diff options
Diffstat (limited to 'fs')
612 files changed, 20729 insertions, 11416 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile index ff7be98f84f2..9619ccadd2fc 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -10,10 +10,7 @@ obj-$(CONFIG_9P_FS) := 9p.o vfs_dentry.o \ v9fs.o \ fid.o \ - xattr.o \ - xattr_user.o \ - xattr_trusted.o + xattr.o 9p-$(CONFIG_9P_FSCACHE) += cache.o 9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o -9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 31c010372660..9da967f38387 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -67,8 +67,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) return 0; } /* get the default/access acl values and cache them */ - dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT); - pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS); + dacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_DEFAULT); + pacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_ACCESS); if (!IS_ERR(dacl) && !IS_ERR(pacl)) { set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); @@ -133,10 +133,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) goto err_free_out; switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); @@ -212,42 +212,22 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, return 0; } -static int v9fs_remote_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - char *full_name; - - switch (type) { - case ACL_TYPE_ACCESS: - full_name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - full_name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - BUG(); - } - return v9fs_xattr_get(dentry, full_name, buffer, size); -} - -static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int v9fs_xattr_get_acl(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { struct v9fs_session_info *v9ses; struct posix_acl *acl; int error; - if (strcmp(name, "") != 0) - return -EINVAL; - v9ses = v9fs_dentry2v9ses(dentry); /* * We allow set/get/list of acl when access=client is not specified */ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) - return v9fs_remote_get_acl(dentry, name, buffer, size, type); + return v9fs_xattr_get(dentry, handler->name, buffer, size); - acl = v9fs_get_cached_acl(d_inode(dentry), type); + acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -258,46 +238,23 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, return error; } -static int v9fs_remote_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, - int flags, int type) -{ - char *full_name; - - switch (type) { - case ACL_TYPE_ACCESS: - full_name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - full_name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - BUG(); - } - return v9fs_xattr_set(dentry, full_name, value, size, flags); -} - - -static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, - int flags, int type) +static int v9fs_xattr_set_acl(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { int retval; struct posix_acl *acl; struct v9fs_session_info *v9ses; struct inode *inode = d_inode(dentry); - if (strcmp(name, "") != 0) - return -EINVAL; - v9ses = v9fs_dentry2v9ses(dentry); /* * set the attribute on the remote. Without even looking at the * xattr value. We leave it to the server to validate */ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) - return v9fs_remote_set_acl(dentry, name, - value, size, flags, type); + return v9fs_xattr_set(dentry, handler->name, value, size, + flags); if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; @@ -316,9 +273,8 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, } else acl = NULL; - switch (type) { + switch (handler->flags) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; if (acl) { umode_t mode = inode->i_mode; retval = posix_acl_equiv_mode(acl, &mode); @@ -349,7 +305,6 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, } break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; if (!S_ISDIR(inode->i_mode)) { retval = acl ? -EINVAL : 0; goto err_out; @@ -358,23 +313,23 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, default: BUG(); } - retval = v9fs_xattr_set(dentry, name, value, size, flags); + retval = v9fs_xattr_set(dentry, handler->name, value, size, flags); if (!retval) - set_cached_acl(inode, type, acl); + set_cached_acl(inode, handler->flags, acl); err_out: posix_acl_release(acl); return retval; } const struct xattr_handler v9fs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, + .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, .get = v9fs_xattr_get_acl, .set = v9fs_xattr_set_acl, }; const struct xattr_handler v9fs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, + .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = v9fs_xattr_get_acl, .set = v9fs_xattr_set_acl, diff --git a/fs/9p/cache.c b/fs/9p/cache.c index a69260f27555..103ca5e1267b 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -243,14 +243,14 @@ void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) if (!v9inode->fscache) return; - spin_lock(&v9inode->fscache_lock); + mutex_lock(&v9inode->fscache_lock); if ((filp->f_flags & O_ACCMODE) != O_RDONLY) v9fs_cache_inode_flush_cookie(inode); else v9fs_cache_inode_get_cookie(inode); - spin_unlock(&v9inode->fscache_lock); + mutex_unlock(&v9inode->fscache_lock); } void v9fs_cache_inode_reset_cookie(struct inode *inode) @@ -264,7 +264,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) old = v9inode->fscache; - spin_lock(&v9inode->fscache_lock); + mutex_lock(&v9inode->fscache_lock); fscache_relinquish_cookie(v9inode->fscache, 1); v9ses = v9fs_inode2v9ses(inode); @@ -274,7 +274,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", inode, old, v9inode->fscache); - spin_unlock(&v9inode->fscache_lock); + mutex_unlock(&v9inode->fscache_lock); } int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) diff --git a/fs/9p/cache.h b/fs/9p/cache.h index 2f9675491095..247e47e54bcc 100644 --- a/fs/9p/cache.h +++ b/fs/9p/cache.h @@ -21,6 +21,7 @@ */ #ifndef _9P_CACHE_H +#define _9P_CACHE_H #ifdef CONFIG_9P_FSCACHE #include <linux/fscache.h> #include <linux/spinlock.h> diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 6caca025019d..072e7599583a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void) v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", sizeof(struct v9fs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), v9fs_inode_init_once); if (!v9fs_inode_cache) return -ENOMEM; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 0923f2cf3c80..6877050384a1 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -123,7 +123,7 @@ struct v9fs_session_info { struct v9fs_inode { #ifdef CONFIG_9P_FSCACHE - spinlock_t fscache_lock; + struct mutex fscache_lock; struct fscache_cookie *fscache; #endif struct p9_qid qid; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3abc447783aa..7bf835f85bc8 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -161,7 +161,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) if ((fl->fl_flags & FL_POSIX) != FL_POSIX) BUG(); - res = posix_lock_file_wait(filp, fl); + res = locks_lock_file_wait(filp, fl); if (res < 0) goto out; @@ -231,7 +231,8 @@ out_unlock: if (res < 0 && fl->fl_type != F_UNLCK) { fl_type = fl->fl_type; fl->fl_type = F_UNLCK; - res = posix_lock_file_wait(filp, fl); + /* Even if this fails we want to return the remote error */ + locks_lock_file_wait(filp, fl); fl->fl_type = fl_type; } out: diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b1dc51888048..3a08b3e6ff1d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -244,7 +244,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) return NULL; #ifdef CONFIG_9P_FSCACHE v9inode->fscache = NULL; - spin_lock_init(&v9inode->fscache_lock); + mutex_init(&v9inode->fscache_lock); #endif v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; @@ -451,9 +451,9 @@ void v9fs_evict_inode(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); - truncate_inode_pages_final(inode->i_mapping); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - filemap_fdatawrite(inode->i_mapping); + filemap_fdatawrite(&inode->i_data); v9fs_cache_inode_put_cookie(inode); /* clunk the fid stashed in writeback_fid */ @@ -1223,18 +1223,26 @@ ino_t v9fs_qid2ino(struct p9_qid *qid) } /** - * v9fs_vfs_follow_link - follow a symlink path + * v9fs_vfs_get_link - follow a symlink path * @dentry: dentry for symlink - * @cookie: place to pass the data to put_link() + * @inode: inode for symlink + * @done: delayed call for when we are done with the return value */ -static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie) +static const char *v9fs_vfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry); - struct p9_fid *fid = v9fs_fid_lookup(dentry); + struct v9fs_session_info *v9ses; + struct p9_fid *fid; struct p9_wstat *st; char *res; + if (!dentry) + return ERR_PTR(-ECHILD); + + v9ses = v9fs_dentry2v9ses(dentry); + fid = v9fs_fid_lookup(dentry); p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); if (IS_ERR(fid)) @@ -1259,7 +1267,8 @@ static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie) p9stat_free(st); kfree(st); - return *cookie = res; + set_delayed_call(done, kfree_link, res); + return res; } /** @@ -1368,9 +1377,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde dir->i_ino, dentry, mode, MAJOR(rdev), MINOR(rdev)); - if (!new_valid_dev(rdev)) - return -EINVAL; - /* build extension */ if (S_ISBLK(mode)) sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev)); @@ -1455,8 +1461,7 @@ static const struct inode_operations v9fs_file_inode_operations = { static const struct inode_operations v9fs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = v9fs_vfs_follow_link, - .put_link = kfree_put_link, + .get_link = v9fs_vfs_get_link, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index e8aa57dc8d6d..a34702c998f5 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -829,9 +829,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); - if (!new_valid_dev(rdev)) - return -EINVAL; - v9ses = v9fs_inode2v9ses(dir); dir_dentry = dentry->d_parent; dfid = v9fs_fid_lookup(dir_dentry); @@ -902,26 +899,34 @@ error: } /** - * v9fs_vfs_follow_link_dotl - follow a symlink path + * v9fs_vfs_get_link_dotl - follow a symlink path * @dentry: dentry for symlink - * @cookie: place to pass the data to put_link() + * @inode: inode for symlink + * @done: destructor for return value */ static const char * -v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie) +v9fs_vfs_get_link_dotl(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct p9_fid *fid = v9fs_fid_lookup(dentry); + struct p9_fid *fid; char *target; int retval; + if (!dentry) + return ERR_PTR(-ECHILD); + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); + fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); retval = p9_client_readlink(fid, &target); if (retval) return ERR_PTR(retval); - return *cookie = target; + set_delayed_call(done, kfree_link, target); + return target; } int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) @@ -987,8 +992,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = { const struct inode_operations v9fs_symlink_inode_operations_dotl = { .readlink = generic_readlink, - .follow_link = v9fs_vfs_follow_link_dotl, - .put_link = kfree_put_link, + .get_link = v9fs_vfs_get_link_dotl, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .setxattr = generic_setxattr, diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 0cf44b6cccd6..9dd9b47a6c1a 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -137,6 +137,44 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) return v9fs_xattr_get(dentry, NULL, buffer, buffer_size); } +static int v9fs_xattr_handler_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + const char *full_name = xattr_full_name(handler, name); + + return v9fs_xattr_get(dentry, full_name, buffer, size); +} + +static int v9fs_xattr_handler_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + const char *full_name = xattr_full_name(handler, name); + + return v9fs_xattr_set(dentry, full_name, value, size, flags); +} + +static struct xattr_handler v9fs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = v9fs_xattr_handler_get, + .set = v9fs_xattr_handler_set, +}; + +static struct xattr_handler v9fs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = v9fs_xattr_handler_get, + .set = v9fs_xattr_handler_set, +}; + +#ifdef CONFIG_9P_FS_SECURITY +static struct xattr_handler v9fs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = v9fs_xattr_handler_get, + .set = v9fs_xattr_handler_set, +}; +#endif + const struct xattr_handler *v9fs_xattr_handlers[] = { &v9fs_xattr_user_handler, &v9fs_xattr_trusted_handler, diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index d3e2ea3840be..c63c3bea5de5 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -19,9 +19,6 @@ #include <net/9p/client.h> extern const struct xattr_handler *v9fs_xattr_handlers[]; -extern struct xattr_handler v9fs_xattr_user_handler; -extern struct xattr_handler v9fs_xattr_trusted_handler; -extern struct xattr_handler v9fs_xattr_security_handler; extern const struct xattr_handler v9fs_xattr_acl_access_handler; extern const struct xattr_handler v9fs_xattr_acl_default_handler; diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c deleted file mode 100644 index cb247a142a6e..000000000000 --- a/fs/9p/xattr_security.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright IBM Corporation, 2010 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - - -#include <linux/module.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include "xattr.h" - -static int v9fs_xattr_security_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - int retval; - char *full_name; - size_t name_len; - size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; - - if (name == NULL) - return -EINVAL; - - if (strcmp(name, "") == 0) - return -EINVAL; - - name_len = strlen(name); - full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); - if (!full_name) - return -ENOMEM; - memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(full_name+prefix_len, name, name_len); - full_name[prefix_len + name_len] = '\0'; - - retval = v9fs_xattr_get(dentry, full_name, buffer, size); - kfree(full_name); - return retval; -} - -static int v9fs_xattr_security_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - int retval; - char *full_name; - size_t name_len; - size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; - - if (name == NULL) - return -EINVAL; - - if (strcmp(name, "") == 0) - return -EINVAL; - - name_len = strlen(name); - full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); - if (!full_name) - return -ENOMEM; - memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(full_name + prefix_len, name, name_len); - full_name[prefix_len + name_len] = '\0'; - - retval = v9fs_xattr_set(dentry, full_name, value, size, flags); - kfree(full_name); - return retval; -} - -struct xattr_handler v9fs_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = v9fs_xattr_security_get, - .set = v9fs_xattr_security_set, -}; diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c deleted file mode 100644 index e30d33b8a3fb..000000000000 --- a/fs/9p/xattr_trusted.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright IBM Corporation, 2010 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - - -#include <linux/module.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include "xattr.h" - -static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - int retval; - char *full_name; - size_t name_len; - size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - - if (name == NULL) - return -EINVAL; - - if (strcmp(name, "") == 0) - return -EINVAL; - - name_len = strlen(name); - full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); - if (!full_name) - return -ENOMEM; - memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(full_name+prefix_len, name, name_len); - full_name[prefix_len + name_len] = '\0'; - - retval = v9fs_xattr_get(dentry, full_name, buffer, size); - kfree(full_name); - return retval; -} - -static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - int retval; - char *full_name; - size_t name_len; - size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - - if (name == NULL) - return -EINVAL; - - if (strcmp(name, "") == 0) - return -EINVAL; - - name_len = strlen(name); - full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); - if (!full_name) - return -ENOMEM; - memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(full_name + prefix_len, name, name_len); - full_name[prefix_len + name_len] = '\0'; - - retval = v9fs_xattr_set(dentry, full_name, value, size, flags); - kfree(full_name); - return retval; -} - -struct xattr_handler v9fs_xattr_trusted_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .get = v9fs_xattr_trusted_get, - .set = v9fs_xattr_trusted_set, -}; diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c deleted file mode 100644 index d0b701b72080..000000000000 --- a/fs/9p/xattr_user.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright IBM Corporation, 2010 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - - -#include <linux/module.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include "xattr.h" - -static int v9fs_xattr_user_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - int retval; - char *full_name; - size_t name_len; - size_t prefix_len = XATTR_USER_PREFIX_LEN; - - if (name == NULL) - return -EINVAL; - - if (strcmp(name, "") == 0) - return -EINVAL; - - name_len = strlen(name); - full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); - if (!full_name) - return -ENOMEM; - memcpy(full_name, XATTR_USER_PREFIX, prefix_len); - memcpy(full_name+prefix_len, name, name_len); - full_name[prefix_len + name_len] = '\0'; - - retval = v9fs_xattr_get(dentry, full_name, buffer, size); - kfree(full_name); - return retval; -} - -static int v9fs_xattr_user_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - int retval; - char *full_name; - size_t name_len; - size_t prefix_len = XATTR_USER_PREFIX_LEN; - - if (name == NULL) - return -EINVAL; - - if (strcmp(name, "") == 0) - return -EINVAL; - - name_len = strlen(name); - full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); - if (!full_name) - return -ENOMEM; - memcpy(full_name, XATTR_USER_PREFIX, prefix_len); - memcpy(full_name + prefix_len, name, name_len); - full_name[prefix_len + name_len] = '\0'; - - retval = v9fs_xattr_set(dentry, full_name, value, size, flags); - kfree(full_name); - return retval; -} - -struct xattr_handler v9fs_xattr_user_handler = { - .prefix = XATTR_USER_PREFIX, - .get = v9fs_xattr_user_get, - .set = v9fs_xattr_user_set, -}; diff --git a/fs/Kconfig b/fs/Kconfig index da3f32f1a4e4..9adee0d7536e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -46,6 +46,13 @@ config FS_DAX or if unsure, say N. Saying Y will increase the size of the kernel by about 5kB. +config FS_DAX_PMD + bool + default FS_DAX + depends on FS_DAX + depends on ZONE_DEVICE + depends on TRANSPARENT_HUGEPAGE + endif # BLOCK # Posix ACL utility routines @@ -67,6 +74,16 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. +config MANDATORY_FILE_LOCKING + bool "Enable Mandatory file locking" + depends on FILE_LOCKING + default y + help + This option enables files appropriately marked files on appropriely + mounted filesystems to support mandatory locking. + + To the best of my knowledge this is dead code that no one cares about. + source "fs/notify/Kconfig" source "fs/quota/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index f79cf4043e60..79f522575cba 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -63,10 +63,11 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ -obj-$(CONFIG_EXT2_FS) += ext2/ -# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 -# unless explicitly requested by rootfstype obj-$(CONFIG_EXT4_FS) += ext4/ +# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the +# ext2 driver, which doesn't know about journalling! Explicitly request ext2 +# by giving the rootfstype= parameter. +obj-$(CONFIG_EXT2_FS) += ext2/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_CRAMFS) += cramfs/ obj-$(CONFIG_SQUASHFS) += squashfs/ diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 24575d9d882d..ea4aba56f29d 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -45,7 +45,7 @@ struct adfs_dir_ops; struct adfs_sb_info { union { struct { struct adfs_discmap *s_map; /* bh list containing map */ - struct adfs_dir_ops *s_dir; /* directory operations */ + const struct adfs_dir_ops *s_dir; /* directory operations */ }; struct rcu_head rcu; /* used only at shutdown time */ }; @@ -168,8 +168,8 @@ void __adfs_error(struct super_block *sb, const char *function, extern const struct inode_operations adfs_dir_inode_operations; extern const struct file_operations adfs_dir_operations; extern const struct dentry_operations adfs_dentry_operations; -extern struct adfs_dir_ops adfs_f_dir_ops; -extern struct adfs_dir_ops adfs_fplus_dir_ops; +extern const struct adfs_dir_ops adfs_f_dir_ops; +extern const struct adfs_dir_ops adfs_fplus_dir_ops; extern int adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait); diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 51c279a29845..fd4cf2c48e48 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -21,7 +21,7 @@ adfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct object_info obj; struct adfs_dir dir; int ret = 0; @@ -69,7 +69,7 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) { int ret = -EINVAL; #ifdef CONFIG_ADFS_FS_RW - struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct adfs_dir dir; printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n", @@ -129,7 +129,7 @@ static int adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj) { struct super_block *sb = inode->i_sb; - struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct adfs_dir dir; int ret; diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index 4bbe853ee50a..0fbfd0b04ae0 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -476,7 +476,7 @@ adfs_f_free(struct adfs_dir *dir) dir->sb = NULL; } -struct adfs_dir_ops adfs_f_dir_ops = { +const struct adfs_dir_ops adfs_f_dir_ops = { .read = adfs_f_read, .setpos = adfs_f_setpos, .getnext = adfs_f_getnext, diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index 82d14cdf70f9..c92cfb638c18 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -256,7 +256,7 @@ adfs_fplus_free(struct adfs_dir *dir) dir->sb = NULL; } -struct adfs_dir_ops adfs_fplus_dir_ops = { +const struct adfs_dir_ops adfs_fplus_dir_ops = { .read = adfs_fplus_read, .setpos = adfs_fplus_setpos, .getnext = adfs_fplus_getnext, diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 4d4a0df8344f..c9fdfb112933 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -271,7 +271,7 @@ static int __init init_inodecache(void) adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (adfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index c69a87eaf57d..cc2b2efc9211 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -138,7 +138,7 @@ extern int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh); extern int affs_remove_header(struct dentry *dentry); extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh); extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); -extern void secs_to_datestamp(time_t secs, struct affs_date *ds); +extern void secs_to_datestamp(time64_t secs, struct affs_date *ds); extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); __printf(3, 4) diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 5fa92bc790ef..d6c7a51c93e4 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -8,6 +8,7 @@ * Please send bug reports to: hjw@zvw.de */ +#include <linux/math64.h> #include "affs.h" /* @@ -366,22 +367,22 @@ affs_fix_checksum(struct super_block *sb, struct buffer_head *bh) } void -secs_to_datestamp(time_t secs, struct affs_date *ds) +secs_to_datestamp(time64_t secs, struct affs_date *ds) { u32 days; u32 minute; + s32 rem; secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60); if (secs < 0) secs = 0; - days = secs / 86400; - secs -= days * 86400; - minute = secs / 60; - secs -= minute * 60; + days = div_s64_rem(secs, 86400, &rem); + minute = rem / 60; + rem -= minute * 60; ds->days = cpu_to_be32(days); ds->mins = cpu_to_be32(minute); - ds->ticks = cpu_to_be32(secs * 50); + ds->ticks = cpu_to_be32(rem * 50); } umode_t diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 17349500592d..0fdb0f5b2239 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) break; case ST_SOFTLINK: inode->i_mode |= S_IFLNK; + inode_nohighmem(inode); inode->i_op = &affs_symlink_inode_operations; inode->i_data.a_ops = &affs_symlink_aops; break; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 181e05b46e72..00d3002a6780 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -344,6 +344,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) return -ENOSPC; inode->i_op = &affs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &affs_symlink_aops; inode->i_mode = S_IFLNK | 0777; mode_to_prot(inode); diff --git a/fs/affs/super.c b/fs/affs/super.c index 5b50c4ca43a7..2a6713b6b9f4 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -32,7 +32,7 @@ affs_commit_super(struct super_block *sb, int wait) struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh); lock_buffer(bh); - secs_to_datestamp(get_seconds(), &tail->disk_change); + secs_to_datestamp(ktime_get_real_seconds(), &tail->disk_change); affs_fix_checksum(sb, bh); unlock_buffer(bh); @@ -132,7 +132,7 @@ static int __init init_inodecache(void) affs_inode_cachep = kmem_cache_create("affs_inode_cache", sizeof(struct affs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (affs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index ea5b69a18ba9..69b03dbb792f 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -14,13 +14,13 @@ static int affs_symlink_readpage(struct file *file, struct page *page) { struct buffer_head *bh; struct inode *inode = page->mapping->host; - char *link = kmap(page); + char *link = page_address(page); struct slink_front *lf; int i, j; char c; char lc; - pr_debug("follow_link(ino=%lu)\n", inode->i_ino); + pr_debug("get_link(ino=%lu)\n", inode->i_ino); bh = affs_bread(inode->i_sb, inode->i_ino); if (!bh) @@ -57,12 +57,10 @@ static int affs_symlink_readpage(struct file *file, struct page *page) link[i] = '\0'; affs_brelse(bh); SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; fail: SetPageError(page); - kunmap(page); unlock_page(page); return -EIO; } @@ -73,7 +71,6 @@ const struct address_space_operations affs_symlink_aops = { const struct inode_operations affs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = affs_notify_change, }; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e06f5a23352a..86cc7264c21c 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -56,6 +56,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) case AFS_FTYPE_SYMLINK: inode->i_mode = S_IFLNK | vnode->status.mode; inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); break; default: printk("kAFS: AFS vnode with undefined type\n"); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 24a905b076fd..2853b4095344 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -230,14 +230,9 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, if (size <= 1 || size >= PAGE_SIZE) return -EINVAL; - kbuf = kmalloc(size + 1, GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(kbuf, buf, size) != 0) - goto done; - kbuf[size] = 0; + kbuf = memdup_user_nul(buf, size); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); /* trim to first NL */ name = memchr(kbuf, '\n', size); @@ -315,15 +310,9 @@ static ssize_t afs_proc_rootcell_write(struct file *file, if (size <= 1 || size >= PAGE_SIZE) return -EINVAL; - ret = -ENOMEM; - kbuf = kmalloc(size + 1, GFP_KERNEL); - if (!kbuf) - goto nomem; - - ret = -EFAULT; - if (copy_from_user(kbuf, buf, size) != 0) - goto infault; - kbuf[size] = 0; + kbuf = memdup_user_nul(buf, size); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); /* trim to first NL */ s = memchr(kbuf, '\n', size); @@ -337,9 +326,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file, if (ret >= 0) ret = size; /* consume everything, always */ -infault: kfree(kbuf); -nomem: _leave(" = %d", ret); return ret; } diff --git a/fs/afs/super.c b/fs/afs/super.c index 1fb4a5129f7d..81afefe7d8a6 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -91,7 +91,7 @@ int __init afs_fs_init(void) afs_inode_cachep = kmem_cache_create("afs_inode_cache", sizeof(struct afs_vnode), 0, - SLAB_HWCACHE_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, afs_i_init_once); if (!afs_inode_cachep) { printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n"); diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index da0c33481bc0..84e037d1d129 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -12,10 +12,16 @@ #include "autofs_i.h" -static const char *autofs4_follow_link(struct dentry *dentry, void **cookie) +static const char *autofs4_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi; + struct autofs_info *ino; + if (!dentry) + return ERR_PTR(-ECHILD); + sbi = autofs4_sbi(dentry->d_sb); + ino = autofs4_dentry_ino(dentry); if (ino && !autofs4_oz_mode(sbi)) ino->last_used = jiffies; return d_inode(dentry)->i_private; @@ -23,5 +29,5 @@ static const char *autofs4_follow_link(struct dentry *dentry, void **cookie) const struct inode_operations autofs4_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = autofs4_follow_link + .get_link = autofs4_get_link }; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 861b1e1c4777..103f5d7c3083 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -192,7 +192,7 @@ EXPORT_SYMBOL(make_bad_inode); * Returns true if the inode in question has been marked as bad. */ -int is_bad_inode(struct inode *inode) +bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 46aedacfa6a8..cc0e08252913 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); static void befs_destroy_inodecache(void); -static const char *befs_follow_link(struct dentry *, void **); +static int befs_symlink_readpage(struct file *, struct page *); static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, @@ -79,10 +79,8 @@ static const struct address_space_operations befs_aops = { .bmap = befs_bmap, }; -static const struct inode_operations befs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = befs_follow_link, - .put_link = kfree_put_link, +static const struct address_space_operations befs_symlink_aops = { + .readpage = befs_symlink_readpage, }; /* @@ -398,7 +396,9 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &befs_dir_operations; } else if (S_ISLNK(inode->i_mode)) { if (befs_ino->i_flags & BEFS_LONG_SYMLINK) { - inode->i_op = &befs_symlink_inode_operations; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &befs_symlink_aops; } else { inode->i_link = befs_ino->i_data.symlink; inode->i_op = &simple_symlink_inode_operations; @@ -434,7 +434,7 @@ befs_init_inodecache(void) befs_inode_cachep = kmem_cache_create("befs_inode_cache", sizeof (struct befs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (befs_inode_cachep == NULL) { pr_err("%s: Couldn't initialize inode slabcache\n", __func__); @@ -463,31 +463,33 @@ befs_destroy_inodecache(void) * The data stream become link name. Unless the LONG_SYMLINK * flag is set. */ -static const char * -befs_follow_link(struct dentry *dentry, void **cookie) +static int befs_symlink_readpage(struct file *unused, struct page *page) { - struct super_block *sb = dentry->d_sb; - struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct befs_inode_info *befs_ino = BEFS_I(inode); befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; - char *link; + char *link = page_address(page); - if (len == 0) { + if (len == 0 || len > PAGE_SIZE) { befs_error(sb, "Long symlink with illegal length"); - return ERR_PTR(-EIO); + goto fail; } befs_debug(sb, "Follow long symlink"); - link = kmalloc(len, GFP_NOFS); - if (!link) - return ERR_PTR(-ENOMEM); if (befs_read_lsymlink(sb, data, link, len) != len) { - kfree(link); befs_error(sb, "Failed to read entire long symlink"); - return ERR_PTR(-EIO); + goto fail; } link[len - 1] = '\0'; - return *cookie = link; + SetPageUptodate(page); + unlock_page(page); + return 0; +fail: + SetPageError(page); + unlock_page(page); + return -EIO; } /* diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index fdcb4d69f430..1e5c896f6b79 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -270,7 +270,7 @@ static int __init init_inodecache(void) bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (bfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6b659967898e..3a93755e880f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -35,6 +35,7 @@ #include <linux/utsname.h> #include <linux/coredump.h> #include <linux/sched.h> +#include <linux/dax.h> #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> @@ -487,7 +488,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr, } /** - * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header + * arch_check_elf() - check an ELF executable * @ehdr: The main ELF header * @has_interp: True if the ELF has an interpreter, else false. * @state: Architecture-specific state preserved throughout the process @@ -759,16 +760,16 @@ static int load_elf_binary(struct linux_binprm *bprm) */ would_dump(bprm, interpreter); - retval = kernel_read(interpreter, 0, bprm->buf, - BINPRM_BUF_SIZE); - if (retval != BINPRM_BUF_SIZE) { + /* Get the exec headers */ + retval = kernel_read(interpreter, 0, + (void *)&loc->interp_elf_ex, + sizeof(loc->interp_elf_ex)); + if (retval != sizeof(loc->interp_elf_ex)) { if (retval >= 0) retval = -EIO; goto out_free_dentry; } - /* Get the exec headers */ - loc->interp_elf_ex = *((struct elfhdr *)bprm->buf); break; } elf_ppnt++; @@ -1236,6 +1237,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, if (vma->vm_flags & VM_DONTDUMP) return 0; + /* support for DAX */ + if (vma_is_dax(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) + goto whole; + return 0; + } + /* Hugetlb memory check */ if (vma->vm_flags & VM_HUGETLB) { if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index d3634bfb7fe1..b1adb92e69de 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -35,6 +35,7 @@ #include <linux/elf-fdpic.h> #include <linux/elfcore.h> #include <linux/coredump.h> +#include <linux/dax.h> #include <asm/uaccess.h> #include <asm/param.h> @@ -103,19 +104,36 @@ static void __exit exit_elf_fdpic_binfmt(void) core_initcall(init_elf_fdpic_binfmt); module_exit(exit_elf_fdpic_binfmt); -static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) +static int is_elf(struct elfhdr *hdr, struct file *file) { if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0) return 0; if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) return 0; - if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr)) + if (!elf_check_arch(hdr)) return 0; if (!file->f_op->mmap) return 0; return 1; } +#ifndef elf_check_fdpic +#define elf_check_fdpic(x) 0 +#endif + +#ifndef elf_check_const_displacement +#define elf_check_const_displacement(x) 0 +#endif + +static int is_constdisp(struct elfhdr *hdr) +{ + if (!elf_check_fdpic(hdr)) + return 1; + if (elf_check_const_displacement(hdr)) + return 1; + return 0; +} + /*****************************************************************************/ /* * read the program headers table into memory @@ -191,8 +209,18 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) /* check that this is a binary we know how to deal with */ retval = -ENOEXEC; - if (!is_elf_fdpic(&exec_params.hdr, bprm->file)) + if (!is_elf(&exec_params.hdr, bprm->file)) + goto error; + if (!elf_check_fdpic(&exec_params.hdr)) { +#ifdef CONFIG_MMU + /* binfmt_elf handles non-fdpic elf except on nommu */ goto error; +#else + /* nommu can only load ET_DYN (PIE) ELF */ + if (exec_params.hdr.e_type != ET_DYN) + goto error; +#endif + } /* read the program header table */ retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file); @@ -269,13 +297,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) } - if (elf_check_const_displacement(&exec_params.hdr)) + if (is_constdisp(&exec_params.hdr)) exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; /* perform insanity checks on the interpreter */ if (interpreter_name) { retval = -ELIBBAD; - if (!is_elf_fdpic(&interp_params.hdr, interpreter)) + if (!is_elf(&interp_params.hdr, interpreter)) goto error; interp_params.flags = ELF_FDPIC_FLAG_PRESENT; @@ -306,9 +334,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) retval = -ENOEXEC; if (stack_size == 0) - goto error; + stack_size = 131072UL; /* same as exec.c's default commit */ - if (elf_check_const_displacement(&interp_params.hdr)) + if (is_constdisp(&interp_params.hdr)) interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP; /* flush all traces of the currently running executable */ @@ -319,7 +347,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) /* there's now no turning back... the old userspace image is dead, * defunct, deceased, etc. */ - set_personality(PER_LINUX_FDPIC); + if (elf_check_fdpic(&exec_params.hdr)) + set_personality(PER_LINUX_FDPIC); + else + set_personality(PER_LINUX); if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) current->personality |= READ_IMPLIES_EXEC; @@ -374,10 +405,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) PAGE_ALIGN(current->mm->start_brk); #else - /* create a stack and brk area big enough for everyone - * - the brk heap starts at the bottom and works up - * - the stack starts at the top and works down - */ + /* create a stack area and zero-size brk area */ stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK; if (stack_size < PAGE_SIZE * 2) stack_size = PAGE_SIZE * 2; @@ -400,8 +428,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) current->mm->brk = current->mm->start_brk; current->mm->context.end_brk = current->mm->start_brk; - current->mm->context.end_brk += - (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; current->mm->start_stack = current->mm->start_brk + stack_size; #endif @@ -1206,6 +1232,20 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) return 0; } + /* support for DAX */ + if (vma_is_dax(vma)) { + if (vma->vm_flags & VM_SHARED) { + dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags); + kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start, + vma->vm_flags, dump_ok ? "yes" : "no"); + } else { + dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags); + kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start, + vma->vm_flags, dump_ok ? "yes" : "no"); + } + return dump_ok; + } + /* By default, dump shared memory if mapped from an anonymous file. */ if (vma->vm_flags & VM_SHARED) { if (file_inode(vma->vm_file)->i_nlink == 0) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 073bb57adab1..ba762ea07f67 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -50,12 +50,21 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); -static void bdev_write_inode(struct inode *inode) +static void bdev_write_inode(struct block_device *bdev) { + struct inode *inode = bdev->bd_inode; + int ret; + spin_lock(&inode->i_lock); while (inode->i_state & I_DIRTY) { spin_unlock(&inode->i_lock); - WARN_ON_ONCE(write_inode_now(inode, true)); + ret = write_inode_now(inode, true); + if (ret) { + char name[BDEVNAME_SIZE]; + pr_warn_ratelimited("VFS: Dirty inode writeback failed " + "for block device %s (err=%d).\n", + bdevname(bdev, name), ret); + } spin_lock(&inode->i_lock); } spin_unlock(&inode->i_lock); @@ -147,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } +static struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct inode *inode = bdev_file_inode(file); if (IS_DAX(inode)) return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, @@ -329,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, */ static loff_t block_llseek(struct file *file, loff_t offset, int whence) { - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t retval; mutex_lock(&bd_inode->i_mutex); @@ -340,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *bd_inode = filp->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); int error; @@ -381,9 +395,17 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, struct page *page) { const struct block_device_operations *ops = bdev->bd_disk->fops; + int result = -EOPNOTSUPP; + if (!ops->rw_page || bdev_get_integrity(bdev)) - return -EOPNOTSUPP; - return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); + return result; + + result = blk_queue_enter(bdev->bd_queue, false); + if (result) + return result; + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); + blk_queue_exit(bdev->bd_queue); + return result; } EXPORT_SYMBOL_GPL(bdev_read_page); @@ -412,14 +434,20 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, int result; int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; + result = blk_queue_enter(bdev->bd_queue, false); + if (result) + return result; + set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); if (result) end_page_writeback(page); else unlock_page(page); + blk_queue_exit(bdev->bd_queue); return result; } EXPORT_SYMBOL_GPL(bdev_write_page); @@ -427,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page); /** * bdev_direct_access() - Get the address for directly-accessibly memory * @bdev: The device containing the memory - * @sector: The offset within the device - * @addr: Where to put the address of the memory - * @pfn: The Page Frame Number for the memory - * @size: The number of bytes requested + * @dax: control and output parameters for ->direct_access * * If a block device is made up of directly addressable memory, this function * will tell the caller the PFN and the address of the memory. The address @@ -441,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page); * Return: negative errno if an error occurs, otherwise the number of bytes * accessible at this address. */ -long bdev_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **addr, unsigned long *pfn, long size) +long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) { - long avail; + sector_t sector = dax->sector; + long avail, size = dax->size; const struct block_device_operations *ops = bdev->bd_disk->fops; /* @@ -463,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, addr, pfn); + avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); if (!avail) return -ERANGE; + if (avail > 0 && avail & ~PAGE_MASK) + return -ENXIO; return min(avail, size); } EXPORT_SYMBOL_GPL(bdev_direct_access); @@ -567,7 +594,7 @@ void __init bdev_cache_init(void) bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) @@ -673,7 +700,7 @@ static struct block_device *bd_acquire(struct inode *inode) spin_lock(&bdev_lock); bdev = inode->i_bdev; if (bdev) { - ihold(bdev->bd_inode); + bdgrab(bdev); spin_unlock(&bdev_lock); return bdev; } @@ -689,7 +716,7 @@ static struct block_device *bd_acquire(struct inode *inode) * So, we can access it via ->i_mapping always * without igrab(). */ - ihold(bdev->bd_inode); + bdgrab(bdev); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; list_add(&inode->i_devices, &bdev->bd_inodes); @@ -712,7 +739,7 @@ void bd_forget(struct inode *inode) spin_unlock(&bdev_lock); if (bdev) - iput(bdev->bd_inode); + bdput(bdev); } /** @@ -1019,12 +1046,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); static void flush_disk(struct block_device *bdev, bool kill_dirty) { if (__invalidate_device(bdev, kill_dirty)) { - char name[BDEVNAME_SIZE] = ""; - - if (bdev->bd_disk) - disk_name(bdev->bd_disk, 0, name); printk(KERN_WARNING "VFS: busy inodes on changed media or " - "resized disk %s\n", name); + "resized disk %s\n", + bdev->bd_disk ? bdev->bd_disk->disk_name : ""); } if (!bdev->bd_disk) @@ -1048,12 +1072,9 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { - char name[BDEVNAME_SIZE]; - - disk_name(disk, 0, name); printk(KERN_INFO "%s: detected capacity change from %lld to %lld\n", - name, bdev_size, disk_size); + disk->disk_name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); flush_disk(bdev, false); } @@ -1075,7 +1096,7 @@ int revalidate_disk(struct gendisk *disk) if (disk->fops->revalidate_disk) ret = disk->fops->revalidate_disk(disk); - + blk_integrity_revalidate(disk); bdev = bdget_disk(disk, 0); if (!bdev) return ret; @@ -1207,8 +1228,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + if (!blkdev_dax_capable(bdev)) + bdev->bd_inode->i_flags &= ~S_DAX; + } /* * If the device is invalidated, rescan partition @@ -1222,6 +1246,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) else if (ret == -ENOMEDIUM) invalidate_partitions(disk, bdev); } + if (ret) goto out_clear; } else { @@ -1242,12 +1267,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - /* - * If the partition is not aligned on a page - * boundary, we can't do dax I/O to it. - */ - if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) || - (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + if (!blkdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { @@ -1500,11 +1520,14 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); + + bdev_write_inode(bdev); /* - * ->release can cause the queue to disappear, so flush all - * dirty data before. + * Detaching bdev inode from its wb in __destroy_inode() + * is too late: the queue which embeds its bdi (along with + * root wb) can be gone as soon as we put_disk() below. */ - bdev_write_inode(bdev->bd_inode); + inode_detach_wb(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) @@ -1579,14 +1602,14 @@ EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { - struct block_device *bdev = I_BDEV(filp->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); blkdev_put(bdev, filp->f_mode); return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - struct block_device *bdev = I_BDEV(file->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); fmode_t mode = file->f_mode; /* @@ -1611,7 +1634,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; ssize_t ret; @@ -1643,7 +1666,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter); ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; @@ -1682,13 +1705,101 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; +#ifdef CONFIG_FS_DAX +/* + * In the raw block case we do not need to contend with truncation nor + * unwritten file extents. Without those concerns there is no need for + * additional locking beyond the mmap_sem context that these routines + * are already executing under. + * + * Note, there is no protection if the block device is dynamically + * resized (partition grow/shrink) during a fault. A stable block device + * size is already not enforced in the blkdev_direct_IO path. + * + * For DAX, it is the responsibility of the block device driver to + * ensure the whole-disk device size is stable while requests are in + * flight. + * + * Finally, unlike the filemap_page_mkwrite() case there is no + * filesystem superblock to sync against freezing. We still include a + * pfn_mkwrite callback for dax drivers to receive write fault + * notifications. + */ +static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return __dax_fault(vma, vmf, blkdev_get_block, NULL); +} + +static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); +} + +static void blkdev_vm_open(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + mutex_unlock(&bd_inode->i_mutex); +} + +static void blkdev_vm_close(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count--; + mutex_unlock(&bd_inode->i_mutex); +} + +static const struct vm_operations_struct blkdev_dax_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = blkdev_dax_fault, + .pmd_fault = blkdev_dax_pmd_fault, + .pfn_mkwrite = blkdev_dax_fault, +}; + +static const struct vm_operations_struct blkdev_default_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, +}; + +static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(bd_inode); + + file_accessed(file); + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + if (IS_DAX(bd_inode)) { + vma->vm_ops = &blkdev_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + } else { + vma->vm_ops = &blkdev_default_vm_ops; + } + mutex_unlock(&bd_inode->i_mutex); + + return 0; +} +#else +#define blkdev_mmap generic_file_mmap +#endif + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .mmap = generic_file_mmap, + .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 6d1d0b93b1aa..128ce17a80b0 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -9,11 +9,12 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o hash.o + uuid-tree.o props.o hash.o free-space-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ - tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ + tests/free-space-tree-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 9a0124a95851..6d263bb1621c 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -37,10 +37,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); @@ -48,7 +48,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, "", 0); if (size > 0) { - value = kzalloc(size, GFP_NOFS); + value = kzalloc(size, GFP_KERNEL); if (!value) return ERR_PTR(-ENOMEM); size = __btrfs_getxattr(inode, name, value, size); @@ -81,7 +81,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) @@ -94,7 +94,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) return acl ? -EINVAL : 0; - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return -EINVAL; @@ -102,7 +102,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, if (acl) { size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); + value = kmalloc(size, GFP_KERNEL); if (!value) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 3e36e4adc4a3..88d9af3d4581 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -97,7 +97,7 @@ static struct __btrfs_workqueue * __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, int thresh) { - struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); + struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; @@ -148,7 +148,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, int limit_active, int thresh) { - struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9a2ec79e8cfb..08405a3da6b1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -355,13 +355,19 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, index = srcu_read_lock(&fs_info->subvol_srcu); - root = btrfs_read_fs_root_no_name(fs_info, &root_key); + root = btrfs_get_fs_root(fs_info, &root_key, false); if (IS_ERR(root)) { srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(root); goto out; } + if (btrfs_test_is_dummy_root(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -ENOENT; + goto out; + } + if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); else if (time_seq == (u64)-1) @@ -514,13 +520,10 @@ static inline int ref_for_same_block(struct __prelim_ref *ref1, static int __add_missing_keys(struct btrfs_fs_info *fs_info, struct list_head *head) { - struct list_head *pos; + struct __prelim_ref *ref; struct extent_buffer *eb; - list_for_each(pos, head) { - struct __prelim_ref *ref; - ref = list_entry(pos, struct __prelim_ref, list); - + list_for_each_entry(ref, head, list) { if (ref->parent) continue; if (ref->key_for_search.type) @@ -557,23 +560,15 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, */ static void __merge_refs(struct list_head *head, int mode) { - struct list_head *pos1; + struct __prelim_ref *ref1; - list_for_each(pos1, head) { - struct list_head *n2; - struct list_head *pos2; - struct __prelim_ref *ref1; + list_for_each_entry(ref1, head, list) { + struct __prelim_ref *ref2 = ref1, *tmp; - ref1 = list_entry(pos1, struct __prelim_ref, list); - - for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; - pos2 = n2, n2 = pos2->next) { - struct __prelim_ref *ref2; + list_for_each_entry_safe_continue(ref2, tmp, head, list) { struct __prelim_ref *xchg; struct extent_inode_elem *eie; - ref2 = list_entry(pos2, struct __prelim_ref, list); - if (!ref_for_same_block(ref1, ref2)) continue; if (mode == 1) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0ef5cc13fae2..61205e3bbefa 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -192,6 +192,10 @@ struct btrfs_inode { /* File creation time. */ struct timespec i_otime; + /* Hook into fs_info->delayed_iputs */ + struct list_head delayed_iput; + long delayed_iput_count; + struct inode vfs_inode; }; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 541fbfaed276..861d472564c1 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -531,13 +531,9 @@ static struct btrfsic_block *btrfsic_block_hashtable_lookup( (((unsigned int)(dev_bytenr >> 16)) ^ ((unsigned int)((uintptr_t)bdev))) & (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block *const b = - list_entry(elem, struct btrfsic_block, - collision_resolving_node); + struct btrfsic_block *b; + list_for_each_entry(b, h->table + hashval, collision_resolving_node) { if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) return b; } @@ -588,13 +584,9 @@ static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( ((unsigned int)((uintptr_t)bdev_ref_to)) ^ ((unsigned int)((uintptr_t)bdev_ref_from))) & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block_link *const l = - list_entry(elem, struct btrfsic_block_link, - collision_resolving_node); + struct btrfsic_block_link *l; + list_for_each_entry(l, h->table + hashval, collision_resolving_node) { BUG_ON(NULL == l->block_ref_to); BUG_ON(NULL == l->block_ref_from); if (l->block_ref_to->dev_state->bdev == bdev_ref_to && @@ -639,13 +631,9 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( const unsigned int hashval = (((unsigned int)((uintptr_t)bdev)) & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_dev_state *const ds = - list_entry(elem, struct btrfsic_dev_state, - collision_resolving_node); + struct btrfsic_dev_state *ds; + list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { if (ds->bdev == bdev) return ds; } @@ -667,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); if (NULL == selected_super) { printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - return -1; + return -ENOMEM; } list_for_each_entry(device, dev_head, dev_list) { @@ -845,8 +833,8 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->never_written = 0; superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" - " @%llu (%s/%llu/%d)\n", + btrfs_info_in_rcu(device->dev_root->fs_info, + "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)", superblock_bdev, rcu_str_deref(device->name), dev_bytenr, dev_state->name, dev_bytenr, @@ -1660,7 +1648,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, sizeof(*block_ctx->pagev)) * num_pages, GFP_NOFS); if (!block_ctx->mem_to_free) - return -1; + return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); for (i = 0; i < num_pages; i++) { @@ -1720,29 +1708,20 @@ static int btrfsic_read_block(struct btrfsic_state *state, static void btrfsic_dump_database(struct btrfsic_state *state) { - struct list_head *elem_all; + const struct btrfsic_block *b_all; BUG_ON(NULL == state); printk(KERN_INFO "all_blocks_list:\n"); - list_for_each(elem_all, &state->all_blocks_list) { - const struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *elem_ref_from; + list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { + const struct btrfsic_block_link *l; printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", btrfsic_get_block_type(state, b_all), b_all->logical_bytenr, b_all->dev_state->name, b_all->dev_bytenr, b_all->mirror_num); - list_for_each(elem_ref_to, &b_all->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { printk(KERN_INFO " %c @%llu (%s/%llu/%d)" " refers %u* to" " %c @%llu (%s/%llu/%d)\n", @@ -1757,12 +1736,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state) l->block_ref_to->mirror_num); } - list_for_each(elem_ref_from, &b_all->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, - struct btrfsic_block_link, - node_ref_from); - + list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { printk(KERN_INFO " %c @%llu (%s/%llu/%d)" " is ref %u* from" " %c @%llu (%s/%llu/%d)\n", @@ -1845,8 +1819,7 @@ again: &state->block_hashtable); if (NULL != block) { u64 bytenr = 0; - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; + struct btrfsic_block_link *l, *tmp; if (block->is_superblock) { bytenr = btrfs_super_bytenr((struct btrfs_super_block *) @@ -1967,13 +1940,8 @@ again: * because it still carries valueable information * like whether it was ever written and IO completed. */ - list_for_each_safe(elem_ref_to, tmp_ref_to, - &block->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry_safe(l, tmp, &block->ref_to_list, + node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) btrfsic_print_rem_link(state, l); l->ref_cnt--; @@ -2436,7 +2404,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, struct btrfsic_block *const block, int recursion_level) { - struct list_head *elem_ref_to; + const struct btrfsic_block_link *l; int ret = 0; if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { @@ -2464,11 +2432,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack * space is very small and the max recursion depth is limited. */ - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &block->ref_to_list, node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "rl=%d, %c @%llu (%s/%llu/%d)" @@ -2561,7 +2525,7 @@ static int btrfsic_is_block_ref_by_superblock( const struct btrfsic_block *block, int recursion_level) { - struct list_head *elem_ref_from; + const struct btrfsic_block_link *l; if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { /* refer to comment at "abort cyclic linkage (case 1)" */ @@ -2576,11 +2540,7 @@ static int btrfsic_is_block_ref_by_superblock( * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - list_for_each(elem_ref_from, &block->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, struct btrfsic_block_link, - node_ref_from); - + list_for_each_entry(l, &block->ref_from_list, node_ref_from) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "rl=%d, %c @%llu (%s/%llu/%d)" @@ -2669,7 +2629,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, const struct btrfsic_block *block, int indent_level) { - struct list_head *elem_ref_to; + const struct btrfsic_block_link *l; int indent_add; static char buf[80]; int cursor_position; @@ -2704,11 +2664,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, } cursor_position = indent_level; - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &block->ref_to_list, node_ref_to) { while (cursor_position < indent_level) { printk(" "); cursor_position++; @@ -3165,8 +3121,7 @@ int btrfsic_mount(struct btrfs_root *root, void btrfsic_unmount(struct btrfs_root *root, struct btrfs_fs_devices *fs_devices) { - struct list_head *elem_all; - struct list_head *tmp_all; + struct btrfsic_block *b_all, *tmp_all; struct btrfsic_state *state; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; @@ -3206,20 +3161,12 @@ void btrfsic_unmount(struct btrfs_root *root, * just free all memory that was allocated dynamically. * Free the blocks and the block_links. */ - list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { - struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; - - list_for_each_safe(elem_ref_to, tmp_ref_to, - &b_all->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); + list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list, + all_blocks_node) { + struct btrfsic_block_link *l, *tmp; + list_for_each_entry_safe(l, tmp, &b_all->ref_to_list, + node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) btrfsic_print_rem_link(state, l); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 57ee8ca29b06..c473c42d7d6c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -482,13 +482,12 @@ static noinline int add_ra_bio_pages(struct inode *inode, goto next; } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & - ~__GFP_FS); + page = __page_cache_alloc(mapping_gfp_constraint(mapping, + ~__GFP_FS)); if (!page) break; - if (add_to_page_cache_lru(page, mapping, pg_index, - GFP_NOFS)) { + if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { page_cache_release(page); goto next; } @@ -745,11 +744,13 @@ out: return ret; } -static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; -static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; -static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; -static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; -static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; +static struct { + struct list_head idle_ws; + spinlock_t ws_lock; + int num_ws; + atomic_t alloc_ws; + wait_queue_head_t ws_wait; +} btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, @@ -761,10 +762,10 @@ void __init btrfs_init_compress(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - INIT_LIST_HEAD(&comp_idle_workspace[i]); - spin_lock_init(&comp_workspace_lock[i]); - atomic_set(&comp_alloc_workspace[i], 0); - init_waitqueue_head(&comp_workspace_wait[i]); + INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); + spin_lock_init(&btrfs_comp_ws[i].ws_lock); + atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); + init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); } } @@ -778,38 +779,38 @@ static struct list_head *find_workspace(int type) int cpus = num_online_cpus(); int idx = type - 1; - struct list_head *idle_workspace = &comp_idle_workspace[idx]; - spinlock_t *workspace_lock = &comp_workspace_lock[idx]; - atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; - wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; - int *num_workspace = &comp_num_workspace[idx]; + struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; + spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; + atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; + int *num_ws = &btrfs_comp_ws[idx].num_ws; again: - spin_lock(workspace_lock); - if (!list_empty(idle_workspace)) { - workspace = idle_workspace->next; + spin_lock(ws_lock); + if (!list_empty(idle_ws)) { + workspace = idle_ws->next; list_del(workspace); - (*num_workspace)--; - spin_unlock(workspace_lock); + (*num_ws)--; + spin_unlock(ws_lock); return workspace; } - if (atomic_read(alloc_workspace) > cpus) { + if (atomic_read(alloc_ws) > cpus) { DEFINE_WAIT(wait); - spin_unlock(workspace_lock); - prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + spin_unlock(ws_lock); + prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_ws) > cpus && !*num_ws) schedule(); - finish_wait(workspace_wait, &wait); + finish_wait(ws_wait, &wait); goto again; } - atomic_inc(alloc_workspace); - spin_unlock(workspace_lock); + atomic_inc(alloc_ws); + spin_unlock(ws_lock); workspace = btrfs_compress_op[idx]->alloc_workspace(); if (IS_ERR(workspace)) { - atomic_dec(alloc_workspace); - wake_up(workspace_wait); + atomic_dec(alloc_ws); + wake_up(ws_wait); } return workspace; } @@ -821,27 +822,30 @@ again: static void free_workspace(int type, struct list_head *workspace) { int idx = type - 1; - struct list_head *idle_workspace = &comp_idle_workspace[idx]; - spinlock_t *workspace_lock = &comp_workspace_lock[idx]; - atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; - wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; - int *num_workspace = &comp_num_workspace[idx]; - - spin_lock(workspace_lock); - if (*num_workspace < num_online_cpus()) { - list_add(workspace, idle_workspace); - (*num_workspace)++; - spin_unlock(workspace_lock); + struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; + spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; + atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; + int *num_ws = &btrfs_comp_ws[idx].num_ws; + + spin_lock(ws_lock); + if (*num_ws < num_online_cpus()) { + list_add(workspace, idle_ws); + (*num_ws)++; + spin_unlock(ws_lock); goto wake; } - spin_unlock(workspace_lock); + spin_unlock(ws_lock); btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_workspace); + atomic_dec(alloc_ws); wake: + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); - if (waitqueue_active(workspace_wait)) - wake_up(workspace_wait); + if (waitqueue_active(ws_wait)) + wake_up(ws_wait); } /* @@ -853,11 +857,11 @@ static void free_workspaces(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - while (!list_empty(&comp_idle_workspace[i])) { - workspace = comp_idle_workspace[i].next; + while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { + workspace = btrfs_comp_ws[i].idle_ws.next; list_del(workspace); btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&comp_alloc_workspace[i]); + atomic_dec(&btrfs_comp_ws[i].alloc_ws); } } } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5f745eadf77d..769e0ff1b4ce 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; if (refs == 0) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); return ret; } } else { @@ -1555,7 +1555,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } - search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + search_start = buf->start & ~((u64)SZ_1G - 1); if (parent) btrfs_set_lock_blocking(parent); @@ -1927,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = read_node_slot(root, mid, 0); if (!child) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); goto enospc; } @@ -2030,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, */ if (!left) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); goto enospc; } wret = balance_node_right(trans, root, mid, left); @@ -2248,7 +2248,6 @@ static void reada_for_search(struct btrfs_root *root, u64 target; u64 nread = 0; u64 gen; - int direction = path->reada; struct extent_buffer *eb; u32 nr; u32 blocksize; @@ -2276,16 +2275,16 @@ static void reada_for_search(struct btrfs_root *root, nr = slot; while (1) { - if (direction < 0) { + if (path->reada == READA_BACK) { if (nr == 0) break; nr--; - } else if (direction > 0) { + } else if (path->reada == READA_FORWARD) { nr++; if (nr >= nritems) break; } - if (path->reada < 0 && objectid) { + if (path->reada == READA_BACK && objectid) { btrfs_node_key(node, &disk_key, nr); if (btrfs_disk_key_objectid(&disk_key) != objectid) break; @@ -2493,7 +2492,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); free_extent_buffer(tmp); - if (p->reada) + if (p->reada != READA_NONE) reada_for_search(root, p, level, slot, key->objectid); btrfs_release_path(p); @@ -4940,8 +4939,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct extent_buffer *leaf; struct btrfs_item *item; - int last_off; - int dsize = 0; + u32 last_off; + u32 dsize = 0; int ret = 0; int wret; int i; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 938efe33be80..97ad9bbeb35d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,6 +35,7 @@ #include <linux/btrfs.h> #include <linux/workqueue.h> #include <linux/security.h> +#include <linux/sizes.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -96,6 +97,9 @@ struct btrfs_ordered_sum; /* for storing items that use the BTRFS_UUID_KEY* types */ #define BTRFS_UUID_TREE_OBJECTID 9ULL +/* tracks free space in block groups. */ +#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL + /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL @@ -174,7 +178,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4 }; +static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 @@ -196,9 +200,9 @@ static int btrfs_csum_sizes[] = { 4 }; /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) -#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) +#define BTRFS_DIRTY_METADATA_THRESH SZ_32M -#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024) +#define BTRFS_MAX_EXTENT_SIZE SZ_128M /* * The key defines the order in the tree, and so it also defines (optimal) @@ -500,6 +504,8 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) + #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) @@ -526,7 +532,10 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL -#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL + +#define BTRFS_FEATURE_COMPAT_RO_SUPP \ + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE) + #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL @@ -590,14 +599,15 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ +enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; /* if there is real range locking, this locks field will change */ - int locks[BTRFS_MAX_LEVEL]; - int reada; + u8 locks[BTRFS_MAX_LEVEL]; + u8 reada; /* keep some upper locks as we walk down */ - int lowest_level; + u8 lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks @@ -823,8 +833,18 @@ struct btrfs_disk_balance_args { */ __le64 profiles; - /* usage filter */ - __le64 usage; + /* + * usage filter + * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' + * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max + */ + union { + __le64 usage; + struct { + __le32 usage_min; + __le32 usage_max; + }; + }; /* devid filter */ __le64 devid; @@ -846,10 +866,27 @@ struct btrfs_disk_balance_args { /* BTRFS_BALANCE_ARGS_* */ __le64 flags; - /* BTRFS_BALANCE_ARGS_LIMIT value */ - __le64 limit; + /* + * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' + * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum + * and maximum + */ + union { + __le64 limit; + struct { + __le32 limit_min; + __le32 limit_max; + }; + }; - __le64 unused[7]; + /* + * Process chunks that cross stripes_min..stripes_max devices, + * BTRFS_BALANCE_ARGS_STRIPES_RANGE + */ + __le32 stripes_min; + __le32 stripes_max; + + __le64 unused[6]; } __attribute__ ((__packed__)); /* @@ -1061,6 +1098,13 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +struct btrfs_free_space_info { + __le32 extent_count; + __le32 flags; +} __attribute__ ((__packed__)); + +#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) + #define BTRFS_QGROUP_LEVEL_SHIFT 48 static inline u64 btrfs_qgroup_level(u64 qgroupid) { @@ -1154,6 +1198,10 @@ struct btrfs_space_info { delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + u64 max_extent_size; /* This will hold the maximum extent size of + the space info if we had an ENOSPC in the + allocator. */ + unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ @@ -1228,6 +1276,9 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; + /* We did a full search and couldn't create a cluster */ + bool fragmented; + struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -1262,6 +1313,9 @@ struct btrfs_caching_control { atomic_t count; }; +/* Once caching_thread() finds this much free space, it will wake up waiters. */ +#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2) + struct btrfs_io_ctl { void *cur, *orig; struct page *page; @@ -1287,8 +1341,20 @@ struct btrfs_block_group_cache { u64 delalloc_bytes; u64 bytes_super; u64 flags; - u64 sectorsize; u64 cache_generation; + u32 sectorsize; + + /* + * If the free space extent count exceeds this number, convert the block + * group to bitmaps. + */ + u32 bitmap_high_thresh; + + /* + * If the free space extent count drops below this number, convert the + * block group back to extents. + */ + u32 bitmap_low_thresh; /* * It is just used for the delayed data space allocation because @@ -1344,6 +1410,15 @@ struct btrfs_block_group_cache { struct list_head io_list; struct btrfs_io_ctl io_ctl; + + /* Lock for free space tree operations. */ + struct mutex free_space_lock; + + /* + * Does the block group need to be added to the free space tree? + * Protected by free_space_lock. + */ + int needs_free_space; }; /* delayed seq elem */ @@ -1395,6 +1470,7 @@ struct btrfs_fs_info { struct btrfs_root *csum_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; + struct btrfs_root *free_space_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1782,6 +1858,8 @@ struct btrfs_fs_info { * and will be latter freed. Protected by fs_info->chunk_mutex. */ struct list_head pinned_chunks; + + int creating_free_space_tree; }; struct btrfs_subvolume_writers { @@ -1943,6 +2021,9 @@ struct btrfs_root { int send_in_progress; struct btrfs_subvolume_writers *subv_writers; atomic_t will_be_snapshoted; + + /* For qgroup metadata space reserve */ + atomic_t qgroup_meta_rsv; }; struct btrfs_ioctl_defrag_range_args { @@ -2055,6 +2136,27 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_BLOCK_GROUP_ITEM_KEY 192 +/* + * Every block group is represented in the free space tree by a free space info + * item, which stores some accounting information. It is keyed on + * (block_group_start, FREE_SPACE_INFO, block_group_length). + */ +#define BTRFS_FREE_SPACE_INFO_KEY 198 + +/* + * A free space extent tracks an extent of space that is free in a block group. + * It is keyed on (start, FREE_SPACE_EXTENT, length). + */ +#define BTRFS_FREE_SPACE_EXTENT_KEY 199 + +/* + * When a block group becomes very fragmented, we convert it to use bitmaps + * instead of extents. A free space bitmap is keyed on + * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with + * (length / sectorsize) bits. + */ +#define BTRFS_FREE_SPACE_BITMAP_KEY 200 + #define BTRFS_DEV_EXTENT_KEY 204 #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 @@ -2145,6 +2247,9 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) +#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) +#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) +#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2169,6 +2274,18 @@ struct btrfs_ioctl_defrag_range_args { btrfs_clear_opt(root->fs_info->mount_opt, opt); \ } +#ifdef CONFIG_BTRFS_DEBUG +static inline int +btrfs_should_fragment_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + return (btrfs_test_opt(root, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(root, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif + /* * Requests for changes that need to be done during transaction commit. * @@ -2455,6 +2572,11 @@ BTRFS_SETGET_FUNCS(disk_block_group_flags, BTRFS_SETGET_STACK_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); +/* struct btrfs_free_space_info */ +BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, + extent_count, 32); +BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); + /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); @@ -3316,7 +3438,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { - return mapping_gfp_mask(mapping) & ~__GFP_FS; + return mapping_gfp_constraint(mapping, ~__GFP_FS); } /* extent-tree.c */ @@ -3365,6 +3487,7 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); +void btrfs_get_block_group(struct btrfs_block_group_cache *cache); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int get_block_group_index(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, @@ -3379,7 +3502,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins); + u64 offset, u64 ram_bytes, + struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, @@ -3398,7 +3522,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int no_quota); + u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, int delalloc); @@ -3411,7 +3535,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int no_quota); + u64 root_objectid, u64 owner, u64 offset); int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3427,6 +3551,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_used, u64 type, u64 chunk_objectid, u64 chunk_offset, u64 size); +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( + struct btrfs_fs_info *fs_info, + const u64 chunk_offset); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start, struct extent_map *em); @@ -3449,8 +3576,11 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes); -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); @@ -3466,8 +3596,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, u64 qgroup_reserved); int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, unsigned short type); @@ -3514,6 +3644,9 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, const u64 type); +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end); + /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3678,6 +3811,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->csum_root); kfree(fs_info->quota_root); kfree(fs_info->uuid_root); + kfree(fs_info->free_space_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); security_free_mnt_opts(&fs_info->security_opts); @@ -3847,7 +3981,6 @@ void btrfs_extent_item_to_extent_map(struct inode *inode, /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; - int wait; int delay_iput; struct completion completion; struct list_head list; @@ -3855,7 +3988,7 @@ struct btrfs_delalloc_work { }; struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int wait, int delay_iput); + int delay_iput); void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, @@ -3965,7 +4098,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, struct btrfs_ioctl_balance_args *bargs); - +ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, + struct file *dst_file, u64 dst_loff); /* file.c */ int btrfs_auto_defrag_init(void); @@ -3996,6 +4130,11 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags); +int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -4004,8 +4143,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* sysfs.c */ int btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); -int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info); -void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info); +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); /* xattr.c */ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); @@ -4039,14 +4178,102 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #define btrfs_info(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_INFO fmt, ##args) +/* + * Wrappers that use printk_in_rcu + */ +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk_in_rcu + */ +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk + */ +#define btrfs_emerg_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) #ifdef DEBUG #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) #else #define btrfs_debug(fs_info, fmt, args...) \ no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) #endif +#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk_ratelimited(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + #ifdef CONFIG_BTRFS_ASSERT __cold @@ -4100,16 +4327,98 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, } } +#define btrfs_clear_fs_incompat(__fs_info, opt) \ + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, "clearing %llu feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + #define btrfs_fs_incompat(fs_info, opt) \ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) -static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) +static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; disk_super = fs_info->super_copy; return !!(btrfs_super_incompat_flags(disk_super) & flag); } +#define btrfs_set_fs_compat_ro(__fs_info, opt) \ + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, "setting %llu ro feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, "clearing %llu ro feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +#define btrfs_fs_compat_ro(fs_info, opt) \ + __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_compat_ro_flags(disk_super) & flag); +} + /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. @@ -4127,14 +4436,7 @@ do { \ __LINE__, (errno)); \ } while (0) -#define btrfs_std_error(fs_info, errno) \ -do { \ - if ((errno)) \ - __btrfs_std_error((fs_info), __func__, \ - __LINE__, (errno), NULL); \ -} while (0) - -#define btrfs_error(fs_info, errno, fmt, args...) \ +#define btrfs_std_error(fs_info, errno, fmt, args...) \ do { \ __btrfs_std_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args); \ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a2ae42720a6a..0be47e4b8136 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -54,16 +54,11 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; atomic_set(&delayed_node->refs, 0); - delayed_node->count = 0; - delayed_node->flags = 0; delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); - delayed_node->index_cnt = 0; INIT_LIST_HEAD(&delayed_node->n_list); INIT_LIST_HEAD(&delayed_node->p_list); - delayed_node->bytes_reserved = 0; - memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item)); } static inline int btrfs_is_continuous_delayed_item( @@ -132,7 +127,7 @@ again: if (node) return node; - node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); if (!node) return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); @@ -463,6 +458,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); + + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if ((atomic_dec_return(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && waitqueue_active(&delayed_root->wait)) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ac3e81da6d4e..914ac13bd92f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates--; } +static bool merge_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref, + u64 seq) +{ + struct btrfs_delayed_ref_node *next; + bool done = false; + + next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, + list); + while (!done && &next->list != &head->ref_list) { + int mod; + struct btrfs_delayed_ref_node *next2; + + next2 = list_next_entry(next, list); + + if (next == ref) + goto next; + + if (seq && next->seq >= seq) + goto next; + + if (next->type != ref->type) + goto next; + + if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY) && + comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref), + btrfs_delayed_node_to_tree_ref(next), + ref->type)) + goto next; + if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY || + ref->type == BTRFS_SHARED_DATA_REF_KEY) && + comp_data_refs(btrfs_delayed_node_to_data_ref(ref), + btrfs_delayed_node_to_data_ref(next))) + goto next; + + if (ref->action == next->action) { + mod = next->ref_mod; + } else { + if (ref->ref_mod < next->ref_mod) { + swap(ref, next); + done = true; + } + mod = -next->ref_mod; + } + + drop_delayed_ref(trans, delayed_refs, head, next); + ref->ref_mod += mod; + if (ref->ref_mod == 0) { + drop_delayed_ref(trans, delayed_refs, head, ref); + done = true; + } else { + /* + * Can't have multiples of the same ref on a tree block. + */ + WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY); + } +next: + next = next2; + } + + return done; +} + +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + u64 seq = 0; + + assert_spin_locked(&head->lock); + + if (list_empty(&head->ref_list)) + return; + + /* We don't have too many refs to merge for data. */ + if (head->is_data) + return; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + seq = elem->seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, + list); + while (&ref->list != &head->ref_list) { + if (seq && ref->seq >= seq) + goto next; + + if (merge_ref(trans, delayed_refs, head, ref, seq)) { + if (list_empty(&head->ref_list)) + break; + ref = list_first_entry(&head->ref_list, + struct btrfs_delayed_ref_node, + list); + continue; + } +next: + ref = list_next_entry(ref, list); + } +} + int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -292,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, list); /* No need to compare bytenr nor is_head */ - if (exist->type != ref->type || exist->no_quota != ref->no_quota || - exist->seq != ref->seq) + if (exist->type != ref->type || exist->seq != ref->seq) goto add_tail; if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || @@ -381,12 +493,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, memcpy(&existing_ref->extent_op->key, &ref->extent_op->key, sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = 1; + existing_ref->extent_op->update_key = true; } if (ref->extent_op->update_flags) { existing_ref->extent_op->flags_to_set |= ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = 1; + existing_ref->extent_op->update_flags = true; } btrfs_free_delayed_extent_op(ref->extent_op); } @@ -423,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, struct btrfs_qgroup_extent_record *qrecord, - u64 bytenr, u64 num_bytes, int action, int is_data) + u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, + int action, int is_data) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; @@ -432,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, int count_mod = 1; int must_insert_reserved = 0; + /* If reserved is provided, it must be a data extent. */ + BUG_ON(!is_data && reserved); + /* * the head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. @@ -476,9 +592,16 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&head_ref->ref_list); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; + head_ref->qgroup_reserved = 0; + head_ref->qgroup_ref_root = 0; /* Record qgroup extent info if provided */ if (qrecord) { + if (ref_root && reserved) { + head_ref->qgroup_ref_root = ref_root; + head_ref->qgroup_reserved = reserved; + } + qrecord->bytenr = bytenr; qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; @@ -497,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { + WARN_ON(ref_root && reserved && existing->qgroup_ref_root + && existing->qgroup_reserved); update_existing_head_ref(delayed_refs, &existing->node, ref); /* * we've updated the existing ref, free the newly @@ -524,7 +649,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, - int action, int no_quota) + int action) { struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -546,7 +671,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -579,7 +703,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, - u64 offset, int action, int no_quota) + u64 offset, int action) { struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -602,7 +726,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -633,17 +756,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - no_quota = 0; - BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) @@ -669,11 +788,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * the spin lock */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, - bytenr, num_bytes, action, 0); + bytenr, num_bytes, 0, 0, action, 0); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, - num_bytes, parent, ref_root, level, action, - no_quota); + num_bytes, parent, ref_root, level, action); spin_unlock(&delayed_refs->lock); return 0; @@ -693,18 +811,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + u64 owner, u64 offset, u64 reserved, int action, + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - no_quota = 0; - BUG_ON(extent_op && !extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) @@ -736,16 +850,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * the spin lock */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, - bytenr, num_bytes, action, 1); + bytenr, num_bytes, ref_root, reserved, + action, 1); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, - action, no_quota); + action); spin_unlock(&delayed_refs->lock); return 0; } +int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 ref_root, u64 bytenr, u64 num_bytes) +{ + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *ref_head; + int ret = 0; + + if (!fs_info->quota_enabled || !is_fstree(ref_root)) + return 0; + + delayed_refs = &trans->transaction->delayed_refs; + + spin_lock(&delayed_refs->lock); + ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0); + if (!ref_head) { + ret = -ENOENT; + goto out; + } + WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root); + ref_head->qgroup_ref_root = ref_root; + ref_head->qgroup_reserved = num_bytes; +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -764,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, spin_lock(&delayed_refs->lock); add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 13fb5e6090fe..c24b653c7343 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; - unsigned int no_quota:1; /* is this node still in the rbtree? */ unsigned int is_head:1; unsigned int in_tree:1; @@ -76,11 +75,11 @@ struct btrfs_delayed_ref_node { struct btrfs_delayed_extent_op { struct btrfs_disk_key key; + u8 level; + bool update_key; + bool update_flags; + bool is_data; u64 flags_to_set; - int level; - unsigned int update_key:1; - unsigned int update_flags:1; - unsigned int is_data:1; }; /* @@ -113,6 +112,17 @@ struct btrfs_delayed_ref_head { int total_ref_mod; /* + * For qgroup reserved space freeing. + * + * ref_root and reserved will be recorded after + * BTRFS_ADD_DELAYED_EXTENT is called. + * And will be used to free reserved qgroup space at + * run_delayed_refs() time. + */ + u64 qgroup_ref_root; + u64 qgroup_reserved; + + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is @@ -233,15 +243,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota); + struct btrfs_delayed_extent_op *extent_op); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota); + u64 owner, u64 offset, u64 reserved, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 ref_root, u64 bytenr, u64 num_bytes); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e54dd5905cee..1e668fb7dd4c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -327,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->start.tgtdev_name[0] == '\0') return -EINVAL; - /* - * Here we commit the transaction to make sure commit_total_bytes - * of all the devices are updated. - */ - trans = btrfs_attach_transaction(root); - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - } else if (PTR_ERR(trans) != -ENOENT) { - return PTR_ERR(trans); - } - /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, @@ -356,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root, if (ret) return ret; + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); + } + btrfs_dev_replace_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -375,12 +375,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; - ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); - if (ret) - btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); - - printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s started\n", + btrfs_info_in_rcu(root->fs_info, + "dev_replace from %s (devid %llu) to %s started", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -401,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); + ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + btrfs_wait_ordered_roots(root->fs_info, -1); /* force writing the updated state information to disk */ @@ -454,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) { clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - if (waitqueue_active(&fs_info->replace_wait)) - wake_up(&fs_info->replace_wait); + wake_up(&fs_info->replace_wait); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, @@ -523,8 +522,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - printk_in_rcu(KERN_ERR - "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + btrfs_err_in_rcu(root->fs_info, + "btrfs_scrub_dev(%s, %llu, %s) failed %d", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -540,8 +539,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, return scrub_ret; } - printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s finished\n", + btrfs_info_in_rcu(root->fs_info, + "dev_replace from %s (devid %llu) to %s finished", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -586,7 +585,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info->fs_devices, src_device); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ @@ -809,8 +808,8 @@ static int btrfs_dev_replace_kthread(void *data) progress = status_args->status.progress_1000; kfree(status_args); progress = div_u64(progress, 10); - printk_in_rcu(KERN_INFO - "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + btrfs_info_in_rcu(fs_info, + "continuing dev_replace from %s (devid %llu) to %s @%u%%", dev_replace->srcdev->missing ? "<missing disk>" : rcu_str_deref(dev_replace->srcdev->name), dev_replace->srcdev->devid, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1e60d00d4ea7..e99ccd6ffb2c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" +#include "free-space-tree.h" #include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" @@ -319,9 +320,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_WARNING - "BTRFS: %s checksum verify failed on %llu wanted %X found %X " - "level %d\n", + btrfs_warn_rl(fs_info, + "%s checksum verify failed on %llu wanted %X found %X " + "level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) @@ -362,15 +363,15 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - 0, &cached_state); + &cached_state); if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; goto out; } - printk_ratelimited(KERN_ERR - "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", - eb->fs_info->sb->s_id, eb->start, + btrfs_err_rl(eb->fs_info, + "parent transid verify failed on %llu wanted %llu found %llu", + eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; @@ -629,15 +630,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start " - "%llu %llu\n", - eb->fs_info->sb->s_id, found_start, eb->start); + btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu", + found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root->fs_info, eb)) { - printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", - eb->fs_info->sb->s_id, eb->start); + btrfs_err_rl(eb->fs_info, "bad fsid on block %llu", + eb->start); ret = -EIO; goto err; } @@ -802,6 +802,9 @@ static void run_one_async_done(struct btrfs_work *work) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if (atomic_dec_return(&fs_info->nr_async_submits) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -921,7 +924,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; #ifdef CONFIG_X86 - if (cpu_has_xmm4_2) + if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) return 0; #endif return 1; @@ -1265,6 +1268,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, atomic_set(&root->orphan_inodes, 0); atomic_set(&root->refs, 1); atomic_set(&root->will_be_snapshoted, 0); + atomic_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -1647,6 +1651,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, if (location->objectid == BTRFS_UUID_TREE_OBJECTID) return fs_info->uuid_root ? fs_info->uuid_root : ERR_PTR(-ENOENT); + if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + return fs_info->free_space_root ? fs_info->free_space_root : + ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { @@ -1759,6 +1766,7 @@ static int cleaner_kthread(void *arg) int again; struct btrfs_trans_handle *trans; + set_freezable(); do { again = 0; @@ -2144,6 +2152,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_root_extent_buffers(info->uuid_root); if (chunk_root) free_root_extent_buffers(info->chunk_root); + free_root_extent_buffers(info->free_space_root); } void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) @@ -2348,8 +2357,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "BTRFS: log replay required " - "on RO media\n"); + btrfs_warn(fs_info, "log replay required on RO media"); return -EIO; } @@ -2364,12 +2372,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = read_tree_block(tree_root, bytenr, fs_info->generation + 1); if (IS_ERR(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); + btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); kfree(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); + btrfs_err(fs_info, "failed to read log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); return -EIO; @@ -2377,7 +2385,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { - btrfs_error(tree_root->fs_info, ret, + btrfs_std_error(tree_root->fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2445,6 +2453,15 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info, fs_info->uuid_root = root; } + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->free_space_root = root; + } + return 0; } @@ -2572,7 +2589,7 @@ int open_ctree(struct super_block *sb, fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ - INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); spin_lock_init(&fs_info->reada_lock); fs_info->thread_pool_size = min_t(unsigned long, @@ -2653,8 +2670,8 @@ int open_ctree(struct super_block *sb, * Read super block and check the signature bytes only */ bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (!bh) { - err = -EINVAL; + if (IS_ERR(bh)) { + err = PTR_ERR(bh); goto fail_alloc; } @@ -2665,6 +2682,7 @@ int open_ctree(struct super_block *sb, if (btrfs_check_super_csum(bh->b_data)) { printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); err = -EINVAL; + brelse(bh); goto fail_alloc; } @@ -2806,7 +2824,7 @@ int open_ctree(struct super_block *sb, fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - 4 * 1024 * 1024 / PAGE_CACHE_SIZE); + SZ_4M / PAGE_CACHE_SIZE); tree_root->nodesize = nodesize; tree_root->sectorsize = sectorsize; @@ -2937,7 +2955,7 @@ retry_root_backup: goto fail_fsdev_sysfs; } - ret = btrfs_sysfs_add_one(fs_info); + ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); goto fail_fsdev_sysfs; @@ -3048,6 +3066,18 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; + if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + pr_info("BTRFS: creating free space tree\n"); + ret = btrfs_create_free_space_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to create free space tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } + down_read(&fs_info->cleanup_work_sem); if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { @@ -3073,6 +3103,18 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); + if (btrfs_test_opt(tree_root, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + pr_info("BTRFS: clearing free space tree\n"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to clear free space tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } + if (!fs_info->uuid_root) { pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); @@ -3117,7 +3159,7 @@ fail_cleaner: filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: btrfs_sysfs_remove_fsid(fs_info->fs_devices); @@ -3179,8 +3221,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to " - "I/O error on %s\n", + btrfs_warn_rl_in_rcu(device->dev_root->fs_info, + "lost page write due to IO error on %s", rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors @@ -3192,6 +3234,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) put_bh(bh); } +int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, + struct buffer_head **bh_ret) +{ + struct buffer_head *bh; + struct btrfs_super_block *super; + u64 bytenr; + + bytenr = btrfs_sb_offset(copy_num); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) + return -EINVAL; + + bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); + /* + * If we fail to read from the underlying devices, as of now + * the best option we have is to mark it EIO. + */ + if (!bh) + return -EIO; + + super = (struct btrfs_super_block *)bh->b_data; + if (btrfs_super_bytenr(super) != bytenr || + btrfs_super_magic(super) != BTRFS_MAGIC) { + brelse(bh); + return -EINVAL; + } + + *bh_ret = bh; + return 0; +} + + struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) { struct buffer_head *bh; @@ -3199,7 +3272,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) struct btrfs_super_block *super; int i; u64 transid = 0; - u64 bytenr; + int ret = -EINVAL; /* we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. @@ -3207,21 +3280,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - i_size_read(bdev->bd_inode)) - break; - bh = __bread(bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) + ret = btrfs_read_dev_one_super(bdev, i, &bh); + if (ret) continue; super = (struct btrfs_super_block *)bh->b_data; - if (btrfs_super_bytenr(super) != bytenr || - btrfs_super_magic(super) != BTRFS_MAGIC) { - brelse(bh); - continue; - } if (!latest || btrfs_super_generation(super) > transid) { brelse(latest); @@ -3231,6 +3294,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) brelse(bh); } } + + if (!latest) + return ERR_PTR(ret); + return latest; } @@ -3299,8 +3366,9 @@ static int write_dev_supers(struct btrfs_device *device, bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); if (!bh) { - printk(KERN_ERR "BTRFS: couldn't get super " - "buffer head for bytenr %Lu\n", bytenr); + btrfs_err(device->dev_root->fs_info, + "couldn't get super buffer head for bytenr %llu", + bytenr); errors++; continue; } @@ -3449,22 +3517,31 @@ static int barrier_all_devices(struct btrfs_fs_info *info) int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) { - if ((flags & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_AVAIL_ALLOC_BIT_SINGLE)) || - ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)) - return 0; + int raid_type; + int min_tolerated = INT_MAX; - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID10)) - return 1; + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || + (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) + min_tolerated = min(min_tolerated, + btrfs_raid_array[BTRFS_RAID_SINGLE]. + tolerated_failures); - if (flags & BTRFS_BLOCK_GROUP_RAID6) - return 2; + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (raid_type == BTRFS_RAID_SINGLE) + continue; + if (!(flags & btrfs_raid_group[raid_type])) + continue; + min_tolerated = min(min_tolerated, + btrfs_raid_array[raid_type]. + tolerated_failures); + } - pr_warn("BTRFS: unknown raid type: %llu\n", flags); - return 0; + if (min_tolerated == INT_MAX) { + pr_warn("BTRFS: unknown raid flag: %llu\n", flags); + min_tolerated = 0; + } + + return min_tolerated; } int btrfs_calc_num_tolerated_disk_barrier_failures( @@ -3548,7 +3625,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) { mutex_unlock( &root->fs_info->fs_devices->device_list_mutex); - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "errors while submitting device barriers."); return ret; } @@ -3588,7 +3665,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_error(root->fs_info, -EIO, + btrfs_std_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3606,7 +3683,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - btrfs_error(root->fs_info, -EIO, + btrfs_std_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3742,6 +3819,9 @@ void close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + /* wait for the qgroup rescan worker to stop */ + btrfs_qgroup_wait_for_completion(fs_info); + /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); /* avoid complains from lockdep et al., set sem back to initial state */ @@ -3792,7 +3872,7 @@ void close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); @@ -3861,11 +3941,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, return !ret; } -int btrfs_set_buffer_uptodate(struct extent_buffer *buf) -{ - return set_extent_buffer_uptodate(buf); -} - void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { struct btrfs_root *root; @@ -3921,7 +3996,6 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, balance_dirty_pages_ratelimited( root->fs_info->btree_inode->i_mapping); } - return; } void btrfs_btree_balance_dirty(struct btrfs_root *root) @@ -4290,25 +4364,6 @@ again: return 0; } -static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_ordered_extent *ordered; - - spin_lock(&fs_info->trans_lock); - while (!list_empty(&cur_trans->pending_ordered)) { - ordered = list_first_entry(&cur_trans->pending_ordered, - struct btrfs_ordered_extent, - trans_list); - list_del_init(&ordered->trans_list); - spin_unlock(&fs_info->trans_lock); - - btrfs_put_ordered_extent(ordered); - spin_lock(&fs_info->trans_lock); - } - spin_unlock(&fs_info->trans_lock); -} - void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { @@ -4320,7 +4375,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); - btrfs_free_pending_ordered(cur_trans, root->fs_info); btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bdfb479ea859..8e79d0070bcf 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -19,7 +19,7 @@ #ifndef __DISKIO__ #define __DISKIO__ -#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) +#define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 #define BTRFS_SUPER_MIRROR_MAX 3 @@ -35,7 +35,7 @@ enum btrfs_wq_endio_type { static inline u64 btrfs_sb_offset(int mirror) { - u64 start = 16 * 1024; + u64 start = SZ_16K; if (mirror) return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); return BTRFS_SUPER_INFO_OFFSET; @@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); +int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, + struct buffer_head **bh_ret); int btrfs_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); @@ -114,7 +116,6 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root) void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 601d7d45d164..60cc1399c64f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@ #include "raid56.h" #include "locking.h" #include "free-space-cache.h" +#include "free-space-tree.h" #include "math.h" #include "sysfs.h" #include "qgroup.h" @@ -95,8 +96,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins, - int no_quota); + int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force); @@ -125,7 +125,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) return (cache->flags & bits) == bits; } -static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +void btrfs_get_block_group(struct btrfs_block_group_cache *cache) { atomic_inc(&cache->count); } @@ -332,13 +332,34 @@ static void put_caching_control(struct btrfs_caching_control *ctl) kfree(ctl); } +#ifdef CONFIG_BTRFS_DEBUG +static void fragment_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + u64 start = block_group->key.objectid; + u64 len = block_group->key.offset; + u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? + root->nodesize : root->sectorsize; + u64 step = chunk << 1; + + while (len > chunk) { + btrfs_remove_free_space(block_group, start, chunk); + start += step; + if (len < step) + len = 0; + else + len -= step; + } +} +#endif + /* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet * since their free space will be released as soon as the transaction commits. */ -static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_fs_info *info, u64 start, u64 end) +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) { u64 extent_start, extent_end, size, total_added = 0; int ret; @@ -375,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, return total_added; } -static noinline void caching_thread(struct btrfs_work *work) +static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group_cache *block_group; struct btrfs_fs_info *fs_info; - struct btrfs_caching_control *caching_ctl; struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -387,19 +407,28 @@ static noinline void caching_thread(struct btrfs_work *work) u64 total_found = 0; u64 last = 0; u32 nritems; - int ret = -ENOMEM; + int ret; + bool wakeup = true; - caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; fs_info = block_group->fs_info; extent_root = fs_info->extent_root; path = btrfs_alloc_path(); if (!path) - goto out; + return -ENOMEM; last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); +#ifdef CONFIG_BTRFS_DEBUG + /* + * If we're fragmenting we don't want to make anybody think we can + * allocate from this block group until we've had a chance to fragment + * the free space. + */ + if (btrfs_should_fragment_free_space(extent_root, block_group)) + wakeup = false; +#endif /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent @@ -408,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 1; + path->reada = READA_FORWARD; key.objectid = last; key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; -again: - mutex_lock(&caching_ctl->mutex); - /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->commit_root_sem); next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto err; + goto out; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -441,17 +466,20 @@ next: if (need_resched() || rwsem_is_contended(&fs_info->commit_root_sem)) { - caching_ctl->progress = last; + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); - goto again; + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + goto next; } ret = btrfs_next_leaf(extent_root, path); if (ret < 0) - goto err; + goto out; if (ret) break; leaf = path->nodes[0]; @@ -464,7 +492,8 @@ next: key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; - caching_ctl->progress = last; + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); goto next; } @@ -489,9 +518,10 @@ next: else last = key.objectid + key.offset; - if (total_found > (1024 * 1024 * 2)) { + if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; - wake_up(&caching_ctl->wait); + if (wakeup) + wake_up(&caching_ctl->wait); } } path->slots[0]++; @@ -503,25 +533,58 @@ next: block_group->key.offset); caching_ctl->progress = (u64)-1; +out: + btrfs_free_path(path); + return ret; +} + +static noinline void caching_thread(struct btrfs_work *work) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + struct btrfs_root *extent_root; + int ret; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + extent_root = fs_info->extent_root; + + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + ret = load_free_space_tree(caching_ctl); + else + ret = load_extent_tree_free(caching_ctl); + spin_lock(&block_group->lock); block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_FINISHED; + block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); -err: - btrfs_free_path(path); - up_read(&fs_info->commit_root_sem); - - free_excluded_extents(extent_root, block_group); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(extent_root, block_group)) { + u64 bytes_used; - mutex_unlock(&caching_ctl->mutex); -out: - if (ret) { + spin_lock(&block_group->space_info->lock); spin_lock(&block_group->lock); - block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_ERROR; + bytes_used = block_group->key.offset - + btrfs_block_group_used(&block_group->item); + block_group->space_info->bytes_used += bytes_used >> 1; spin_unlock(&block_group->lock); + spin_unlock(&block_group->space_info->lock); + fragment_free_space(extent_root, block_group); } +#endif + + caching_ctl->progress = (u64)-1; + + up_read(&fs_info->commit_root_sem); + free_excluded_extents(fs_info->extent_root, block_group); + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); @@ -607,6 +670,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } } spin_unlock(&cache->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (ret == 1 && + btrfs_should_fragment_free_space(fs_info->extent_root, + cache)) { + u64 bytes_used; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + bytes_used = cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fragment_free_space(fs_info->extent_root, cache); + } +#endif mutex_unlock(&caching_ctl->mutex); wake_up(&caching_ctl->wait); @@ -617,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } } else { /* - * We are not going to do the fast caching, set cached to the - * appropriate value and wakeup any waiters. + * We're either using the free space tree or no caching at all. + * Set cached to the appropriate value and wakeup any waiters. */ spin_lock(&cache->lock); if (load_cache_only) { @@ -2009,8 +2088,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, - int no_quota) + u64 root_objectid, u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -2022,12 +2100,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, no_quota); + BTRFS_ADD_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, no_quota); + num_bytes, parent, root_objectid, + owner, offset, 0, + BTRFS_ADD_DELAYED_REF, NULL); } return ret; } @@ -2048,16 +2126,12 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 num_bytes = node->num_bytes; u64 refs; int ret; - int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) - no_quota = 1; - - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, @@ -2083,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, @@ -2196,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 1); @@ -2291,8 +2365,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins, - node->no_quota); + ref->level, &ins); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, @@ -2345,6 +2418,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->num_bytes); } } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(root->fs_info, + head->qgroup_ref_root, + head->qgroup_reserved); return ret; } @@ -2433,7 +2511,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + * Or we can get node references of the same type that weren't + * merged when created due to bumps in the tree mod seq, and + * we need to merge them to prevent adding an inline extent + * backref before dropping it (triggering a BUG_ON at + * insert_inline_extent_backref()). + */ spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, + locked_ref); /* * locked_ref is the head node, so we have to go one @@ -2834,6 +2926,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (trans->aborted) return 0; + if (root->fs_info->creating_free_space_tree) + return 0; + if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; @@ -2912,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, return -ENOMEM; extent_op->flags_to_set = flags; - extent_op->update_flags = 1; - extent_op->update_key = 0; - extent_op->is_data = is_data ? 1 : 0; + extent_op->update_flags = true; + extent_op->update_key = false; + extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, @@ -3109,7 +3204,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, int); + u64, u64, u64, u64, u64, u64); if (btrfs_test_is_dummy_root(root)) @@ -3150,15 +3245,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, 1); + key.offset); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = root->nodesize; ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0, - 1); + parent, ref_root, level - 1, 0); if (ret) goto fail; } @@ -3253,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, * If this block group is smaller than 100 megs don't bother caching the * block group. */ - if (block_group->key.offset < (100 * 1024 * 1024)) { + if (block_group->key.offset < (100 * SZ_1M)) { spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); @@ -3339,28 +3433,47 @@ again: spin_unlock(&block_group->lock); /* + * We hit an ENOSPC when setting up the cache in this transaction, just + * skip doing the setup, we've already cleared the cache so we're safe. + */ + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { + ret = -ENOSPC; + goto out_put; + } + + /* * Try to preallocate enough space based on how big the block group is. * Keep in mind this has to include any pinned space which could end up * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); + num_pages = div_u64(block_group->key.offset, SZ_256M); if (!num_pages) num_pages = 1; num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages, num_pages); + ret = btrfs_check_data_free_space(inode, 0, num_pages); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); + /* + * Our cache requires contiguous chunks so that we don't modify a bunch + * of metadata or split extents when writing the cache out, which means + * we can enospc if we are heavily fragmented in addition to just normal + * out of space conditions. So if we hit this just skip setting up any + * other block groups for this transaction, maybe we'll unpin enough + * space the next time around. + */ if (!ret) dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); + else if (ret == -ENOSPC) + set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); + btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -3590,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return -ENOMEM; /* - * We don't need the lock here since we are protected by the transaction - * commit. We want to do the cache_save_setup first and then run the + * Even though we are in the critical section of the transaction commit, + * we can still have concurrent tasks adding elements to this + * transaction's list of dirty block groups. These tasks correspond to + * endio free space workers started when writeback finishes for a + * space cache, which run inode.c:btrfs_finish_ordered_io(), and can + * allocate new block groups as a result of COWing nodes of the root + * tree when updating the free space inode. The writeback for the space + * caches is triggered by an earlier call to + * btrfs_start_dirty_block_groups() and iterations of the following + * loop. + * Also we want to do the cache_save_setup first and then run the * delayed refs to make sure we have the best chance at doing this all * in one shot. */ + spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, @@ -3606,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, * finish and then do it all again */ if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); list_del_init(&cache->io_list); btrfs_wait_cache_io(root, trans, cache, &cache->io_ctl, path, cache->key.objectid); btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); } /* @@ -3618,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, * on any pending IO */ list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); should_put = 1; cache_save_setup(cache, trans, path); @@ -3642,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, } if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + /* + * One of the free space endio workers might have + * created a new block group while updating a free space + * cache's inode (at inode.c:btrfs_finish_ordered_io()) + * and hasn't released its transaction handle yet, in + * which case the new block group is still attached to + * its transaction handle and its creation has not + * finished yet (no block group item in the extent tree + * yet, etc). If this is the case, wait for all free + * space endio workers to finish and retry. This is a + * a very rare case so no need for a more efficient and + * complex approach. + */ + if (ret == -ENOENT) { + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = write_one_cache_group(trans, root, path, + cache); + } if (ret) btrfs_abort_transaction(trans, root, ret); } @@ -3649,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); } + spin_unlock(&cur_trans->dirty_bgs_lock); while (!list_empty(io)) { cache = list_first_entry(io, struct btrfs_block_group_cache, @@ -3746,6 +3893,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_readonly = 0; found->bytes_may_use = 0; found->full = 0; + found->max_extent_size = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3822,7 +3970,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; u64 target; - u64 tmp; + u64 raid_type; + u64 allowed = 0; /* * see if restripe for this chunk_type is in progress, if so @@ -3840,31 +3989,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) spin_unlock(&root->fs_info->balance_lock); /* First, mask out the RAID levels which aren't possible */ - if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5); - if (num_devices < 3) - flags &= ~BTRFS_BLOCK_GROUP_RAID6; - if (num_devices < 4) - flags &= ~BTRFS_BLOCK_GROUP_RAID10; - - tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); - flags &= ~tmp; - - if (tmp & BTRFS_BLOCK_GROUP_RAID6) - tmp = BTRFS_BLOCK_GROUP_RAID6; - else if (tmp & BTRFS_BLOCK_GROUP_RAID5) - tmp = BTRFS_BLOCK_GROUP_RAID5; - else if (tmp & BTRFS_BLOCK_GROUP_RAID10) - tmp = BTRFS_BLOCK_GROUP_RAID10; - else if (tmp & BTRFS_BLOCK_GROUP_RAID1) - tmp = BTRFS_BLOCK_GROUP_RAID1; - else if (tmp & BTRFS_BLOCK_GROUP_RAID0) - tmp = BTRFS_BLOCK_GROUP_RAID0; - - return extended_to_chunk(flags | tmp); + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (num_devices >= btrfs_raid_array[raid_type].devs_min) + allowed |= btrfs_raid_group[raid_type]; + } + allowed &= flags; + + if (allowed & BTRFS_BLOCK_GROUP_RAID6) + allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID5) + allowed = BTRFS_BLOCK_GROUP_RAID5; + else if (allowed & BTRFS_BLOCK_GROUP_RAID10) + allowed = BTRFS_BLOCK_GROUP_RAID10; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1) + allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_RAID0) + allowed = BTRFS_BLOCK_GROUP_RAID0; + + flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + return extended_to_chunk(flags | allowed); } static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) @@ -3903,11 +4047,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return ret; } -/* - * This will check the space that the inode allocates from to make sure we have - * enough space for bytes. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4006,7 +4146,8 @@ commit_trans: if (IS_ERR(trans)) return PTR_ERR(trans); if (have_pinned_space >= 0 || - trans->transaction->have_free_bgs || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, + &trans->transaction->flags) || need_commit > 0) { ret = btrfs_commit_transaction(trans, root); if (ret) @@ -4028,38 +4169,86 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - ret = btrfs_qgroup_reserve(root, write_bytes); - if (ret) - goto out; data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 1); -out: spin_unlock(&data_sinfo->lock); return ret; } /* - * Called if we need to clear a data reservation for this inode. + * New check_data_free_space() with ability for precious data reservation + * Will replace old btrfs_check_data_free_space(), but for patch split, + * add a new function first and then replace it. + */ +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + /* align the range */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); + + ret = btrfs_alloc_data_chunk_ondemand(inode, len); + if (ret < 0) + return ret; + + /* + * Use new btrfs_qgroup_reserve_data to reserve precious data space + * + * TODO: Find a good method to avoid reserve data space for NOCOW + * range, but don't impact performance on quota disable case. + */ + ret = btrfs_qgroup_reserve_data(inode, start, len); + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_space_info *data_sinfo; - /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, root->sectorsize); + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - WARN_ON(data_sinfo->bytes_may_use < bytes); - data_sinfo->bytes_may_use -= bytes; + if (WARN_ON(data_sinfo->bytes_may_use < len)) + data_sinfo->bytes_may_use = 0; + else + data_sinfo->bytes_may_use -= len; trace_btrfs_space_reservation(root->fs_info, "space_info", - data_sinfo->flags, bytes, 0); + data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); } +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-indoe data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +{ + btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_qgroup_free_data(inode, start, len); +} + static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -4103,14 +4292,13 @@ static int should_alloc_chunk(struct btrfs_root *root, */ if (force == CHUNK_ALLOC_LIMITED) { thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - thresh = max_t(u64, 64 * 1024 * 1024, - div_factor_fine(thresh, 1)); + thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); if (num_bytes - num_allocated < thresh) return 1; } - if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) + if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) return 0; return 1; } @@ -4310,7 +4498,7 @@ out: * transaction. */ if (trans->can_flush_pending_bgs && - trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + trans->chunk_bytes_reserved >= (u64)SZ_2M) { btrfs_create_pending_block_groups(trans, trans->root); btrfs_trans_release_chunk_metadata(trans); } @@ -4408,7 +4596,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) return nr; } -#define EXTENT_SIZE_PER_ITEM (256 * 1024) +#define EXTENT_SIZE_PER_ITEM SZ_256K /* * shrink metadata reservation for delalloc @@ -4613,8 +4801,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, u64 expected; u64 to_reclaim; - to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, - 16 * 1024 * 1024); + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); spin_lock(&space_info->lock); if (can_overcommit(root, space_info, to_reclaim, BTRFS_RESERVE_FLUSH_ALL)) { @@ -4625,8 +4812,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; - if (can_overcommit(root, space_info, 1024 * 1024, - BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -4891,13 +5077,9 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->csum_root && trans->adding_csums) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->uuid_root) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + (root == root->fs_info->csum_root && trans->adding_csums) || + (root == root->fs_info->uuid_root)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -5186,7 +5368,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); + block_rsv->size = min_t(u64, num_bytes, SZ_512M); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -5340,7 +5522,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ num_bytes = 3 * root->nodesize; - ret = btrfs_qgroup_reserve(root, num_bytes); + ret = btrfs_qgroup_reserve_meta(root, num_bytes); if (ret) return ret; } else { @@ -5358,10 +5540,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret == -ENOSPC && use_global_rsv) ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); - if (ret) { - if (*qgroup_reserved) - btrfs_qgroup_free(root, *qgroup_reserved); - } + if (ret && *qgroup_reserved) + btrfs_qgroup_free_meta(root, *qgroup_reserved); return ret; } @@ -5522,15 +5702,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_unlock(&BTRFS_I(inode)->lock); if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); + ret = btrfs_qgroup_reserve_meta(root, + nr_extents * root->nodesize); if (ret) goto out_fail; } ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, nr_extents * root->nodesize); + btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); goto out_fail; } @@ -5653,41 +5833,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) } /** - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * btrfs_delalloc_reserve_space - reserve data and metadata space for + * delalloc * @inode: inode we're writing to - * @num_bytes: the number of bytes we want to allocate + * @start: start range we are writing to + * @len: how long the range we are writing to + * + * TODO: This function will finally replace old btrfs_delalloc_reserve_space() * * This will do the following things * - * o reserve space in the data space info for num_bytes - * o reserve space in the metadata space info based on number of outstanding + * o reserve space in data space info for num bytes + * and reserve precious corresponding qgroup space + * (Done in check_data_free_space) + * + * o reserve space for metadata space, based on the number of outstanding * extents and how much csums will be needed - * o add to the inodes ->delalloc_bytes + * also reserve metadata space in a per root over-reserve method. + * o add to the inodes->delalloc_bytes * o add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) * - * This will return 0 for success and -ENOSPC if there is no space left. + * Return 0 for success + * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); - if (ret) - return ret; - - ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); - if (ret) { - btrfs_free_reserved_data_space(inode, num_bytes); + ret = btrfs_check_data_free_space(inode, start, len); + if (ret < 0) return ret; - } - - return 0; + ret = btrfs_delalloc_reserve_metadata(inode, len); + if (ret < 0) + btrfs_free_reserved_data_space(inode, start, len); + return ret; } /** * btrfs_delalloc_release_space - release data and metadata space for delalloc * @inode: inode we're releasing space for - * @num_bytes: the number of bytes we want to free up + * @start: start position of the space already reserved + * @len: the len of the space already reserved * * This must be matched with a call to btrfs_delalloc_reserve_space. This is * called in the case that we don't need the metadata AND data reservations @@ -5696,11 +5883,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes * list if there are no delalloc bytes left. + * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) { - btrfs_delalloc_release_metadata(inode, num_bytes); - btrfs_free_reserved_data_space(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, len); + btrfs_free_reserved_data_space(inode, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, @@ -5777,19 +5965,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); - /* - * No longer have used bytes in this block group, queue - * it for deletion. - */ - if (old_val == 0) { - spin_lock(&info->unused_bgs_lock); - if (list_empty(&cache->bg_list)) { - btrfs_get_block_group(cache); - list_add_tail(&cache->bg_list, - &info->unused_bgs); - } - spin_unlock(&info->unused_bgs_lock); - } } spin_lock(&trans->transaction->dirty_bgs_lock); @@ -5801,6 +5976,22 @@ static int update_block_group(struct btrfs_trans_handle *trans, } spin_unlock(&trans->transaction->dirty_bgs_lock); + /* + * No longer have used bytes in this block group, queue it for + * deletion. We do this after adding the block group to the + * dirty list to avoid races between cleaner kthread and space + * cache writeout. + */ + if (!alloc && old_val == 0) { + spin_lock(&info->unused_bgs_lock); + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; @@ -6065,6 +6256,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, update_global_block_rsv(fs_info); } +/* + * Returns the free cluster for the given space info and sets empty_cluster to + * what it should be based on the mount options. + */ +static struct btrfs_free_cluster * +fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, + u64 *empty_cluster) +{ + struct btrfs_free_cluster *ret = NULL; + bool ssd = btrfs_test_opt(root, SSD); + + *empty_cluster = 0; + if (btrfs_mixed_space_info(space_info)) + return ret; + + if (ssd) + *empty_cluster = SZ_2M; + if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + ret = &root->fs_info->meta_alloc_cluster; + if (!ssd) + *empty_cluster = SZ_64K; + } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { + ret = &root->fs_info->data_alloc_cluster; + } + + return ret; +} + static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, const bool return_free_space) { @@ -6072,7 +6291,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_block_group_cache *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_free_cluster *cluster = NULL; u64 len; + u64 total_unpinned = 0; + u64 empty_cluster = 0; bool readonly; while (start <= end) { @@ -6081,8 +6303,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, start >= cache->key.objectid + cache->key.offset) { if (cache) btrfs_put_block_group(cache); + total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); BUG_ON(!cache); /* Logic error */ + + cluster = fetch_cluster_info(root, + cache->space_info, + &empty_cluster); + empty_cluster <<= 1; } len = cache->key.objectid + cache->key.offset - start; @@ -6095,12 +6323,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, } start += len; + total_unpinned += len; space_info = cache->space_info; + /* + * If this space cluster has been marked as fragmented and we've + * unpinned enough in this block group to potentially allow a + * cluster to be created inside of it go ahead and clear the + * fragmented check. + */ + if (cluster && cluster->fragmented && + total_unpinned > empty_cluster) { + spin_lock(&cluster->lock); + cluster->fragmented = 0; + spin_unlock(&cluster->lock); + } + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + space_info->max_extent_size = 0; percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; @@ -6233,7 +6476,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; - int no_quota = node->no_quota; u32 item_size; u64 refs; u64 bytenr = node->bytenr; @@ -6242,14 +6484,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (!info->quota_enabled || !is_fstree(root_objectid)) - no_quota = 1; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; @@ -6472,6 +6711,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + ret = add_to_free_space_tree(trans, root->fs_info, bytenr, + num_bytes); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); @@ -6570,7 +6816,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, 0); + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); /* -ENOMEM */ } @@ -6618,7 +6864,7 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int no_quota) + u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -6641,13 +6887,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, no_quota); + BTRFS_DROP_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, - NULL, no_quota); + offset, 0, + BTRFS_DROP_DELAYED_REF, NULL); } return ret; } @@ -6833,7 +7079,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_block_group_cache *block_group = NULL; u64 search_start = 0; u64 max_extent_size = 0; - int empty_cluster = 2 * 1024 * 1024; + u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); @@ -6843,6 +7089,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; + bool orig_have_caching_bg = false; + bool full_search = false; WARN_ON(num_bytes < root->sectorsize); ins->type = BTRFS_EXTENT_ITEM_KEY; @@ -6858,36 +7106,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } /* - * If the space info is for both data and metadata it means we have a - * small filesystem and we can't use the clustering stuff. + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. */ - if (btrfs_mixed_space_info(space_info)) - use_cluster = false; - - if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { - last_ptr = &root->fs_info->meta_alloc_cluster; - if (!btrfs_test_opt(root, SSD)) - empty_cluster = 64 * 1024; - } - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && - btrfs_test_opt(root, SSD)) { - last_ptr = &root->fs_info->data_alloc_cluster; + if (unlikely(space_info->max_extent_size)) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + use_cluster = false; + } + spin_unlock(&space_info->lock); } + last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + hint_byte = last_ptr->window_start; + use_cluster = false; + } spin_unlock(&last_ptr->lock); } search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - - if (!last_ptr) - empty_cluster = 0; - if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); @@ -6922,6 +7181,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } search: have_caching_bg = false; + if (index == 0 || index == __get_raid_index(flags)) + full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { @@ -6955,6 +7216,7 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { + have_caching_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -6969,7 +7231,7 @@ have_block_group: * Ok we want to try and use the cluster allocator, so * lets look there */ - if (last_ptr) { + if (last_ptr && use_cluster) { struct btrfs_block_group_cache *used_block_group; unsigned long aligned_cluster; /* @@ -7095,6 +7357,16 @@ refill_cluster: } unclustered_alloc: + /* + * We are doing an unclustered alloc, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get + * more space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < @@ -7127,8 +7399,6 @@ unclustered_alloc: failed_alloc = true; goto have_block_group; } else if (!offset) { - if (!cached) - have_caching_bg = true; goto loop; } checks: @@ -7169,6 +7439,10 @@ loop: } up_read(&space_info->groups_sem); + if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg + && !orig_have_caching_bg) + orig_have_caching_bg = true; + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) goto search; @@ -7185,7 +7459,20 @@ loop: */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - loop++; + if (loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any unached bgs and we've alrelady done a + * full search through. + */ + if (orig_have_caching_bg || !full_search) + loop = LOOP_CACHING_WAIT; + else + loop = LOOP_ALLOC_CHUNK; + } else { + loop++; + } + if (loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; @@ -7203,6 +7490,15 @@ loop: ret = do_chunk_alloc(trans, root, flags, CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + loop = LOOP_NO_EMPTY_SIZE; + /* * Do not bail out on ENOSPC since we * can do more things. @@ -7219,6 +7515,15 @@ loop: } if (loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (empty_size == 0 && + empty_cluster == 0) { + ret = -ENOSPC; + goto out; + } empty_size = 0; empty_cluster = 0; } @@ -7227,11 +7532,20 @@ loop: } else if (!ins->objectid) { ret = -ENOSPC; } else if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } ret = 0; } out: - if (ret == -ENOSPC) + if (ret == -ENOSPC) { + spin_lock(&space_info->lock); + space_info->max_extent_size = max_extent_size; + spin_unlock(&space_info->lock); ins->offset = max_extent_size; + } return ret; } @@ -7280,7 +7594,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) { - bool final_tried = false; + bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; @@ -7415,6 +7729,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); + ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + ins->offset); + if (ret) + return ret; + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7429,8 +7748,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins, - int no_quota) + int level, struct btrfs_key *ins) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -7496,6 +7814,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + num_bytes); + if (ret) + return ret; + ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7511,7 +7834,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins) + u64 offset, u64 ram_bytes, + struct btrfs_key *ins) { int ret; @@ -7520,7 +7844,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, ins->offset, 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL, 0); + ram_bytes, BTRFS_ADD_DELAYED_EXTENT, + NULL); return ret; } @@ -7576,7 +7901,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); - btrfs_set_buffer_uptodate(buf); + set_extent_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; @@ -7722,19 +8047,16 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; - if (skinny_metadata) - extent_op->update_key = 0; - else - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; + extent_op->update_key = skinny_metadata ? false : true; + extent_op->update_flags = true; + extent_op->is_data = false; extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, 0); + extent_op); if (ret) goto out_free_delayed; } @@ -7850,21 +8172,47 @@ reada: } /* - * TODO: Modify related function to add related node/leaf to dirty_extent_root, - * for later qgroup accounting. - * - * Current, this function does nothing. + * These may not be seen by the usual inc/dec ref code so we have to + * add them here. */ +static int record_one_subtree_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_qgroup_extent_record *qrecord; + struct btrfs_delayed_ref_root *delayed_refs; + + qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); + if (!qrecord) + return -ENOMEM; + + qrecord->bytenr = bytenr; + qrecord->num_bytes = num_bytes; + qrecord->old_roots = NULL; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord)) + kfree(qrecord); + spin_unlock(&delayed_refs->lock); + + return 0; +} + static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int nr = btrfs_header_nritems(eb); - int i, extent_type; + int i, extent_type, ret; struct btrfs_key key; struct btrfs_file_extent_item *fi; u64 bytenr, num_bytes; + /* We can be called directly from walk_up_proc() */ + if (!root->fs_info->quota_enabled) + return 0; + for (i = 0; i < nr; i++) { btrfs_item_key_to_cpu(eb, &key, i); @@ -7883,6 +8231,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, continue; num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); + if (ret) + return ret; } return 0; } @@ -7951,8 +8303,6 @@ static int adjust_slots_upwards(struct btrfs_root *root, /* * root_eb is the subtree root and is locked before this function is called. - * TODO: Modify this function to mark all (including complete shared node) - * to dirty_extent_root to allow it get accounted in qgroup. */ static int account_shared_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -8030,6 +8380,11 @@ walk_down: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + ret = record_one_subtree_extent(trans, root, child_bytenr, + root->nodesize); + if (ret) + goto out; } if (level == 0) { @@ -8275,14 +8630,15 @@ skip: ret = account_shared_subtree(trans, root, next, generation, level - 1); if (ret) { - printk_ratelimited(KERN_ERR "BTRFS: %s Error " + btrfs_err_rl(root->fs_info, + "Error " "%d accounting shared subtree. Quota " - "is out of sync, rescan required.\n", - root->fs_info->sb->s_id, ret); + "is out of sync, rescan required.", + ret); } } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0, 0); + root->root_key.objectid, level - 1, 0); BUG_ON(ret); /* -ENOMEM */ } btrfs_tree_unlock(next); @@ -8367,10 +8723,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = account_leaf_items(trans, root, eb); if (ret) { - printk_ratelimited(KERN_ERR "BTRFS: %s Error " + btrfs_err_rl(root->fs_info, + "error " "%d accounting leaf items. Quota " - "is out of sync, rescan required.\n", - root->fs_info->sb->s_id, ret); + "is out of sync, rescan required.", + ret); } } /* make block locked assertion in clean_tree_block happy */ @@ -8692,7 +9049,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err); + btrfs_std_error(root->fs_info, err, NULL); return err; } @@ -8831,7 +9188,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) if ((sinfo->flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && !force) - min_allocable_bytes = 1 * 1024 * 1024; + min_allocable_bytes = SZ_1M; else min_allocable_bytes = 0; @@ -8880,7 +9237,7 @@ again: * back off and let this transaction commit */ mutex_lock(&root->fs_info->ro_block_group_mutex); - if (trans->transaction->dirty_bg_run) { + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { u64 transid = trans->transid; mutex_unlock(&root->fs_info->ro_block_group_mutex); @@ -9363,6 +9720,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) cache->full_stripe_len = btrfs_full_stripe_len(root, &root->fs_info->mapping_tree, start); + set_free_space_tree_thresholds(cache); + atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); init_rwsem(&cache->data_rwsem); @@ -9374,6 +9733,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); + mutex_init(&cache->free_space_lock); return cache; } @@ -9398,7 +9758,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_test_opt(root, SPACE_CACHE) && @@ -9584,6 +9944,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, extent_root, ret); + add_block_group_free_space(trans, root->fs_info, block_group); + /* already aborted the transaction if it failed. */ next: list_del_init(&block_group->bg_list); } @@ -9614,6 +9976,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + cache->needs_free_space = 1; ret = exclude_super_stripes(root, cache); if (ret) { /* @@ -9630,6 +9993,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(root, cache)) { + u64 new_bytes_used = size - bytes_used; + + bytes_used += new_bytes_used >> 1; + fragment_free_space(root, cache); + } +#endif /* * Call to ensure the corresponding space_info object is created and * assigned to our block group, but don't update its counters just yet. @@ -9976,6 +10347,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, unlock_chunks(root); + ret = remove_block_group_free_space(trans, root->fs_info, block_group); + if (ret) + goto out; + btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -9991,6 +10366,47 @@ out: return ret; } +struct btrfs_trans_handle * +btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, + const u64 chunk_offset) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct map_lookup *map; + unsigned int num_items; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + ASSERT(em && em->start == chunk_offset); + + /* + * We need to reserve 3 + N units from the metadata space info in order + * to remove a block group (done at btrfs_remove_chunk() and at + * btrfs_remove_block_group()), which are used for: + * + * 1 unit for adding the free space inode's orphan (located in the tree + * of tree roots). + * 1 unit for deleting the block group item (located in the extent + * tree). + * 1 unit for deleting the free space item (located in tree of tree + * roots). + * N units for deleting N device extent items corresponding to each + * stripe (located in the device tree). + * + * In order to remove a block group we also need to reserve units in the + * system space info in order to update the chunk tree (update one or + * more device items and remove one chunk item), but this is done at + * btrfs_remove_chunk() through a call to check_system_chunk(). + */ + map = (struct map_lookup *)em->bdev; + num_items = 3 + map->num_stripes; + free_extent_map(em); + + return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, + num_items, 1); +} + /* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. @@ -10014,22 +10430,25 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) block_group = list_first_entry(&fs_info->unused_bgs, struct btrfs_block_group_cache, bg_list); - space_info = block_group->space_info; list_del_init(&block_group->bg_list); + + space_info = block_group->space_info; + if (ret || btrfs_mixed_space_info(space_info)) { btrfs_put_block_group(block_group); continue; } spin_unlock(&fs_info->unused_bgs_lock); - mutex_lock(&root->fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->delete_unused_bgs_mutex); /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); spin_lock(&block_group->lock); if (block_group->reserved || btrfs_block_group_used(&block_group->item) || - block_group->ro) { + block_group->ro || + list_is_singular(&block_group->list)) { /* * We want to bail if we made new allocations or have * outstanding allocations in this block group. We do @@ -10054,8 +10473,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. */ - /* 1 for btrfs_orphan_reserve_metadata() */ - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_trans_remove_block_group(fs_info, + block_group->key.objectid); if (IS_ERR(trans)) { btrfs_dec_block_group_ro(root, block_group); ret = PTR_ERR(trans); @@ -10135,17 +10554,21 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * until transaction commit to do the actual discard. */ if (trimming) { - WARN_ON(!list_empty(&block_group->bg_list)); - spin_lock(&trans->transaction->deleted_bgs_lock); + spin_lock(&fs_info->unused_bgs_lock); + /* + * A concurrent scrub might have added us to the list + * fs_info->unused_bgs, so use a list_move operation + * to add the block group to the deleted_bgs list. + */ list_move(&block_group->bg_list, &trans->transaction->deleted_bgs); - spin_unlock(&trans->transaction->deleted_bgs_lock); + spin_unlock(&fs_info->unused_bgs_lock); btrfs_get_block_group(block_group); } end_trans: btrfs_end_transaction(trans, root); next: - mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } @@ -10370,8 +10793,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* - * Make sure counter is updated before we wake up - * waiters. + * Make sure counter is updated before we wake up waiters. */ smp_mb(); if (waitqueue_active(&root->subv_writers->wait)) diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/fs/btrfs/extent-tree.h +++ /dev/null diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3915c9473e94..2e7c97a3f344 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, inode = tree->mapping->host; isize = i_size_read(inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - printk_ratelimited(KERN_DEBUG - "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", caller, btrfs_ino(inode), isize, start, end); } } @@ -131,6 +131,25 @@ struct extent_page_data { unsigned int sync_io:1; }; +static void add_extent_changeset(struct extent_state *state, unsigned bits, + struct extent_changeset *changeset, + int set) +{ + int ret; + + if (!changeset) + return; + if (set && (state->state & bits) == bits) + return; + if (!set && (state->state & bits) == 0) + return; + changeset->bytes_changed += state->end - state->start + 1; + ret = ulist_add(changeset->range_changed, state->start, state->end, + GFP_ATOMIC); + /* ENOMEM */ + BUG_ON(ret < 0); +} + static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) @@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits); + struct extent_state *state, unsigned *bits, + struct extent_changeset *changeset); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned *bits) + unsigned *bits, struct extent_changeset *changeset) { struct rb_node *node; @@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree, state->start = start; state->end = end; - set_state_bits(tree, state, bits); + set_state_bits(tree, state, bits, changeset); node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); if (node) { @@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits, int wake) + unsigned *bits, int wake, + struct extent_changeset *changeset) { struct extent_state *next; unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; @@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); + add_extent_changeset(state, bits_to_clear, changeset, 0); state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); @@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask) +static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; @@ -594,7 +616,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { /* * Don't care for allocation failure here because we might end * up not needing the pre-allocated extent state at all, which @@ -671,7 +693,8 @@ hit_next: if (err) goto out; if (state->end <= end) { - state = clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake, + changeset); goto next; } goto search_again; @@ -692,13 +715,13 @@ hit_next: if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, &bits, wake); + clear_state_bit(tree, prealloc, &bits, wake, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake, changeset); next: if (last_end == (u64)-1) goto out; @@ -718,7 +741,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; } @@ -789,7 +812,7 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits) + unsigned *bits, struct extent_changeset *changeset) { unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; @@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree, u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } + add_extent_changeset(state, bits_to_set, changeset, 1); state->state |= bits_to_set; } @@ -835,7 +859,7 @@ static int __must_check __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned exclusive_bits, u64 *failed_start, struct extent_state **cached_state, - gfp_t mask) + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -850,7 +874,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bits |= EXTENT_FIRST_DELALLOC; again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { prealloc = alloc_extent_state(mask); BUG_ON(!prealloc); } @@ -873,7 +897,7 @@ again: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits); + &p, &parent, &bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -899,7 +923,7 @@ hit_next: goto out; } - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -945,7 +969,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -980,7 +1004,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits); + NULL, NULL, &bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1008,7 +1032,7 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits); + set_state_bits(tree, prealloc, &bits, changeset); cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; @@ -1028,7 +1052,7 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; } @@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return __set_extent_bit(tree, start, end, bits, 0, failed_start, - cached_state, mask); + cached_state, mask, NULL); } @@ -1076,7 +1100,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); again: - if (!prealloc && (mask & __GFP_WAIT)) { + if (!prealloc && gfpflags_allow_blocking(mask)) { /* * Best effort, don't worry if extent state allocation fails * here for the first iteration. We might have a cached state @@ -1111,7 +1135,7 @@ again: goto out; } err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits); + &p, &parent, &bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1130,9 +1154,9 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1171,9 +1195,10 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0, + NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1208,7 +1233,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits); + NULL, NULL, &bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1233,9 +1258,9 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits); + set_state_bits(tree, prealloc, &bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, &clear_bits, 0); + clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); prealloc = NULL; goto out; } @@ -1253,81 +1278,49 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (gfpflags_allow_blocking(mask)) cond_resched(); first_iteration = false; goto again; } /* wrappers around set/clear extent bit */ -int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, - NULL, mask); -} - -int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask) +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset) { - return set_extent_bit(tree, start, end, bits, NULL, - NULL, mask); -} - -int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask) -{ - int wake = 0; - - if (bits & EXTENT_LOCKED) - wake = 1; - - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); -} - -int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); -} - -int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, mask); -} + /* + * We don't support EXTENT_LOCKED yet, as current changeset will + * record any bits changed, so for EXTENT_LOCKED case, it will + * either fail with -EEXIST or changeset will record the whole + * range. + */ + BUG_ON(bits & EXTENT_LOCKED); -int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask, + changeset); } -int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, - NULL, mask); + return __clear_extent_bit(tree, start, end, bits, wake, delete, + cached, mask, NULL); } -int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, - cached_state, mask); -} + /* + * Don't support EXTENT_LOCKED case, same reason as + * set_record_extent_bits(). + */ + BUG_ON(bits & EXTENT_LOCKED); -int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, mask); + return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask, + changeset); } /* @@ -1335,15 +1328,15 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_state **cached_state) + struct extent_state **cached_state) { int err; u64 failed_start; while (1) { - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS); + cached_state, GFP_NOFS, NULL); if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -1354,18 +1347,13 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, return err; } -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return lock_extent_bits(tree, start, end, 0, NULL); -} - int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { int err; u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS); + &failed_start, NULL, GFP_NOFS, NULL); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -1375,20 +1363,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) return 1; } -int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - mask); -} - -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - GFP_NOFS); -} - -int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1401,10 +1376,9 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } -int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1418,13 +1392,12 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } /* * helper function to set both pages and extents in the tree writeback */ -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1437,7 +1410,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } /* find the first state struct with 'bits' set after 'start', and @@ -1737,7 +1709,7 @@ again: BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); + lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, @@ -1757,7 +1729,7 @@ out_failed: return found; } -int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, unsigned clear_bits, unsigned long page_ops) @@ -1772,7 +1744,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); if (page_ops == 0) - return 0; + return; if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) mapping_set_error(inode->i_mapping, -EIO); @@ -1806,7 +1778,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, index += ret; cond_resched(); } - return 0; } /* @@ -2078,8 +2049,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } - printk_ratelimited_in_rcu(KERN_INFO - "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n", + btrfs_info_rl_in_rcu(fs_info, + "read error corrected: ino %llu off %llu (dev %s sector %llu)", btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); bio_put(bio); @@ -2453,7 +2424,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, /* lots and lots of room for performance fixes in the end_bio funcs */ -int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); struct extent_io_tree *tree; @@ -2474,7 +2445,6 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) ret = ret < 0 ? ret : -EIO; mapping_set_error(page->mapping, ret); } - return 0; } /* @@ -2516,9 +2486,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - if (end_extent_writepage(page, bio->bi_error, start, end)) - continue; - + end_extent_writepage(page, bio->bi_error, start, end); end_page_writeback(page); } @@ -3070,8 +3038,12 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); + if (parent_locked) + free_extent_state(cached); + else + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; pg_offset += iosize; continue; @@ -4259,7 +4231,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, 0, &cached_state); + lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -4319,8 +4291,8 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; - if ((mask & __GFP_WAIT) && - page->mapping->host->i_size > 16 * 1024 * 1024) { + if (gfpflags_allow_blocking(mask) && + page->mapping->host->i_size > SZ_16M) { u64 len; while (start <= end) { len = end - start + 1; @@ -4469,7 +4441,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, @@ -4730,24 +4702,14 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) return new; } -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) +struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) { struct extent_buffer *eb; - unsigned long len; unsigned long num_pages; unsigned long i; - if (!fs_info) { - /* - * Called only from tests that don't always have a fs_info - * available, but we know that nodesize is 4096 - */ - len = 4096; - } else { - len = fs_info->tree_root->nodesize; - } - num_pages = num_extent_pages(0, len); + num_pages = num_extent_pages(start, len); eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) @@ -4770,6 +4732,24 @@ err: return NULL; } +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + unsigned long len; + + if (!fs_info) { + /* + * Called only from tests that don't always have a fs_info + * available, but we know that nodesize is 4096 + */ + len = 4096; + } else { + len = fs_info->tree_root->nodesize; + } + + return __alloc_dummy_extent_buffer(fs_info, start, len); +} + static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; @@ -5160,7 +5140,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) return was_dirty; } -int clear_extent_buffer_uptodate(struct extent_buffer *eb) +void clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; @@ -5173,10 +5153,9 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb) if (page) ClearPageUptodate(page); } - return 0; } -int set_extent_buffer_uptodate(struct extent_buffer *eb) +void set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; @@ -5188,7 +5167,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb) page = eb->pages[i]; SetPageUptodate(page); } - return 0; } int extent_buffer_uptodate(struct extent_buffer *eb) @@ -5527,6 +5505,155 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } +/* + * The extent buffer bitmap operations are done with byte granularity because + * bitmap items are not guaranteed to be aligned to a word and therefore a + * single word in a bitmap may straddle two pages in the extent buffer. + */ +#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) +#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BITMAP_FIRST_BYTE_MASK(start) \ + ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) +#define BITMAP_LAST_BYTE_MASK(nbits) \ + (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) + +/* + * eb_bitmap_offset() - calculate the page and offset of the byte containing the + * given bit number + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number + * @page_index: return index of the page in the extent buffer that contains the + * given bit number + * @page_offset: return offset into the page given by page_index + * + * This helper hides the ugliness of finding the byte in an extent buffer which + * contains a given bit. + */ +static inline void eb_bitmap_offset(struct extent_buffer *eb, + unsigned long start, unsigned long nr, + unsigned long *page_index, + size_t *page_offset) +{ + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + size_t byte_offset = BIT_BYTE(nr); + size_t offset; + + /* + * The byte we want is the offset of the extent buffer + the offset of + * the bitmap item in the extent buffer + the offset of the byte in the + * bitmap item. + */ + offset = start_offset + start + byte_offset; + + *page_index = offset >> PAGE_CACHE_SHIFT; + *page_offset = offset & (PAGE_CACHE_SIZE - 1); +} + +/** + * extent_buffer_test_bit - determine whether a bit in a bitmap item is set + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number to test + */ +int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, + unsigned long nr) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + + eb_bitmap_offset(eb, start, nr, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); +} + +/** + * extent_buffer_bitmap_set - set an area of a bitmap + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to set + */ +void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + const unsigned int size = pos + len; + int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); + unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); + + eb_bitmap_offset(eb, start, pos, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + + while (len >= bits_to_set) { + kaddr[offset] |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~0U; + if (++offset >= PAGE_CACHE_SIZE && len > 0) { + offset = 0; + page = eb->pages[++i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + } + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + kaddr[offset] |= mask_to_set; + } +} + + +/** + * extent_buffer_bitmap_clear - clear an area of a bitmap + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to clear + */ +void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + const unsigned int size = pos + len; + int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); + unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); + + eb_bitmap_offset(eb, start, pos, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + + while (len >= bits_to_clear) { + kaddr[offset] &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_BYTE; + mask_to_clear = ~0U; + if (++offset >= PAGE_CACHE_SIZE && len > 0) { + offset = 0; + page = eb->pages[++i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + } + } + if (len) { + mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); + kaddr[offset] &= ~mask_to_clear; + } +} + static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) { unsigned long distance = (src > dst) ? src - dst : dst - src; @@ -5566,13 +5693,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " - "len %lu dst len %lu\n", src_offset, len, dst->len); + btrfs_err(dst->fs_info, + "memmove bogus src_offset %lu move " + "len %lu dst len %lu", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " - "len %lu dst len %lu\n", dst_offset, len, dst->len); + btrfs_err(dst->fs_info, + "memmove bogus dst_offset %lu move " + "len %lu dst len %lu", dst_offset, len, dst->len); BUG_ON(1); } @@ -5612,13 +5741,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " - "len %lu len %lu\n", src_offset, len, dst->len); + btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move " + "len %lu len %lu", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " - "len %lu len %lu\n", dst_offset, len, dst->len); + btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move " + "len %lu len %lu", dst_offset, len, dst->len); BUG_ON(1); } if (dst_offset < src_offset) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c668f36898d3..0377413bd4b9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -2,6 +2,7 @@ #define __EXTENTIO__ #include <linux/rbtree.h> +#include "ulist.h" /* bits for the extent state */ #define EXTENT_DIRTY (1U << 0) @@ -18,6 +19,7 @@ #define EXTENT_NEED_WAIT (1U << 13) #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) +#define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -161,6 +163,17 @@ struct extent_buffer { #endif }; +/* + * Structure to record how many bytes and which ranges are set/cleared + */ +struct extent_changeset { + /* How many bytes are set/cleared in this operation */ + u64 bytes_changed; + + /* Changed ranges */ + struct ulist *range_changed; +}; + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { @@ -186,12 +199,14 @@ int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_state **cached); -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); -int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached, gfp_t mask); + struct extent_state **cached); + +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return lock_extent_bits(tree, start, end, NULL); +} + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); @@ -208,33 +223,105 @@ void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int filled, struct extent_state *cached_state); -int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask); +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); -int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask); + +static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + GFP_NOFS); +} + +static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} + +static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits, gfp_t mask) +{ + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); +} + +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); -int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); + +static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); +} + +static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, mask); +} + +static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, + NULL, mask); +} + +static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); +} + int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask); -int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); + +static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE, + NULL, cached_state, mask); +} + +static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, mask); +} + +static inline int set_extent_new(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); +} + +static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, + cached_state, mask); +} + int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state); @@ -263,8 +350,10 @@ void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); +struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -309,19 +398,25 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); +int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, + unsigned long pos); +void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len); +void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len); void clear_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_buffer *eb); +void set_extent_buffer_uptodate(struct extent_buffer *eb); +void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, unsigned long *map_len); -int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); -int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); -int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); +void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); @@ -338,7 +433,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, int mirror_num); int clean_io_failure(struct inode *inode, u64 start, struct page *page, unsigned int pg_offset); -int end_extent_writepage(struct page *page, int err, u64 start, u64 end); +void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 58ece6558430..a67e1c828d0f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -202,7 +202,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, } if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) - path->reada = 2; + path->reada = READA_FORWARD; WARN_ON(bio->bi_vcnt <= 0); @@ -328,7 +328,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (search_commit) { path->skip_locking = 1; - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8c6f247ba81d..83d7859d7619 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -756,8 +756,16 @@ next_slot: } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid > ino || - key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + + if (key.objectid > ino) + break; + if (WARN_ON_ONCE(key.objectid < ino) || + key.type < BTRFS_EXTENT_DATA_KEY) { + ASSERT(del_nr == 0); + path->slots[0]++; + goto next_slot; + } + if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) break; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -776,8 +784,8 @@ next_slot: btrfs_file_extent_inline_len(leaf, path->slots[0], fi); } else { - WARN_ON(1); - extent_end = search_start; + /* can't happen */ + BUG(); } /* @@ -847,7 +855,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 1); + start - extent_offset); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -925,7 +933,7 @@ delete_extent_item: disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset, 0); + extent_offset); BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); @@ -1204,7 +1212,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 1); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1231,7 +1239,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } other_start = 0; @@ -1248,7 +1256,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } if (del_nr == 0) { @@ -1283,7 +1291,8 @@ out: * on error we return an unlocked page and the error value * on success we return a locked page and 0 */ -static int prepare_uptodate_page(struct page *page, u64 pos, +static int prepare_uptodate_page(struct inode *inode, + struct page *page, u64 pos, bool force_uptodate) { int ret = 0; @@ -1298,6 +1307,10 @@ static int prepare_uptodate_page(struct page *page, u64 pos, unlock_page(page); return -EIO; } + if (page->mapping != inode->i_mapping) { + unlock_page(page); + return -EAGAIN; + } } return 0; } @@ -1316,6 +1329,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, int faili; for (i = 0; i < num_pages; i++) { +again: pages[i] = find_or_create_page(inode->i_mapping, index + i, mask | __GFP_WRITE); if (!pages[i]) { @@ -1325,13 +1339,17 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, } if (i == 0) - err = prepare_uptodate_page(pages[i], pos, + err = prepare_uptodate_page(inode, pages[i], pos, force_uptodate); - if (i == num_pages - 1) - err = prepare_uptodate_page(pages[i], + if (!err && i == num_pages - 1) + err = prepare_uptodate_page(inode, pages[i], pos + write_bytes, false); if (err) { page_cache_release(pages[i]); + if (err == -EAGAIN) { + err = 0; + goto again; + } faili = i - 1; goto fail; } @@ -1376,7 +1394,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos, 0, cached_state); + start_pos, last_pos, cached_state); ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && @@ -1469,7 +1487,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, u64 release_bytes = 0; u64 lockstart; u64 lockend; - unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; @@ -1485,8 +1502,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (!pages) return -ENOMEM; - first_index = pos >> PAGE_CACHE_SHIFT; - while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(i), @@ -1510,12 +1525,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); - if (ret == -ENOSPC && - (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC))) { + + if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) { ret = check_can_nocow(inode, pos, &write_bytes); + if (ret < 0) + break; if (ret > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ only_release_metadata = true; /* * our prealloc extent may be smaller than @@ -1524,20 +1544,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_CACHE_SIZE); reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = 0; - } else { - ret = -ENOSPC; + goto reserve_metadata; } } - - if (ret) + ret = btrfs_check_data_free_space(inode, pos, write_bytes); + if (ret < 0) break; +reserve_metadata: ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, - reserve_bytes); + btrfs_free_reserved_data_space(inode, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1603,12 +1622,17 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - if (only_release_metadata) + if (only_release_metadata) { btrfs_delalloc_release_metadata(inode, release_bytes); - else - btrfs_delalloc_release_space(inode, + } else { + u64 __pos; + + __pos = round_down(pos, root->sectorsize) + + (dirty_pages << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, __pos, release_bytes); + } } release_bytes = dirty_pages << PAGE_CACHE_SHIFT; @@ -1660,7 +1684,7 @@ again: btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, release_bytes); + btrfs_delalloc_release_space(inode, pos, release_bytes); } } @@ -1868,8 +1892,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_log_ctx ctx; int ret = 0; bool full_sync = 0; - const u64 len = end - start + 1; + u64 len; + /* + * The range length can be represented by u64, we have to do the typecasts + * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() + */ + len = (u64)end - (u64)start + 1; trace_btrfs_sync_file(file, datasync); /* @@ -2057,8 +2086,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } } if (!full_sync) { - ret = btrfs_wait_ordered_range(inode, start, - end - start + 1); + ret = btrfs_wait_ordered_range(inode, start, len); if (ret) { btrfs_end_transaction(trans, root); goto out; @@ -2266,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; - int rsv_count; + unsigned int rsv_count; bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); u64 ino_size; @@ -2370,7 +2398,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache_range(inode, lockstart, lockend); lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, &cached_state); + &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, lockend); /* @@ -2488,6 +2516,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) trans->block_rsv = &root->fs_info->trans_block_rsv; /* + * If we are using the NO_HOLES feature we might have had already an + * hole that overlaps a part of the region [lockstart, lockend] and + * ends at (or beyond) lockend. Since we have no file extent items to + * represent holes, drop_end can be less than lockend and so we must + * make sure we have an extent map representing the existing hole (the + * call to __btrfs_drop_extents() might have dropped the existing extent + * map representing the existing hole), otherwise the fast fsync path + * will not record the existence of the hole region + * [existing_hole_start, lockend]. + */ + if (drop_end <= lockend) + drop_end = lockend + 1; + /* * Don't insert file hole extent item if it's for a range beyond eof * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). @@ -2541,17 +2582,61 @@ out_only_mutex: return err; } +/* Helper structure to record which range is already reserved */ +struct falloc_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Helper function to add falloc range + * + * Caller should have locked the larger range of extent containing + * [start, len) + */ +static int add_falloc_range(struct list_head *head, u64 start, u64 len) +{ + struct falloc_range *prev = NULL; + struct falloc_range *range = NULL; + + if (list_empty(head)) + goto insert; + + /* + * As fallocate iterate by bytenr order, we only need to check + * the last range. + */ + prev = list_entry(head->prev, struct falloc_range, list); + if (prev->start + prev->len == start) { + prev->len += len; + return 0; + } +insert: + range = kmalloc(sizeof(*range), GFP_NOFS); + if (!range) + return -ENOMEM; + range->start = start; + range->len = len; + list_add_tail(&range->list, head); + return 0; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct falloc_range *range; + struct falloc_range *tmp; + struct list_head reserve_list; u64 cur_offset; u64 last_byte; u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; + u64 actual_end = 0; struct extent_map *em; int blocksize = BTRFS_I(inode)->root->sectorsize; int ret; @@ -2567,11 +2652,12 @@ static long btrfs_fallocate(struct file *file, int mode, return btrfs_punch_hole(inode, offset, len); /* - * Make sure we have enough space before we do the - * allocation. + * Only trigger disk allocation, don't trigger qgroup reserve + * + * For qgroup space, it will be checked later. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); - if (ret) + ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start); + if (ret < 0) return ret; mutex_lock(&inode->i_mutex); @@ -2579,6 +2665,13 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret) goto out; + /* + * TODO: Move these two operations after we have checked + * accurate reserved space, or fallocate can still fail but + * with page truncated or size expanded. + * + * But that's a minor problem and won't do much harm BTW. + */ if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, i_size_read(inode), alloc_start); @@ -2612,7 +2705,7 @@ static long btrfs_fallocate(struct file *file, int mode, * transaction */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state); + locked_end, &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -2637,10 +2730,10 @@ static long btrfs_fallocate(struct file *file, int mode, } } + /* First, check if we exceed the qgroup limit */ + INIT_LIST_HEAD(&reserve_list); cur_offset = alloc_start; while (1) { - u64 actual_end; - em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); if (IS_ERR_OR_NULL(em)) { @@ -2653,57 +2746,82 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); - if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - } else if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size and the inode item. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, - NULL); - ret = btrfs_update_inode(trans, root, inode); - if (ret) - btrfs_end_transaction(trans, root); - else - ret = btrfs_end_transaction(trans, - root); + ret = add_falloc_range(&reserve_list, cur_offset, + last_byte - cur_offset); + if (ret < 0) { + free_extent_map(em); + break; } + ret = btrfs_qgroup_reserve_data(inode, cur_offset, + last_byte - cur_offset); + if (ret < 0) + break; } free_extent_map(em); - if (ret < 0) - break; - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; + if (cur_offset >= alloc_end) break; + } + + /* + * If ret is still 0, means we're OK to fallocate. + * Or just cleanup the list and exit. + */ + list_for_each_entry_safe(range, tmp, &reserve_list, list) { + if (!ret) + ret = btrfs_prealloc_file_range(inode, mode, + range->start, + range->len, 1 << inode->i_blkbits, + offset + len, &alloc_hint); + list_del(&range->list); + kfree(range); + } + if (ret < 0) + goto out_unlock; + + if (actual_end > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We didn't need to allocate any more space, but we + * still extended the size of the file so we need to + * update i_size and the inode item. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, root); } } +out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); out: + /* + * As we waited the extent range, the data_rsv_map must be empty + * in the range, as written data range will be released from it. + * And for prealloacted extent, it will also be released when + * its metadata is written. + * So this is completely used as cleanup. + */ + btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - alloc_start); return ret; } @@ -2734,7 +2852,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) lockend--; len = lockend - lockstart + 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); while (start < inode->i_size) { @@ -2816,6 +2934,9 @@ const struct file_operations btrfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, #endif + .copy_file_range = btrfs_copy_file_range, + .clone_file_range = btrfs_clone_file_range, + .dedupe_file_range = btrfs_dedupe_file_range, }; void btrfs_auto_defrag_exit(void) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index abe3a66bd3ba..8f835bfa1bdd 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -30,7 +30,7 @@ #include "volumes.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) -#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) +#define MAX_CACHE_BYTES_PER_GIG SZ_32K struct btrfs_trim_range { u64 start; @@ -85,8 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, } mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & - ~(__GFP_FS | __GFP_HIGHMEM)); + mapping_gfp_constraint(inode->i_mapping, + ~(__GFP_FS | __GFP_HIGHMEM))); return inode; } @@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) gen = io_ctl->cur; if (le64_to_cpu(*gen) != generation) { - printk_ratelimited(KERN_ERR "BTRFS: space cache generation " - "(%Lu) does not match inode (%Lu)\n", *gen, - generation); + btrfs_err_rl(io_ctl->root->fs_info, + "space cache generation (%llu) does not match inode (%llu)", + *gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { - printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " - "space cache\n"); + btrfs_err_rl(io_ctl->root->fs_info, + "csum mismatch on free space cache"); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -891,7 +891,7 @@ out: spin_unlock(&block_group->lock); ret = 0; - btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now", + btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now", block_group->key.objectid); } @@ -1086,14 +1086,11 @@ write_pinned_extent_entries(struct btrfs_root *root, static noinline_for_stack int write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) { - struct list_head *pos, *n; + struct btrfs_free_space *entry, *next; int ret; /* Write out the bitmaps */ - list_for_each_safe(pos, n, bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); - + list_for_each_entry_safe(entry, next, bitmap_list, list) { ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); if (ret) return -ENOSPC; @@ -1119,13 +1116,10 @@ static int flush_dirty_cache(struct inode *inode) static void noinline_for_stack cleanup_bitmap_list(struct list_head *bitmap_list) { - struct list_head *pos, *n; + struct btrfs_free_space *entry, *next; - list_for_each_safe(pos, n, bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); + list_for_each_entry_safe(entry, next, bitmap_list, list) list_del_init(&entry->list); - } } static void noinline_for_stack @@ -1215,7 +1209,7 @@ out: * @offset - the offset for the key we'll insert * * This function writes out a free space cache struct to disk for quick recovery - * on mount. This will return 0 if it was successfull in writing the cache out, + * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, @@ -1261,7 +1255,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, goto out; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state); + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1656,11 +1650,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) * at or below 32k, so we need to adjust how much memory we allow to be * used by extent based free space tracking */ - if (size < 1024 * 1024 * 1024) + if (size < SZ_1G) max_bytes = MAX_CACHE_BYTES_PER_GIG; else - max_bytes = MAX_CACHE_BYTES_PER_GIG * - div_u64(size, 1024 * 1024 * 1024); + max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); /* * we want to account for 1 more bitmap than what we have so we can make @@ -1730,7 +1723,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, */ static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, - u64 *bytes) + u64 *bytes, bool for_alloc) { unsigned long found_bits = 0; unsigned long max_bits = 0; @@ -1738,11 +1731,26 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, unsigned long next_zero; unsigned long extent_bits; + /* + * Skip searching the bitmap if we don't have a contiguous section that + * is large enough for this allocation. + */ + if (for_alloc && + bitmap_info->max_extent_size && + bitmap_info->max_extent_size < *bytes) { + *bytes = bitmap_info->max_extent_size; + return -1; + } + i = offset_to_bit(bitmap_info->offset, ctl->unit, max_t(u64, *offset, bitmap_info->offset)); bits = bytes_to_bits(*bytes, ctl->unit); for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { + if (for_alloc && bits == 1) { + found_bits = 1; + break; + } next_zero = find_next_zero_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); extent_bits = next_zero - i; @@ -1762,6 +1770,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, } *bytes = (u64)(max_bits) * ctl->unit; + bitmap_info->max_extent_size = *bytes; return -1; } @@ -1813,7 +1822,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, if (entry->bitmap) { u64 size = *bytes; - ret = search_bitmap(ctl, entry, &tmp, &size); + ret = search_bitmap(ctl, entry, &tmp, &size, true); if (!ret) { *offset = tmp; *bytes = size; @@ -1874,7 +1883,8 @@ again: search_start = *offset; search_bytes = ctl->unit; search_bytes = min(search_bytes, end - search_start + 1); - ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); + ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, + false); if (ret < 0 || search_start != *offset) return -EINVAL; @@ -1919,7 +1929,7 @@ again: search_start = *offset; search_bytes = ctl->unit; ret = search_bitmap(ctl, bitmap_info, &search_start, - &search_bytes); + &search_bytes, false); if (ret < 0 || search_start != *offset) return -EAGAIN; @@ -1943,6 +1953,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, bitmap_set_bits(ctl, info, offset, bytes_to_set); + /* + * We set some bytes, we have no idea what the max extent size is + * anymore. + */ + info->max_extent_size = 0; + return bytes_to_set; } @@ -1951,12 +1967,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_block_group_cache *block_group = ctl->private; + bool forced = false; + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root, + block_group)) + forced = true; +#endif /* * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap */ - if (ctl->free_extents < ctl->extents_thresh) { + if (!forced && ctl->free_extents < ctl->extents_thresh) { /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want @@ -1986,7 +2009,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } -static struct btrfs_free_space_op free_space_op = { +static const struct btrfs_free_space_op free_space_op = { .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -2459,8 +2482,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) * track of free space, and if we pass 1/2 of that we want to * start converting things over to using bitmaps */ - ctl->extents_thresh = ((1024 * 32) / 2) / - sizeof(struct btrfs_free_space); + ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); } /* @@ -2661,7 +2683,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, search_start = min_start; search_bytes = bytes; - err = search_bitmap(ctl, entry, &search_start, &search_bytes); + err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); if (err) { if (search_bytes > *max_extent_size) *max_extent_size = search_bytes; @@ -2775,6 +2797,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, unsigned long want_bits; unsigned long min_bits; unsigned long found_bits; + unsigned long max_bits = 0; unsigned long start = 0; unsigned long total_found = 0; int ret; @@ -2784,6 +2807,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, want_bits = bytes_to_bits(bytes, ctl->unit); min_bits = bytes_to_bits(min_bytes, ctl->unit); + /* + * Don't bother looking for a cluster in this bitmap if it's heavily + * fragmented. + */ + if (entry->max_extent_size && + entry->max_extent_size < cont1_bytes) + return -ENOSPC; again: found_bits = 0; for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { @@ -2791,13 +2821,19 @@ again: BITS_PER_BITMAP, i); if (next_zero - i >= min_bits) { found_bits = next_zero - i; + if (found_bits > max_bits) + max_bits = found_bits; break; } + if (next_zero - i > max_bits) + max_bits = next_zero - i; i = next_zero; } - if (!found_bits) + if (!found_bits) { + entry->max_extent_size = (u64)max_bits * ctl->unit; return -ENOSPC; + } if (!total_found) { start = i; @@ -2928,7 +2964,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; + struct btrfs_free_space *entry = NULL; int ret = -ENOSPC; u64 bitmap_offset = offset_to_bitmap(ctl, offset); @@ -2939,8 +2975,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, * The bitmap that covers offset won't be in the list unless offset * is just its start offset. */ - entry = list_first_entry(bitmaps, struct btrfs_free_space, list); - if (entry->offset != bitmap_offset) { + if (!list_empty(bitmaps)) + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + + if (!entry || entry->offset != bitmap_offset) { entry = tree_search_offset(ctl, bitmap_offset, 1, 0); if (entry && list_empty(&entry->list)) list_add(&entry->list, bitmaps); @@ -3056,6 +3094,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) spin_lock_init(&cluster->refill_lock); cluster->root = RB_ROOT; cluster->max_size = 0; + cluster->fragmented = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } @@ -3223,7 +3262,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, } bytes = minlen; - ret2 = search_bitmap(ctl, entry, &start, &bytes); + ret2 = search_bitmap(ctl, entry, &start, &bytes, false); if (ret2 || start >= end) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3376,7 +3415,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) u64 count = 1; int ret; - ret = search_bitmap(ctl, entry, &offset, &count); + ret = search_bitmap(ctl, entry, &offset, &count, true); /* Logic error; Should be empty if it can't find anything */ ASSERT(!ret); @@ -3532,6 +3571,7 @@ again: spin_lock(&ctl->tree_lock); info->offset = offset; info->bytes = bytes; + info->max_extent_size = 0; ret = link_free_space(ctl, info); spin_unlock(&ctl->tree_lock); if (ret) @@ -3559,6 +3599,7 @@ again: } bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; offset += bytes_added; spin_unlock(&ctl->tree_lock); @@ -3602,7 +3643,7 @@ have_info: bit_off = offset; bit_bytes = ctl->unit; - ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); + ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); if (!ret) { if (bit_off == offset) { ret = 1; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index a16a029ad3b1..33178c490ace 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -23,6 +23,7 @@ struct btrfs_free_space { struct rb_node offset_index; u64 offset; u64 bytes; + u64 max_extent_size; unsigned long *bitmap; struct list_head list; }; @@ -36,7 +37,7 @@ struct btrfs_free_space_ctl { int total_bitmaps; int unit; u64 start; - struct btrfs_free_space_op *op; + const struct btrfs_free_space_op *op; void *private; struct mutex cache_writeout_mutex; struct list_head trimming_ranges; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c new file mode 100644 index 000000000000..393e36bd5845 --- /dev/null +++ b/fs/btrfs/free-space-tree.c @@ -0,0 +1,1591 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include "ctree.h" +#include "disk-io.h" +#include "locking.h" +#include "free-space-tree.h" +#include "transaction.h" + +static int __add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) +{ + u32 bitmap_range; + size_t bitmap_size; + u64 num_bitmaps, total_bitmap_size; + + /* + * We convert to bitmaps when the disk space required for using extents + * exceeds that required for using bitmaps. + */ + bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1, + bitmap_range); + bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE; + total_bitmap_size = num_bitmaps * bitmap_size; + cache->bitmap_high_thresh = div_u64(total_bitmap_size, + sizeof(struct btrfs_item)); + + /* + * We allow for a small buffer between the high threshold and low + * threshold to avoid thrashing back and forth between the two formats. + */ + if (cache->bitmap_high_thresh > 100) + cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100; + else + cache->bitmap_low_thresh = 0; +} + +static int add_new_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key; + struct extent_buffer *leaf; + int ret; + + key.objectid = block_group->key.objectid; + key.type = BTRFS_FREE_SPACE_INFO_KEY; + key.offset = block_group->key.offset; + + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); + if (ret) + goto out; + + leaf = path->nodes[0]; + info = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_info); + btrfs_set_free_space_extent_count(leaf, info, 0); + btrfs_set_free_space_flags(leaf, info, 0); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_release_path(path); + return ret; +} + +struct btrfs_free_space_info * +search_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + int ret; + + key.objectid = block_group->key.objectid; + key.type = BTRFS_FREE_SPACE_INFO_KEY; + key.offset = block_group->key.offset; + + ret = btrfs_search_slot(trans, root, &key, path, 0, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret != 0) { + btrfs_warn(fs_info, "missing free space info for %llu\n", + block_group->key.objectid); + ASSERT(0); + return ERR_PTR(-ENOENT); + } + + return btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_free_space_info); +} + +/* + * btrfs_search_slot() but we're looking for the greatest key less than the + * passed key. + */ +static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int ins_len, int cow) +{ + int ret; + + ret = btrfs_search_slot(trans, root, key, p, ins_len, cow); + if (ret < 0) + return ret; + + if (ret == 0) { + ASSERT(0); + return -EIO; + } + + if (p->slots[0] == 0) { + ASSERT(0); + return -EIO; + } + p->slots[0]--; + + return 0; +} + +static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) +{ + return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); +} + +static unsigned long *alloc_bitmap(u32 bitmap_size) +{ + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); +} + +int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + unsigned long *bitmap; + char *bitmap_cursor; + u64 start, end; + u64 bitmap_range, i; + u32 bitmap_size, flags, expected_extent_count; + u32 extent_count = 0; + int done = 0, nr; + int ret; + + bitmap_size = free_space_bitmap_size(block_group->key.offset, + block_group->sectorsize); + bitmap = alloc_bitmap(bitmap_size); + if (!bitmap) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { + u64 first, last; + + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + + first = div_u64(found_key.objectid - start, + block_group->sectorsize); + last = div_u64(found_key.objectid + found_key.offset - start, + block_group->sectorsize); + bitmap_set(bitmap, first, last - first); + + extent_count++; + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + leaf = path->nodes[0]; + flags = btrfs_free_space_flags(leaf, info); + flags |= BTRFS_FREE_SPACE_USING_BITMAPS; + btrfs_set_free_space_flags(leaf, info, flags); + expected_extent_count = btrfs_free_space_extent_count(leaf, info); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + bitmap_cursor = (char *)bitmap; + bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + i = start; + while (i < end) { + unsigned long ptr; + u64 extent_size; + u32 data_size; + + extent_size = min(end - i, bitmap_range); + data_size = free_space_bitmap_size(extent_size, + block_group->sectorsize); + + key.objectid = i; + key.type = BTRFS_FREE_SPACE_BITMAP_KEY; + key.offset = extent_size; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + data_size); + if (ret) + goto out; + + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, bitmap_cursor, ptr, + data_size); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + i += extent_size; + bitmap_cursor += data_size; + } + + ret = 0; +out: + vfree(bitmap); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +int convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + unsigned long *bitmap; + u64 start, end; + /* Initialize to silence GCC. */ + u64 extent_start = 0; + u64 offset; + u32 bitmap_size, flags, expected_extent_count; + int prev_bit = 0, bit, bitnr; + u32 extent_count = 0; + int done = 0, nr; + int ret; + + bitmap_size = free_space_bitmap_size(block_group->key.offset, + block_group->sectorsize); + bitmap = alloc_bitmap(bitmap_size); + if (!bitmap) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + unsigned long ptr; + char *bitmap_cursor; + u32 bitmap_pos, data_size; + + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + + bitmap_pos = div_u64(found_key.objectid - start, + block_group->sectorsize * + BITS_PER_BYTE); + bitmap_cursor = ((char *)bitmap) + bitmap_pos; + data_size = free_space_bitmap_size(found_key.offset, + block_group->sectorsize); + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); + read_extent_buffer(leaf, bitmap_cursor, ptr, + data_size); + + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + leaf = path->nodes[0]; + flags = btrfs_free_space_flags(leaf, info); + flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; + btrfs_set_free_space_flags(leaf, info, flags); + expected_extent_count = btrfs_free_space_extent_count(leaf, info); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + offset = start; + bitnr = 0; + while (offset < end) { + bit = !!test_bit(bitnr, bitmap); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + key.objectid = extent_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = offset - extent_start; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + btrfs_release_path(path); + + extent_count++; + } + prev_bit = bit; + offset += block_group->sectorsize; + bitnr++; + } + if (prev_bit == 1) { + key.objectid = extent_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = end - extent_start; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + btrfs_release_path(path); + + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + ret = 0; +out: + vfree(bitmap); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +static int update_free_space_extent_count(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + int new_extents) +{ + struct btrfs_free_space_info *info; + u32 flags; + u32 extent_count; + int ret = 0; + + if (new_extents == 0) + return 0; + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + flags = btrfs_free_space_flags(path->nodes[0], info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + + extent_count += new_extents; + btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(path); + + if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && + extent_count > block_group->bitmap_high_thresh) { + ret = convert_free_space_to_bitmaps(trans, fs_info, block_group, + path); + } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && + extent_count < block_group->bitmap_low_thresh) { + ret = convert_free_space_to_extents(trans, fs_info, block_group, + path); + } + +out: + return ret; +} + +int free_space_test_bit(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 offset) +{ + struct extent_buffer *leaf; + struct btrfs_key key; + u64 found_start, found_end; + unsigned long ptr, i; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(offset >= found_start && offset < found_end); + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + i = div_u64(offset - found_start, block_group->sectorsize); + return !!extent_buffer_test_bit(leaf, ptr, i); +} + +static void free_space_set_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 *start, u64 *size, + int bit) +{ + struct extent_buffer *leaf; + struct btrfs_key key; + u64 end = *start + *size; + u64 found_start, found_end; + unsigned long ptr, first, last; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(*start >= found_start && *start < found_end); + ASSERT(end > found_start); + + if (end > found_end) + end = found_end; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + first = div_u64(*start - found_start, block_group->sectorsize); + last = div_u64(end - found_start, block_group->sectorsize); + if (bit) + extent_buffer_bitmap_set(leaf, ptr, first, last - first); + else + extent_buffer_bitmap_clear(leaf, ptr, first, last - first); + btrfs_mark_buffer_dirty(leaf); + + *size -= end - *start; + *start = end; +} + +/* + * We can't use btrfs_next_item() in modify_free_space_bitmap() because + * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy + * tree walking in btrfs_next_leaf() anyways because we know exactly what we're + * looking for. + */ +static int free_space_next_bitmap(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p) +{ + struct btrfs_key key; + + if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) { + p->slots[0]++; + return 0; + } + + btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]); + btrfs_release_path(p); + + key.objectid += key.offset; + key.type = (u8)-1; + key.offset = (u64)-1; + + return btrfs_search_prev_slot(trans, root, &key, p, 0, 1); +} + +/* + * If remove is 1, then we are removing free space, thus clearing bits in the + * bitmap. If remove is 0, then we are adding free space, thus setting bits in + * the bitmap. + */ +static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size, int remove) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + u64 end = start + size; + u64 cur_start, cur_size; + int prev_bit, next_bit; + int new_extents; + int ret; + + /* + * Read the bit for the block immediately before the extent of space if + * that block is within the block group. + */ + if (start > block_group->key.objectid) { + u64 prev_block = start - block_group->sectorsize; + + key.objectid = prev_block; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); + if (ret) + goto out; + + prev_bit = free_space_test_bit(block_group, path, prev_block); + + /* The previous block may have been in the previous bitmap. */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (start >= key.objectid + key.offset) { + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + } else { + key.objectid = start; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); + if (ret) + goto out; + + prev_bit = -1; + } + + /* + * Iterate over all of the bitmaps overlapped by the extent of space, + * clearing/setting bits as required. + */ + cur_start = start; + cur_size = size; + while (1) { + free_space_set_bits(block_group, path, &cur_start, &cur_size, + !remove); + if (cur_size == 0) + break; + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + + /* + * Read the bit for the block immediately after the extent of space if + * that block is within the block group. + */ + if (end < block_group->key.objectid + block_group->key.offset) { + /* The next block may be in the next bitmap. */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (end >= key.objectid + key.offset) { + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + + next_bit = free_space_test_bit(block_group, path, end); + } else { + next_bit = -1; + } + + if (remove) { + new_extents = -1; + if (prev_bit == 1) { + /* Leftover on the left. */ + new_extents++; + } + if (next_bit == 1) { + /* Leftover on the right. */ + new_extents++; + } + } else { + new_extents = 1; + if (prev_bit == 1) { + /* Merging with neighbor on the left. */ + new_extents--; + } + if (next_bit == 1) { + /* Merging with neighbor on the right. */ + new_extents--; + } + } + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +static int remove_free_space_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + u64 found_start, found_end; + u64 end = start + size; + int new_extents = -1; + int ret; + + key.objectid = start; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(start >= found_start && end <= found_end); + + /* + * Okay, now that we've found the free space extent which contains the + * free space that we are removing, there are four cases: + * + * 1. We're using the whole extent: delete the key we found and + * decrement the free space extent count. + * 2. We are using part of the extent starting at the beginning: delete + * the key we found and insert a new key representing the leftover at + * the end. There is no net change in the number of extents. + * 3. We are using part of the extent ending at the end: delete the key + * we found and insert a new key representing the leftover at the + * beginning. There is no net change in the number of extents. + * 4. We are using part of the extent in the middle: delete the key we + * found and insert two new keys representing the leftovers on each + * side. Where we used to have one extent, we now have two, so increment + * the extent count. We may need to convert the block group to bitmaps + * as a result. + */ + + /* Delete the existing key (cases 1-4). */ + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + /* Add a key for leftovers at the beginning (cases 3 and 4). */ + if (start > found_start) { + key.objectid = found_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = start - found_start; + + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + new_extents++; + } + + /* Add a key for leftovers at the end (cases 2 and 4). */ + if (end < found_end) { + key.objectid = end; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = found_end - end; + + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + new_extents++; + } + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + if (block_group->needs_free_space) { + ret = __add_block_group_free_space(trans, fs_info, block_group, + path); + if (ret) + return ret; + } + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + return modify_free_space_bitmap(trans, fs_info, block_group, + path, start, size, 1); + } else { + return remove_free_space_extent(trans, fs_info, block_group, + path, start, size); + } +} + +int remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_path *path; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + block_group = btrfs_lookup_block_group(fs_info, start); + if (!block_group) { + ASSERT(0); + ret = -ENOENT; + goto out; + } + + mutex_lock(&block_group->free_space_lock); + ret = __remove_from_free_space_tree(trans, fs_info, block_group, path, + start, size); + mutex_unlock(&block_group->free_space_lock); + + btrfs_put_block_group(block_group); +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +static int add_free_space_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key, new_key; + u64 found_start, found_end; + u64 end = start + size; + int new_extents = 1; + int ret; + + /* + * We are adding a new extent of free space, but we need to merge + * extents. There are four cases here: + * + * 1. The new extent does not have any immediate neighbors to merge + * with: add the new key and increment the free space extent count. We + * may need to convert the block group to bitmaps as a result. + * 2. The new extent has an immediate neighbor before it: remove the + * previous key and insert a new key combining both of them. There is no + * net change in the number of extents. + * 3. The new extent has an immediate neighbor after it: remove the next + * key and insert a new key combining both of them. There is no net + * change in the number of extents. + * 4. The new extent has immediate neighbors on both sides: remove both + * of the keys and insert a new key combining all of them. Where we used + * to have two extents, we now have one, so decrement the extent count. + */ + + new_key.objectid = start; + new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + new_key.offset = size; + + /* Search for a neighbor on the left. */ + if (start == block_group->key.objectid) + goto right; + key.objectid = start - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { + ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); + btrfs_release_path(path); + goto right; + } + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(found_start >= block_group->key.objectid && + found_end > block_group->key.objectid); + ASSERT(found_start < start && found_end <= start); + + /* + * Delete the neighbor on the left and absorb it into the new key (cases + * 2 and 4). + */ + if (found_end == start) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + new_key.objectid = found_start; + new_key.offset += key.offset; + new_extents--; + } + btrfs_release_path(path); + +right: + /* Search for a neighbor on the right. */ + if (end == block_group->key.objectid + block_group->key.offset) + goto insert; + key.objectid = end; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { + ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); + btrfs_release_path(path); + goto insert; + } + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(found_start >= block_group->key.objectid && + found_end > block_group->key.objectid); + ASSERT((found_start < start && found_end <= start) || + (found_start >= end && found_end > end)); + + /* + * Delete the neighbor on the right and absorb it into the new key + * (cases 3 and 4). + */ + if (found_start == end) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + new_key.offset += key.offset; + new_extents--; + } + btrfs_release_path(path); + +insert: + /* Insert the new key (cases 1-4). */ + ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0); + if (ret) + goto out; + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +int __add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + if (block_group->needs_free_space) { + ret = __add_block_group_free_space(trans, fs_info, block_group, + path); + if (ret) + return ret; + } + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + return modify_free_space_bitmap(trans, fs_info, block_group, + path, start, size, 0); + } else { + return add_free_space_extent(trans, fs_info, block_group, path, + start, size); + } +} + +int add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_path *path; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + block_group = btrfs_lookup_block_group(fs_info, start); + if (!block_group) { + ASSERT(0); + ret = -ENOENT; + goto out; + } + + mutex_lock(&block_group->free_space_lock); + ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start, + size); + mutex_unlock(&block_group->free_space_lock); + + btrfs_put_block_group(block_group); +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +/* + * Populate the free space tree by walking the extent tree. Operations on the + * extent tree that happen as a result of writes to the free space tree will go + * through the normal add/remove hooks. + */ +static int populate_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path, *path2; + struct btrfs_key key; + u64 start, end; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + path2 = btrfs_alloc_path(); + if (!path2) { + btrfs_free_path(path); + return -ENOMEM; + } + + ret = add_new_free_space_info(trans, fs_info, block_group, path2); + if (ret) + goto out; + + mutex_lock(&block_group->free_space_lock); + + /* + * Iterate through all of the extent and metadata items in this block + * group, adding the free space between them and the free space at the + * end. Note that EXTENT_ITEM and METADATA_ITEM are less than + * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's + * contained in. + */ + key.objectid = block_group->key.objectid; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); + if (ret < 0) + goto out_locked; + ASSERT(ret == 0); + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + while (1) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + if (key.objectid >= end) + break; + + if (start < key.objectid) { + ret = __add_to_free_space_tree(trans, fs_info, + block_group, + path2, start, + key.objectid - + start); + if (ret) + goto out_locked; + } + start = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY) + start += fs_info->tree_root->nodesize; + else + start += key.offset; + } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + if (key.objectid != block_group->key.objectid) + break; + } + + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + goto out_locked; + if (ret) + break; + } + if (start < end) { + ret = __add_to_free_space_tree(trans, fs_info, block_group, + path2, start, end - start); + if (ret) + goto out_locked; + } + + ret = 0; +out_locked: + mutex_unlock(&block_group->free_space_lock); +out: + btrfs_free_path(path2); + btrfs_free_path(path); + return ret; +} + +int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *free_space_root; + struct btrfs_block_group_cache *block_group; + struct rb_node *node; + int ret; + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + fs_info->creating_free_space_tree = 1; + free_space_root = btrfs_create_tree(trans, fs_info, + BTRFS_FREE_SPACE_TREE_OBJECTID); + if (IS_ERR(free_space_root)) { + ret = PTR_ERR(free_space_root); + goto abort; + } + fs_info->free_space_root = free_space_root; + + node = rb_first(&fs_info->block_group_cache_tree); + while (node) { + block_group = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + ret = populate_free_space_tree(trans, fs_info, block_group); + if (ret) + goto abort; + node = rb_next(node); + } + + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + fs_info->creating_free_space_tree = 0; + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + return 0; + +abort: + fs_info->creating_free_space_tree = 0; + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); + return ret; +} + +static int clear_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int nr; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + key.objectid = 0; + key.type = 0; + key.offset = 0; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + nr = btrfs_header_nritems(path->nodes[0]); + if (!nr) + break; + + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); + if (ret) + goto out; + + btrfs_release_path(path); + } + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *free_space_root = fs_info->free_space_root; + int ret; + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); + fs_info->free_space_root = NULL; + + ret = clear_free_space_tree(trans, free_space_root); + if (ret) + goto abort; + + ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key); + if (ret) + goto abort; + + list_del(&free_space_root->dirty_list); + + btrfs_tree_lock(free_space_root->node); + clean_tree_block(trans, tree_root->fs_info, free_space_root->node); + btrfs_tree_unlock(free_space_root->node); + btrfs_free_tree_block(trans, free_space_root, free_space_root->node, + 0, 1); + + free_extent_buffer(free_space_root->node); + free_extent_buffer(free_space_root->commit_root); + kfree(free_space_root); + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + return 0; + +abort: + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); + return ret; +} + +static int __add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + u64 start, end; + int ret; + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + block_group->needs_free_space = 0; + + ret = add_new_free_space_info(trans, fs_info, block_group, path); + if (ret) + return ret; + + return __add_to_free_space_tree(trans, fs_info, block_group, path, + block_group->key.objectid, + block_group->key.offset); +} + +int add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_path *path = NULL; + int ret = 0; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + mutex_lock(&block_group->free_space_lock); + if (!block_group->needs_free_space) + goto out; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = __add_block_group_free_space(trans, fs_info, block_group, path); + +out: + btrfs_free_path(path); + mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +int remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_path *path; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + u64 start, end; + int done = 0, nr; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + if (block_group->needs_free_space) { + /* We never added this block group to the free space tree. */ + return 0; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + nr++; + path->slots[0]--; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY || + found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + ret = 0; +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, + struct btrfs_path *path, + u32 expected_extent_count) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + int prev_bit = 0, bit; + /* Initialize to silence GCC. */ + u64 extent_start = 0; + u64 end, offset; + u64 total_found = 0; + u32 extent_count = 0; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + root = fs_info->free_space_root; + + end = block_group->key.objectid + block_group->key.offset; + + while (1) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_FREE_SPACE_INFO_KEY) + break; + + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + ASSERT(key.objectid < end && key.objectid + key.offset <= end); + + caching_ctl->progress = key.objectid; + + offset = key.objectid; + while (offset < key.objectid + key.offset) { + bit = free_space_test_bit(block_group, path, offset); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + total_found += add_new_free_space(block_group, + fs_info, + extent_start, + offset); + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + extent_count++; + } + prev_bit = bit; + offset += block_group->sectorsize; + } + } + if (prev_bit == 1) { + total_found += add_new_free_space(block_group, fs_info, + extent_start, end); + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + caching_ctl->progress = (u64)-1; + + ret = 0; +out: + return ret; +} + +static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, + struct btrfs_path *path, + u32 expected_extent_count) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + u64 end; + u64 total_found = 0; + u32 extent_count = 0; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + root = fs_info->free_space_root; + + end = block_group->key.objectid + block_group->key.offset; + + while (1) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_FREE_SPACE_INFO_KEY) + break; + + ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); + ASSERT(key.objectid < end && key.objectid + key.offset <= end); + + caching_ctl->progress = key.objectid; + + total_found += add_new_free_space(block_group, fs_info, + key.objectid, + key.objectid + key.offset); + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + caching_ctl->progress = (u64)-1; + + ret = 0; +out: + return ret; +} + +int load_free_space_tree(struct btrfs_caching_control *caching_ctl) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_free_space_info *info; + struct btrfs_path *path; + u32 extent_count, flags; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Just like caching_thread() doesn't want to deadlock on the extent + * tree, we don't want to deadlock on the free space tree. + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 1; + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + flags = btrfs_free_space_flags(path->nodes[0], info); + + /* + * We left path pointing to the free space info item, so now + * load_free_space_foo can just iterate through the free space tree from + * there. + */ + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) + ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + else + ret = load_free_space_extents(caching_ctl, path, extent_count); + +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h new file mode 100644 index 000000000000..54ffced3bce8 --- /dev/null +++ b/fs/btrfs/free-space-tree.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_TREE +#define __BTRFS_FREE_SPACE_TREE + +/* + * The default size for new free space bitmap items. The last bitmap in a block + * group may be truncated, and none of the free space tree code assumes that + * existing bitmaps are this size. + */ +#define BTRFS_FREE_SPACE_BITMAP_SIZE 256 +#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE) + +void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group); +int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); +int load_free_space_tree(struct btrfs_caching_control *caching_ctl); +int add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size); +int remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size); + +/* Exposed for testing. */ +struct btrfs_free_space_info * +search_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow); +int __add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size); +int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size); +int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); +int convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); +int free_space_test_bit(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 offset); + +#endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 265e03c73f4d..be4d22a5022f 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) { - btrfs_std_error(root->fs_info, -ENOENT); + btrfs_std_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d4a582ac3f73..8b57c17b3fb3 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -48,7 +48,7 @@ static int caching_kthread(void *data) /* Since the commit root is read-only, we can safely skip locking. */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 2; + path->reada = READA_FORWARD; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.offset = 0; @@ -282,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) } } -#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space)) +#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space)) #define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) /* @@ -334,7 +334,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } -static struct btrfs_free_space_op free_ino_op = { +static const struct btrfs_free_space_op free_ino_op = { .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -356,7 +356,7 @@ static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, return false; } -static struct btrfs_free_space_op pinned_free_ino_op = { +static const struct btrfs_free_space_op pinned_free_ino_op = { .recalc_thresholds = pinned_recalc_thresholds, .use_bitmap = pinned_use_bitmap, }; @@ -488,17 +488,17 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_CACHE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, prealloc); + ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_space(inode, prealloc); + btrfs_delalloc_release_space(inode, 0, prealloc); goto out_put; } - btrfs_free_reserved_data_space(inode, prealloc); + btrfs_free_reserved_data_space(inode, 0, prealloc); ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 611b66d73e80..247830107686 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -66,6 +66,13 @@ struct btrfs_iget_args { struct btrfs_root *root; }; +struct btrfs_dio_data { + u64 outstanding_extents; + u64 reserve; + u64 unsubmitted_oe_range_start; + u64 unsubmitted_oe_range_end; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_dir_ro_inode_operations; @@ -74,17 +81,16 @@ static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct address_space_operations btrfs_symlink_aops; static const struct file_operations btrfs_dir_file_operations; -static struct extent_io_ops btrfs_extent_io_ops; +static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; -static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; #define S_SHIFT 12 -static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, @@ -310,6 +316,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: + /* + * Don't forget to free the reserved space, as for inlined extent + * it won't count as data extent, free them directly here. + * And at reserve time, it's always aligned to page size, so + * just free one page here. + */ + btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; @@ -407,15 +420,15 @@ static noinline void compress_file_range(struct inode *inode, unsigned long nr_pages_ret = 0; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned long max_compressed = 128 * 1024; - unsigned long max_uncompressed = 128 * 1024; + unsigned long max_compressed = SZ_128K; + unsigned long max_uncompressed = SZ_128K; int i; int will_compress; int compress_type = root->fs_info->compress_type; int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ - if ((end - start + 1) < 16 * 1024 && + if ((end - start + 1) < SZ_16K && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); @@ -423,7 +436,7 @@ static noinline void compress_file_range(struct inode *inode, again: will_compress = 0; nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; - nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE); /* * we don't want to send crud past the end of i_size through @@ -937,7 +950,7 @@ static noinline int cow_file_range(struct inode *inode, disk_num_bytes = num_bytes; /* if this is a small write inside eof, kick off defrag */ - if (num_bytes < 64 * 1024 && + if (num_bytes < SZ_64K && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); @@ -1096,8 +1109,11 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; + /* + * atomic_sub_return implies a barrier for waitqueue_active + */ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < - 5 * 1024 * 1024 && + 5 * SZ_1M && waitqueue_active(&root->fs_info->async_submit_wait)) wake_up(&root->fs_info->async_submit_wait); @@ -1122,7 +1138,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; - int limit = 10 * 1024 * 1024; + int limit = 10 * SZ_1M; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); @@ -1138,7 +1154,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, !btrfs_test_opt(root, FORCE_COMPRESS)) cur_end = end; else - cur_end = min(end, start + 512 * 1024 - 1); + cur_end = min(end, start + SZ_512K - 1); async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); @@ -1294,8 +1310,14 @@ next_slot: num_bytes = 0; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid > ino || - found_key.type > BTRFS_EXTENT_DATA_KEY || + if (found_key.objectid > ino) + break; + if (WARN_ON_ONCE(found_key.objectid < ino) || + found_key.type < BTRFS_EXTENT_DATA_KEY) { + path->slots[0]++; + goto next_slot; + } + if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; @@ -1766,7 +1788,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE)) - btrfs_free_reserved_data_space(inode, len); + btrfs_free_reserved_data_space_noquota(inode, + state->start, len); __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, root->fs_info->delalloc_batch); @@ -1861,15 +1884,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; + enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; int ret = 0; int skip_sum; - int metadata = 0; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; if (btrfs_is_free_space_inode(inode)) - metadata = 2; + metadata = BTRFS_WQ_ENDIO_FREE_SPACE; if (!(rw & REQ_WRITE)) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); @@ -1972,7 +1995,7 @@ again: page_start = page_offset(page); page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ @@ -1989,7 +2012,8 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, page_start, + PAGE_CACHE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); @@ -2115,7 +2139,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_alloc_reserved_file_extent(trans, root, root->root_key.objectid, - btrfs_ino(inode), file_pos, &ins); + btrfs_ino(inode), file_pos, + ram_bytes, &ins); + /* + * Release the reserved range from inode dirty range map, as it is + * already moved into delayed_ref_head + */ + btrfs_qgroup_release_data(inode, file_pos, ram_bytes); out: btrfs_free_path(path); @@ -2458,7 +2488,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, lock_start = backref->file_pos; lock_end = backref->file_pos + backref->num_bytes - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - 0, &cached); + &cached); ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); if (ordered) { @@ -2573,7 +2603,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, new->bytenr, new->disk_len, 0, backref->root_id, backref->inum, - new->file_pos, 0); /* start - extent_offset */ + new->file_pos); /* start - extent_offset */ if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_free_path; @@ -2599,7 +2629,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) return; list_for_each_entry_safe(old, tmp, &new->head, list) { - list_del(&old->list); kfree(old); } kfree(new); @@ -2824,6 +2853,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ + + /* + * For mwrite(mmap + memset to write) case, we still reserve + * space for NOCOW range. + * As NOCOW won't cause a new delayed ref, just free the space + */ + btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) trans = btrfs_join_transaction_nolock(root); @@ -2843,7 +2880,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - 0, &cached_state); + &cached_state); ret = test_range_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, @@ -3018,8 +3055,6 @@ static int __readpage_endio_check(struct inode *inode, char *kaddr; u32 csum_expected; u32 csum = ~(u32)0; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); csum_expected = *(((u32 *)io_bio->csum) + icsum); @@ -3032,9 +3067,8 @@ static int __readpage_endio_check(struct inode *inode, kunmap_atomic(kaddr); return 0; zeroit: - if (__ratelimit(&_rs)) - btrfs_warn(BTRFS_I(inode)->root->fs_info, - "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_warn_rl(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, csum_expected); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); @@ -3078,55 +3112,47 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, start, (size_t)(end - start + 1)); } -struct delayed_iput { - struct list_head list; - struct inode *inode; -}; - -/* JDM: If this is fs-wide, why can't we add a pointer to - * btrfs_inode instead and avoid the allocation? */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct delayed_iput *delayed; + struct btrfs_inode *binode = BTRFS_I(inode); if (atomic_add_unless(&inode->i_count, -1, 1)) return; - delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); - delayed->inode = inode; - spin_lock(&fs_info->delayed_iput_lock); - list_add_tail(&delayed->list, &fs_info->delayed_iputs); + if (binode->delayed_iput_count == 0) { + ASSERT(list_empty(&binode->delayed_iput)); + list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); + } else { + binode->delayed_iput_count++; + } spin_unlock(&fs_info->delayed_iput_lock); } void btrfs_run_delayed_iputs(struct btrfs_root *root) { - LIST_HEAD(list); struct btrfs_fs_info *fs_info = root->fs_info; - struct delayed_iput *delayed; - int empty; - - spin_lock(&fs_info->delayed_iput_lock); - empty = list_empty(&fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); - if (empty) - return; down_read(&fs_info->delayed_iput_sem); - spin_lock(&fs_info->delayed_iput_lock); - list_splice_init(&fs_info->delayed_iputs, &list); - spin_unlock(&fs_info->delayed_iput_lock); - - while (!list_empty(&list)) { - delayed = list_entry(list.next, struct delayed_iput, list); - list_del(&delayed->list); - iput(delayed->inode); - kfree(delayed); + while (!list_empty(&fs_info->delayed_iputs)) { + struct btrfs_inode *inode; + + inode = list_first_entry(&fs_info->delayed_iputs, + struct btrfs_inode, delayed_iput); + if (inode->delayed_iput_count) { + inode->delayed_iput_count--; + list_move_tail(&inode->delayed_iput, + &fs_info->delayed_iputs); + } else { + list_del_init(&inode->delayed_iput); + } + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + spin_lock(&fs_info->delayed_iput_lock); } - + spin_unlock(&fs_info->delayed_iput_lock); up_read(&root->fs_info->delayed_iput_sem); } @@ -3323,7 +3349,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = -ENOMEM; goto out; } - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -3522,10 +3548,10 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, int scanned = 0; if (!xattr_access) { - xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, - strlen(POSIX_ACL_XATTR_ACCESS)); - xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, - strlen(POSIX_ACL_XATTR_DEFAULT)); + xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, + strlen(XATTR_NAME_POSIX_ACL_ACCESS)); + xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, + strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); } slot++; @@ -3746,6 +3772,7 @@ cache_acl: break; case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_symlink_aops; break; default: @@ -4018,9 +4045,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, */ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - int ret; /* * 1 for the possible orphan item @@ -4029,27 +4054,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) * 1 for the inode ref * 1 for the inode */ - trans = btrfs_start_transaction(root, 5); - if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) - return trans; - - if (PTR_ERR(trans) == -ENOSPC) { - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return trans; - ret = btrfs_cond_migrate_bytes(root->fs_info, - &root->fs_info->trans_block_rsv, - num_bytes, 5); - if (ret) { - btrfs_end_transaction(trans, root); - return ERR_PTR(ret); - } - trans->block_rsv = &root->fs_info->trans_block_rsv; - trans->bytes_reserved = num_bytes; - } - return trans; + return btrfs_start_transaction_fallback_global_rsv(root, 5, 5); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -4217,6 +4222,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans, } +static int truncate_inline_extent(struct inode *inode, + struct btrfs_path *path, + struct btrfs_key *found_key, + const u64 item_end, + const u64 new_size) +{ + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + u32 size = (u32)(new_size - found_key->offset); + struct btrfs_root *root = BTRFS_I(inode)->root; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { + loff_t offset = new_size; + loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); + + /* + * Zero out the remaining of the last page of our inline extent, + * instead of directly truncating our inline extent here - that + * would be much more complex (decompressing all the data, then + * compressing the truncated data, which might be bigger than + * the size of the inline extent, resize the extent, etc). + * We release the path because to get the page we might need to + * read the extent item from disk (data not in the page cache). + */ + btrfs_release_path(path); + return btrfs_truncate_page(inode, offset, page_end - offset, 0); + } + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(root, path, size, 1); + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + inode_sub_bytes(inode, item_end + 1 - new_size); + + return 0; +} + /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find @@ -4270,7 +4316,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; /* * We want to drop from the next block forward in case this new size is @@ -4301,7 +4347,7 @@ search_again: * up a huge file in a single leaf. Most of the time that * bytes_deleted is > 0, it will be huge by the time we get here */ - if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (be_nice && bytes_deleted > SZ_32M) { if (btrfs_should_end_transaction(trans, root)) { err = -EAGAIN; goto error; @@ -4411,27 +4457,40 @@ search_again: * special encodings */ if (!del_item && - btrfs_file_extent_compression(leaf, fi) == 0 && btrfs_file_extent_encryption(leaf, fi) == 0 && btrfs_file_extent_other_encoding(leaf, fi) == 0) { - u32 size = new_size - found_key.offset; - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - inode_sub_bytes(inode, item_end + 1 - - new_size); /* - * update the ram bytes to properly reflect - * the new size of our item + * Need to release path in order to truncate a + * compressed extent. So delete any accumulated + * extent items so far. */ - btrfs_set_file_extent_ram_bytes(leaf, fi, size); - size = - btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root, path, size, 1); + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE && pending_del_nr) { + err = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + if (err) { + btrfs_abort_transaction(trans, + root, + err); + goto error; + } + pending_del_nr = 0; + } + + err = truncate_inline_extent(inode, path, + &found_key, + item_end, + new_size); + if (err) { + btrfs_abort_transaction(trans, + root, err); + goto error; + } } else if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { - inode_sub_bytes(inode, item_end + 1 - - found_key.offset); + inode_sub_bytes(inode, item_end + 1 - new_size); } } delete: @@ -4461,7 +4520,7 @@ delete: ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), - ino, extent_offset, 0); + ino, extent_offset); BUG_ON(ret); if (btrfs_should_throttle_delayed_refs(trans, root)) btrfs_async_run_delayed_refs(root, @@ -4531,7 +4590,7 @@ error: btrfs_free_path(path); - if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (be_nice && bytes_deleted > SZ_32M) { unsigned long updates = trans->delayed_ref_updates; if (updates) { trans->delayed_ref_updates = 0; @@ -4575,14 +4634,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, if ((offset & (blocksize - 1)) == 0 && (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, + round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE); if (ret) goto out; again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, + round_down(from, PAGE_CACHE_SIZE), + PAGE_CACHE_SIZE); ret = -ENOMEM; goto out; } @@ -4605,7 +4667,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(io_tree, page_start, page_end, &cached_state); set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, page_start); @@ -4650,7 +4712,8 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, page_start, + PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: @@ -4735,7 +4798,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) while (1) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(io_tree, hole_start, block_end - 1, 0, + lock_extent_bits(io_tree, hole_start, block_end - 1, &cached_state); ordered = btrfs_lookup_ordered_range(inode, hole_start, block_end - hole_start); @@ -5047,7 +5110,19 @@ static void evict_inode_truncate_pages(struct inode *inode) end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, start, end, 0, &cached_state); + lock_extent_bits(io_tree, start, end, &cached_state); + + /* + * If still has DELALLOC flag, the extent didn't reach disk, + * and its reserved space won't be freed by delayed_ref. + * So we need to free its reserved space here. + * (Refer to comment in btrfs_invalidatepage, case 2) + * + * Note, end is the bytenr of last byte, so we need + 1 here. + */ + if (state->state & EXTENT_DELALLOC) + btrfs_qgroup_free_data(inode, start, end - start + 1); + clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | @@ -5228,7 +5303,6 @@ void btrfs_evict_inode(struct inode *inode) no_delete: btrfs_remove_delayed_node(inode); clear_inode(inode); - return; } /* @@ -5677,7 +5751,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); @@ -6268,9 +6342,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, u64 objectid; u64 index = 0; - if (!new_valid_dev(rdev)) - return -EINVAL; - /* * 2 for inode item and ref * 2 for dir items @@ -6408,7 +6479,7 @@ out_unlock_inode: static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); u64 index; @@ -6434,6 +6505,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; goto fail; } @@ -6467,9 +6539,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, inode, NULL, parent); } - btrfs_end_transaction(trans, root); btrfs_balance_delayed_items(root); fail: + if (trans) + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -6614,7 +6687,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, } static noinline int uncompress_inline(struct btrfs_path *path, - struct inode *inode, struct page *page, + struct page *page, size_t pg_offset, u64 extent_offset, struct btrfs_file_extent_item *item) { @@ -6711,7 +6784,7 @@ again: * Chances are we'll be called again, so go ahead and do * readahead */ - path->reada = 1; + path->reada = READA_FORWARD; } ret = btrfs_lookup_file_extent(trans, root, path, @@ -6810,8 +6883,7 @@ next: if (create == 0 && !PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, inode, page, - pg_offset, + ret = uncompress_inline(path, page, pg_offset, extent_offset, item); if (ret) { err = ret; @@ -7307,7 +7379,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, while (1) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, cached_state); + cached_state); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure theres no ordered @@ -7335,25 +7407,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } else { - /* Screw you mmap */ - ret = btrfs_fdatawrite_range(inode, lockstart, lockend); - if (ret) - break; - ret = filemap_fdatawait_range(inode->i_mapping, - lockstart, - lockend); - if (ret) - break; - /* - * If we found a page that couldn't be invalidated just - * fall back to buffered. + * We could trigger writeback for this range (and wait + * for it to complete) and then invalidate the pages for + * this range (through invalidate_inode_pages2_range()), + * but that can lead us to a deadlock with a concurrent + * call to readpages() (a buffered read or a defrag call + * triggered a readahead) on a page lock due to an + * ordered dio extent we created before but did not have + * yet a corresponding bio submitted (whence it can not + * complete), which makes readpages() wait for that + * ordered extent to complete while holding a lock on + * that page. */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - lockstart >> PAGE_CACHE_SHIFT, - lockend >> PAGE_CACHE_SHIFT); - if (ret) - break; + ret = -ENOTBLK; + break; } cond_resched(); @@ -7409,10 +7477,27 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, return em; } -struct btrfs_dio_data { - u64 outstanding_extents; - u64 reserve; -}; +static void adjust_dio_outstanding_extents(struct inode *inode, + struct btrfs_dio_data *dio_data, + const u64 len) +{ + unsigned num_extents; + + num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + /* + * If we have an outstanding_extents count still set then we're + * within our reservation, otherwise we need to adjust our inode + * counter appropriately. + */ + if (dio_data->outstanding_extents) { + dio_data->outstanding_extents -= num_extents; + } else { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents += num_extents; + spin_unlock(&BTRFS_I(inode)->lock); + } +} static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -7449,8 +7534,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) - return -ENOTBLK; + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, + create)) { + ret = -ENOTBLK; + goto err; + } em = btrfs_get_extent(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { @@ -7568,22 +7656,11 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - /* - * If we have an outstanding_extents count still set then we're - * within our reservation, otherwise we need to adjust our inode - * counter appropriately. - */ - if (dio_data->outstanding_extents) { - (dio_data->outstanding_extents)--; - } else { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - } - - btrfs_free_reserved_data_space(inode, len); + adjust_dio_outstanding_extents(inode, dio_data, len); + btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; + dio_data->unsubmitted_oe_range_end = start + len; current->journal_info = dio_data; } @@ -7607,8 +7684,17 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); +err: if (dio_data) current->journal_info = dio_data; + /* + * Compensate the delalloc release we do in btrfs_direct_IO() when we + * write less data then expected, so that we don't underflow our inode's + * outstanding extents counter. + */ + if (create && dio_data) + adjust_dio_outstanding_extents(inode, dio_data, len); + return ret; } @@ -7897,22 +7983,22 @@ static void btrfs_endio_direct_read(struct bio *bio) bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio) +static void btrfs_endio_direct_write_update_ordered(struct inode *inode, + const u64 offset, + const u64 bytes, + const int uptodate) { - struct btrfs_dio_private *dip = bio->bi_private; - struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered = NULL; - u64 ordered_offset = dip->logical_offset; - u64 ordered_bytes = dip->bytes; - struct bio *dio_bio; + u64 ordered_offset = offset; + u64 ordered_bytes = bytes; int ret; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, - !bio->bi_error); + uptodate); if (!ret) goto out_test; @@ -7925,13 +8011,22 @@ out_test: * our bio might span multiple ordered extents. If we haven't * completed the accounting for the whole dio, go back and try again */ - if (ordered_offset < dip->logical_offset + dip->bytes) { - ordered_bytes = dip->logical_offset + dip->bytes - - ordered_offset; + if (ordered_offset < offset + bytes) { + ordered_bytes = offset + bytes - ordered_offset; ordered = NULL; goto again; } - dio_bio = dip->dio_bio; +} + +static void btrfs_endio_direct_write(struct bio *bio) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct bio *dio_bio = dip->dio_bio; + + btrfs_endio_direct_write_update_ordered(dip->inode, + dip->logical_offset, + dip->bytes, + !bio->bi_error); kfree(dip); @@ -8239,6 +8334,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->subio_endio = btrfs_subio_endio_read; } + /* + * Reset the range for unsubmitted ordered extents (to a 0 length range) + * even if we fail to submit a bio, because in such case we do the + * corresponding error handling below and it must not be done a second + * time by btrfs_direct_IO(). + */ + if (write) { + struct btrfs_dio_data *dio_data = current->journal_info; + + dio_data->unsubmitted_oe_range_end = dip->logical_offset + + dip->bytes; + dio_data->unsubmitted_oe_range_start = + dio_data->unsubmitted_oe_range_end; + } + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; @@ -8267,24 +8377,15 @@ free_ordered: dip = NULL; io_bio = NULL; } else { - if (write) { - struct btrfs_ordered_extent *ordered; - - ordered = btrfs_lookup_ordered_extent(inode, - file_offset); - set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); - /* - * Decrements our ref on the ordered extent and removes - * the ordered extent from the inode's ordered tree, - * doing all the proper resource cleanup such as for the - * reserved space and waking up any waiters for this - * ordered extent (through btrfs_remove_ordered_extent). - */ - btrfs_finish_ordered_io(ordered); - } else { + if (write) + btrfs_endio_direct_write_update_ordered(inode, + file_offset, + dio_bio->bi_iter.bi_size, + 0); + else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - } + dio_bio->bi_error = -EIO; /* * Releases and cleans up our dio_bio, no need to bio_put() @@ -8371,7 +8472,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, mutex_unlock(&inode->i_mutex); relock = true; } - ret = btrfs_delalloc_reserve_space(inode, count); + ret = btrfs_delalloc_reserve_space(inode, offset, count); if (ret) goto out; dio_data.outstanding_extents = div64_u64(count + @@ -8384,6 +8485,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * originally calculated. Abuse current->journal_info for this. */ dio_data.reserve = round_up(count, root->sectorsize); + dio_data.unsubmitted_oe_range_start = (u64)offset; + dio_data.unsubmitted_oe_range_end = (u64)offset; current->journal_info = &dio_data; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { @@ -8400,10 +8503,23 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, - dio_data.reserve); + btrfs_delalloc_release_space(inode, offset, + dio_data.reserve); + /* + * On error we might have left some ordered extents + * without submitting corresponding bios for them, so + * cleanup them up to avoid other tasks getting them + * and waiting for them to complete forever. + */ + if (dio_data.unsubmitted_oe_range_start < + dio_data.unsubmitted_oe_range_end) + btrfs_endio_direct_write_update_ordered(inode, + dio_data.unsubmitted_oe_range_start, + dio_data.unsubmitted_oe_range_end - + dio_data.unsubmitted_oe_range_start, + 0); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); } out: @@ -8439,15 +8555,28 @@ int btrfs_readpage(struct file *file, struct page *page) static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; - + struct inode *inode = page->mapping->host; + int ret; if (current->flags & PF_MEMALLOC) { redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } + + /* + * If we are under memory pressure we will call this directly from the + * VM, we need to make sure we have the inode referenced for the ordered + * extent. If not just return like we didn't do anything. + */ + if (!igrab(inode)) { + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; + } tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_write_full_page(tree, page, btrfs_get_extent, wbc); + ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc); + btrfs_add_delayed_iput(inode); + return ret; } static int btrfs_writepages(struct address_space *mapping, @@ -8519,7 +8648,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, } if (!inode_evicting) - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(tree, page_start, page_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { /* @@ -8557,11 +8686,23 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_put_ordered_extent(ordered); if (!inode_evicting) { cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, + lock_extent_bits(tree, page_start, page_end, &cached_state); } } + /* + * Qgroup reserved space handler + * Page here will be either + * 1) Already written to disk + * In this case, its reserved space is released from data rsv map + * and will be freed by delayed_ref handler finally. + * So even we call qgroup_free_data(), it won't decrease reserved + * space. + * 2) Not written to disk + * This means the reserved space should be freed here. + */ + btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -8612,7 +8753,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_end; sb_start_pagefault(inode->i_sb); - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + ret = btrfs_delalloc_reserve_space(inode, page_start, + PAGE_CACHE_SIZE); if (!ret) { ret = file_update_time(vma->vm_file); reserved = 1; @@ -8631,8 +8776,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) again: lock_page(page); size = i_size_read(inode); - page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; if ((page->mapping != inode->i_mapping) || (page_start >= size)) { @@ -8641,7 +8784,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(io_tree, page_start, page_end, &cached_state); set_page_extent_mapped(page); /* @@ -8709,7 +8852,7 @@ out_unlock: } unlock_page(page); out: - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE); out_noreserve: sb_end_pagefault(inode->i_sb); return ret; @@ -8915,6 +9058,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; + ei->delayed_iput_count = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; @@ -8939,6 +9083,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); + INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); return inode; @@ -8998,6 +9143,7 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } + btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: @@ -9042,15 +9188,14 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); if (btrfs_free_space_cachep) kmem_cache_destroy(btrfs_free_space_cachep); - if (btrfs_delalloc_work_cachep) - kmem_cache_destroy(btrfs_delalloc_work_cachep); } int btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + init_once); if (!btrfs_inode_cachep) goto fail; @@ -9078,13 +9223,6 @@ int btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; - btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", - sizeof(struct btrfs_delalloc_work), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - NULL); - if (!btrfs_delalloc_work_cachep) - goto fail; - return 0; fail: btrfs_destroy_cachep(); @@ -9308,14 +9446,10 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) delalloc_work = container_of(work, struct btrfs_delalloc_work, work); inode = delalloc_work->inode; - if (delalloc_work->wait) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - } else { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); - } if (delalloc_work->delay_iput) btrfs_add_delayed_iput(inode); @@ -9325,18 +9459,17 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) } struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int wait, int delay_iput) + int delay_iput) { struct btrfs_delalloc_work *work; - work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + work = kmalloc(sizeof(*work), GFP_NOFS); if (!work) return NULL; init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - work->wait = wait; work->delay_iput = delay_iput; WARN_ON_ONCE(!inode); btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, @@ -9348,7 +9481,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) { wait_for_completion(&work->completion); - kmem_cache_free(btrfs_delalloc_work_cachep, work); + kfree(work); } /* @@ -9384,7 +9517,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, } spin_unlock(&root->delalloc_lock); - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + work = btrfs_alloc_delalloc_work(inode, delay_iput); if (!work) { if (delay_iput) btrfs_add_delayed_iput(inode); @@ -9526,9 +9659,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, /* * 2 items for inode item and ref * 2 items for dir items + * 1 item for updating parent inode item + * 1 item for the inline extent item * 1 item for xattr if selinux is on */ - trans = btrfs_start_transaction(root, 5); + trans = btrfs_start_transaction(root, 7); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -9559,10 +9694,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock_inode; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); - if (err) - goto out_unlock_inode; - path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -9595,10 +9726,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, btrfs_free_path(path); inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_symlink_aops; inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); + /* + * Last step, add directory indexes for our symlink inode. This is the + * last step to avoid extra cleanup of these indexes if an error happens + * elsewhere above. + */ + if (!err) + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) { drop_inode = 1; goto out_unlock_inode; @@ -9634,6 +9773,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 cur_offset = start; u64 i_size; u64 cur_bytes; + u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; @@ -9648,8 +9788,15 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); + /* + * If we are severely fragmented we could end up with really + * small allocations, so if the allocator is returning small + * chunks lets make its job easier by only searching for those + * sized chunks. + */ + cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { @@ -9658,6 +9805,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } + last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, @@ -9875,7 +10023,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .setattr = btrfs_setattr, .mknod = btrfs_mknod, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, @@ -9904,7 +10052,7 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static struct extent_io_ops btrfs_extent_io_ops = { +static const struct extent_io_ops btrfs_extent_io_ops = { .fill_delalloc = run_delalloc_range, .submit_bio_hook = btrfs_submit_bio_hook, .merge_bio_hook = btrfs_merge_bio_hook, @@ -9952,7 +10100,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, @@ -9966,7 +10114,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, @@ -9975,13 +10123,12 @@ static const struct inode_operations btrfs_special_inode_operations = { }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .update_time = btrfs_update_time, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8d20f3b1cab0..2a47a3148ec8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -655,22 +655,28 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) + return -ENOMEM; + + pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), + GFP_NOFS); + pending_snapshot->path = btrfs_alloc_path(); + if (!pending_snapshot->root_item || !pending_snapshot->path) { + ret = -ENOMEM; + goto free_pending; + } + atomic_inc(&root->will_be_snapshoted); smp_mb__after_atomic(); btrfs_wait_for_no_snapshoting_writes(root); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) - goto out; + goto dec_and_free; btrfs_wait_ordered_extents(root, -1); - pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) { - ret = -ENOMEM; - goto out; - } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -686,7 +692,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, &pending_snapshot->qgroup_reserved, false); if (ret) - goto free; + goto dec_and_free; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -737,11 +743,14 @@ fail: btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, pending_snapshot->qgroup_reserved); -free: - kfree(pending_snapshot); -out: +dec_and_free: if (atomic_dec_and_test(&root->will_be_snapshoted)) wake_up_atomic_t(&root->will_be_snapshoted); +free_pending: + kfree(pending_snapshot->root_item); + btrfs_free_path(pending_snapshot->path); + kfree(pending_snapshot); + return ret; } @@ -992,7 +1001,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) u64 end = start + len - 1; /* get the big lock and read metadata off disk */ - lock_extent_bits(io_tree, start, end, 0, &cached); + lock_extent_bits(io_tree, start, end, &cached); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); @@ -1016,7 +1025,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) ret = false; else if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > 128 * 1024 && next->block_len > 128 * 1024)) + (em->block_len > SZ_128K && next->block_len > SZ_128K)) ret = false; free_extent_map(next); @@ -1120,7 +1129,8 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(inode, - page_cnt << PAGE_CACHE_SHIFT); + start_index << PAGE_CACHE_SHIFT, + page_cnt << PAGE_CACHE_SHIFT); if (ret) return ret; i_done = 0; @@ -1139,7 +1149,7 @@ again: page_end = page_start + PAGE_CACHE_SIZE - 1; while (1) { lock_extent_bits(tree, page_start, page_end, - 0, &cached_state); + &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); unlock_extent_cached(tree, page_start, page_end, @@ -1199,7 +1209,7 @@ again: page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, 0, &cached_state); + page_start, page_end - 1, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, @@ -1210,7 +1220,8 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, - (page_cnt - i_done) << PAGE_CACHE_SHIFT); + start_index << PAGE_CACHE_SHIFT, + (page_cnt - i_done) << PAGE_CACHE_SHIFT); } @@ -1235,7 +1246,9 @@ out: unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, + start_index << PAGE_CACHE_SHIFT, + page_cnt << PAGE_CACHE_SHIFT); return ret; } @@ -1258,9 +1271,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; u32 extent_thresh = range->extent_thresh; - unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT; unsigned long cluster = max_cluster; - u64 new_align = ~((u64)128 * 1024 - 1); + u64 new_align = ~((u64)SZ_128K - 1); struct page **pages = NULL; if (isize == 0) @@ -1277,7 +1290,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } if (extent_thresh == 0) - extent_thresh = 256 * 1024; + extent_thresh = SZ_256K; /* * if we were not given a file, allocate a readahead @@ -1309,7 +1322,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (newer_than) { ret = find_new_extents(root, inode, newer_than, - &newer_off, 64 * 1024); + &newer_off, SZ_64K); if (!ret) { range->start = newer_off; /* @@ -1342,7 +1355,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; if (btrfs_defrag_cancelled(root->fs_info)) { - printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); + btrfs_debug(root->fs_info, "defrag_file cancelled"); ret = -EAGAIN; break; } @@ -1399,9 +1412,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, newer_off = max(newer_off + 1, (u64)i << PAGE_CACHE_SHIFT); - ret = find_new_extents(root, inode, - newer_than, &newer_off, - 64 * 1024); + ret = find_new_extents(root, inode, newer_than, + &newer_off, SZ_64K); if (!ret) { range->start = newer_off; i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; @@ -1567,7 +1579,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = old_size + new_size; } - if (new_size < 256 * 1024 * 1024) { + if (new_size < SZ_256M) { ret = -EINVAL; goto out_free; } @@ -1579,7 +1591,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = div_u64(new_size, root->sectorsize); new_size *= root->sectorsize; - printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", + btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu", rcu_str_deref(device->name), new_size); if (new_size > old_size) { @@ -2081,7 +2093,7 @@ static noinline int search_ioctl(struct inode *inode, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "BTRFS: could not find root %llu\n", + btrfs_err(info, "could not find root %llu", sk->tree_id); btrfs_free_path(path); return -ENOENT; @@ -2156,7 +2168,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, struct inode *inode; int ret; size_t buf_size; - const size_t buf_limit = 16 * 1024 * 1024; + const size_t buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2221,7 +2233,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); + btrfs_err(info, "could not find root %llu", tree_id); ret = -ENOENT; goto out; } @@ -2699,7 +2711,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; - struct btrfs_device *next; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; int ret = 0; @@ -2711,7 +2722,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) fi_args->num_devices = fs_devices->num_devices; memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); - list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; } @@ -2959,7 +2970,7 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, flush_dcache_page(dst_page); if (memcmp(addr, dst_addr, cmp_len)) - ret = BTRFS_SAME_DATA_DIFFERS; + ret = -EBADE; kunmap_atomic(addr); kunmap_atomic(dst_addr); @@ -3093,55 +3104,18 @@ out_unlock: return ret; } -#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) +#define BTRFS_MAX_DEDUPE_LEN SZ_16M -static long btrfs_ioctl_file_extent_same(struct file *file, - struct btrfs_ioctl_same_args __user *argp) +ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, + struct file *dst_file, u64 dst_loff) { - struct btrfs_ioctl_same_args *same = NULL; - struct btrfs_ioctl_same_extent_info *info; - struct inode *src = file_inode(file); - u64 off; - u64 len; - int i; - int ret; - unsigned long size; + struct inode *src = file_inode(src_file); + struct inode *dst = file_inode(dst_file); u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - bool is_admin = capable(CAP_SYS_ADMIN); - u16 count; - - if (!(file->f_mode & FMODE_READ)) - return -EINVAL; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - if (get_user(count, &argp->dest_count)) { - ret = -EFAULT; - goto out; - } - - size = offsetof(struct btrfs_ioctl_same_args __user, info[count]); + ssize_t res; - same = memdup_user(argp, size); - - if (IS_ERR(same)) { - ret = PTR_ERR(same); - same = NULL; - goto out; - } - - off = same->logical_offset; - len = same->length; - - /* - * Limit the total length we will dedupe for each operation. - * This is intended to bound the total time spent in this - * ioctl to something sane. - */ - if (len > BTRFS_MAX_DEDUPE_LEN) - len = BTRFS_MAX_DEDUPE_LEN; + if (olen > BTRFS_MAX_DEDUPE_LEN) + olen = BTRFS_MAX_DEDUPE_LEN; if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { /* @@ -3149,93 +3123,13 @@ static long btrfs_ioctl_file_extent_same(struct file *file, * result, btrfs_cmp_data() won't correctly handle * this situation without an update. */ - ret = -EINVAL; - goto out; - } - - ret = -EISDIR; - if (S_ISDIR(src->i_mode)) - goto out; - - ret = -EACCES; - if (!S_ISREG(src->i_mode)) - goto out; - - /* pre-format output fields to sane values */ - for (i = 0; i < count; i++) { - same->info[i].bytes_deduped = 0ULL; - same->info[i].status = 0; - } - - for (i = 0, info = same->info; i < count; i++, info++) { - struct inode *dst; - struct fd dst_file = fdget(info->fd); - if (!dst_file.file) { - info->status = -EBADF; - continue; - } - dst = file_inode(dst_file.file); - - if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) { - info->status = -EINVAL; - } else if (file->f_path.mnt != dst_file.file->f_path.mnt) { - info->status = -EXDEV; - } else if (S_ISDIR(dst->i_mode)) { - info->status = -EISDIR; - } else if (!S_ISREG(dst->i_mode)) { - info->status = -EACCES; - } else { - info->status = btrfs_extent_same(src, off, len, dst, - info->logical_offset); - if (info->status == 0) - info->bytes_deduped += len; - } - fdput(dst_file); + return -EINVAL; } - ret = copy_to_user(argp, same, size); - if (ret) - ret = -EFAULT; - -out: - mnt_drop_write_file(file); - kfree(same); - return ret; -} - -/* Helper to check and see if this root currently has a ref on the given disk - * bytenr. If it does then we need to update the quota for this root. This - * doesn't do anything if quotas aren't enabled. - */ -static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 disko) -{ - struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); - struct ulist *roots; - struct ulist_iterator uiter; - struct ulist_node *root_node = NULL; - int ret; - - if (!root->fs_info->quota_enabled) - return 1; - - btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); - ret = btrfs_find_all_roots(trans, root->fs_info, disko, - tree_mod_seq_elem.seq, &roots); - if (ret < 0) - goto out; - ret = 0; - ULIST_ITER_INIT(&uiter); - while ((root_node = ulist_next(roots, &uiter))) { - if (root_node->val == root->objectid) { - ret = 1; - break; - } - } - ulist_free(roots); -out: - btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); - return ret; + res = btrfs_extent_same(src, loff, olen, dst, dst_loff); + if (res) + return res; + return olen; } static int clone_finish_inode_update(struct btrfs_trans_handle *trans, @@ -3328,6 +3222,150 @@ static void clone_update_extent_map(struct inode *inode, &BTRFS_I(inode)->runtime_flags); } +/* + * Make sure we do not end up inserting an inline extent into a file that has + * already other (non-inline) extents. If a file has an inline extent it can + * not have any other extents and the (single) inline extent must start at the + * file offset 0. Failing to respect these rules will lead to file corruption, + * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc + * + * We can have extents that have been already written to disk or we can have + * dirty ranges still in delalloc, in which case the extent maps and items are + * created only when we run delalloc, and the delalloc ranges might fall outside + * the range we are currently locking in the inode's io tree. So we check the + * inode's i_size because of that (i_size updates are done while holding the + * i_mutex, which we are holding here). + * We also check to see if the inode has a size not greater than "datal" but has + * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are + * protected against such concurrent fallocate calls by the i_mutex). + * + * If the file has no extents but a size greater than datal, do not allow the + * copy because we would need turn the inline extent into a non-inline one (even + * with NO_HOLES enabled). If we find our destination inode only has one inline + * extent, just overwrite it with the source inline extent if its size is less + * than the source extent's size, or we could copy the source inline extent's + * data into the destination inode's inline extent if the later is greater then + * the former. + */ +static int clone_copy_inline_extent(struct inode *src, + struct inode *dst, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_key *new_key, + const u64 drop_start, + const u64 datal, + const u64 skip, + const u64 size, + char *inline_data) +{ + struct btrfs_root *root = BTRFS_I(dst)->root; + const u64 aligned_end = ALIGN(new_key->offset + datal, + root->sectorsize); + int ret; + struct btrfs_key key; + + if (new_key->offset > 0) + return -EOPNOTSUPP; + + key.objectid = btrfs_ino(dst); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + goto copy_inline_extent; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == btrfs_ino(dst) && + key.type == BTRFS_EXTENT_DATA_KEY) { + ASSERT(key.offset > 0); + return -EOPNOTSUPP; + } + } else if (i_size_read(dst) <= datal) { + struct btrfs_file_extent_item *ei; + u64 ext_len; + + /* + * If the file size is <= datal, make sure there are no other + * extents following (can happen do to an fallocate call with + * the flag FALLOC_FL_KEEP_SIZE). + */ + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + /* + * If it's an inline extent, it can not have other extents + * following it. + */ + if (btrfs_file_extent_type(path->nodes[0], ei) == + BTRFS_FILE_EXTENT_INLINE) + goto copy_inline_extent; + + ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + if (ext_len > aligned_end) + return -EOPNOTSUPP; + + ret = btrfs_next_item(root, path); + if (ret < 0) { + return ret; + } else if (ret == 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == btrfs_ino(dst) && + key.type == BTRFS_EXTENT_DATA_KEY) + return -EOPNOTSUPP; + } + } + +copy_inline_extent: + /* + * We have no extent items, or we have an extent at offset 0 which may + * or may not be inlined. All these cases are dealt the same way. + */ + if (i_size_read(dst) > datal) { + /* + * If the destination inode has an inline extent... + * This would require copying the data from the source inline + * extent into the beginning of the destination's inline extent. + * But this is really complex, both extents can be compressed + * or just one of them, which would require decompressing and + * re-compressing data (which could increase the new compressed + * size, not allowing the compressed data to fit anymore in an + * inline extent). + * So just don't support this case for now (it should be rare, + * we are not really saving space when cloning inline extents). + */ + return -EOPNOTSUPP; + } + + btrfs_release_path(path); + ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + if (ret) + return ret; + ret = btrfs_insert_empty_item(trans, root, path, new_key, size); + if (ret) + return ret; + + if (skip) { + const u32 start = btrfs_file_extent_calc_inline_size(0); + + memmove(inline_data + start, inline_data + start + skip, datal); + } + + write_extent_buffer(path->nodes[0], inline_data, + btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]), + size); + inode_add_bytes(dst, datal); + + return 0; +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -3352,9 +3390,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u32 nritems; int slot; int ret; - int no_quota; const u64 len = olen_aligned; - u64 last_disko = 0; u64 last_dest_end = destoff; ret = -ENOMEM; @@ -3368,7 +3404,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, return ret; } - path->reada = 2; + path->reada = READA_FORWARD; /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; @@ -3400,7 +3436,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, nritems = btrfs_header_nritems(path->nodes[0]); process_slot: - no_quota = 1; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) @@ -3552,35 +3587,13 @@ process_slot: btrfs_set_file_extent_num_bytes(leaf, extent, datal); - /* - * We need to look up the roots that point at - * this bytenr and see if the new root does. If - * it does not we need to make sure we update - * quotas appropriately. - */ - if (disko && root != BTRFS_I(src)->root && - disko != last_disko) { - no_quota = check_ref(trans, root, - disko); - if (no_quota < 0) { - btrfs_abort_transaction(trans, - root, - ret); - btrfs_end_transaction(trans, - root); - ret = no_quota; - goto out; - } - } - if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, disko, diskl, 0, root->root_key.objectid, btrfs_ino(inode), - new_key.offset - datao, - no_quota); + new_key.offset - datao); if (ret) { btrfs_abort_transaction(trans, root, @@ -3594,21 +3607,6 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; - u64 aligned_end = 0; - - /* - * Don't copy an inline extent into an offset - * greater than zero. Having an inline extent - * at such an offset results in chaos as btrfs - * isn't prepared for such cases. Just skip - * this case for the same reasons as commented - * at btrfs_ioctl_clone(). - */ - if (last_dest_end > 0) { - ret = -EOPNOTSUPP; - btrfs_end_transaction(trans, root); - goto out; - } if (off > key.offset) { skip = off - key.offset; @@ -3626,42 +3624,22 @@ process_slot: size -= skip + trim; datal -= skip + trim; - aligned_end = ALIGN(new_key.offset + datal, - root->sectorsize); - ret = btrfs_drop_extents(trans, root, inode, - drop_start, - aligned_end, - 1); + ret = clone_copy_inline_extent(src, inode, + trans, path, + &new_key, + drop_start, + datal, + skip, size, buf); if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); - btrfs_end_transaction(trans, root); - goto out; - } - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) { - btrfs_abort_transaction(trans, root, - ret); + root, + ret); btrfs_end_transaction(trans, root); goto out; } - - if (skip) { - u32 start = - btrfs_file_extent_calc_inline_size(0); - memmove(buf+start, buf+start+skip, - datal); - } - leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - inode_add_bytes(inode, datal); } /* If we have an implicit hole (NO_HOLES feature). */ @@ -3727,17 +3705,16 @@ out: return ret; } -static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static noinline int btrfs_clone_files(struct file *file, struct file *file_src, + u64 off, u64 olen, u64 destoff) { struct inode *inode = file_inode(file); + struct inode *src = file_inode(file_src); struct btrfs_root *root = BTRFS_I(inode)->root; - struct fd src_file; - struct inode *src; int ret; u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; - int same_inode = 0; + int same_inode = src == inode; /* * TODO: @@ -3750,49 +3727,20 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * be either compressed or non-compressed. */ - /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) - return -EINVAL; - if (btrfs_root_readonly(root)) return -EROFS; - ret = mnt_want_write_file(file); - if (ret) - return ret; - - src_file = fdget(srcfd); - if (!src_file.file) { - ret = -EBADF; - goto out_drop_write; - } - - ret = -EXDEV; - if (src_file.file->f_path.mnt != file->f_path.mnt) - goto out_fput; - - src = file_inode(src_file.file); - - ret = -EINVAL; - if (src == inode) - same_inode = 1; - - /* the src must be open for reading */ - if (!(src_file.file->f_mode & FMODE_READ)) - goto out_fput; + if (file_src->f_path.mnt != file->f_path.mnt || + src->i_sb != inode->i_sb) + return -EXDEV; /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - goto out_fput; + return -EINVAL; - ret = -EISDIR; if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - goto out_fput; - - ret = -EXDEV; - if (src->i_sb != inode->i_sb) - goto out_fput; + return -EISDIR; if (!same_inode) { btrfs_double_inode_lock(src, inode); @@ -3869,21 +3817,25 @@ out_unlock: btrfs_double_inode_unlock(src, inode); else mutex_unlock(&src->i_mutex); -out_fput: - fdput(src_file); -out_drop_write: - mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) +ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) { - struct btrfs_ioctl_clone_range_args args; + ssize_t ret; - if (copy_from_user(&args, argp, sizeof(args))) - return -EFAULT; - return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, - args.src_length, args.dest_offset); + ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out); + if (ret == 0) + ret = len; + return ret; +} + +int btrfs_clone_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, u64 len) +{ + return btrfs_clone_files(dst_file, src_file, off, len, destoff); } /* @@ -4095,7 +4047,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) return -ENOMEM; space_args.total_spaces = 0; - dest = kmalloc(alloc_size, GFP_NOFS); + dest = kmalloc(alloc_size, GFP_KERNEL); if (!dest) return -ENOMEM; dest_orig = dest; @@ -4472,7 +4424,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; } - size = min_t(u32, loi->size, 64 * 1024); + size = min_t(u32, loi->size, SZ_64K); inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); @@ -4621,7 +4573,7 @@ locked: goto out_bargs; } - bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; goto out_bargs; @@ -4707,7 +4659,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root, goto out; } - bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + bargs = kzalloc(sizeof(*bargs), GFP_KERNEL); if (!bargs) { ret = -ENOMEM; goto out; @@ -4814,7 +4766,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* update qgroup status and info */ err = btrfs_run_qgroups(trans, root->fs_info); if (err < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "failed to update qgroup status and info\n"); err = btrfs_end_transaction(trans, root); if (err && !ret) @@ -4967,7 +4919,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - qsa = kzalloc(sizeof(*qsa), GFP_NOFS); + qsa = kzalloc(sizeof(*qsa), GFP_KERNEL); if (!qsa) return -ENOMEM; @@ -5097,7 +5049,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, goto out; } - args64 = kmalloc(sizeof(*args64), GFP_NOFS); + args64 = kmalloc(sizeof(*args64), GFP_KERNEL); if (!args64) { ret = -ENOMEM; goto out; @@ -5234,7 +5186,7 @@ out_unlock: static int btrfs_ioctl_get_supported_features(struct file *file, void __user *arg) { - static struct btrfs_ioctl_feature_flags features[3] = { + static const struct btrfs_ioctl_feature_flags features[3] = { INIT_FEATURE_FLAGS(SUPP), INIT_FEATURE_FLAGS(SAFE_SET), INIT_FEATURE_FLAGS(SAFE_CLEAR) @@ -5433,10 +5385,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: return btrfs_ioctl_balance(file, NULL); - case BTRFS_IOC_CLONE: - return btrfs_ioctl_clone(file, arg, 0, 0, 0); - case BTRFS_IOC_CLONE_RANGE: - return btrfs_ioctl_clone_range(file, argp); case BTRFS_IOC_TRANS_START: return btrfs_ioctl_trans_start(file); case BTRFS_IOC_TRANS_END: @@ -5514,8 +5462,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_fslabel(file, argp); case BTRFS_IOC_SET_FSLABEL: return btrfs_ioctl_set_fslabel(file, argp); - case BTRFS_IOC_FILE_EXTENT_SAME: - return btrfs_ioctl_file_extent_same(file, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: return btrfs_ioctl_get_supported_features(file, argp); case BTRFS_IOC_GET_FEATURES: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index d7e6baf1b205..d13128c70ddd 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -56,7 +56,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) atomic_dec(&eb->spinning_readers); read_unlock(&eb->lock); } - return; } /* @@ -79,6 +78,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) write_lock(&eb->lock); WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_writers) && waitqueue_active(&eb->write_lock_wq)) wake_up(&eb->write_lock_wq); @@ -86,11 +88,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) BUG_ON(atomic_read(&eb->blocking_readers) == 0); read_lock(&eb->lock); atomic_inc(&eb->spinning_readers); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_readers) && waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); } - return; } /* @@ -229,6 +233,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_readers) && waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); @@ -280,6 +287,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { WARN_ON(atomic_read(&eb->spinning_writers)); atomic_dec(&eb->blocking_writers); + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); if (waitqueue_active(&eb->write_lock_wq)) wake_up(&eb->write_lock_wq); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 52170cf1757e..8c27292ea9ea 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Implicit memory barrier after test_and_set_bit + */ if (waitqueue_active(&entry->wait)) wake_up(&entry->wait); } else { @@ -409,6 +412,9 @@ have_entry: if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Implicit memory barrier after test_and_set_bit + */ if (waitqueue_active(&entry->wait)) wake_up(&entry->wait); } else { @@ -484,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, spin_lock_irq(&log->log_extents_lock[index]); while (!list_empty(&log->logged_list[index])) { + struct inode *inode; ordered = list_first_entry(&log->logged_list[index], struct btrfs_ordered_extent, log_list); list_del_init(&ordered->log_list); + inode = ordered->inode; spin_unlock_irq(&log->log_extents_lock[index]); if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - struct inode *inode = ordered->inode; u64 start = ordered->file_offset; u64 end = ordered->file_offset + ordered->len - 1; @@ -503,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, &ordered->flags)); /* - * If our ordered extent completed it means it updated the - * fs/subvol and csum trees already, so no need to make the - * current transaction's commit wait for it, as we end up - * holding memory unnecessarily and delaying the inode's iput - * until the transaction commit (we schedule an iput for the - * inode when the ordered extent's refcount drops to 0), which - * prevents it from being evictable until the transaction - * commits. + * In order to keep us from losing our ordered extent + * information when committing the transaction we have to make + * sure that any logged extents are completed when we go to + * commit the transaction. To do this we simply increase the + * current transactions pending_ordered counter and decrement it + * when the ordered extent completes. */ - if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) - btrfs_put_ordered_extent(ordered); - else - list_add_tail(&ordered->trans_list, &trans->ordered); - + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&tree->lock); + } + btrfs_put_ordered_extent(ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -578,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; + bool dec_pending_ordered = false; tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); @@ -587,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) + dec_pending_ordered = true; spin_unlock_irq(&tree->lock); + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (dec_pending_ordered) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&root->fs_info->trans_lock); + trans = root->fs_info->running_transaction; + if (trans) + atomic_inc(&trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + + ASSERT(trans); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 7176cc0fe43f..23c96059cef2 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -73,6 +73,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent * in the logging code. */ +#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to + * complete in the current transaction. */ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index dca137b04095..f9e60231f685 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = { .extract = prop_compression_extract, .inheritable = 1 }, - { - .xattr_name = NULL - } }; void __init btrfs_props_init(void) { - struct prop_handler *p; + int i; hash_init(prop_handlers_ht); - for (p = &prop_handlers[0]; p->xattr_name; p++) { + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + struct prop_handler *p = &prop_handlers[i]; u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); hash_add(prop_handlers_ht, &p->node, h); @@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *parent) { - const struct prop_handler *h; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + int i; if (!test_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(parent)->runtime_flags)) return 0; - for (h = &prop_handlers[0]; h->xattr_name; h++) { + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + const struct prop_handler *h = &prop_handlers[i]; const char *value; u64 num_bytes; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d904ee1c5349..5279fdae7142 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -993,9 +993,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; - spin_lock(&fs_info->qgroup_lock); fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; + btrfs_qgroup_wait_for_completion(fs_info); + spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; @@ -1461,6 +1462,8 @@ struct btrfs_qgroup_extent_record struct btrfs_qgroup_extent_record *entry; u64 bytenr = record->bytenr; + assert_spin_locked(&delayed_refs->lock); + while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, @@ -1652,10 +1655,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, } } - /* For exclusive extent, free its reserved bytes too */ - if (nr_old_roots == 0 && nr_new_roots == 1 && - cur_new_count == nr_new_roots) - qg->reserved -= num_bytes; if (dirty) qgroup_dirty(fs_info, qg); } @@ -2035,7 +2034,7 @@ out: return ret; } -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) +static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -2116,14 +2115,13 @@ out: return ret; } -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; - struct btrfs_fs_info *fs_info = root->fs_info; struct ulist_node *unode; struct ulist_iterator uiter; - u64 ref_root = root->root_key.objectid; int ret = 0; if (!is_fstree(ref_root)) @@ -2169,6 +2167,11 @@ out: spin_unlock(&fs_info->qgroup_lock); } +static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes) +{ + return btrfs_qgroup_free_refroot(root->fs_info, root->objectid, + num_bytes); +} void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) @@ -2188,17 +2191,16 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, - struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans) { struct btrfs_key found; + struct extent_buffer *scratch_leaf = NULL; struct ulist *roots = NULL; struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; int slot; int ret; - path->leave_spinning = 1; mutex_lock(&fs_info->qgroup_rescan_lock); ret = btrfs_search_slot_for_read(fs_info->extent_root, &fs_info->qgroup_rescan_progress, @@ -2229,7 +2231,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); - memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf)); + scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); + if (!scratch_leaf) { + ret = -ENOMEM; + mutex_unlock(&fs_info->qgroup_rescan_lock); + goto out; + } + extent_buffer_get(scratch_leaf); + btrfs_tree_read_lock(scratch_leaf); + btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK); slot = path->slots[0]; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -2255,6 +2265,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, goto out; } out: + if (scratch_leaf) { + btrfs_tree_read_unlock_blocking(scratch_leaf); + free_extent_buffer(scratch_leaf); + } btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); return ret; @@ -2266,19 +2280,15 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; int ret = 0; path = btrfs_alloc_path(); if (!path) goto out; - scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); - if (!scratch_leaf) - goto out; err = 0; - while (!err) { + while (!err && !btrfs_fs_closing(fs_info)) { trans = btrfs_start_transaction(fs_info->fs_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); @@ -2287,8 +2297,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) if (!fs_info->quota_enabled) { err = -EINTR; } else { - err = qgroup_rescan_leaf(fs_info, path, trans, - scratch_leaf); + err = qgroup_rescan_leaf(fs_info, path, trans); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2297,11 +2306,11 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) } out: - kfree(scratch_leaf); btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + if (!btrfs_fs_closing(fs_info)) + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { @@ -2330,7 +2339,9 @@ out: } btrfs_end_transaction(trans, fs_info->quota_root); - if (err >= 0) { + if (btrfs_fs_closing(fs_info)) { + btrfs_info(fs_info, "qgroup scan paused"); + } else if (err >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", err > 0 ? " (inconsistency flag cleared)" : ""); } else { @@ -2378,12 +2389,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_progress, 0, sizeof(fs_info->qgroup_rescan_progress)); fs_info->qgroup_rescan_progress.objectid = progress_objectid; + init_completion(&fs_info->qgroup_rescan_completion); spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - init_completion(&fs_info->qgroup_rescan_completion); - memset(&fs_info->qgroup_rescan_work, 0, sizeof(fs_info->qgroup_rescan_work)); btrfs_init_work(&fs_info->qgroup_rescan_work, @@ -2486,3 +2496,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); } + +/* + * Reserve qgroup space for range [start, start + len). + * + * This function will either reserve space from related qgroups or doing + * nothing if the range is already reserved. + * + * Return 0 for successful reserve + * Return <0 for error (including -EQUOT) + * + * NOTE: this function may sleep for memory allocation. + */ +int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_changeset changeset; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) || + len == 0) + return 0; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, + &changeset); + trace_btrfs_qgroup_reserve_data(inode, start, len, + changeset.bytes_changed, + QGROUP_RESERVE); + if (ret < 0) + goto cleanup; + ret = qgroup_reserve(root, changeset.bytes_changed); + if (ret < 0) + goto cleanup; + + ulist_free(changeset.range_changed); + return ret; + +cleanup: + /* cleanup already reserved ranges */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(changeset.range_changed, &uiter))) + clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, + unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, + GFP_NOFS); + ulist_free(changeset.range_changed); + return ret; +} + +static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, + int free) +{ + struct extent_changeset changeset; + int trace_op = QGROUP_RELEASE; + int ret; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + if (!changeset.range_changed) + return -ENOMEM; + + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, + &changeset); + if (ret < 0) + goto out; + + if (free) { + qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed); + trace_op = QGROUP_FREE; + } + trace_btrfs_qgroup_release_data(inode, start, len, + changeset.bytes_changed, trace_op); +out: + ulist_free(changeset.range_changed); + return ret; +} + +/* + * Free a reserved space range from io_tree and related qgroups + * + * Should be called when a range of pages get invalidated before reaching disk. + * Or for error cleanup case. + * + * For data written to disk, use btrfs_qgroup_release_data(). + * + * NOTE: This function may sleep for memory allocation. + */ +int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +{ + return __btrfs_qgroup_release_data(inode, start, len, 1); +} + +/* + * Release a reserved space range from io_tree only. + * + * Should be called when a range of pages get written to disk and corresponding + * FILE_EXTENT is inserted into corresponding root. + * + * Since new qgroup accounting framework will only update qgroup numbers at + * commit_transaction() time, its reserved space shouldn't be freed from + * related qgroups. + * + * But we should release the range from io_tree, to allow further write to be + * COWed. + * + * NOTE: This function may sleep for memory allocation. + */ +int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) +{ + return __btrfs_qgroup_release_data(inode, start, len, 0); +} + +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes) +{ + int ret; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) || + num_bytes == 0) + return 0; + + BUG_ON(num_bytes != round_down(num_bytes, root->nodesize)); + ret = qgroup_reserve(root, num_bytes); + if (ret < 0) + return ret; + atomic_add(num_bytes, &root->qgroup_meta_rsv); + return ret; +} + +void btrfs_qgroup_free_meta_all(struct btrfs_root *root) +{ + int reserved; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid)) + return; + + reserved = atomic_xchg(&root->qgroup_meta_rsv, 0); + if (reserved == 0) + return; + qgroup_free(root, reserved); +} + +void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) +{ + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid)) + return; + + BUG_ON(num_bytes != round_down(num_bytes, root->nodesize)); + WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes); + atomic_sub(num_bytes, &root->qgroup_meta_rsv); + qgroup_free(root, num_bytes); +} + +/* + * Check qgroup reserved space leaking, normally at destory inode + * time + */ +void btrfs_qgroup_check_reserved_leak(struct inode *inode) +{ + struct extent_changeset changeset; + struct ulist_node *unode; + struct ulist_iterator iter; + int ret; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + if (WARN_ON(!changeset.range_changed)) + return; + + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset); + + WARN_ON(ret < 0); + if (WARN_ON(changeset.bytes_changed)) { + ULIST_ITER_INIT(&iter); + while ((unode = ulist_next(changeset.range_changed, &iter))) { + btrfs_warn(BTRFS_I(inode)->root->fs_info, + "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu", + inode->i_ino, unode->val, unode->aux); + } + qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed); + } + ulist_free(changeset.range_changed); +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 6387dcfa354c..ecb2c143ef75 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record { struct ulist *old_roots; }; +/* + * For qgroup event trace points only + */ +#define QGROUP_RESERVE (1<<0) +#define QGROUP_RELEASE (1<<1) +#define QGROUP_FREE (1<<2) + int btrfs_quota_enable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_trans_handle *trans, @@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, struct btrfs_qgroup_inherit *inherit); -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); - +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes); +/* + * TODO: Add proper trace point for it, as btrfs_qgroup_free() is + * called by everywhere, can't provide good trace for delayed ref case. + */ +static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes) +{ + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); + trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes); +} void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl); #endif +/* New io_tree based accurate qgroup reserve API */ +int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); + +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes); +void btrfs_qgroup_free_meta_all(struct btrfs_root *root); +void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes); +void btrfs_qgroup_check_reserved_leak(struct inode *inode); #endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fcf7265ca46f..6d707545f775 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -503,7 +503,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) } spin_unlock_irqrestore(&table->cache_lock, flags); - return; } /* @@ -810,7 +809,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } goto done_nolock; - } else if (waitqueue_active(&h->wait)) { + /* + * The barrier for this waitqueue_active is not needed, + * we're protected by h->lock and can't miss a wakeup. + */ + } else if (waitqueue_active(&h->wait)) { spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); wake_up(&h->wait); @@ -902,7 +905,6 @@ static void raid_write_end_io(struct bio *bio) err = -EIO; rbio_orig_end_io(rbio, err); - return; } /* diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 4645cd16d5ba..619f92963e27 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -569,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical, rec = kzalloc(sizeof(*rec), GFP_NOFS); if (!rec) { reada_extent_put(root->fs_info, re); - return -1; + return -ENOMEM; } rec->rc = rc; @@ -918,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, u64 start; u64 generation; int level; + int ret; struct extent_buffer *node; static struct btrfs_key max_key = { .objectid = (u64)-1, @@ -943,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, generation = btrfs_header_generation(node); free_extent_buffer(node); - if (reada_add_block(rc, start, &max_key, level, generation)) { + ret = reada_add_block(rc, start, &max_key, level, generation); + if (ret) { kfree(rc); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } reada_start_machine(root->fs_info); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 303babeef505..ef6d8fc85853 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -708,8 +708,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } - path1->reada = 1; - path2->reada = 2; + path1->reada = READA_FORWARD; + path2->reada = READA_FORWARD; node = alloc_backref_node(cache); if (!node) { @@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); + key.objectid, key.offset); if (ret) { btrfs_abort_transaction(trans, root, ret); break; @@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); + key.objectid, key.offset); if (ret) { btrfs_abort_transaction(trans, root, ret); break; @@ -1900,23 +1900,21 @@ again: ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); + src->root_key.objectid, level - 1, 0); BUG_ON(ret); ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0, 1); + 0); BUG_ON(ret); ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); + src->root_key.objectid, level - 1, 0); BUG_ON(ret); ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0, 1); + 0); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -2132,7 +2130,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -2418,7 +2416,7 @@ again: } out: if (ret) { - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); @@ -2745,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), - node->level, 0, 1); + node->level, 0); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); @@ -3034,8 +3032,8 @@ int prealloc_file_extent_cluster(struct inode *inode, BUG_ON(cluster->start != cluster->boundary[0]); mutex_lock(&inode->i_mutex); - ret = btrfs_check_data_free_space(inode, cluster->end + - 1 - cluster->start, 0); + ret = btrfs_check_data_free_space(inode, cluster->start, + cluster->end + 1 - cluster->start); if (ret) goto out; @@ -3056,8 +3054,8 @@ int prealloc_file_extent_cluster(struct inode *inode, break; nr++; } - btrfs_free_reserved_data_space(inode, cluster->end + - 1 - cluster->start); + btrfs_free_reserved_data_space(inode, cluster->start, + cluster->end + 1 - cluster->start); out: mutex_unlock(&inode->i_mutex); return ret; @@ -3529,7 +3527,7 @@ static int find_data_references(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { @@ -3919,7 +3917,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; ret = prepare_to_relocate(rc); if (ret) { @@ -4345,7 +4343,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 360a728a639f..7cf8509deda7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, if (!need_reset && btrfs_root_generation(item) != btrfs_root_generation_v2(item)) { if (btrfs_root_generation_v2(item) != 0) { - printk(KERN_WARNING "BTRFS: mismatching " + btrfs_warn(eb->fs_info, + "mismatching " "generation and generation_v2 " "found in root item. This root " "was probably mounted with an " "older kernel. Resetting all " - "new fields.\n"); + "new fields."); } need_reset = 1; } @@ -141,7 +142,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root int ret; int slot; unsigned long ptr; - int old_len; + u32 old_len; path = btrfs_alloc_path(); if (!path) @@ -283,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - btrfs_error(tree_root->fs_info, err, + btrfs_std_error(tree_root->fs_info, err, "Failed to start trans to delete " "orphan item"); break; @@ -292,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid); btrfs_end_transaction(trans, tree_root); if (err) { - btrfs_error(tree_root->fs_info, err, + btrfs_std_error(tree_root->fs_info, err, "Failed to delete root orphan " "item"); break; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a39f5d1144e8..0c981ebe2acb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -248,14 +248,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size, int retry_failed_mirror); -static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, - int is_metadata, int have_csum, - const u8 *csum, u64 generation, - u16 csum_size); + struct scrub_block *sblock, + int retry_failed_mirror); +static void scrub_recheck_block_checksum(struct scrub_block *sblock); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good); static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, @@ -580,9 +575,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " - "length %llu, links %u (path: %s)\n", swarn->errstr, + "length %llu, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, @@ -592,9 +587,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, return 0; err: - printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " - "resolving failed with ret=%d\n", swarn->errstr, + "resolving failed with ret=%d", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, ret); @@ -649,10 +644,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); - printk_in_rcu(KERN_WARNING - "BTRFS: %s at logical %llu on dev %s, " + btrfs_warn_in_rcu(fs_info, + "%s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " - "%llu\n", errstr, swarn.logical, + "%llu", errstr, swarn.logical, rcu_str_deref(dev->name), (unsigned long long)swarn.sector, ref_level ? "node" : "leaf", @@ -850,8 +845,8 @@ out: btrfs_dev_replace_stats_inc( &sctx->dev_root->fs_info->dev_replace. num_uncorrectable_read_errors); - printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " - "unable to fixup (nodatasum) error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(sctx->dev_root->fs_info, + "unable to fixup (nodatasum) error at logical %llu on dev %s", fixup->logical, rcu_str_deref(fixup->dev->name)); } @@ -889,11 +884,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) struct btrfs_fs_info *fs_info; u64 length; u64 logical; - u64 generation; unsigned int failed_mirror_index; unsigned int is_metadata; unsigned int have_csum; - u8 *csum; struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ struct scrub_block *sblock_bad; int ret; @@ -918,13 +911,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } length = sblock_to_check->page_count * PAGE_SIZE; logical = sblock_to_check->pagev[0]->logical; - generation = sblock_to_check->pagev[0]->generation; BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; is_metadata = !(sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA); have_csum = sblock_to_check->pagev[0]->have_csum; - csum = sblock_to_check->pagev[0]->csum; dev = sblock_to_check->pagev[0]->dev; if (sctx->is_dev_replace && !is_metadata && !have_csum) { @@ -987,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_bad = sblocks_for_recheck + failed_mirror_index; /* build and submit the bios for the failed mirror, check checksums */ - scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sctx->csum_size, 1); + scrub_recheck_block(fs_info, sblock_bad, 1); if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { @@ -1101,9 +1091,7 @@ nodatasum_case: sblock_other = sblocks_for_recheck + mirror_index; /* build and submit the bios, check checksums */ - scrub_recheck_block(fs_info, sblock_other, is_metadata, - have_csum, csum, generation, - sctx->csum_size, 0); + scrub_recheck_block(fs_info, sblock_other, 0); if (!sblock_other->header_error && !sblock_other->checksum_error && @@ -1215,9 +1203,7 @@ nodatasum_case: * is verified, but most likely the data comes out * of the page cache. */ - scrub_recheck_block(fs_info, sblock_bad, - is_metadata, have_csum, csum, - generation, sctx->csum_size, 1); + scrub_recheck_block(fs_info, sblock_bad, 1); if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) @@ -1230,8 +1216,8 @@ corrected_error: sctx->stat.corrected_errors++; sblock_to_check->data_corrected = 1; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: fixed up error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "fixed up error at logical %llu on dev %s", logical, rcu_str_deref(dev->name)); } } else { @@ -1239,8 +1225,8 @@ did_not_correct_error: spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "unable to fixup (regular) error at logical %llu on dev %s", logical, rcu_str_deref(dev->name)); } @@ -1318,6 +1304,9 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; u64 length = original_sblock->page_count * PAGE_SIZE; u64 logical = original_sblock->pagev[0]->logical; + u64 generation = original_sblock->pagev[0]->generation; + u64 flags = original_sblock->pagev[0]->flags; + u64 have_csum = original_sblock->pagev[0]->have_csum; struct scrub_recover *recover; struct btrfs_bio *bbio; u64 sublen; @@ -1372,6 +1361,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; + page = kzalloc(sizeof(*page), GFP_NOFS); if (!page) { leave_nomem: @@ -1383,7 +1373,15 @@ leave_nomem: } scrub_page_get(page); sblock->pagev[page_index] = page; + page->sblock = sblock; + page->flags = flags; + page->generation = generation; page->logical = logical; + page->have_csum = have_csum; + if (have_csum) + memcpy(page->csum, + original_sblock->pagev[0]->csum, + sctx->csum_size); scrub_stripe_index_and_offset(logical, bbio->map_type, @@ -1474,15 +1472,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, * the pages that are errored in the just handled mirror can be repaired. */ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size, int retry_failed_mirror) + struct scrub_block *sblock, + int retry_failed_mirror) { int page_num; sblock->no_io_error_seen = 1; - sblock->header_error = 0; - sblock->checksum_error = 0; for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; @@ -1518,11 +1513,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } if (sblock->no_io_error_seen) - scrub_recheck_block_checksum(fs_info, sblock, is_metadata, - have_csum, csum, generation, - csum_size); - - return; + scrub_recheck_block_checksum(sblock); } static inline int scrub_check_fsid(u8 fsid[], @@ -1535,61 +1526,16 @@ static inline int scrub_check_fsid(u8 fsid[], return !ret; } -static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, - int is_metadata, int have_csum, - const u8 *csum, u64 generation, - u16 csum_size) +static void scrub_recheck_block_checksum(struct scrub_block *sblock) { - int page_num; - u8 calculated_csum[BTRFS_CSUM_SIZE]; - u32 crc = ~(u32)0; - void *mapped_buffer; - - WARN_ON(!sblock->pagev[0]->page); - if (is_metadata) { - struct btrfs_header *h; - - mapped_buffer = kmap_atomic(sblock->pagev[0]->page); - h = (struct btrfs_header *)mapped_buffer; - - if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || - !scrub_check_fsid(h->fsid, sblock->pagev[0]) || - memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) { - sblock->header_error = 1; - } else if (generation != btrfs_stack_header_generation(h)) { - sblock->header_error = 1; - sblock->generation_error = 1; - } - csum = h->csum; - } else { - if (!have_csum) - return; - - mapped_buffer = kmap_atomic(sblock->pagev[0]->page); - } - - for (page_num = 0;;) { - if (page_num == 0 && is_metadata) - crc = btrfs_csum_data( - ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, - crc, PAGE_SIZE - BTRFS_CSUM_SIZE); - else - crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE); - - kunmap_atomic(mapped_buffer); - page_num++; - if (page_num >= sblock->page_count) - break; - WARN_ON(!sblock->pagev[page_num]->page); - - mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); - } + sblock->header_error = 0; + sblock->checksum_error = 0; + sblock->generation_error = 0; - btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, csum, csum_size)) - sblock->checksum_error = 1; + if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA) + scrub_checksum_data(sblock); + else + scrub_checksum_tree_block(sblock); } static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, @@ -1626,9 +1572,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, int ret; if (!page_bad->dev->bdev) { - printk_ratelimited(KERN_WARNING "BTRFS: " + btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) " - "is unexpected!\n"); + "is unexpected"); return -EIO; } @@ -1833,6 +1779,18 @@ static int scrub_checksum(struct scrub_block *sblock) u64 flags; int ret; + /* + * No need to initialize these stats currently, + * because this function only use return value + * instead of these stats value. + * + * Todo: + * always use stats + */ + sblock->header_error = 0; + sblock->generation_error = 0; + sblock->checksum_error = 0; + WARN_ON(sblock->page_count < 1); flags = sblock->pagev[0]->flags; ret = 0; @@ -1858,7 +1816,6 @@ static int scrub_checksum_data(struct scrub_block *sblock) struct page *page; void *buffer; u32 crc = ~(u32)0; - int fail = 0; u64 len; int index; @@ -1889,9 +1846,9 @@ static int scrub_checksum_data(struct scrub_block *sblock) btrfs_csum_final(crc, csum); if (memcmp(csum, on_disk_csum, sctx->csum_size)) - fail = 1; + sblock->checksum_error = 1; - return fail; + return sblock->checksum_error; } static int scrub_checksum_tree_block(struct scrub_block *sblock) @@ -1907,8 +1864,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) u64 mapped_size; void *p; u32 crc = ~(u32)0; - int fail = 0; - int crc_fail = 0; u64 len; int index; @@ -1923,19 +1878,20 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) - ++fail; + sblock->header_error = 1; - if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) - ++fail; + if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) { + sblock->header_error = 1; + sblock->generation_error = 1; + } if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) - ++fail; + sblock->header_error = 1; if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) - ++fail; + sblock->header_error = 1; len = sctx->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; @@ -1960,9 +1916,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) btrfs_csum_final(crc, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) - ++crc_fail; + sblock->checksum_error = 1; - return fail || crc_fail; + return sblock->header_error || sblock->checksum_error; } static int scrub_checksum_super(struct scrub_block *sblock) @@ -2176,40 +2132,28 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) { struct scrub_block *sblock = container_of(work, struct scrub_block, work); struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; - unsigned int is_metadata; - unsigned int have_csum; - u8 *csum; - u64 generation; u64 logical; struct btrfs_device *dev; - is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock->pagev[0]->have_csum; - csum = sblock->pagev[0]->csum; - generation = sblock->pagev[0]->generation; logical = sblock->pagev[0]->logical; dev = sblock->pagev[0]->dev; - if (sblock->no_io_error_seen) { - scrub_recheck_block_checksum(fs_info, sblock, is_metadata, - have_csum, csum, generation, - sctx->csum_size); - } + if (sblock->no_io_error_seen) + scrub_recheck_block_checksum(sblock); if (!sblock->no_io_error_seen) { spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: I/O error rebulding logical %llu for dev %s\n", + btrfs_err_rl_in_rcu(sctx->dev_root->fs_info, + "IO error rebuilding logical %llu for dev %s", logical, rcu_str_deref(dev->name)); } else if (sblock->header_error || sblock->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: failed to rebuild valid logical %llu for dev %s\n", + btrfs_err_rl_in_rcu(sctx->dev_root->fs_info, + "failed to rebuild valid logical %llu for dev %s", logical, rcu_str_deref(dev->name)); } else { scrub_write_block_to_dev_replace(sblock); @@ -2500,8 +2444,7 @@ static void scrub_block_complete(struct scrub_block *sblock) } } -static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, - u8 *csum) +static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) { struct btrfs_ordered_sum *sum = NULL; unsigned long index; @@ -2565,7 +2508,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, if (flags & BTRFS_EXTENT_FLAG_DATA) { /* push csums to sbio */ - have_csum = scrub_find_csum(sctx, logical, l, csum); + have_csum = scrub_find_csum(sctx, logical, csum); if (have_csum == 0) ++sctx->stat.no_csum; if (sctx->is_dev_replace && !have_csum) { @@ -2703,7 +2646,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, if (flags & BTRFS_EXTENT_FLAG_DATA) { /* push csums to sbio */ - have_csum = scrub_find_csum(sctx, logical, l, csum); + have_csum = scrub_find_csum(sctx, logical, csum); if (have_csum == 0) goto skip; } @@ -3012,6 +2955,9 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, logic_start + map->stripe_len)) { btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu", key.objectid, logic_start); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); goto next; } again: @@ -3361,6 +3307,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, "scrub: tree block %llu spanning " "stripes, ignored. logical=%llu", key.objectid, logical); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); goto next; } @@ -3481,7 +3430,9 @@ out: static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 chunk_offset, u64 length, - u64 dev_offset, int is_dev_replace) + u64 dev_offset, + struct btrfs_block_group_cache *cache, + int is_dev_replace) { struct btrfs_mapping_tree *map_tree = &sctx->dev_root->fs_info->mapping_tree; @@ -3494,8 +3445,18 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); read_unlock(&map_tree->map_tree.lock); - if (!em) - return -EINVAL; + if (!em) { + /* + * Might have been an unused block group deleted by the cleaner + * kthread or relocation. + */ + spin_lock(&cache->lock); + if (!cache->removed) + ret = -EINVAL; + spin_unlock(&cache->lock); + + return ret; + } map = (struct map_lookup *)em->bdev; if (em->start != chunk_offset) @@ -3532,6 +3493,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 length; u64 chunk_offset; int ret = 0; + int ro_set; int slot; struct extent_buffer *l; struct btrfs_key key; @@ -3543,7 +3505,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; @@ -3617,7 +3579,21 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_on(fs_info); ret = btrfs_inc_block_group_ro(root, cache); scrub_pause_off(fs_info); - if (ret) { + + if (ret == 0) { + ro_set = 1; + } else if (ret == -ENOSPC) { + /* + * btrfs_inc_block_group_ro return -ENOSPC when it + * failed in creating new chunk for metadata. + * It is not a problem for scrub/replace, because + * metadata are always cowed, and our scrub paused + * commit_transactions. + */ + ro_set = 0; + } else { + btrfs_warn(fs_info, "failed setting block group ro, ret=%d\n", + ret); btrfs_put_block_group(cache); break; } @@ -3626,7 +3602,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, - found_key.offset, is_dev_replace); + found_key.offset, cache, is_dev_replace); /* * flush, submit all pending read and write bios, afterwards @@ -3660,7 +3636,30 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); - btrfs_dec_block_group_ro(root, cache); + if (ro_set) + btrfs_dec_block_group_ro(root, cache); + + /* + * We might have prevented the cleaner kthread from deleting + * this block group if it was already unused because we raced + * and set it to RO mode first. So add it back to the unused + * list, otherwise it might not ever be deleted unless a manual + * balance is triggered or it becomes used and unused again. + */ + spin_lock(&cache->lock); + if (!cache->removed && !cache->ro && cache->reserved == 0 && + btrfs_block_group_used(&cache->item) == 0) { + spin_unlock(&cache->lock); + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &fs_info->unused_bgs); + } + spin_unlock(&fs_info->unused_bgs_lock); + } else { + spin_unlock(&cache->lock); + } btrfs_put_block_group(cache); if (ret) @@ -3734,27 +3733,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) fs_info->scrub_workers = - btrfs_alloc_workqueue("btrfs-scrub", flags, + btrfs_alloc_workqueue("scrub", flags, 1, 4); else fs_info->scrub_workers = - btrfs_alloc_workqueue("btrfs-scrub", flags, + btrfs_alloc_workqueue("scrub", flags, max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + btrfs_alloc_workqueue("scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + btrfs_alloc_workqueue("scrubnc", flags, 1, 0); if (!fs_info->scrub_nocow_workers) goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = - btrfs_alloc_workqueue("btrfs-scrubparity", flags, + btrfs_alloc_workqueue("scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; @@ -4210,7 +4209,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len, io_tree = &BTRFS_I(inode)->io_tree; - lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + lock_extent_bits(io_tree, lockstart, lockend, &cached_state); ordered = btrfs_lookup_ordered_range(inode, lockstart, len); if (ordered) { btrfs_put_ordered_extent(ordered); @@ -4375,8 +4374,8 @@ static int write_page_nocow(struct scrub_ctx *sctx, if (!dev) return -EIO; if (!dev->bdev) { - printk_ratelimited(KERN_WARNING - "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + btrfs_warn_rl(dev->dev_root->fs_info, + "scrub write_page_nocow(bdev == NULL) is unexpected"); return -EIO; } bio = btrfs_io_bio_alloc(GFP_NOFS, 1); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a739b825bdd3..63a6152be04b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " } if (cur_clone_root) { - if (compressed != BTRFS_COMPRESS_NONE) { - /* - * Offsets given by iterate_extent_inodes() are relative - * to the start of the extent, we need to add logical - * offset from the file extent item. - * (See why at backref.c:check_extent_in_eb()) - */ - cur_clone_root->offset += btrfs_file_extent_offset(eb, - fi); - } *found = cur_clone_root; ret = 0; } else { @@ -1479,7 +1469,21 @@ static int read_symlink(struct btrfs_root *root, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret); + if (ret) { + /* + * An empty symlink inode. Can happen in rare error paths when + * creating a symlink (transaction committed before the inode + * eviction handler removed the symlink inode items and a crash + * happened in between or the subvol was snapshoted in between). + * Print an informative message to dmesg/syslog so that the user + * can delete the symlink. + */ + btrfs_err(root->fs_info, + "Found empty symlink inode %llu at root %llu", + ino, root->root_key.objectid); + ret = -EIO; + goto out; + } ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); @@ -2353,8 +2357,14 @@ static int send_subvol_begin(struct send_ctx *sctx) } TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); - TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, - sctx->send_root->root_item.uuid); + + if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { @@ -2564,7 +2574,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); } else if (S_ISSOCK(mode)) { cmd = BTRFS_SEND_C_MKSOCK; } else { - printk(KERN_WARNING "btrfs: unexpected inode type %o", + btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", (int)(mode & S_IFMT)); ret = -ENOTSUPP; goto out; @@ -4687,6 +4697,171 @@ tlv_put_failure: return ret; } +static int send_extent_data(struct send_ctx *sctx, + const u64 offset, + const u64 len) +{ + u64 sent = 0; + + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, len); + + while (sent < len) { + u64 size = len - sent; + int ret; + + if (size > BTRFS_SEND_READ_SIZE) + size = BTRFS_SEND_READ_SIZE; + ret = send_write(sctx, offset + sent, size); + if (ret < 0) + return ret; + if (!ret) + break; + sent += ret; + } + return 0; +} + +static int clone_range(struct send_ctx *sctx, + struct clone_root *clone_root, + const u64 disk_byte, + u64 data_offset, + u64 offset, + u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + /* + * We can't send a clone operation for the entire range if we find + * extent items in the respective range in the source file that + * refer to different extents or if we find holes. + * So check for that and do a mix of clone and regular write/copy + * operations if needed. + * + * Example: + * + * mkfs.btrfs -f /dev/sda + * mount /dev/sda /mnt + * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo + * cp --reflink=always /mnt/foo /mnt/bar + * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo + * btrfs subvolume snapshot -r /mnt /mnt/snap + * + * If when we send the snapshot and we are processing file bar (which + * has a higher inode number than foo) we blindly send a clone operation + * for the [0, 100K[ range from foo to bar, the receiver ends up getting + * a file bar that matches the content of file foo - iow, doesn't match + * the content from bar in the original filesystem. + */ + key.objectid = clone_root->ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = clone_root->offset; + ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == clone_root->ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *ei; + u8 type; + u64 ext_len; + u64 clone_len; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(clone_root->root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* + * We might have an implicit trailing hole (NO_HOLES feature + * enabled). We deal with it after leaving this loop. + */ + if (key.objectid != clone_root->ino || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(leaf, ei); + if (type == BTRFS_FILE_EXTENT_INLINE) { + ext_len = btrfs_file_extent_inline_len(leaf, slot, ei); + ext_len = PAGE_CACHE_ALIGN(ext_len); + } else { + ext_len = btrfs_file_extent_num_bytes(leaf, ei); + } + + if (key.offset + ext_len <= clone_root->offset) + goto next; + + if (key.offset > clone_root->offset) { + /* Implicit hole, NO_HOLES feature enabled. */ + u64 hole_len = key.offset - clone_root->offset; + + if (hole_len > len) + hole_len = len; + ret = send_extent_data(sctx, offset, hole_len); + if (ret < 0) + goto out; + + len -= hole_len; + if (len == 0) + break; + offset += hole_len; + clone_root->offset += hole_len; + data_offset += hole_len; + } + + if (key.offset >= clone_root->offset + len) + break; + + clone_len = min_t(u64, ext_len, len); + + if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && + btrfs_file_extent_offset(leaf, ei) == data_offset) + ret = send_clone(sctx, offset, clone_len, clone_root); + else + ret = send_extent_data(sctx, offset, clone_len); + + if (ret < 0) + goto out; + + len -= clone_len; + if (len == 0) + break; + offset += clone_len; + clone_root->offset += clone_len; + data_offset += clone_len; +next: + path->slots[0]++; + } + + if (len > 0) + ret = send_extent_data(sctx, offset, len); + else + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -4695,9 +4870,7 @@ static int send_write_or_clone(struct send_ctx *sctx, int ret = 0; struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 pos = 0; u64 len; - u32 l; u8 type; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; @@ -4725,22 +4898,15 @@ static int send_write_or_clone(struct send_ctx *sctx, } if (clone_root && IS_ALIGNED(offset + len, bs)) { - ret = send_clone(sctx, offset, len, clone_root); - } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { - ret = send_update_extent(sctx, offset, len); + u64 disk_byte; + u64 data_offset; + + disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); + data_offset = btrfs_file_extent_offset(path->nodes[0], ei); + ret = clone_range(sctx, clone_root, disk_byte, data_offset, + offset, len); } else { - while (pos < len) { - l = len - pos; - if (l > BTRFS_SEND_READ_SIZE) - l = BTRFS_SEND_READ_SIZE; - ret = send_write(sctx, pos + offset, l); - if (ret < 0) - goto out; - if (!ret) - break; - pos += ret; - } - ret = 0; + ret = send_extent_data(sctx, offset, len); } out: return ret; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 48d425aef05b..02e00166c4da 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -22,8 +22,8 @@ #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" #define BTRFS_SEND_STREAM_VERSION 1 -#define BTRFS_SEND_BUF_SIZE (1024 * 64) -#define BTRFS_SEND_READ_SIZE (1024 * 48) +#define BTRFS_SEND_BUF_SIZE SZ_64K +#define BTRFS_SEND_READ_SIZE (48 * SZ_1K) enum btrfs_tlv_type { BTRFS_TLV_U8, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 11d1eab9234d..9b9eab6d048e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } } -#ifdef CONFIG_PRINTK /* * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. @@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; +#ifdef CONFIG_PRINTK const char *errstr; +#endif /* * Special case: if the error is EROFS, and we're already @@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) return; +#ifdef CONFIG_PRINTK errstr = btrfs_decode_error(errno); if (fmt) { struct va_format vaf; @@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n", sb->s_id, function, line, errno, errstr); } +#endif /* Don't go through full error handling during mount */ save_error_info(fs_info); @@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, btrfs_handle_error(fs_info); } +#ifdef CONFIG_PRINTK static const char * const logtypes[] = { "emergency", "alert", @@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) va_end(args); } - -#else - -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; - - /* - * Special case: if the error is EROFS, and we're already - * under MS_RDONLY, then it is safe here. - */ - if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) - return; - - /* Don't go through full error handling during mount */ - if (sb->s_flags & MS_BORN) { - save_error_info(fs_info); - btrfs_handle_error(fs_info); - } -} #endif /* @@ -312,18 +295,22 @@ enum { Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, - Opt_no_space_cache, Opt_recovery, Opt_skip_balance, - Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_space_cache, Opt_space_cache_version, Opt_clear_cache, + Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, + Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, + Opt_skip_balance, Opt_check_integrity, + Opt_check_integrity_including_extent_data, Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, Opt_datasum, Opt_treelog, Opt_noinode_cache, +#ifdef CONFIG_BTRFS_DEBUG + Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, +#endif Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_degraded, "degraded"}, {Opt_subvol, "subvol=%s"}, {Opt_subvolid, "subvolid=%s"}, @@ -354,6 +341,7 @@ static match_table_t tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_space_cache, "space_cache"}, + {Opt_space_cache_version, "space_cache=%s"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_enospc_debug, "enospc_debug"}, @@ -372,6 +360,11 @@ static match_table_t tokens = { {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, {Opt_fatal_errors, "fatal_errors=%s"}, {Opt_commit_interval, "commit=%d"}, +#ifdef CONFIG_BTRFS_DEBUG + {Opt_fragment_data, "fragment=data"}, + {Opt_fragment_metadata, "fragment=metadata"}, + {Opt_fragment_all, "fragment=all"}, +#endif {Opt_err, NULL}, }; @@ -392,7 +385,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) bool compress_force = false; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (cache_gen) + if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE)) + btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); + else if (cache_gen) btrfs_set_opt(info->mount_opt, SPACE_CACHE); if (!options) @@ -626,15 +621,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) "turning off discard"); break; case Opt_space_cache: - btrfs_set_and_info(root, SPACE_CACHE, - "enabling disk space caching"); + case Opt_space_cache_version: + if (token == Opt_space_cache || + strcmp(args[0].from, "v1") == 0) { + btrfs_clear_opt(root->fs_info->mount_opt, + FREE_SPACE_TREE); + btrfs_set_and_info(root, SPACE_CACHE, + "enabling disk space caching"); + } else if (strcmp(args[0].from, "v2") == 0) { + btrfs_clear_opt(root->fs_info->mount_opt, + SPACE_CACHE); + btrfs_set_and_info(root, FREE_SPACE_TREE, + "enabling free space tree"); + } else { + ret = -EINVAL; + goto out; + } break; case Opt_rescan_uuid_tree: btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - btrfs_clear_and_info(root, SPACE_CACHE, - "disabling disk space caching"); + if (btrfs_test_opt(root, SPACE_CACHE)) { + btrfs_clear_and_info(root, SPACE_CACHE, + "disabling disk space caching"); + } + if (btrfs_test_opt(root, FREE_SPACE_TREE)) { + btrfs_clear_and_info(root, FREE_SPACE_TREE, + "disabling free space tree"); + } break; case Opt_inode_cache: btrfs_set_pending_and_info(info, INODE_MAP_CACHE, @@ -738,6 +753,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; } break; +#ifdef CONFIG_BTRFS_DEBUG + case Opt_fragment_all: + btrfs_info(root->fs_info, "fragmenting all space"); + btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); + break; + case Opt_fragment_metadata: + btrfs_info(root->fs_info, "fragmenting metadata"); + btrfs_set_opt(info->mount_opt, + FRAGMENT_METADATA); + break; + case Opt_fragment_data: + btrfs_info(root->fs_info, "fragmenting data"); + btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + break; +#endif case Opt_err: btrfs_info(root->fs_info, "unrecognized mount option '%s'", p); ret = -EINVAL; @@ -747,8 +778,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: + if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && + !btrfs_test_opt(root, FREE_SPACE_TREE) && + !btrfs_test_opt(root, CLEAR_CACHE)) { + btrfs_err(root->fs_info, "cannot disable free space tree"); + ret = -EINVAL; + + } if (!ret && btrfs_test_opt(root, SPACE_CACHE)) btrfs_info(root->fs_info, "disk space caching is enabled"); + if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE)) + btrfs_info(root->fs_info, "using free space tree"); kfree(orig); return ret; } @@ -1155,6 +1195,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",noacl"); if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); + else if (btrfs_test_opt(root, FREE_SPACE_TREE)) + seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, RESCAN_UUID_TREE)) @@ -1189,6 +1231,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_test_opt(root, FRAGMENT_DATA)) + seq_puts(seq, ",fragment=data"); + if (btrfs_test_opt(root, FRAGMENT_METADATA)) + seq_puts(seq, ",fragment=metadata"); +#endif seq_printf(seq, ",subvolid=%llu", BTRFS_I(d_inode(dentry))->root->root_key.objectid); seq_puts(seq, ",subvol="); @@ -1501,9 +1549,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if ((flags ^ s->s_flags) & MS_RDONLY) error = -EBUSY; } else { - char b[BDEVNAME_SIZE]; - - strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); @@ -1852,7 +1898,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * btrfs starts at an offset of at least 1MB when doing chunk * allocation. */ - skip_space = 1024 * 1024; + skip_space = SZ_1M; /* user can set the offset in fs_info->alloc_start. */ if (fs_info->alloc_start && @@ -1943,6 +1989,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * there are other factors that may change the result (like a new metadata * chunk). * + * If metadata is exhausted, f_bavail will be 0. + * * FIXME: not accurate for mixed block groups, total and free/used are ok, * available appears slightly larger. */ @@ -1954,11 +2002,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; + u64 total_free_meta = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)fs_info->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; + u64 thresh = 0; /* * holding chunk_muext to avoid allocating new chunks, holding @@ -1984,6 +2034,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + total_free_meta += found->disk_total - found->disk_used; total_used += found->disk_used; } @@ -2006,6 +2058,24 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; + /* + * We calculate the remaining metadata space minus global reserve. If + * this is (supposedly) smaller than zero, there's no space. But this + * does not hold in practice, the exhausted state happens where's still + * some positive delta. So we apply some guesswork and compare the + * delta to a 4M threshold. (Practically observed delta was ~2M.) + * + * We probably cannot calculate the exact threshold value because this + * depends on the internal reservations requested by various + * operations, so some operations that consume a few metadata will + * succeed even if the Avail is zero. But this is better than the other + * way around. + */ + thresh = 4 * 1024 * 1024; + + if (total_free_meta - thresh < block_rsv->size) + buf->f_bavail = 0; + buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_namelen = BTRFS_NAME_LEN; @@ -2212,6 +2282,9 @@ static int btrfs_run_sanity_tests(void) if (ret) goto out; ret = btrfs_test_qgroups(); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(); out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 603b0cc2b9bb..e0ac85949067 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = { NULL, }; -static void btrfs_release_super_kobj(struct kobject *kobj) +static void btrfs_release_fsid_kobj(struct kobject *kobj) { struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); - memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); + memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject)); complete(&fs_devs->kobj_unregister); } static struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .release = btrfs_release_super_kobj, + .release = btrfs_release_fsid_kobj, }; static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; - return container_of(kobj, struct btrfs_fs_devices, super_kobj); + return container_of(kobj, struct btrfs_fs_devices, fsid_kobj); } static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) @@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; - ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, + ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj, &agroup); if (ret) return ret; } else - sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, + sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj, &agroup); } @@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) fs_devs->device_dir_kobj = NULL; } - if (fs_devs->super_kobj.state_initialized) { - kobject_del(&fs_devs->super_kobj); - kobject_put(&fs_devs->super_kobj); + if (fs_devs->fsid_kobj.state_initialized) { + kobject_del(&fs_devs->fsid_kobj); + kobject_put(&fs_devs->fsid_kobj); wait_for_completion(&fs_devs->kobj_unregister); } } @@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } -void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { btrfs_reset_fs_info_ptr(fs_info); @@ -555,9 +555,9 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) kobject_put(fs_info->space_info_kobj); } addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); - sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); - btrfs_kobj_rm_device(fs_info->fs_devices, NULL); + sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); } const char * const btrfs_feature_set_names[3] = { @@ -637,7 +637,7 @@ static void init_feature_attrs(void) /* when one_device is NULL, it removes all device links */ -int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; @@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { if (!fs_devs->device_dir_kobj) fs_devs->device_dir_kobj = kobject_create_and_add("devices", - &fs_devs->super_kobj); + &fs_devs->fsid_kobj); if (!fs_devs->device_dir_kobj) return -ENOMEM; @@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) return 0; } -int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { int error = 0; @@ -730,31 +730,31 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, int error; init_completion(&fs_devs->kobj_unregister); - fs_devs->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_devs->super_kobj, + fs_devs->fsid_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, parent, "%pU", fs_devs->fsid); return error; } -int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) { int error; struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; - struct kobject *super_kobj = &fs_devs->super_kobj; + struct kobject *fsid_kobj = &fs_devs->fsid_kobj; btrfs_set_fs_info_ptr(fs_info); - error = btrfs_kobj_add_device(fs_devs, NULL); + error = btrfs_sysfs_add_device_link(fs_devs, NULL); if (error) return error; - error = sysfs_create_files(super_kobj, btrfs_attrs); + error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_kobj_rm_device(fs_devs, NULL); + btrfs_sysfs_rm_device_link(fs_devs, NULL); return error; } - error = sysfs_create_group(super_kobj, + error = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); if (error) goto failure; @@ -764,7 +764,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", - super_kobj); + fsid_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; @@ -776,7 +776,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) return 0; failure: - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); return error; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 6392527bcc15..9c09522125a6 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -82,9 +82,9 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; -int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 9626252ee6b4..b1d920b30070 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -21,6 +21,9 @@ #include <linux/magic.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../free-space-cache.h" +#include "../free-space-tree.h" +#include "../transaction.h" #include "../volumes.h" #include "../disk-io.h" #include "../qgroup.h" @@ -122,6 +125,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + extent_io_tree_init(&fs_info->freed_extents[0], NULL); + extent_io_tree_init(&fs_info->freed_extents[1], NULL); + fs_info->pinned_extents = &fs_info->freed_extents[0]; return fs_info; } @@ -169,3 +175,55 @@ void btrfs_free_dummy_root(struct btrfs_root *root) kfree(root); } +struct btrfs_block_group_cache * +btrfs_alloc_dummy_block_group(unsigned long length) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + cache->fs_info = btrfs_alloc_dummy_fs_info(); + if (!cache->fs_info) { + kfree(cache->free_space_ctl); + kfree(cache); + return NULL; + } + + cache->key.objectid = 0; + cache->key.offset = length; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = 4096; + cache->full_stripe_len = 4096; + + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + btrfs_init_free_space_ctl(cache); + mutex_init(&cache->free_space_lock); + + return cache; +} + +void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache) +{ + if (!cache) + return; + __btrfs_remove_free_space_cache(cache->free_space_ctl); + kfree(cache->free_space_ctl); + kfree(cache); +} + +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + INIT_LIST_HEAD(&trans->qgroup_ref_list); + trans->type = __TRANS_DUMMY; +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index fd3954224480..054b8c73c951 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -24,17 +24,23 @@ #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) struct btrfs_root; +struct btrfs_trans_handle; int btrfs_test_free_space_cache(void); int btrfs_test_extent_buffer_operations(void); int btrfs_test_extent_io(void); int btrfs_test_inodes(void); int btrfs_test_qgroups(void); +int btrfs_test_free_space_tree(void); int btrfs_init_test_fs(void); void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); void btrfs_free_dummy_root(struct btrfs_root *root); +struct btrfs_block_group_cache * +btrfs_alloc_dummy_block_group(unsigned long length); +void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else static inline int btrfs_test_free_space_cache(void) { @@ -63,6 +69,10 @@ static inline int btrfs_test_qgroups(void) { return 0; } +static inline int btrfs_test_free_space_tree(void) +{ + return 0; +} #endif #endif diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 9e9f2368177d..e29fa297e053 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/slab.h> +#include <linux/sizes.h> #include "btrfs-tests.h" #include "../extent_io.h" @@ -70,12 +72,14 @@ static int test_find_delalloc(void) struct page *page; struct page *locked_page = NULL; unsigned long index = 0; - u64 total_dirty = 256 * 1024 * 1024; - u64 max_bytes = 128 * 1024 * 1024; + u64 total_dirty = SZ_256M; + u64 max_bytes = SZ_128M; u64 start, end, test_start; u64 found; int ret = -EINVAL; + test_msg("Running find delalloc tests\n"); + inode = btrfs_new_test_inode(); if (!inode) { test_msg("Failed to allocate test inode\n"); @@ -133,7 +137,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - test_start = 64 * 1024 * 1024; + test_start = SZ_64M; locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_CACHE_SHIFT); if (!locked_page) { @@ -220,8 +224,8 @@ static int test_find_delalloc(void) * Now to test where we run into a page that is no longer dirty in the * range we want to find. */ - page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024)) - >> PAGE_CACHE_SHIFT); + page = find_get_page(inode->i_mapping, + (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT); if (!page) { test_msg("Couldn't find our page\n"); goto out_bits; @@ -268,8 +272,139 @@ out: return ret; } +static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, + unsigned long len) +{ + unsigned long i, x; + + memset(bitmap, 0, len); + memset_extent_buffer(eb, 0, 0, len); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Bitmap was not zeroed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting all bits failed\n"); + return -EINVAL; + } + + bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing all bits failed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting straddling pages failed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + bitmap_clear(bitmap, + (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing straddling pages failed\n"); + return -EINVAL; + } + + /* + * Generate a wonky pseudo-random bit pattern for the sake of not using + * something repetitive that could miss some hypothetical off-by-n bug. + */ + x = 0; + for (i = 0; i < len / sizeof(long); i++) { + x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL; + bitmap[i] = x; + } + write_extent_buffer(eb, bitmap, 0, len); + + for (i = 0; i < len * BITS_PER_BYTE; i++) { + int bit, bit1; + + bit = !!test_bit(i, bitmap); + bit1 = !!extent_buffer_test_bit(eb, 0, i); + if (bit1 != bit) { + test_msg("Testing bit pattern failed\n"); + return -EINVAL; + } + + bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1 != bit) { + test_msg("Testing bit pattern with offset failed\n"); + return -EINVAL; + } + } + + return 0; +} + +static int test_eb_bitmaps(void) +{ + unsigned long len = PAGE_CACHE_SIZE * 4; + unsigned long *bitmap; + struct extent_buffer *eb; + int ret; + + test_msg("Running extent buffer bitmap tests\n"); + + bitmap = kmalloc(len, GFP_NOFS); + if (!bitmap) { + test_msg("Couldn't allocate test bitmap\n"); + return -ENOMEM; + } + + eb = __alloc_dummy_extent_buffer(NULL, 0, len); + if (!eb) { + test_msg("Couldn't allocate test extent buffer\n"); + kfree(bitmap); + return -ENOMEM; + } + + ret = __test_eb_bitmaps(bitmap, eb, len); + if (ret) + goto out; + + /* Do it over again with an extent buffer which isn't page-aligned. */ + free_extent_buffer(eb); + eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len); + if (!eb) { + test_msg("Couldn't allocate test extent buffer\n"); + kfree(bitmap); + return -ENOMEM; + } + + ret = __test_eb_bitmaps(bitmap, eb, len); +out: + free_extent_buffer(eb); + kfree(bitmap); + return ret; +} + int btrfs_test_extent_io(void) { - test_msg("Running find delalloc tests\n"); - return test_find_delalloc(); + int ret; + + test_msg("Running extent I/O tests\n"); + + ret = test_find_delalloc(); + if (ret) + goto out; + + ret = test_eb_bitmaps(); +out: + test_msg("Extent I/O tests finished\n"); + return ret; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 2299bfde39ee..c9ad97b1e690 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -19,38 +19,10 @@ #include <linux/slab.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../disk-io.h" #include "../free-space-cache.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) -static struct btrfs_block_group_cache *init_test_block_group(void) -{ - struct btrfs_block_group_cache *cache; - - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return NULL; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return NULL; - } - - cache->key.objectid = 0; - cache->key.offset = 1024 * 1024 * 1024; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; - cache->full_stripe_len = 4096; - - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->bg_list); - - btrfs_init_free_space_ctl(cache); - - return cache; -} /* * This test just does basic sanity checking, making sure we can add an exten @@ -64,59 +36,59 @@ static int test_extents(struct btrfs_block_group_cache *cache) test_msg("Running extent only tests\n"); /* First just make sure we can remove an entire entry */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error adding initial extents %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error removing extent %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_4M)) { test_msg("Full remove left some lingering space\n"); return -1; } /* Ok edge and middle cases now */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error adding half extent %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M); if (ret) { test_msg("Error removing tail end %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { test_msg("Error removing front end %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); + ret = btrfs_remove_free_space(cache, SZ_2M, 4096); if (ret) { test_msg("Error removing middle piece %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_1M)) { test_msg("Still have space at the front\n"); return -1; } - if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) { + if (test_check_exists(cache, SZ_2M, 4096)) { test_msg("Still have space in the middle\n"); return -1; } - if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) { test_msg("Still have space at the end\n"); return -1; } @@ -134,30 +106,30 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) test_msg("Running bitmap only tests\n"); - ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { test_msg("Couldn't create a bitmap entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error removing bitmap full range %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_4M)) { test_msg("Left some space in bitmap\n"); return -1; } - ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { test_msg("Couldn't add to our bitmap entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M); if (ret) { test_msg("Couldn't remove middle chunk %d\n", ret); return ret; @@ -170,23 +142,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); /* Test a bit straddling two bitmaps */ - ret = test_add_free_space_entry(cache, next_bitmap_offset - - (2 * 1024 * 1024), 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, + SZ_4M, 1); if (ret) { test_msg("Couldn't add space that straddles two bitmaps %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, next_bitmap_offset - - (1 * 1024 * 1024), 2 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M); if (ret) { test_msg("Couldn't remove overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), - 2 * 1024 * 1024)) { + if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) { test_msg("Left some space when removing overlapping\n"); return -1; } @@ -209,43 +179,43 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * bitmap, but the free space completely in the extent and then * completely in the bitmap. */ - ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1); if (ret) { test_msg("Couldn't create bitmap entry %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { test_msg("Couldn't remove extent entry %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_1M)) { test_msg("Left remnants after our remove\n"); return -1; } /* Now to add back the extent entry and remove from the bitmap */ - ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { test_msg("Couldn't re-add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M); if (ret) { test_msg("Couldn't remove from bitmap %d\n", ret); return ret; } - if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { + if (test_check_exists(cache, SZ_4M, SZ_1M)) { test_msg("Left remnants in the bitmap\n"); return -1; } @@ -254,19 +224,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * Ok so a little more evil, extent entry and bitmap at the same offset, * removing an overlapping chunk. */ - ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1); if (ret) { test_msg("Couldn't add to a bitmap %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M); if (ret) { test_msg("Couldn't remove overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { + if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) { test_msg("Left over pieces after removing overlapping\n"); return -1; } @@ -274,25 +244,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) __btrfs_remove_free_space_cache(cache->free_space_ctl); /* Now with the extent entry offset into the bitmap */ - ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1); if (ret) { test_msg("Couldn't add space to the bitmap %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0); if (ret) { test_msg("Couldn't add extent to the cache %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M); if (ret) { test_msg("Problem removing overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) { test_msg("Left something behind when removing space"); return -1; } @@ -308,29 +278,26 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * [ del ] */ __btrfs_remove_free_space_cache(cache->free_space_ctl); - ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, - 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1); if (ret) { test_msg("Couldn't add bitmap %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, - 5 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M, + 5 * SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M); if (ret) { test_msg("Failed to free our space %d\n", ret); return ret; } - if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024)) { + if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) { test_msg("Left stuff over\n"); return -1; } @@ -343,19 +310,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * to return -EAGAIN back from btrfs_remove_extent, make sure this * doesn't happen. */ - ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M); if (ret) { test_msg("Error removing bitmap and extent overlapping %d\n", ret); return ret; @@ -438,9 +405,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) int ret; u64 offset; u64 max_extent_size; - - bool (*use_bitmap_op)(struct btrfs_free_space_ctl *, - struct btrfs_free_space *); + const struct btrfs_free_space_op test_free_space_ops = { + .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds, + .use_bitmap = test_use_bitmap, + }; + const struct btrfs_free_space_op *orig_free_space_ops; test_msg("Running space stealing from bitmap to extent\n"); @@ -462,22 +431,21 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that forces use of bitmaps as soon as we have at least 1 * extent entry. */ - use_bitmap_op = cache->free_space_ctl->op->use_bitmap; - cache->free_space_ctl->op->use_bitmap = test_use_bitmap; + orig_free_space_ops = cache->free_space_ctl->op; + cache->free_space_ctl->op = &test_free_space_ops; /* * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024, - 128 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024, - 128 * 1024 * 1024 - 512 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K, + SZ_128M - SZ_512K, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; @@ -495,21 +463,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * [128Mb + 512Kb, 128Mb + 768Kb[ */ ret = btrfs_remove_free_space(cache, - 128 * 1024 * 1024 + 768 * 1024, - 128 * 1024 * 1024 - 768 * 1024); + SZ_128M + 768 * SZ_1K, + SZ_128M - 768 * SZ_1K); if (ret) { test_msg("Failed to free part of bitmap space %d\n", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) { test_msg("Free space range missing\n"); return -ENOENT; } - if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024, - 256 * 1024)) { + if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) { test_msg("Free space range missing\n"); return -ENOENT; } @@ -518,8 +484,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked * as free anymore. */ - if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024, - 128 * 1024 * 1024 - 768 * 1024)) { + if (test_check_exists(cache, SZ_128M + 768 * SZ_1K, + SZ_128M - 768 * SZ_1K)) { test_msg("Bitmap region not removed from space cache\n"); return -EINVAL; } @@ -528,8 +494,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is * covered by the bitmap, isn't marked as free. */ - if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024, - 256 * 1024)) { + if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -538,8 +503,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered * by the bitmap too, isn't marked as free either. */ - if (test_check_exists(cache, 128 * 1024 * 1024, - 256 * 1024)) { + if (test_check_exists(cache, SZ_128M, SZ_256K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -549,13 +513,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * lets make sure the free space cache marks it as free in the bitmap, * and doesn't insert a new extent entry to represent this region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) { + if (!test_check_exists(cache, SZ_128M, SZ_512K)) { test_msg("Bitmap region not marked as free\n"); return -ENOENT; } @@ -574,8 +538,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024, - 4096); + ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -594,15 +557,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * expand the range covered by the existing extent entry that represents * the free space [128Mb - 256Kb, 128Mb - 128Kb[. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024, - 128 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) { test_msg("Extent region not marked as free\n"); return -ENOENT; } @@ -630,21 +591,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that represents the 1Mb free space, and therefore we're able to * allocate the whole free space at once. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, - 1 * 1024 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) { test_msg("Expected region not marked as free\n"); return -ENOENT; } - if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) { + if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) { test_msg("Cache free space is not 1Mb + 4Kb\n"); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 1 * 1024 * 1024, 0, + 0, SZ_1M, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 - 256 * 1024)) { + if (offset != (SZ_128M - SZ_256K)) { test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -663,7 +623,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) { + if (offset != (SZ_128M + SZ_16M)) { test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -684,16 +644,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) /* * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024, - 128 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */ - ret = test_add_free_space_entry(cache, 0, - 128 * 1024 * 1024 - 512 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; @@ -710,22 +668,18 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * [128Mb + 128b, 128Mb + 256Kb[ * [128Mb - 768Kb, 128Mb - 512Kb[ */ - ret = btrfs_remove_free_space(cache, - 0, - 128 * 1024 * 1024 - 768 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K); if (ret) { test_msg("Failed to free part of bitmap space %d\n", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) { test_msg("Free space range missing\n"); return -ENOENT; } - if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, - 256 * 1024)) { + if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) { test_msg("Free space range missing\n"); return -ENOENT; } @@ -734,8 +688,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked * as free anymore. */ - if (test_check_exists(cache, 0, - 128 * 1024 * 1024 - 768 * 1024)) { + if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) { test_msg("Bitmap region not removed from space cache\n"); return -EINVAL; } @@ -744,8 +697,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb - 512Kb, 128Mb[, which is * covered by the bitmap, isn't marked as free. */ - if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024)) { + if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -755,15 +707,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * lets make sure the free space cache marks it as free in the bitmap, * and doesn't insert a new extent entry to represent this region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { test_msg("Bitmap region not marked as free\n"); return -ENOENT; } @@ -782,7 +732,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192); + ret = btrfs_add_free_space(cache, SZ_32M, 8192); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -793,13 +743,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * expand the range covered by the existing extent entry that represents * the free space [128Mb + 128Kb, 128Mb + 256Kb[. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M, SZ_128K)) { test_msg("Extent region not marked as free\n"); return -ENOENT; } @@ -827,21 +777,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that represents the 1Mb free space, and therefore we're able to * allocate the whole free space at once. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, - 1 * 1024 * 1024)) { + if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) { test_msg("Expected region not marked as free\n"); return -ENOENT; } - if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) { + if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) { test_msg("Cache free space is not 1Mb + 8Kb\n"); return -EINVAL; } - offset = btrfs_find_space_for_alloc(cache, - 0, 1 * 1024 * 1024, 0, + offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 - 768 * 1024)) { + if (offset != (SZ_128M - 768 * SZ_1K)) { test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -860,7 +808,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) offset = btrfs_find_space_for_alloc(cache, 0, 8192, 0, &max_extent_size); - if (offset != (32 * 1024 * 1024)) { + if (offset != SZ_32M) { test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -870,7 +818,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) if (ret) return ret; - cache->free_space_ctl->op->use_bitmap = use_bitmap_op; + cache->free_space_ctl->op = orig_free_space_ops; __btrfs_remove_free_space_cache(cache->free_space_ctl); return 0; @@ -879,16 +827,30 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) int btrfs_test_free_space_cache(void) { struct btrfs_block_group_cache *cache; - int ret; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; test_msg("Running btrfs free space cache tests\n"); - cache = init_test_block_group(); + cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024); if (!cache) { test_msg("Couldn't run the tests\n"); return 0; } + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) + goto out; + + root->fs_info->extent_root = root; + cache->fs_info = root->fs_info; + ret = test_extents(cache); if (ret) goto out; @@ -901,9 +863,8 @@ int btrfs_test_free_space_cache(void) ret = test_steal_space_from_bitmap_to_extent(cache); out: - __btrfs_remove_free_space_cache(cache->free_space_ctl); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_free_dummy_block_group(cache); + btrfs_free_dummy_root(root); test_msg("Free space cache tests finished\n"); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c new file mode 100644 index 000000000000..d05fe1ab4808 --- /dev/null +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -0,0 +1,571 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../disk-io.h" +#include "../free-space-tree.h" +#include "../transaction.h" + +struct free_space_extent { + u64 start, length; +}; + +/* + * The test cases align their operations to this in order to hit some of the + * edge cases in the bitmap code. + */ +#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096) + +static int __check_free_space_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path, + struct free_space_extent *extents, + unsigned int num_extents) +{ + struct btrfs_free_space_info *info; + struct btrfs_key key; + int prev_bit = 0, bit; + u64 extent_start = 0, offset, end; + u32 flags, extent_count; + unsigned int i; + int ret; + + info = search_free_space_info(trans, fs_info, cache, path, 0); + if (IS_ERR(info)) { + test_msg("Could not find free space info\n"); + ret = PTR_ERR(info); + goto out; + } + flags = btrfs_free_space_flags(path->nodes[0], info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + + if (extent_count != num_extents) { + test_msg("Extent count is wrong\n"); + ret = -EINVAL; + goto out; + } + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + if (path->slots[0] != 0) + goto invalid; + end = cache->key.objectid + cache->key.offset; + i = 0; + while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY) + goto invalid; + offset = key.objectid; + while (offset < key.objectid + key.offset) { + bit = free_space_test_bit(cache, path, offset); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + if (i >= num_extents) + goto invalid; + if (i >= num_extents || + extent_start != extents[i].start || + offset - extent_start != extents[i].length) + goto invalid; + i++; + } + prev_bit = bit; + offset += cache->sectorsize; + } + } + if (prev_bit == 1) { + if (i >= num_extents || + extent_start != extents[i].start || + end - extent_start != extents[i].length) + goto invalid; + i++; + } + if (i != num_extents) + goto invalid; + } else { + if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 || + path->slots[0] != 0) + goto invalid; + for (i = 0; i < num_extents; i++) { + path->slots[0]++; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY || + key.objectid != extents[i].start || + key.offset != extents[i].length) + goto invalid; + } + } + + ret = 0; +out: + btrfs_release_path(path); + return ret; +invalid: + test_msg("Free space tree is invalid\n"); + ret = -EINVAL; + goto out; +} + +static int check_free_space_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path, + struct free_space_extent *extents, + unsigned int num_extents) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + info = search_free_space_info(trans, fs_info, cache, path, 0); + if (IS_ERR(info)) { + test_msg("Could not find free space info\n"); + btrfs_release_path(path); + return PTR_ERR(info); + } + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + ret = __check_free_space_extents(trans, fs_info, cache, path, extents, + num_extents); + if (ret) + return ret; + + /* Flip it to the other format and check that for good measure. */ + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + ret = convert_free_space_to_extents(trans, fs_info, cache, path); + if (ret) { + test_msg("Could not convert to extents\n"); + return ret; + } + } else { + ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path); + if (ret) { + test_msg("Could not convert to bitmaps\n"); + return ret; + } + } + return __check_free_space_extents(trans, fs_info, cache, path, extents, + num_extents); +} + +static int test_empty_block_group(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset}, + }; + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_all(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = {}; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_beginning(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid + BITMAP_RANGE, + cache->key.offset - BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); + +} + +static int test_remove_end(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset - BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + + cache->key.offset - BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_middle(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, BITMAP_RANGE}, + {cache->key.objectid + 2 * BITMAP_RANGE, + cache->key.offset - 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_left(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_right(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_both(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, 3 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_none(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, BITMAP_RANGE}, + {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE}, + {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 4 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +typedef int (*test_func_t)(struct btrfs_trans_handle *, + struct btrfs_fs_info *, + struct btrfs_block_group_cache *, + struct btrfs_path *); + +static int run_test(test_func_t test_func, int bitmaps) +{ + struct btrfs_root *root = NULL; + struct btrfs_block_group_cache *cache = NULL; + struct btrfs_trans_handle trans; + struct btrfs_path *path = NULL; + int ret; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate dummy root\n"); + ret = PTR_ERR(root); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; + goto out; + } + + btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE); + root->fs_info->free_space_root = root; + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 8192; + + cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE); + if (!cache) { + test_msg("Couldn't allocate dummy block group cache\n"); + ret = -ENOMEM; + goto out; + } + cache->bitmap_low_thresh = 0; + cache->bitmap_high_thresh = (u32)-1; + cache->needs_free_space = 1; + + btrfs_init_dummy_trans(&trans); + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + ret = add_block_group_free_space(&trans, root->fs_info, cache); + if (ret) { + test_msg("Could not add block group free space\n"); + goto out; + } + + if (bitmaps) { + ret = convert_free_space_to_bitmaps(&trans, root->fs_info, + cache, path); + if (ret) { + test_msg("Could not convert block group to bitmaps\n"); + goto out; + } + } + + ret = test_func(&trans, root->fs_info, cache, path); + if (ret) + goto out; + + ret = remove_block_group_free_space(&trans, root->fs_info, cache); + if (ret) { + test_msg("Could not remove block group free space\n"); + goto out; + } + + if (btrfs_header_nritems(root->node) != 0) { + test_msg("Free space tree has leftover items\n"); + ret = -EINVAL; + goto out; + } + + ret = 0; +out: + btrfs_free_path(path); + btrfs_free_dummy_block_group(cache); + btrfs_free_dummy_root(root); + return ret; +} + +static int run_test_both_formats(test_func_t test_func) +{ + int ret; + + ret = run_test(test_func, 0); + if (ret) + return ret; + return run_test(test_func, 1); +} + +int btrfs_test_free_space_tree(void) +{ + test_func_t tests[] = { + test_empty_block_group, + test_remove_all, + test_remove_beginning, + test_remove_end, + test_remove_middle, + test_merge_left, + test_merge_right, + test_merge_both, + test_merge_none, + }; + int i; + + test_msg("Running free space tree tests\n"); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + int ret = run_test_both_formats(tests[i]); + if (ret) { + test_msg("%pf failed\n", tests[i]); + return ret; + } + } + + return 0; +} diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 054fc0d97131..5de55fdd28bc 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -100,7 +100,7 @@ static void insert_inode_item_key(struct btrfs_root *root) static void setup_file_extents(struct btrfs_root *root) { int slot = 0; - u64 disk_bytenr = 1 * 1024 * 1024; + u64 disk_bytenr = SZ_1M; u64 offset = 0; /* First we want a hole */ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 846d277b1901..8ea5d34bc5a2 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -23,14 +23,6 @@ #include "../qgroup.h" #include "../backref.h" -static void init_dummy_trans(struct btrfs_trans_handle *trans) -{ - memset(trans, 0, sizeof(*trans)); - trans->transid = 1; - INIT_LIST_HEAD(&trans->qgroup_ref_list); - trans->type = __TRANS_DUMMY; -} - static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid) { @@ -44,7 +36,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); ins.objectid = bytenr; ins.type = BTRFS_EXTENT_ITEM_KEY; @@ -94,7 +86,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 refs; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -144,7 +136,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, struct btrfs_path *path; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -178,7 +170,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, u64 refs; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -232,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root) struct ulist *new_roots = NULL; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); ret = btrfs_create_qgroup(NULL, fs_info, 5); @@ -326,7 +318,7 @@ static int test_multiple_refs(struct btrfs_root *root) struct ulist *new_roots = NULL; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); test_msg("Qgroup multiple refs test\n"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a5b06442f0bf..b6031ce474f7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) list_del_init(&em->list); free_extent_map(em); } + /* + * If any block groups are found in ->deleted_bgs then it's + * because the transaction was aborted and a commit did not + * happen (things failed before writing the new superblock + * and calling btrfs_finish_extent_commit()), so we can not + * discard the physical locations of the block groups. + */ + while (!list_empty(&transaction->deleted_bgs)) { + struct btrfs_block_group_cache *cache; + + cache = list_first_entry(&transaction->deleted_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&cache->bg_list); + btrfs_put_block_group_trimming(cache); + btrfs_put_block_group(cache); + } kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -82,6 +99,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) static void clear_btree_io_tree(struct extent_io_tree *tree) { spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once clear_btree_io_tree is + * called. + */ + smp_mb(); while (!RB_EMPTY_ROOT(&tree->state)) { struct rb_node *node; struct extent_state *state; @@ -226,25 +249,22 @@ loop: extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); + init_waitqueue_head(&cur_trans->pending_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); - cur_trans->have_free_bgs = 0; + atomic_set(&cur_trans->pending_ordered, 0); + cur_trans->flags = 0; cur_trans->start_time = get_seconds(); - cur_trans->dirty_bg_run = 0; + + memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); cur_trans->delayed_refs.href_root = RB_ROOT; cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); - cur_trans->delayed_refs.num_heads_ready = 0; - cur_trans->delayed_refs.pending_csums = 0; - cur_trans->delayed_refs.num_heads = 0; - cur_trans->delayed_refs.flushing = 0; - cur_trans->delayed_refs.run_delayed_start = 0; - cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, @@ -264,7 +284,6 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); - INIT_LIST_HEAD(&cur_trans->pending_ordered); INIT_LIST_HEAD(&cur_trans->dirty_bgs); INIT_LIST_HEAD(&cur_trans->io_bgs); INIT_LIST_HEAD(&cur_trans->dropped_roots); @@ -272,7 +291,6 @@ loop: cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); - spin_lock_init(&cur_trans->deleted_bgs_lock); spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, @@ -447,8 +465,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) } static struct btrfs_trans_handle * -start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, - enum btrfs_reserve_flush_enum flush) +start_transaction(struct btrfs_root *root, unsigned int num_items, + unsigned int type, enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; @@ -478,13 +496,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, * the appropriate flushing if need be. */ if (num_items > 0 && root != root->fs_info->chunk_root) { - if (root->fs_info->quota_enabled && - is_fstree(root->root_key.objectid)) { - qgroup_reserved = num_items * root->nodesize; - ret = btrfs_qgroup_reserve(root, qgroup_reserved); - if (ret) - return ERR_PTR(ret); - } + qgroup_reserved = num_items * root->nodesize; + ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved); + if (ret) + return ERR_PTR(ret); num_bytes = btrfs_calc_trans_metadata_size(root, num_items); /* @@ -502,7 +517,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, goto reserve_fail; } again: - h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) { ret = -ENOMEM; goto alloc_fail; @@ -543,26 +558,13 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; - h->blocks_used = 0; - h->bytes_reserved = 0; - h->chunk_bytes_reserved = 0; h->root = root; - h->delayed_ref_updates = 0; h->use_count = 1; - h->adding_csums = 0; - h->block_rsv = NULL; - h->orig_rsv = NULL; - h->aborted = 0; - h->qgroup_reserved = 0; - h->delayed_ref_elem.seq = 0; + h->type = type; - h->allocating_chunk = false; h->can_flush_pending_bgs = true; - h->reloc_reserved = false; - h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); - INIT_LIST_HEAD(&h->ordered); smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && @@ -579,7 +581,6 @@ again: h->bytes_reserved = num_bytes; h->reloc_reserved = reloc_reserved; } - h->qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); @@ -597,20 +598,52 @@ alloc_fail: btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, num_bytes); reserve_fail: - if (qgroup_reserved) - btrfs_qgroup_free(root, qgroup_reserved); + btrfs_qgroup_free_meta(root, qgroup_reserved); return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_items) + unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_ALL); } +struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( + struct btrfs_root *root, + unsigned int num_items, + int min_factor) +{ + struct btrfs_trans_handle *trans; + u64 num_bytes; + int ret; + + trans = btrfs_start_transaction(root, num_items); + if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) + return trans; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return trans; + + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + ret = btrfs_cond_migrate_bytes(root->fs_info, + &root->fs_info->trans_block_rsv, + num_bytes, + min_factor); + if (ret) { + btrfs_end_transaction(trans, root); + return ERR_PTR(ret); + } + + trans->block_rsv = &root->fs_info->trans_block_rsv; + trans->bytes_reserved = num_bytes; + + return trans; +} struct btrfs_trans_handle *btrfs_start_transaction_lflush( - struct btrfs_root *root, int num_items) + struct btrfs_root *root, + unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_LIMIT); @@ -618,17 +651,20 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN, 0); + return start_transaction(root, 0, TRANS_JOIN, + BTRFS_RESERVE_NO_FLUSH); } struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0); + return start_transaction(root, 0, TRANS_JOIN_NOLOCK, + BTRFS_RESERVE_NO_FLUSH); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_USERSPACE, 0); + return start_transaction(root, 0, TRANS_USERSPACE, + BTRFS_RESERVE_NO_FLUSH); } /* @@ -646,7 +682,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_ATTACH, 0); + return start_transaction(root, 0, TRANS_ATTACH, + BTRFS_RESERVE_NO_FLUSH); } /* @@ -661,7 +698,8 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - trans = start_transaction(root, 0, TRANS_ATTACH, 0); + trans = start_transaction(root, 0, TRANS_ATTACH, + BTRFS_RESERVE_NO_FLUSH); if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) btrfs_wait_for_commit(root, 0); @@ -794,12 +832,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - if (!list_empty(&trans->ordered)) { - spin_lock(&info->trans_lock); - list_splice_init(&trans->ordered, &cur_trans->pending_ordered); - spin_unlock(&info->trans_lock); - } - trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = @@ -815,15 +847,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, must_run_delayed_refs = 2; } - if (trans->qgroup_reserved) { - /* - * the same root has to be passed here between start_transaction - * and end_transaction. Subvolume quota depends on this. - */ - btrfs_qgroup_free(trans->root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -856,6 +879,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, atomic_dec(&cur_trans->num_writers); extwriter_counter_dec(cur_trans, trans->type); + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); @@ -1238,6 +1264,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, spin_lock(&fs_info->fs_roots_radix_lock); if (err) break; + btrfs_qgroup_free_meta_all(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); @@ -1314,17 +1341,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 root_flags; uuid_le new_uuid; - path = btrfs_alloc_path(); - if (!path) { - pending->error = -ENOMEM; - return 0; - } + ASSERT(pending->path); + path = pending->path; - new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); - if (!new_root_item) { - pending->error = -ENOMEM; - goto root_item_alloc_fail; - } + ASSERT(pending->root_item); + new_root_item = pending->root_item; pending->error = btrfs_find_free_objectid(tree_root, &objectid); if (pending->error) @@ -1557,8 +1578,10 @@ clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); -root_item_alloc_fail: + pending->root_item = NULL; btrfs_free_path(path); + pending->path = NULL; + return ret; } @@ -1795,25 +1818,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) } static inline void -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) { - struct btrfs_ordered_extent *ordered; - - spin_lock(&fs_info->trans_lock); - while (!list_empty(&cur_trans->pending_ordered)) { - ordered = list_first_entry(&cur_trans->pending_ordered, - struct btrfs_ordered_extent, - trans_list); - list_del_init(&ordered->trans_list); - spin_unlock(&fs_info->trans_lock); - - wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &ordered->flags)); - btrfs_put_ordered_extent(ordered); - spin_lock(&fs_info->trans_lock); - } - spin_unlock(&fs_info->trans_lock); + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); } int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -1842,10 +1850,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } cur_trans = trans->transaction; @@ -1865,7 +1869,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } - if (!cur_trans->dirty_bg_run) { + if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; /* this mutex is also taken before trying to set @@ -1874,18 +1878,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * after a extents from that block group have been * allocated for cache files. btrfs_set_block_group_ro * will wait for the transaction to commit if it - * finds dirty_bg_run = 1 + * finds BTRFS_TRANS_DIRTY_BG_RUN set. * - * The dirty_bg_run flag is also used to make sure only - * one process starts all the block group IO. It wouldn't + * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure + * only one process starts all the block group IO. It wouldn't * hurt to have more than one go through, but there's no * real advantage to it either. */ mutex_lock(&root->fs_info->ro_block_group_mutex); - if (!cur_trans->dirty_bg_run) { + if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN, + &cur_trans->flags)) run_it = 1; - cur_trans->dirty_bg_run = 1; - } mutex_unlock(&root->fs_info->ro_block_group_mutex); if (run_it) @@ -1897,7 +1900,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); - list_splice_init(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1956,7 +1958,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_wait_delalloc_flush(root->fs_info); - btrfs_wait_pending_ordered(cur_trans, root->fs_info); + btrfs_wait_pending_ordered(cur_trans); btrfs_scrub_pause(root); /* @@ -2136,7 +2138,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto scrub_continue; @@ -2156,7 +2158,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - if (cur_trans->have_free_bgs) + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(root->fs_info); root->fs_info->last_trans_committed = cur_trans->transid; @@ -2198,10 +2200,6 @@ cleanup_transaction: btrfs_trans_release_metadata(trans, root); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; - if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } btrfs_warn(root->fs_info, "Skipping commit of aborted transaction."); if (current->journal_info == trans) current->journal_info = NULL; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index a994bb097ee5..72be51f7ca2f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -32,6 +32,10 @@ enum btrfs_trans_state { TRANS_STATE_MAX = 6, }; +#define BTRFS_TRANS_HAVE_FREE_BGS 0 +#define BTRFS_TRANS_DIRTY_BG_RUN 1 +#define BTRFS_TRANS_CACHE_ENOSPC 2 + struct btrfs_transaction { u64 transid; /* @@ -46,11 +50,9 @@ struct btrfs_transaction { */ atomic_t num_writers; atomic_t use_count; + atomic_t pending_ordered; - /* - * true if there is free bgs operations in this transaction - */ - int have_free_bgs; + unsigned long flags; /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; @@ -59,9 +61,9 @@ struct btrfs_transaction { unsigned long start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; + wait_queue_head_t pending_wait; struct list_head pending_snapshots; struct list_head pending_chunks; - struct list_head pending_ordered; struct list_head switch_commits; struct list_head dirty_bgs; struct list_head io_bgs; @@ -75,12 +77,11 @@ struct btrfs_transaction { */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; + /* Protected by spin lock fs_info->unused_bgs_lock. */ struct list_head deleted_bgs; - spinlock_t deleted_bgs_lock; spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; - int dirty_bg_run; }; #define __TRANS_FREEZABLE (1U << 0) @@ -107,7 +108,6 @@ struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; u64 chunk_bytes_reserved; - u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; unsigned long blocks_used; @@ -129,7 +129,6 @@ struct btrfs_trans_handle { */ struct btrfs_root *root; struct seq_list delayed_ref_elem; - struct list_head ordered; struct list_head qgroup_ref_list; struct list_head new_bgs; }; @@ -138,8 +137,10 @@ struct btrfs_pending_snapshot { struct dentry *dentry; struct inode *dir; struct btrfs_root *root; + struct btrfs_root_item *root_item; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; + struct btrfs_path *path; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; u64 qgroup_reserved; @@ -185,9 +186,14 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_items); + unsigned int num_items); +struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( + struct btrfs_root *root, + unsigned int num_items, + int min_factor); struct btrfs_trans_handle *btrfs_start_transaction_lflush( - struct btrfs_root *root, int num_items); + struct btrfs_root *root, + unsigned int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index f31db4325339..cb65089127cc 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } btrfs_release_path(path); + /* + * We don't need a lock on a leaf. btrfs_realloc_node() will lock all + * leafs from path->nodes[1], so set lowest_level to 1 to avoid later + * a deadlock (attempting to write lock an already write locked leaf). + */ + path->lowest_level = 1; wret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (wret < 0) { @@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, ret = 0; goto out; } - path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, - min_trans); + /* + * The node at level 1 must always be locked when our path has + * keep_locks set and lowest_level is 1, regardless of the value of + * path->slots[1]. + */ + BUG_ON(path->locks[1] == 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, @@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, WARN_ON(ret == -EAGAIN); goto out; } + /* + * Now that we reallocated the node we can find the next key. Note that + * btrfs_find_next_key() can release our path and do another search + * without COWing, this is because even with path->keep_locks = 1, + * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a + * node when path->slots[node_level - 1] does not point to the last + * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore + * we search for the next key after reallocating our node. + */ + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, + min_trans); if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1bbaace73383..323e12cc9d2f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -229,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root) void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { - smp_mb(); + /* + * Implicit memory barrier after atomic_dec_and_test + */ if (waitqueue_active(&root->log_writer_wait)) wake_up(&root->log_writer_wait); } @@ -691,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, - key->objectid, offset, 0); + key->objectid, offset); if (ret) goto out; } else { @@ -2820,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { - smp_mb(); + /* + * Implicit memory barrier after atomic_dec_and_test + */ if (waitqueue_active(&log_root_tree->log_writer_wait)) wake_up(&log_root_tree->log_writer_wait); } @@ -2950,6 +2954,9 @@ out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); mutex_unlock(&log_root_tree->log_mutex); + /* + * The barrier before waitqueue_active is implied by mutex_unlock + */ if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: @@ -2961,6 +2968,9 @@ out: atomic_set(&root->log_commit[index1], 0); mutex_unlock(&root->log_mutex); + /* + * The barrier before waitqueue_active is implied by mutex_unlock + */ if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); return ret; @@ -5314,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_error(fs_info, ret, "Failed to pin buffers while " + btrfs_std_error(fs_info, ret, "Failed to pin buffers while " "recovering log root tree."); goto error; } @@ -5328,7 +5338,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_error(fs_info, ret, + btrfs_std_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -5346,7 +5356,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_error(fs_info, ret, + btrfs_std_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -5361,7 +5371,7 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_error(fs_info, ret, "Couldn't read target root " + btrfs_std_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6fc735869c18..c32abbca9d77 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -42,6 +42,82 @@ #include "dev-replace.h" #include "sysfs.h" +const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 4, + .tolerated_failures = 1, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_RAID5] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID6] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .tolerated_failures = 2, + .devs_increment = 1, + .ncopies = 3, + }, +}; + +const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, + [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, + [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, + [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, + [BTRFS_RAID_SINGLE] = 0, + [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, + [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, +}; + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -49,6 +125,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); +static void btrfs_close_one_device(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -156,8 +233,8 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); - INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); - INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); return dev; } @@ -198,7 +275,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, if (IS_ERR(*bdev)) { ret = PTR_ERR(*bdev); - printk(KERN_INFO "BTRFS: open %s failed\n", device_path); goto error; } @@ -211,8 +287,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, } invalidate_bdev(*bdev); *bh = btrfs_read_dev_super(*bdev); - if (!*bh) { - ret = -EINVAL; + if (IS_ERR(*bh)) { + ret = PTR_ERR(*bh); blkdev_put(*bdev, flags); goto error; } @@ -345,6 +421,9 @@ loop_lock: pending = pending->bi_next; cur->bi_next = NULL; + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if (atomic_dec_return(&fs_info->nr_async_bios) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -765,36 +844,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; - - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - list_del_init(&device->dev_alloc_list); - fs_devices->rw_devices--; - } - - if (device->missing) - fs_devices->missing_devices--; - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); - } - - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; - - call_rcu(&device->rcu, free_device); + btrfs_close_one_device(device); } mutex_unlock(&fs_devices->device_list_mutex); @@ -1053,7 +1103,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; key.objectid = device->devid; key.offset = start; @@ -1208,6 +1258,15 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, int ret; int slot; struct extent_buffer *l; + u64 min_search_start; + + /* + * We don't want to overwrite the superblock on the drive nor any area + * used by the boot loader (grub for example), so we make sure to start + * at an offset of at least 1MB. + */ + min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + search_start = max(search_start, min_search_start); path = btrfs_alloc_path(); if (!path) @@ -1222,7 +1281,7 @@ again: goto out; } - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; @@ -1348,18 +1407,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { - struct btrfs_root *root = device->dev_root; - u64 search_start; - /* FIXME use last free of some kind */ - - /* - * we don't want to overwrite the superblock on the drive, - * so we make sure to start at an offset of at least 1MB - */ - search_start = max(root->fs_info->alloc_start, 1024ull * 1024); return find_free_dev_extent_start(trans->transaction, device, - num_bytes, search_start, start, len); + num_bytes, 0, start, len); } static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, @@ -1402,7 +1452,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_error(root->fs_info, ret, "Slot search failed"); + btrfs_std_error(root->fs_info, ret, "Slot search failed"); goto out; } @@ -1410,10 +1460,10 @@ again: ret = btrfs_del_item(trans, root, path); if (ret) { - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to remove dev extent item"); } else { - trans->transaction->have_free_bgs = 1; + set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); } out: btrfs_free_path(path); @@ -1593,7 +1643,6 @@ static void update_dev_time(char *path_name) return; file_update_time(filp); filp_close(filp, NULL); - return; } static int btrfs_rm_dev_item(struct btrfs_root *root, @@ -1801,7 +1850,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); } call_rcu(&device->rcu, free_device); @@ -1924,7 +1973,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->writeable) { fs_devices->rw_devices--; /* zero out the old super if it is writable */ - btrfs_scratch_superblock(srcdev); + btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); } if (srcdev->bdev) @@ -1971,10 +2020,10 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); - btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); if (tgtdev->bdev) { - btrfs_scratch_superblock(tgtdev); + btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); fs_info->fs_devices->open_devices--; } fs_info->fs_devices->num_devices--; @@ -2041,10 +2090,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } } - if (!*device) { - btrfs_err(root->fs_info, "no missing device found"); - return -ENOENT; - } + if (!*device) + return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; return 0; } else { @@ -2309,7 +2356,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) tmp + 1); /* add sysfs device entry */ - btrfs_kobj_add_device(root->fs_info->fs_devices, device); + btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2350,9 +2397,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", root->fs_info->fsid); - if (kobject_rename(&root->fs_info->fs_devices->super_kobj, + if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj, fsid_buf)) - pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); + btrfs_warn(root->fs_info, + "sysfs: failed to create fsid for sprout"); } root->fs_info->num_tolerated_disk_barrier_failures = @@ -2368,7 +2416,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); if (ret < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); @@ -2388,7 +2436,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: btrfs_end_transaction(trans, root); rcu_string_free(device->name); - btrfs_kobj_rm_device(root->fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2613,7 +2661,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_error(root->fs_info, -ENOENT, + btrfs_std_error(root->fs_info, -ENOENT, "Failed lookup while freeing chunk."); ret = -ENOENT; goto out; @@ -2621,7 +2669,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to delete chunk item."); out: btrfs_free_path(path); @@ -2803,10 +2851,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) if (ret) return ret; - trans = btrfs_start_transaction(root, 0); + trans = btrfs_start_trans_remove_block_group(root->fs_info, + chunk_offset); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); return ret; } @@ -3009,16 +3058,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl) * (albeit full) chunks. */ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->data.usage = 90; } if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->sys.usage = 90; } if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->meta.usage = 90; @@ -3070,17 +3122,50 @@ static int chunk_profiles_filter(u64 chunk_type, return 1; } -static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, +static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { struct btrfs_block_group_cache *cache; + u64 chunk_used; + u64 user_thresh_min; + u64 user_thresh_max; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + if (bargs->usage_min == 0) + user_thresh_min = 0; + else + user_thresh_min = div_factor_fine(cache->key.offset, + bargs->usage_min); + + if (bargs->usage_max == 0) + user_thresh_max = 1; + else if (bargs->usage_max > 100) + user_thresh_max = cache->key.offset; + else + user_thresh_max = div_factor_fine(cache->key.offset, + bargs->usage_max); + + if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, + u64 chunk_offset, struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; u64 chunk_used, user_thresh; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - if (bargs->usage == 0) + if (bargs->usage_min == 0) user_thresh = 1; else if (bargs->usage > 100) user_thresh = cache->key.offset; @@ -3170,6 +3255,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf, return 1; } +static int chunk_stripes_range_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + if (bargs->stripes_min <= num_stripes + && num_stripes <= bargs->stripes_max) + return 0; + + return 1; +} + static int chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { @@ -3216,6 +3314,9 @@ static int should_balance_chunk(struct btrfs_root *root, if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { return 0; + } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && + chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; } /* devid filter */ @@ -3236,6 +3337,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* stripes filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && + chunk_stripes_range_filter(leaf, chunk, bargs)) { + return 0; + } + /* soft profile changing mode */ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && chunk_soft_convert_filter(chunk_type, bargs)) { @@ -3250,6 +3357,16 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; else bargs->limit--; + } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { + /* + * Same logic as the 'limit' filter; the minimum cannot be + * determined here because we do not have the global informatoin + * about the count of all chunks that satisfy the filters. + */ + if (bargs->limit_max == 0) + return 0; + else + bargs->limit_max--; } return 1; @@ -3264,6 +3381,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) struct btrfs_device *device; u64 old_size; u64 size_to_free; + u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; @@ -3274,16 +3392,21 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) int ret; int enospc_errors = 0; bool counting = true; + /* The single value limit and min/max limits use the same bytes in the */ u64 limit_data = bctl->data.limit; u64 limit_meta = bctl->meta.limit; u64 limit_sys = bctl->sys.limit; + u32 count_data = 0; + u32 count_meta = 0; + u32 count_sys = 0; + int chunk_reserved = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); - size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + size_to_free = min_t(u64, size_to_free, SZ_1M); if (!device->writeable || btrfs_device_get_total_bytes(device) - btrfs_device_get_bytes_used(device) > size_to_free || @@ -3317,6 +3440,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->balance_lock); again: if (!counting) { + /* + * The single value limit and min/max limits use the same bytes + * in the + */ bctl->data.limit = limit_data; bctl->meta.limit = limit_meta; bctl->sys.limit = limit_sys; @@ -3364,6 +3491,7 @@ again: } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); if (!counting) { spin_lock(&fs_info->balance_lock); @@ -3373,6 +3501,7 @@ again: ret = should_balance_chunk(chunk_root, leaf, chunk, found_key.offset); + btrfs_release_path(path); if (!ret) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); @@ -3384,9 +3513,49 @@ again: spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + count_data++; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + count_sys++; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + count_meta++; + + goto loop; + } + + /* + * Apply limit_min filter, no need to check if the LIMITS + * filter is used, limit_min is 0 by default + */ + if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + count_data < bctl->data.limit_min) + || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && + count_meta < bctl->meta.limit_min) + || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && + count_sys < bctl->sys.limit_min)) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto loop; } + if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) { + trans = btrfs_start_transaction(chunk_root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + ret = PTR_ERR(trans); + goto error; + } + + ret = btrfs_force_chunk_alloc(trans, chunk_root, + BTRFS_BLOCK_GROUP_DATA); + btrfs_end_transaction(trans, chunk_root); + if (ret < 0) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + goto error; + } + chunk_reserved = 1; + } + ret = btrfs_relocate_chunk(chunk_root, found_key.offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); @@ -3461,11 +3630,20 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); if (ret) - btrfs_std_error(fs_info, ret); + btrfs_std_error(fs_info, ret, NULL); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } +/* Non-zero return value signifies invalidity */ +static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, + u64 allowed) +{ + return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl_arg->target, 1) || + (bctl_arg->target & ~allowed))); +} + /* * Should be called with both balance and volume mutexes held */ @@ -3523,27 +3701,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (num_devices > 3) allowed |= (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID6); - if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->data.target, 1) || - (bctl->data.target & ~allowed))) { + if (validate_convert_profile(&bctl->data, allowed)) { btrfs_err(fs_info, "unable to start balance with target " "data profile %llu", bctl->data.target); ret = -EINVAL; goto out; } - if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->meta.target, 1) || - (bctl->meta.target & ~allowed))) { + if (validate_convert_profile(&bctl->meta, allowed)) { btrfs_err(fs_info, "unable to start balance with target metadata profile %llu", bctl->meta.target); ret = -EINVAL; goto out; } - if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->sys.target, 1) || - (bctl->sys.target & ~allowed))) { + if (validate_convert_profile(&bctl->sys, allowed)) { btrfs_err(fs_info, "unable to start balance with target system profile %llu", bctl->sys.target); @@ -3551,14 +3723,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } - /* allow dup'ed data chunks only in mixed mode */ - if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { - btrfs_err(fs_info, "dup for data is not allowed"); - ret = -EINVAL; - goto out; - } - /* allow to reduce meta or sys integrity only if force set */ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | @@ -3584,6 +3748,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } while (read_seqretry(&fs_info->profiles_lock, seq)); + if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < + btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { + btrfs_warn(fs_info, + "metatdata profile 0x%llx has lower redundancy than data profile 0x%llx", + bctl->meta.target, bctl->data.target); + } + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { fs_info->num_tolerated_disk_barrier_failures = min( btrfs_calc_num_tolerated_disk_barrier_failures(fs_info), @@ -4096,7 +4267,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; lock_chunks(root); @@ -4285,69 +4456,10 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = { - .sub_stripes = 2, - .dev_stripes = 1, - .devs_max = 0, /* 0 == as many as possible */ - .devs_min = 4, - .devs_increment = 2, - .ncopies = 2, - }, - [BTRFS_RAID_RAID1] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 2, - .devs_min = 2, - .devs_increment = 2, - .ncopies = 2, - }, - [BTRFS_RAID_DUP] = { - .sub_stripes = 1, - .dev_stripes = 2, - .devs_max = 1, - .devs_min = 1, - .devs_increment = 1, - .ncopies = 2, - }, - [BTRFS_RAID_RAID0] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 2, - .devs_increment = 1, - .ncopies = 1, - }, - [BTRFS_RAID_SINGLE] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 1, - .devs_min = 1, - .devs_increment = 1, - .ncopies = 1, - }, - [BTRFS_RAID_RAID5] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 2, - .devs_increment = 1, - .ncopies = 2, - }, - [BTRFS_RAID_RAID6] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 3, - .devs_increment = 1, - .ncopies = 3, - }, -}; - static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) { /* TODO allow them to set a preferred stripe size */ - return 64 * 1024; + return SZ_64K; } static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) @@ -4415,21 +4527,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ncopies = btrfs_raid_array[index].ncopies; if (type & BTRFS_BLOCK_GROUP_DATA) { - max_stripe_size = 1024 * 1024 * 1024; + max_stripe_size = SZ_1G; max_chunk_size = 10 * max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) - max_stripe_size = 1024 * 1024 * 1024; + if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) + max_stripe_size = SZ_1G; else - max_stripe_size = 256 * 1024 * 1024; + max_stripe_size = SZ_256M; max_chunk_size = max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 32 * 1024 * 1024; + max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; @@ -4680,7 +4792,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 dev_offset; u64 stripe_size; int i = 0; - int ret; + int ret = 0; em_tree = &extent_root->fs_info->mapping_tree.map_tree; read_lock(&em_tree->lock); @@ -4711,20 +4823,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with the map's stripes, because the device object's id can change + * at any time during that final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; dev_offset = map->stripes[i].physical; ret = btrfs_update_device(trans, device); if (ret) - goto out; + break; ret = btrfs_alloc_dev_extent(trans, device, chunk_root->root_key.objectid, BTRFS_FIRST_CHUNK_TREE_OBJECTID, chunk_offset, dev_offset, stripe_size); if (ret) - goto out; + break; + } + if (ret) { + mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex); + goto out; } stripe = &chunk->stripe; @@ -4737,6 +4861,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; } + mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex); btrfs_set_stack_chunk_length(chunk, chunk_size); btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); @@ -6352,11 +6477,11 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); if (!sb) return -ENOMEM; - btrfs_set_buffer_uptodate(sb); + set_extent_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* * The sb extent buffer is artifical and just used to read the system array. - * btrfs_set_buffer_uptodate() call does not properly mark all it's + * set_extent_buffer_uptodate() call does not properly mark all it's * pages up-to-date when the page is larger: extent does not cover the * whole page and consequently check_page_uptodate does not find all * the page's extents up-to-date (the hole beyond sb), @@ -6399,6 +6524,14 @@ int btrfs_read_sys_array(struct btrfs_root *root) goto out_short_read; num_stripes = btrfs_chunk_num_stripes(sb, chunk); + if (!num_stripes) { + printk(KERN_ERR + "BTRFS: invalid number of stripes %u in sys_array at offset %u\n", + num_stripes, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; @@ -6407,6 +6540,9 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (ret) break; } else { + printk(KERN_ERR + "BTRFS: unexpected item type %u in sys_array at offset %u\n", + (u32)key.type, cur_offset); ret = -EIO; break; } @@ -6594,8 +6730,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "error %d while searching for dev_stats item for device %s!\n", + btrfs_warn_in_rcu(dev_root->fs_info, + "error %d while searching for dev_stats item for device %s", ret, rcu_str_deref(device->name)); goto out; } @@ -6605,8 +6741,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "delete too small dev_stats item for device %s failed %d!\n", + btrfs_warn_in_rcu(dev_root->fs_info, + "delete too small dev_stats item for device %s failed %d", rcu_str_deref(device->name), ret); goto out; } @@ -6619,9 +6755,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "insert dev_stats item for device %s failed %d!\n", - rcu_str_deref(device->name), ret); + btrfs_warn_in_rcu(dev_root->fs_info, + "insert dev_stats item for device %s failed %d", + rcu_str_deref(device->name), ret); goto out; } } @@ -6675,8 +6811,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " - "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + btrfs_err_rl_in_rcu(dev->dev_root->fs_info, + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6695,8 +6831,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - printk_in_rcu(KERN_INFO "BTRFS: " - "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + btrfs_info_in_rcu(dev->dev_root->fs_info, + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6740,22 +6876,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root, return 0; } -int btrfs_scratch_superblock(struct btrfs_device *device) +void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path) { struct buffer_head *bh; struct btrfs_super_block *disk_super; + int copy_num; - bh = btrfs_read_dev_super(device->bdev); - if (!bh) - return -EINVAL; - disk_super = (struct btrfs_super_block *)bh->b_data; + if (!bdev) + return; - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; + copy_num++) { - return 0; + if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); } /* @@ -6823,3 +6971,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) fs_devices = fs_devices->seed; } } + +static void btrfs_close_one_device(struct btrfs_device *device) +{ + struct btrfs_fs_devices *fs_devices = device->fs_devices; + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->missing) + fs_devices->missing_devices--; + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 595279a8b99f..1939ebde63df 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -26,7 +26,7 @@ extern struct mutex uuid_mutex; -#define BTRFS_STRIPE_LEN (64 * 1024) +#define BTRFS_STRIPE_LEN SZ_64K struct buffer_head; struct btrfs_pending_bios { @@ -256,7 +256,7 @@ struct btrfs_fs_devices { struct btrfs_fs_info *fs_info; /* sysfs kobjects */ - struct kobject super_kobj; + struct kobject fsid_kobj; struct kobject *device_dir_kobj; struct completion kobj_unregister; }; @@ -334,10 +334,15 @@ struct btrfs_raid_attr { int dev_stripes; /* stripes per dev */ int devs_max; /* max devs to use */ int devs_min; /* min devs needed */ + int tolerated_failures; /* max tolerated fail devs */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ }; +extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; + +extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; + struct map_lookup { u64 type; int io_align; @@ -375,6 +380,9 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) #define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) +#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) +#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) +#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10) #define BTRFS_BALANCE_ARGS_MASK \ (BTRFS_BALANCE_ARGS_PROFILES | \ @@ -382,7 +390,10 @@ struct map_lookup { BTRFS_BALANCE_ARGS_DEVID | \ BTRFS_BALANCE_ARGS_DRANGE | \ BTRFS_BALANCE_ARGS_VRANGE | \ - BTRFS_BALANCE_ARGS_LIMIT) + BTRFS_BALANCE_ARGS_LIMIT | \ + BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ + BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ + BTRFS_BALANCE_ARGS_USAGE_RANGE) /* * Profile changing flags. When SOFT is set we won't relocate chunk if @@ -482,7 +493,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); -int btrfs_scratch_superblock(struct btrfs_device *device); +void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path); int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_root *root, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 6f518c90e1c1..fd953c361a43 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -283,7 +283,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; /* search for our xattrs */ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -313,8 +313,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; - if (found_key.type != BTRFS_XATTR_ITEM_KEY) + if (found_key.type > BTRFS_XATTR_ITEM_KEY) break; + if (found_key.type < BTRFS_XATTR_ITEM_KEY) + goto next; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); if (verify_dir_item(root, leaf, di)) @@ -349,137 +351,89 @@ err: return ret; } -/* - * List of handlers for synthetic system.* attributes. All real ondisk - * attributes are handled directly. - */ -const struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif - NULL, -}; - -/* - * Check if the attribute is in a supported namespace. - * - * This is applied after the check for the synthetic attributes in the system - * namespace. - */ -static int btrfs_is_valid_xattr(const char *name) +static int btrfs_xattr_handler_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - int len = strlen(name); - int prefixlen = 0; - - if (!strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN)) - prefixlen = XATTR_SECURITY_PREFIX_LEN; - else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - prefixlen = XATTR_SYSTEM_PREFIX_LEN; - else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) - prefixlen = XATTR_TRUSTED_PREFIX_LEN; - else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - prefixlen = XATTR_USER_PREFIX_LEN; - else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - prefixlen = XATTR_BTRFS_PREFIX_LEN; - else - return -EOPNOTSUPP; - - /* - * The name cannot consist of just prefix - */ - if (len <= prefixlen) - return -EINVAL; + struct inode *inode = d_inode(dentry); - return 0; + name = xattr_full_name(handler, name); + return __btrfs_getxattr(inode, name, buffer, size); } -ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) +static int btrfs_xattr_handler_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, + int flags) { - int ret; + struct inode *inode = d_inode(dentry); - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_getxattr(dentry, name, buffer, size); + name = xattr_full_name(handler, name); + return __btrfs_setxattr(NULL, inode, name, buffer, size, flags); +} - ret = btrfs_is_valid_xattr(name); - if (ret) - return ret; - return __btrfs_getxattr(d_inode(dentry), name, buffer, size); +static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, + struct dentry *dentry, + const char *name, const void *value, + size_t size, int flags) +{ + name = xattr_full_name(handler, name); + return btrfs_set_prop(d_inode(dentry), name, value, size, flags); } +static const struct xattr_handler btrfs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_btrfs_xattr_handler = { + .prefix = XATTR_BTRFS_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set_prop, +}; + +const struct xattr_handler *btrfs_xattr_handlers[] = { + &btrfs_security_xattr_handler, +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &btrfs_trusted_xattr_handler, + &btrfs_user_xattr_handler, + &btrfs_btrfs_xattr_handler, + NULL, +}; + int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - int ret; - /* - * The permission on security.* and system.* is not checked - * in permission(). - */ if (btrfs_root_readonly(root)) return -EROFS; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_setxattr(dentry, name, value, size, flags); - - ret = btrfs_is_valid_xattr(name); - if (ret) - return ret; - - if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(d_inode(dentry), name, - value, size, flags); - - if (size == 0) - value = ""; /* empty EA, do not remove */ - - return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size, - flags); + return generic_setxattr(dentry, name, value, size, flags); } int btrfs_removexattr(struct dentry *dentry, const char *name) { struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - int ret; - /* - * The permission on security.* and system.* is not checked - * in permission(). - */ if (btrfs_root_readonly(root)) return -EROFS; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_removexattr(dentry, name); - - ret = btrfs_is_valid_xattr(name); - if (ret) - return ret; - - if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(d_inode(dentry), name, - NULL, 0, XATTR_REPLACE); - - return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0, - XATTR_REPLACE); + return generic_removexattr(dentry, name); } static int btrfs_initxattrs(struct inode *inode, @@ -492,7 +446,7 @@ static int btrfs_initxattrs(struct inode *inode, for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + - strlen(xattr->name) + 1, GFP_NOFS); + strlen(xattr->name) + 1, GFP_KERNEL); if (!name) { err = -ENOMEM; break; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 5049608d1388..96807b3d22f5 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -28,8 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags); -extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size); extern int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int btrfs_removexattr(struct dentry *dentry, const char *name); diff --git a/fs/buffer.c b/fs/buffer.c index 82283abb2795..e1632abb4ca9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -134,13 +134,10 @@ __clear_page_buffers(struct page *page) static void buffer_io_error(struct buffer_head *bh, char *msg) { - char b[BDEVNAME_SIZE]; - if (!test_bit(BH_Quiet, &bh->b_state)) printk_ratelimited(KERN_ERR - "Buffer I/O error on dev %s, logical block %llu%s\n", - bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr, msg); + "Buffer I/O error on dev %pg, logical block %llu%s\n", + bh->b_bdev, (unsigned long long)bh->b_blocknr, msg); } /* @@ -237,15 +234,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) * elsewhere, don't buffer_error if we had some unmapped buffers */ if (all_mapped) { - char b[BDEVNAME_SIZE]; - printk("__find_get_block_slow() failed. " "block=%llu, b_blocknr=%llu\n", (unsigned long long)block, (unsigned long long)bh->b_blocknr); printk("b_state=0x%08lx, b_size=%zu\n", bh->b_state, bh->b_size); - printk("device %s blocksize: %d\n", bdevname(bdev, b), + printk("device %pg blocksize: %d\n", bdev, 1 << bd_inode->i_blkbits); } out_unlock: @@ -531,10 +526,8 @@ repeat: static void do_thaw_one(struct super_block *sb, void *unused) { - char b[BDEVNAME_SIZE]; while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) - printk(KERN_WARNING "Emergency Thaw on %s\n", - bdevname(sb->s_bdev, b)); + printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); } static void do_thaw_all(struct work_struct *work) @@ -999,7 +992,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, int ret = 0; /* Will call free_more_memory() */ gfp_t gfp_mask; - gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; + gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp; /* * XXX: __getblk_slow() can not really deal with failure and @@ -1074,12 +1067,10 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) * pagecache index. (this comparison is done using sector_t types). */ if (unlikely(index != block >> sizebits)) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "%s: requested out-of-range block %llu for " - "device %s\n", + "device %pg\n", __func__, (unsigned long long)block, - bdevname(bdev, b)); + bdev); return -EIO; } @@ -2420,9 +2411,9 @@ EXPORT_SYMBOL(block_commit_write); * unlock the page. * * Direct callers of this function should protect against filesystem freezing - * using sb_start_write() - sb_end_write() functions. + * using sb_start_pagefault() - sb_end_pagefault() functions. */ -int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, +int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { struct page *page = vmf->page; @@ -2459,26 +2450,6 @@ out_unlock: unlock_page(page); return ret; } -EXPORT_SYMBOL(__block_page_mkwrite); - -int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) -{ - int ret; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; - - sb_start_pagefault(sb); - - /* - * Update file times before taking page lock. We may end up failing the - * fault so this update may be superfluous but who really cares... - */ - file_update_time(vma->vm_file); - - ret = __block_page_mkwrite(vma, vmf, get_block); - sb_end_pagefault(sb); - return block_page_mkwrite_return(ret); -} EXPORT_SYMBOL(block_page_mkwrite); /* diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index f601def05bdf..452e98dd7560 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -226,15 +226,9 @@ static ssize_t cachefiles_daemon_write(struct file *file, return -EOPNOTSUPP; /* drag the command string into the kernel so we can parse it */ - data = kmalloc(datalen + 1, GFP_KERNEL); - if (!data) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(data, _data, datalen) != 0) - goto error; - - data[datalen] = '\0'; + data = memdup_user_nul(_data, datalen); + if (IS_ERR(data)) + return PTR_ERR(data); ret = -EINVAL; if (memchr(data, '\0', datalen)) diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index aecd0859eacb..9c4b737a54df 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -30,7 +30,7 @@ extern unsigned cachefiles_debug; #define CACHEFILES_DEBUG_KLEAVE 2 #define CACHEFILES_DEBUG_KDEBUG 4 -#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC) +#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC) /* * node records diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index fc1056f5c96a..c4b893453e0e 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -655,6 +655,8 @@ lookup_again: aops = d_backing_inode(object->dentry)->i_mapping->a_ops; if (!aops->bmap) goto check_error; + if (object->dentry->d_sb->s_blocksize > PAGE_SIZE) + goto check_error; object->backer = object->dentry; } else { diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 3cbb0e834694..c0f3da3926a0 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -414,9 +414,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, ASSERT(inode->i_mapping->a_ops->readpages); /* calculate the shift required to use bmap */ - if (inode->i_sb->s_blocksize > PAGE_SIZE) - goto enobufs; - shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; op->op.flags &= FSCACHE_OP_KEEP_FLAGS; @@ -711,9 +708,6 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, ASSERT(inode->i_mapping->a_ops->readpages); /* calculate the shift required to use bmap */ - if (inode->i_sb->s_blocksize > PAGE_SIZE) - goto all_enobufs; - shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; pagevec_init(&pagevec, 0); @@ -885,7 +879,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) loff_t pos, eof; size_t len; void *data; - int ret; + int ret = -ENOBUFS; ASSERT(op != NULL); ASSERT(page != NULL); @@ -905,6 +899,15 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) cache = container_of(object->fscache.cache, struct cachefiles_cache, cache); + pos = (loff_t)page->index << PAGE_SHIFT; + + /* We mustn't write more data than we have, so we have to beware of a + * partial page at EOF. + */ + eof = object->fscache.store_limit_l; + if (pos >= eof) + goto error; + /* write the page to the backing filesystem and let it store it in its * own time */ path.mnt = cache->mnt; @@ -912,40 +915,38 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred); if (IS_ERR(file)) { ret = PTR_ERR(file); - } else { - pos = (loff_t) page->index << PAGE_SHIFT; - - /* we mustn't write more data than we have, so we have - * to beware of a partial page at EOF */ - eof = object->fscache.store_limit_l; - len = PAGE_SIZE; - if (eof & ~PAGE_MASK) { - ASSERTCMP(pos, <, eof); - if (eof - pos < PAGE_SIZE) { - _debug("cut short %llx to %llx", - pos, eof); - len = eof - pos; - ASSERTCMP(pos + len, ==, eof); - } - } - - data = kmap(page); - ret = __kernel_write(file, data, len, &pos); - kunmap(page); - if (ret != len) - ret = -EIO; - fput(file); + goto error_2; } - if (ret < 0) { - if (ret == -EIO) - cachefiles_io_error_obj( - object, "Write page to backing file failed"); - ret = -ENOBUFS; + len = PAGE_SIZE; + if (eof & ~PAGE_MASK) { + if (eof - pos < PAGE_SIZE) { + _debug("cut short %llx to %llx", + pos, eof); + len = eof - pos; + ASSERTCMP(pos + len, ==, eof); + } } - _leave(" = %d", ret); - return ret; + data = kmap(page); + ret = __kernel_write(file, data, len, &pos); + kunmap(page); + fput(file); + if (ret != len) + goto error_eio; + + _leave(" = 0"); + return 0; + +error_eio: + ret = -EIO; +error_2: + if (ret == -EIO) + cachefiles_io_error_obj(object, + "Write page to backing file failed"); +error: + _leave(" = -ENOBUFS [%d]", ret); + return -ENOBUFS; } /* diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 8f84646f10e9..f19708487e2f 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -49,10 +49,10 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); @@ -92,7 +92,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { ret = posix_acl_equiv_mode(acl, &new_mode); if (ret < 0) @@ -106,7 +106,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) ret = acl ? -EINVAL : 0; goto out; } - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: ret = -EINVAL; @@ -202,11 +202,11 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1); if (acl) { - size_t len = strlen(POSIX_ACL_XATTR_ACCESS); + size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS); err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8); if (err) goto out_err; - ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS, + ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS, len); err = posix_acl_to_xattr(&init_user_ns, acl, tmp_buf, val_size1); @@ -216,12 +216,12 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, ceph_pagelist_append(pagelist, tmp_buf, val_size1); } if (default_acl) { - size_t len = strlen(POSIX_ACL_XATTR_DEFAULT); + size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT); err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); if (err) goto out_err; err = ceph_pagelist_encode_string(pagelist, - POSIX_ACL_XATTR_DEFAULT, len); + XATTR_NAME_POSIX_ACL_DEFAULT, len); err = posix_acl_to_xattr(&init_user_ns, default_acl, tmp_buf, val_size2); if (err < 0) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9d23e788d1df..b7d218a168fb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1283,8 +1283,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int ret1; struct address_space *mapping = inode->i_mapping; struct page *page = find_or_create_page(mapping, 0, - mapping_gfp_mask(mapping) & - ~__GFP_FS); + mapping_gfp_constraint(mapping, + ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; goto out; @@ -1428,7 +1428,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, if (i_size_read(inode) == 0) return; page = find_or_create_page(mapping, 0, - mapping_gfp_mask(mapping) & ~__GFP_FS); + mapping_gfp_constraint(mapping, + ~__GFP_FS)); if (!page) return; if (PageUptodate(page)) { diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 834f9f3723fb..a4766ded1ba7 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -88,7 +88,7 @@ static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, const struct ceph_inode_info* ci = cookie_netfs_data; uint16_t klen; - /* use ceph virtual inode (id + snaphot) */ + /* use ceph virtual inode (id + snapshot) */ klen = sizeof(ci->i_vino); if (klen > maxbuf) return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 27b566874bc1..c69e1253b47b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1655,9 +1655,8 @@ retry_locked: !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ - (file_wanted == 0 || /* no open files */ - (revoking & (CEPH_CAP_FILE_CACHE| - CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ + (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ !tried_invalidate) { dout("check_caps trying to invalidate on %p\n", inode); if (try_nonblocking_invalidate(inode) < 0) { @@ -1971,49 +1970,46 @@ out: } /* - * wait for any uncommitted directory operations to commit. + * wait for any unsafe requests to complete. */ -static int unsafe_dirop_wait(struct inode *inode) +static int unsafe_request_wait(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_dirops; - struct ceph_mds_request *req; - u64 last_tid; - int ret = 0; - - if (!S_ISDIR(inode->i_mode)) - return 0; + struct ceph_mds_request *req1 = NULL, *req2 = NULL; + int ret, err = 0; spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - req = list_last_entry(head, struct ceph_mds_request, - r_unsafe_dir_item); - last_tid = req->r_tid; - - do { - ceph_mdsc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); + if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) { + req1 = list_last_entry(&ci->i_unsafe_dirops, + struct ceph_mds_request, + r_unsafe_dir_item); + ceph_mdsc_get_request(req1); + } + if (!list_empty(&ci->i_unsafe_iops)) { + req2 = list_last_entry(&ci->i_unsafe_iops, + struct ceph_mds_request, + r_unsafe_target_item); + ceph_mdsc_get_request(req2); + } + spin_unlock(&ci->i_unsafe_lock); - dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n", - inode, req->r_tid, last_tid); - ret = !wait_for_completion_timeout(&req->r_safe_completion, - ceph_timeout_jiffies(req->r_timeout)); + dout("unsafe_requeset_wait %p wait on tid %llu %llu\n", + inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); + if (req1) { + ret = !wait_for_completion_timeout(&req1->r_safe_completion, + ceph_timeout_jiffies(req1->r_timeout)); if (ret) - ret = -EIO; /* timed out */ - - ceph_mdsc_put_request(req); - - spin_lock(&ci->i_unsafe_lock); - if (ret || list_empty(head)) - break; - req = list_first_entry(head, struct ceph_mds_request, - r_unsafe_dir_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); - return ret; + err = -EIO; + ceph_mdsc_put_request(req1); + } + if (req2) { + ret = !wait_for_completion_timeout(&req2->r_safe_completion, + ceph_timeout_jiffies(req2->r_timeout)); + if (ret) + err = -EIO; + ceph_mdsc_put_request(req2); + } + return err; } int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) @@ -2039,7 +2035,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - ret = unsafe_dirop_wait(inode); + ret = unsafe_request_wait(inode); /* * only wait on non-file metadata writeback (the mds diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0c62868b5c56..3c68e6aee2f0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -34,6 +34,74 @@ * need to wait for MDS acknowledgement. */ +/* + * Calculate the length sum of direct io vectors that can + * be combined into one page vector. + */ +static size_t dio_get_pagev_size(const struct iov_iter *it) +{ + const struct iovec *iov = it->iov; + const struct iovec *iovend = iov + it->nr_segs; + size_t size; + + size = iov->iov_len - it->iov_offset; + /* + * An iov can be page vectored when both the current tail + * and the next base are page aligned. + */ + while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) && + (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) { + size += iov->iov_len; + } + dout("dio_get_pagevlen len = %zu\n", size); + return size; +} + +/* + * Allocate a page vector based on (@it, @nbytes). + * The return value is the tuple describing a page vector, + * that is (@pages, @page_align, @num_pages). + */ +static struct page ** +dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, + size_t *page_align, int *num_pages) +{ + struct iov_iter tmp_it = *it; + size_t align; + struct page **pages; + int ret = 0, idx, npages; + + align = (unsigned long)(it->iov->iov_base + it->iov_offset) & + (PAGE_SIZE - 1); + npages = calc_pages_for(align, nbytes); + pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL); + if (!pages) { + pages = vmalloc(sizeof(*pages) * npages); + if (!pages) + return ERR_PTR(-ENOMEM); + } + + for (idx = 0; idx < npages; ) { + size_t start; + ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes, + npages - idx, &start); + if (ret < 0) + goto fail; + + iov_iter_advance(&tmp_it, ret); + nbytes -= ret; + idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE; + } + + BUG_ON(nbytes != 0); + *num_pages = npages; + *page_align = align; + dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align); + return pages; +fail: + ceph_put_page_vector(pages, idx, false); + return ERR_PTR(ret); +} /* * Prepare an open request. Preallocate ceph_cap to avoid an @@ -458,11 +526,10 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, size_t start; ssize_t n; - n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start); - if (n < 0) - return n; - - num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; + n = dio_get_pagev_size(i); + pages = dio_get_pages_alloc(i, n, &start, &num_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); ret = striped_read(inode, off, n, pages, num_pages, checkeof, @@ -592,7 +659,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, CEPH_OSD_FLAG_WRITE; while (iov_iter_count(from) > 0) { - u64 len = iov_iter_single_seg_count(from); + u64 len = dio_get_pagev_size(from); size_t start; ssize_t n; @@ -611,14 +678,14 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); - n = iov_iter_get_pages_alloc(from, &pages, len, &start); - if (unlikely(n < 0)) { - ret = n; + n = len; + pages = dio_get_pages_alloc(from, len, &start, &num_pages); + if (IS_ERR(pages)) { ceph_osdc_put_request(req); + ret = PTR_ERR(pages); break; } - num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; /* * throw out any page cache pages in this range. this * may block. diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 96d2bd829902..da55eb8bcffa 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -452,6 +452,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ci->i_unsafe_writes); INIT_LIST_HEAD(&ci->i_unsafe_dirops); + INIT_LIST_HEAD(&ci->i_unsafe_iops); spin_lock_init(&ci->i_unsafe_lock); ci->i_snap_realm = NULL; @@ -1755,7 +1756,7 @@ retry: */ static const struct inode_operations ceph_symlink_iops = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = ceph_setattr, .getattr = ceph_getattr, .setxattr = ceph_setxattr, diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 6706bde9ad1b..a2cb0c254060 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -228,12 +228,12 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, file, lock_cmd, wait, fl); if (!err) { - err = flock_lock_file_wait(file, fl); + err = locks_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, file, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on flock_lock_file_wait, undid lock", err); + dout("got %d on locks_lock_file_wait, undid lock", err); } } return err; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 51cb02da75d9..e7b130a637f9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -633,13 +633,8 @@ static void __register_request(struct ceph_mds_client *mdsc, mdsc->oldest_tid = req->r_tid; if (dir) { - struct ceph_inode_info *ci = ceph_inode(dir); - ihold(dir); - spin_lock(&ci->i_unsafe_lock); req->r_unsafe_dir = dir; - list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); - spin_unlock(&ci->i_unsafe_lock); } } @@ -665,13 +660,20 @@ static void __unregister_request(struct ceph_mds_client *mdsc, rb_erase(&req->r_node, &mdsc->request_tree); RB_CLEAR_NODE(&req->r_node); - if (req->r_unsafe_dir) { + if (req->r_unsafe_dir && req->r_got_unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); - spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); + } + if (req->r_target_inode && req->r_got_unsafe) { + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_target_item); + spin_unlock(&ci->i_unsafe_lock); + } + if (req->r_unsafe_dir) { iput(req->r_unsafe_dir); req->r_unsafe_dir = NULL; } @@ -1430,6 +1432,13 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if ((used | wanted) & CEPH_CAP_ANY_WR) goto out; } + /* The inode has cached pages, but it's no longer used. + * we can safely drop it */ + if (wanted == 0 && used == CEPH_CAP_FILE_CACHE && + !(oissued & CEPH_CAP_FILE_CACHE)) { + used = 0; + oissued = 0; + } if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ @@ -1438,7 +1447,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) /* we aren't the only cap.. just remove us */ __ceph_remove_cap(cap, true); } else { - /* try to drop referring dentries */ + /* try dropping referring dentries */ spin_unlock(&ci->i_ceph_lock); d_prune_aliases(inode); dout("trim_caps_cb %p cap %p pruned, count now %d\n", @@ -1704,6 +1713,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) req->r_started = jiffies; req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); + INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; kref_init(&req->r_kref); INIT_LIST_HEAD(&req->r_wait); @@ -1935,7 +1945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, len = sizeof(*head) + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + - sizeof(struct timespec); + sizeof(struct ceph_timespec); /* calculate (max) length for cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -2477,6 +2487,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } else { req->r_got_unsafe = true; list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); + if (req->r_unsafe_dir) { + struct ceph_inode_info *ci = + ceph_inode(req->r_unsafe_dir); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_dir_item, + &ci->i_unsafe_dirops); + spin_unlock(&ci->i_unsafe_lock); + } } dout("handle_reply tid %lld result %d\n", tid, result); @@ -2518,6 +2536,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) up_read(&mdsc->snap_rwsem); if (realm) ceph_put_snap_realm(mdsc, realm); + + if (err == 0 && req->r_got_unsafe && req->r_target_inode) { + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); + spin_unlock(&ci->i_unsafe_lock); + } out_err: mutex_lock(&mdsc->mutex); if (!req->r_aborted) { @@ -3917,17 +3942,19 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, return msg; } -static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) +static int mds_sign_message(struct ceph_msg *msg) { - struct ceph_mds_session *s = con->private; + struct ceph_mds_session *s = msg->con->private; struct ceph_auth_handshake *auth = &s->s_auth; + return ceph_auth_sign_message(auth, msg); } -static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) +static int mds_check_message_signature(struct ceph_msg *msg) { - struct ceph_mds_session *s = con->private; + struct ceph_mds_session *s = msg->con->private; struct ceph_auth_handshake *auth = &s->s_auth; + return ceph_auth_check_message_signature(auth, msg); } @@ -3940,8 +3967,8 @@ static const struct ceph_connection_operations mds_con_ops = { .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, .alloc_msg = mds_alloc_msg, - .sign_message = sign_message, - .check_message_signature = check_message_signature, + .sign_message = mds_sign_message, + .check_message_signature = mds_check_message_signature, }; /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f575eafe2261..ccf11ef0ca87 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -236,6 +236,9 @@ struct ceph_mds_request { struct inode *r_unsafe_dir; struct list_head r_unsafe_dir_item; + /* unsafe requests that modify the target inode */ + struct list_head r_unsafe_target_item; + struct ceph_mds_session *r_session; int r_attempts; /* resend attempts */ diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f446afada328..ca4d5e8457f1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -639,8 +639,8 @@ static int __init init_caches(void) ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - ceph_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ceph_inode_init_once); if (ceph_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2f2460d23a06..75b7d125ce66 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -342,6 +342,7 @@ struct ceph_inode_info { struct list_head i_unsafe_writes; /* uncommitted sync writes */ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ + struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ spinlock_t i_unsafe_lock; struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index f4cf200b3c76..6908080e9b6d 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -42,7 +42,7 @@ cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) goto error; /* attach the data */ - key->payload.data = payload; + key->payload.data[0] = payload; ret = 0; error: @@ -52,7 +52,7 @@ error: static void cifs_spnego_key_destroy(struct key *key) { - kfree(key->payload.data); + kfree(key->payload.data[0]); } @@ -167,7 +167,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) #ifdef CONFIG_CIFS_DEBUG2 if (cifsFYI && !IS_ERR(spnego_key)) { - struct cifs_spnego_msg *msg = spnego_key->payload.data; + struct cifs_spnego_msg *msg = spnego_key->payload.data[0]; cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U, msg->secblob_len + msg->sesskey_len)); } diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 1ea780bc6376..3f93125916bf 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -58,16 +58,15 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) * dereference payload.data! */ if (prep->datalen <= sizeof(key->payload)) { - key->payload.value = 0; - memcpy(&key->payload.value, prep->data, prep->datalen); - key->datalen = prep->datalen; - return 0; + key->payload.data[0] = NULL; + memcpy(&key->payload, prep->data, prep->datalen); + } else { + payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); + if (!payload) + return -ENOMEM; + key->payload.data[0] = payload; } - payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); - if (!payload) - return -ENOMEM; - key->payload.data = payload; key->datalen = prep->datalen; return 0; } @@ -76,7 +75,7 @@ static inline void cifs_idmap_key_destroy(struct key *key) { if (key->datalen > sizeof(key->payload)) - kfree(key->payload.data); + kfree(key->payload.data[0]); } static struct key_type cifs_idmap_key_type = { @@ -233,8 +232,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) * it could be. */ ksid = sidkey->datalen <= sizeof(sidkey->payload) ? - (struct cifs_sid *)&sidkey->payload.value : - (struct cifs_sid *)sidkey->payload.data; + (struct cifs_sid *)&sidkey->payload : + (struct cifs_sid *)sidkey->payload.data[0]; ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32)); if (ksid_size > sidkey->datalen) { @@ -307,14 +306,14 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, if (sidtype == SIDOWNER) { kuid_t uid; uid_t id; - memcpy(&id, &sidkey->payload.value, sizeof(uid_t)); + memcpy(&id, &sidkey->payload.data[0], sizeof(uid_t)); uid = make_kuid(&init_user_ns, id); if (uid_valid(uid)) fuid = uid; } else { kgid_t gid; gid_t id; - memcpy(&id, &sidkey->payload.value, sizeof(gid_t)); + memcpy(&id, &sidkey->payload.data[0], sizeof(gid_t)); gid = make_kgid(&init_user_ns, id); if (gid_valid(gid)) fgid = gid; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e739950ca084..c4c1169814b2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -454,6 +454,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",nocase"); if (tcon->retry) seq_puts(s, ",hard"); + if (tcon->use_persistent) + seq_puts(s, ",persistenthandles"); + else if (tcon->use_resilient) + seq_puts(s, ",resilienthandles"); if (tcon->unix_ext) seq_puts(s, ",unix"); else @@ -896,8 +900,7 @@ const struct inode_operations cifs_file_inode_ops = { const struct inode_operations cifs_symlink_inode_ops = { .readlink = generic_readlink, - .follow_link = cifs_follow_link, - .put_link = kfree_put_link, + .get_link = cifs_get_link, .permission = cifs_permission, /* BB add the following two eventually */ /* revalidate: cifs_revalidate, @@ -910,6 +913,59 @@ const struct inode_operations cifs_symlink_inode_ops = { #endif }; +static int cifs_clone_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, u64 len) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *target_inode = file_inode(dst_file); + struct cifsFileInfo *smb_file_src = src_file->private_data; + struct cifsFileInfo *smb_file_target = dst_file->private_data; + struct cifs_tcon *target_tcon = tlink_tcon(smb_file_target->tlink); + unsigned int xid; + int rc; + + cifs_dbg(FYI, "clone range\n"); + + xid = get_xid(); + + if (!src_file->private_data || !dst_file->private_data) { + rc = -EBADF; + cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n"); + goto out; + } + + /* + * Note: cifs case is easier than btrfs since server responsible for + * checks for proper open modes and file type and if it wants + * server could even support copy of range where source = target + */ + lock_two_nondirectories(target_inode, src_inode); + + if (len == 0) + len = src_inode->i_size - off; + + cifs_dbg(FYI, "about to flush pages\n"); + /* should we flush first and last page first */ + truncate_inode_pages_range(&target_inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len)-1); + + if (target_tcon->ses->server->ops->duplicate_extents) + rc = target_tcon->ses->server->ops->duplicate_extents(xid, + smb_file_src, smb_file_target, off, len, destoff); + else + rc = -EOPNOTSUPP; + + /* force revalidate of size and timestamps of target file now + that target is updated on the server */ + CIFS_I(target_inode)->time = 0; + /* although unlocking in the reverse order from locking is not + strictly necessary here it is a little cleaner to be consistent */ + unlock_two_nondirectories(src_inode, target_inode); +out: + free_xid(xid); + return rc; +} + const struct file_operations cifs_file_ops = { .read_iter = cifs_loose_read_iter, .write_iter = cifs_file_write_iter, @@ -921,9 +977,8 @@ const struct file_operations cifs_file_ops = { .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, .llseek = cifs_llseek, -#ifdef CONFIG_CIFS_POSIX .unlocked_ioctl = cifs_ioctl, -#endif /* CONFIG_CIFS_POSIX */ + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -939,9 +994,9 @@ const struct file_operations cifs_file_strict_ops = { .mmap = cifs_file_strict_mmap, .splice_read = generic_file_splice_read, .llseek = cifs_llseek, -#ifdef CONFIG_CIFS_POSIX .unlocked_ioctl = cifs_ioctl, -#endif /* CONFIG_CIFS_POSIX */ + .clone_file_range = cifs_clone_file_range, + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -957,9 +1012,8 @@ const struct file_operations cifs_file_direct_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, -#ifdef CONFIG_CIFS_POSIX .unlocked_ioctl = cifs_ioctl, -#endif /* CONFIG_CIFS_POSIX */ + .clone_file_range = cifs_clone_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -975,9 +1029,8 @@ const struct file_operations cifs_file_nobrl_ops = { .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, .llseek = cifs_llseek, -#ifdef CONFIG_CIFS_POSIX .unlocked_ioctl = cifs_ioctl, -#endif /* CONFIG_CIFS_POSIX */ + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -992,9 +1045,8 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .mmap = cifs_file_strict_mmap, .splice_read = generic_file_splice_read, .llseek = cifs_llseek, -#ifdef CONFIG_CIFS_POSIX .unlocked_ioctl = cifs_ioctl, -#endif /* CONFIG_CIFS_POSIX */ + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1009,9 +1061,8 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, -#ifdef CONFIG_CIFS_POSIX .unlocked_ioctl = cifs_ioctl, -#endif /* CONFIG_CIFS_POSIX */ + .clone_file_range = cifs_clone_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -1022,6 +1073,7 @@ const struct file_operations cifs_dir_ops = { .release = cifs_closedir, .read = generic_read_dir, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .llseek = generic_file_llseek, }; @@ -1040,7 +1092,7 @@ cifs_init_inodecache(void) cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", sizeof(struct cifsInodeInfo), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), cifs_init_once); if (cifs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index c3cc1609025f..68c4547528c4 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -120,9 +120,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); #endif /* Functions related to symlinks */ -extern const char *cifs_follow_link(struct dentry *direntry, void **cookie); -extern int cifs_readlink(struct dentry *direntry, char __user *buffer, - int buflen); +extern const char *cifs_get_link(struct dentry *, struct inode *, + struct delayed_call *); extern int cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname); extern int cifs_removexattr(struct dentry *, const char *); @@ -131,7 +130,6 @@ extern int cifs_setxattr(struct dentry *, const char *, const void *, extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); - #ifdef CONFIG_CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b406a32deb1f..2b510c537a0d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -493,7 +493,10 @@ struct smb_vol { bool mfsymlinks:1; /* use Minshall+French Symlinks */ bool multiuser:1; bool rwpidforward:1; /* pid forward for read/write operations */ - bool nosharesock; + bool nosharesock:1; + bool persistent:1; + bool nopersistent:1; + bool resilient:1; /* noresilient not required since not fored for CA */ unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -895,6 +898,8 @@ struct cifs_tcon { bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ bool broken_sparse_sup; /* if server or share does not support sparse */ bool need_reconnect:1; /* connection reset, tid now invalid */ + bool use_resilient:1; /* use resilient instead of durable handles */ + bool use_persistent:1; /* use persistent instead of durable handles */ #ifdef CONFIG_CIFS_SMB2 bool print:1; /* set if connection to printer share */ bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */ @@ -1015,6 +1020,7 @@ struct cifs_fid { __u64 persistent_fid; /* persist file id for smb2 */ __u64 volatile_fid; /* volatile file id for smb2 */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ + __u8 create_guid[16]; #endif struct cifs_pending_open *pending_open; unsigned int epoch; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 773f4dc77630..ecb0803bdb0e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -87,6 +87,8 @@ enum { Opt_sign, Opt_seal, Opt_noac, Opt_fsc, Opt_mfsymlinks, Opt_multiuser, Opt_sloppy, Opt_nosharesock, + Opt_persistent, Opt_nopersistent, + Opt_resilient, Opt_noresilient, /* Mount options which take numeric value */ Opt_backupuid, Opt_backupgid, Opt_uid, @@ -169,6 +171,10 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_multiuser, "multiuser" }, { Opt_sloppy, "sloppy" }, { Opt_nosharesock, "nosharesock" }, + { Opt_persistent, "persistenthandles"}, + { Opt_nopersistent, "nopersistenthandles"}, + { Opt_resilient, "resilienthandles"}, + { Opt_noresilient, "noresilienthandles"}, { Opt_backupuid, "backupuid=%s" }, { Opt_backupgid, "backupgid=%s" }, @@ -1497,6 +1503,33 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_nosharesock: vol->nosharesock = true; break; + case Opt_nopersistent: + vol->nopersistent = true; + if (vol->persistent) { + cifs_dbg(VFS, + "persistenthandles mount options conflict\n"); + goto cifs_parse_mount_err; + } + break; + case Opt_persistent: + vol->persistent = true; + if ((vol->nopersistent) || (vol->resilient)) { + cifs_dbg(VFS, + "persistenthandles mount options conflict\n"); + goto cifs_parse_mount_err; + } + break; + case Opt_resilient: + vol->resilient = true; + if (vol->persistent) { + cifs_dbg(VFS, + "persistenthandles mount options conflict\n"); + goto cifs_parse_mount_err; + } + break; + case Opt_noresilient: + vol->resilient = false; /* already the default */ + break; /* Numeric Values */ case Opt_backupuid: @@ -2325,13 +2358,14 @@ static int cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) { int rc = 0; - char *desc, *delim, *payload; + const char *delim, *payload; + char *desc; ssize_t len; struct key *key; struct TCP_Server_Info *server = ses->server; struct sockaddr_in *sa; struct sockaddr_in6 *sa6; - struct user_key_payload *upayload; + const struct user_key_payload *upayload; desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL); if (!desc) @@ -2374,14 +2408,14 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) } down_read(&key->sem); - upayload = key->payload.data; + upayload = user_key_payload(key); if (IS_ERR_OR_NULL(upayload)) { rc = upayload ? PTR_ERR(upayload) : -EINVAL; goto out_key_put; } /* find first : in payload */ - payload = (char *)upayload->data; + payload = upayload->data; delim = strnchr(payload, upayload->datalen, ':'); cifs_dbg(FYI, "payload=%s\n", payload); if (!delim) { @@ -2654,6 +2688,42 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags); } tcon->seal = volume_info->seal; + tcon->use_persistent = false; + /* check if SMB2 or later, CIFS does not support persistent handles */ + if (volume_info->persistent) { + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "SMB3 or later required for persistent handles\n"); + rc = -EOPNOTSUPP; + goto out_fail; +#ifdef CONFIG_CIFS_SMB2 + } else if (ses->server->capabilities & + SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) + tcon->use_persistent = true; + else /* persistent handles requested but not supported */ { + cifs_dbg(VFS, + "Persistent handles not supported on share\n"); + rc = -EOPNOTSUPP; + goto out_fail; +#endif /* CONFIG_CIFS_SMB2 */ + } +#ifdef CONFIG_CIFS_SMB2 + } else if ((tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY) + && (ses->server->capabilities & SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) + && (volume_info->nopersistent == false)) { + cifs_dbg(FYI, "enabling persistent handles\n"); + tcon->use_persistent = true; +#endif /* CONFIG_CIFS_SMB2 */ + } else if (volume_info->resilient) { + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "SMB2.1 or later required for resilient handles\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } + tcon->use_resilient = true; + } + /* * We can have only one retry value for a connection to a share so for * resources mounted more than once to the same server share the last @@ -3502,6 +3572,15 @@ try_mount_again: goto mount_fail_check; } +#ifdef CONFIG_CIFS_SMB2 + if ((volume_info->persistent == true) && ((ses->server->capabilities & + SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) { + cifs_dbg(VFS, "persistent handles not supported by server\n"); + rc = -EOPNOTSUPP; + goto mount_fail_check; + } +#endif /* CONFIG_CIFS_SMB2*/ + /* search for existing tcon to this server share */ tcon = cifs_get_tcon(ses, volume_info); if (IS_ERR(tcon)) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 62203c387db4..0a2752b79e72 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1553,7 +1553,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, out: if (flock->fl_flags & FL_POSIX && !rc) - rc = posix_lock_file_wait(file, flock); + rc = locks_lock_file_wait(file, flock); return rc; } @@ -3380,7 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, struct page *page, *tpage; unsigned int expected_index; int rc; - gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); INIT_LIST_HEAD(tmplist); @@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, * should have access to this page, we're safe to simply set * PG_locked without checking it first. */ - __set_page_locked(page); + __SetPageLocked(page); rc = add_to_page_cache_locked(page, mapping, page->index, gfp); /* give up if we can't stick it in the cache */ if (rc) { - __clear_page_locked(page); + __ClearPageLocked(page); return rc; } @@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, if (*bytes + PAGE_CACHE_SIZE > rsize) break; - __set_page_locked(page); + __SetPageLocked(page); if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { - __clear_page_locked(page); + __ClearPageLocked(page); break; } list_move_tail(&page->lru, tmplist); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 6b66dd5d1540..a329f5ba35aa 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1831,11 +1831,11 @@ cifs_invalidate_mapping(struct inode *inode) * @word: long word containing the bit lock */ static int -cifs_wait_bit_killable(struct wait_bit_key *key) +cifs_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 28a77bf1d559..7a3b84e300f8 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -34,68 +34,36 @@ #include "cifs_ioctl.h" #include <linux/btrfs.h> -static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, - unsigned long srcfd, u64 off, u64 len, u64 destoff, - bool dup_extents) +static int cifs_file_clone_range(unsigned int xid, struct file *src_file, + struct file *dst_file) { - int rc; - struct cifsFileInfo *smb_file_target = dst_file->private_data; + struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); - struct cifs_tcon *target_tcon; - struct fd src_file; struct cifsFileInfo *smb_file_src; - struct inode *src_inode; + struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; + struct cifs_tcon *target_tcon; + int rc; cifs_dbg(FYI, "ioctl clone range\n"); - /* the destination must be opened for writing */ - if (!(dst_file->f_mode & FMODE_WRITE)) { - cifs_dbg(FYI, "file target not open for write\n"); - return -EINVAL; - } - - /* check if target volume is readonly and take reference */ - rc = mnt_want_write_file(dst_file); - if (rc) { - cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc); - return rc; - } - src_file = fdget(srcfd); - if (!src_file.file) { - rc = -EBADF; - goto out_drop_write; - } - - if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) { - rc = -EBADF; - cifs_dbg(VFS, "src file seems to be from a different filesystem type\n"); - goto out_fput; - } - - if ((!src_file.file->private_data) || (!dst_file->private_data)) { + if (!src_file->private_data || !dst_file->private_data) { rc = -EBADF; cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n"); - goto out_fput; + goto out; } rc = -EXDEV; smb_file_target = dst_file->private_data; - smb_file_src = src_file.file->private_data; + smb_file_src = src_file->private_data; src_tcon = tlink_tcon(smb_file_src->tlink); target_tcon = tlink_tcon(smb_file_target->tlink); - /* check if source and target are on same tree connection */ - if (src_tcon != target_tcon) { - cifs_dbg(VFS, "file copy src and target on different volume\n"); - goto out_fput; + if (src_tcon->ses != target_tcon->ses) { + cifs_dbg(VFS, "source and target of copy not on same server\n"); + goto out; } - src_inode = file_inode(src_file.file); - rc = -EINVAL; - if (S_ISDIR(src_inode->i_mode)) - goto out_fput; - /* * Note: cifs case is easier than btrfs since server responsible for * checks for proper open modes and file type and if it wants @@ -103,34 +71,66 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, */ lock_two_nondirectories(target_inode, src_inode); - /* determine range to clone */ - rc = -EINVAL; - if (off + len > src_inode->i_size || off + len < off) - goto out_unlock; - if (len == 0) - len = src_inode->i_size - off; - cifs_dbg(FYI, "about to flush pages\n"); /* should we flush first and last page first */ - truncate_inode_pages_range(&target_inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len)-1); + truncate_inode_pages(&target_inode->i_data, 0); - if (dup_extents && target_tcon->ses->server->ops->duplicate_extents) - rc = target_tcon->ses->server->ops->duplicate_extents(xid, - smb_file_src, smb_file_target, off, len, destoff); - else if (!dup_extents && target_tcon->ses->server->ops->clone_range) + if (target_tcon->ses->server->ops->clone_range) rc = target_tcon->ses->server->ops->clone_range(xid, - smb_file_src, smb_file_target, off, len, destoff); + smb_file_src, smb_file_target, 0, src_inode->i_size, 0); else rc = -EOPNOTSUPP; /* force revalidate of size and timestamps of target file now that target is updated on the server */ CIFS_I(target_inode)->time = 0; -out_unlock: /* although unlocking in the reverse order from locking is not strictly necessary here it is a little cleaner to be consistent */ unlock_two_nondirectories(src_inode, target_inode); +out: + return rc; +} + +static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, + unsigned long srcfd) +{ + int rc; + struct fd src_file; + struct inode *src_inode; + + cifs_dbg(FYI, "ioctl clone range\n"); + /* the destination must be opened for writing */ + if (!(dst_file->f_mode & FMODE_WRITE)) { + cifs_dbg(FYI, "file target not open for write\n"); + return -EINVAL; + } + + /* check if target volume is readonly and take reference */ + rc = mnt_want_write_file(dst_file); + if (rc) { + cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc); + return rc; + } + + src_file = fdget(srcfd); + if (!src_file.file) { + rc = -EBADF; + goto out_drop_write; + } + + if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) { + rc = -EBADF; + cifs_dbg(VFS, "src file seems to be from a different filesystem type\n"); + goto out_fput; + } + + src_inode = file_inode(src_file.file); + rc = -EINVAL; + if (S_ISDIR(src_inode->i_mode)) + goto out_fput; + + rc = cifs_file_clone_range(xid, src_file.file, dst_file); + out_fput: fdput(src_file); out_drop_write: @@ -251,10 +251,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) } break; case CIFS_IOC_COPYCHUNK_FILE: - rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false); - break; - case BTRFS_IOC_CLONE: - rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true); + rc = cifs_ioctl_clone(xid, filep, arg); break; case CIFS_IOC_SET_INTEGRITY: if (pSMBFile == NULL) diff --git a/fs/cifs/link.c b/fs/cifs/link.c index e3548f73bdea..062c2375549a 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -627,9 +627,9 @@ cifs_hl_exit: } const char * -cifs_follow_link(struct dentry *direntry, void **cookie) +cifs_get_link(struct dentry *direntry, struct inode *inode, + struct delayed_call *done) { - struct inode *inode = d_inode(direntry); int rc = -ENOMEM; unsigned int xid; char *full_path = NULL; @@ -639,6 +639,9 @@ cifs_follow_link(struct dentry *direntry, void **cookie) struct cifs_tcon *tcon; struct TCP_Server_Info *server; + if (!direntry) + return ERR_PTR(-ECHILD); + xid = get_xid(); tlink = cifs_sb_tlink(cifs_sb); @@ -678,7 +681,8 @@ cifs_follow_link(struct dentry *direntry, void **cookie) kfree(target_path); return ERR_PTR(rc); } - return *cookie = target_path; + set_delayed_call(done, kfree_link, target_path); + return target_path; } int diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index b1eede3678a9..0557c45e9c33 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -84,7 +84,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); dentry = d_hash_and_lookup(parent, name); - if (unlikely(IS_ERR(dentry))) + if (IS_ERR(dentry)) return; if (dentry) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index bce6fdcd5d48..59727e32ed0f 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -988,7 +988,7 @@ sess_auth_kerberos(struct sess_data *sess_data) goto out; } - msg = spnego_key->payload.data; + msg = spnego_key->payload.data[0]; /* * check version field to make sure that cifs.upcall is * sending us a response in an expected form diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 2ab297dae5a7..f9e766f464be 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -43,6 +43,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, struct smb2_file_all_info *smb2_data = NULL; __u8 smb2_oplock[17]; struct cifs_fid *fid = oparms->fid; + struct network_resiliency_req nr_ioctl_req; smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); if (smb2_path == NULL) { @@ -67,6 +68,24 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, if (rc) goto out; + + if (oparms->tcon->use_resilient) { + nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */ + nr_ioctl_req.Reserved = 0; + rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid, + fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, true, + (char *)&nr_ioctl_req, sizeof(nr_ioctl_req), + NULL, NULL /* no return info */); + if (rc == -EOPNOTSUPP) { + cifs_dbg(VFS, + "resiliency not supported by server, disabling\n"); + oparms->tcon->use_resilient = false; + } else if (rc) + cifs_dbg(FYI, "error %d setting resiliency\n", rc); + + rc = 0; + } + if (buf) { /* open response does not have IndexNumber field - get it */ rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 18da19f4f811..53ccdde6ff18 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -810,7 +810,6 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, &eof, false); } -#ifdef CONFIG_CIFS_SMB311 static int smb2_duplicate_extents(const unsigned int xid, struct cifsFileInfo *srcfile, @@ -854,8 +853,6 @@ smb2_duplicate_extents(const unsigned int xid, duplicate_extents_out: return rc; } -#endif /* CONFIG_CIFS_SMB311 */ - static int smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, @@ -1703,6 +1700,7 @@ struct smb_version_operations smb30_operations = { .create_lease_buf = smb3_create_lease_buf, .parse_lease_buf = smb3_parse_lease_buf, .clone_range = smb2_clone_range, + .duplicate_extents = smb2_duplicate_extents, .validate_negotiate = smb3_validate_negotiate, .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, @@ -1840,7 +1838,7 @@ struct smb_version_values smb21_values = { struct smb_version_values smb30_values = { .version_string = SMB30_VERSION_STRING, .protocol_id = SMB30_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, @@ -1860,7 +1858,7 @@ struct smb_version_values smb30_values = { struct smb_version_values smb302_values = { .version_string = SMB302_VERSION_STRING, .protocol_id = SMB302_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, @@ -1881,7 +1879,7 @@ struct smb_version_values smb302_values = { struct smb_version_values smb311_values = { .version_string = SMB311_VERSION_STRING, .protocol_id = SMB311_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 597a417ba94d..767555518d40 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -660,7 +660,7 @@ ssetup_ntlmssp_authenticate: goto ssetup_exit; } - msg = spnego_key->payload.data; + msg = spnego_key->payload.data[0]; /* * check version field to make sure that cifs.upcall is * sending us a response in an expected form @@ -1151,13 +1151,130 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, return 0; } +static struct create_durable_v2 * +create_durable_v2_buf(struct cifs_fid *pfid) +{ + struct create_durable_v2 *buf; + + buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL); + if (!buf) + return NULL; + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_durable_v2, dcontext)); + buf->ccontext.DataLength = cpu_to_le32(sizeof(struct durable_context_v2)); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_durable_v2, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + + buf->dcontext.Timeout = 0; /* Should this be configurable by workload */ + buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); + get_random_bytes(buf->dcontext.CreateGuid, 16); + memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); + + /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */ + buf->Name[0] = 'D'; + buf->Name[1] = 'H'; + buf->Name[2] = '2'; + buf->Name[3] = 'Q'; + return buf; +} + +static struct create_durable_handle_reconnect_v2 * +create_reconnect_durable_v2_buf(struct cifs_fid *fid) +{ + struct create_durable_handle_reconnect_v2 *buf; + + buf = kzalloc(sizeof(struct create_durable_handle_reconnect_v2), + GFP_KERNEL); + if (!buf) + return NULL; + + buf->ccontext.DataOffset = + cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2, + dcontext)); + buf->ccontext.DataLength = + cpu_to_le32(sizeof(struct durable_reconnect_context_v2)); + buf->ccontext.NameOffset = + cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2, + Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + + buf->dcontext.Fid.PersistentFileId = fid->persistent_fid; + buf->dcontext.Fid.VolatileFileId = fid->volatile_fid; + buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); + memcpy(buf->dcontext.CreateGuid, fid->create_guid, 16); + + /* SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 is "DH2C" */ + buf->Name[0] = 'D'; + buf->Name[1] = 'H'; + buf->Name[2] = '2'; + buf->Name[3] = 'C'; + return buf; +} + static int -add_durable_context(struct kvec *iov, unsigned int *num_iovec, +add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec, struct cifs_open_parms *oparms) { struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; + iov[num].iov_base = create_durable_v2_buf(oparms->fid); + if (iov[num].iov_base == NULL) + return -ENOMEM; + iov[num].iov_len = sizeof(struct create_durable_v2); + if (!req->CreateContextsOffset) + req->CreateContextsOffset = + cpu_to_le32(sizeof(struct smb2_create_req) - 4 + + iov[1].iov_len); + le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2)); + inc_rfc1001_len(&req->hdr, sizeof(struct create_durable_v2)); + *num_iovec = num + 1; + return 0; +} + +static int +add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec, + struct cifs_open_parms *oparms) +{ + struct smb2_create_req *req = iov[0].iov_base; + unsigned int num = *num_iovec; + + /* indicate that we don't need to relock the file */ + oparms->reconnect = false; + + iov[num].iov_base = create_reconnect_durable_v2_buf(oparms->fid); + if (iov[num].iov_base == NULL) + return -ENOMEM; + iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2); + if (!req->CreateContextsOffset) + req->CreateContextsOffset = + cpu_to_le32(sizeof(struct smb2_create_req) - 4 + + iov[1].iov_len); + le32_add_cpu(&req->CreateContextsLength, + sizeof(struct create_durable_handle_reconnect_v2)); + inc_rfc1001_len(&req->hdr, + sizeof(struct create_durable_handle_reconnect_v2)); + *num_iovec = num + 1; + return 0; +} + +static int +add_durable_context(struct kvec *iov, unsigned int *num_iovec, + struct cifs_open_parms *oparms, bool use_persistent) +{ + struct smb2_create_req *req = iov[0].iov_base; + unsigned int num = *num_iovec; + + if (use_persistent) { + if (oparms->reconnect) + return add_durable_reconnect_v2_context(iov, num_iovec, + oparms); + else + return add_durable_v2_context(iov, num_iovec, oparms); + } + if (oparms->reconnect) { iov[num].iov_base = create_reconnect_durable_buf(oparms->fid); /* indicate that we don't need to relock the file */ @@ -1275,7 +1392,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, ccontext->Next = cpu_to_le32(server->vals->create_lease_size); } - rc = add_durable_context(iov, &num_iovecs, oparms); + + rc = add_durable_context(iov, &num_iovecs, oparms, + tcon->use_persistent); if (rc) { cifs_small_buf_release(req); kfree(copy_path); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 451108284a2f..4af52780ec35 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -590,6 +590,44 @@ struct create_durable { } Data; } __packed; +/* See MS-SMB2 2.2.13.2.11 */ +/* Flags */ +#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002 +struct durable_context_v2 { + __le32 Timeout; + __le32 Flags; + __u64 Reserved; + __u8 CreateGuid[16]; +} __packed; + +struct create_durable_v2 { + struct create_context ccontext; + __u8 Name[8]; + struct durable_context_v2 dcontext; +} __packed; + +/* See MS-SMB2 2.2.13.2.12 */ +struct durable_reconnect_context_v2 { + struct { + __u64 PersistentFileId; + __u64 VolatileFileId; + } Fid; + __u8 CreateGuid[16]; + __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */ +} __packed; + +/* See MS-SMB2 2.2.14.2.12 */ +struct durable_reconnect_context_v2_rsp { + __le32 Timeout; + __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */ +} __packed; + +struct create_durable_handle_reconnect_v2 { + struct create_context ccontext; + __u8 Name[8]; + struct durable_reconnect_context_v2 dcontext; +} __packed; + #define COPY_CHUNK_RES_KEY_SIZE 24 struct resume_key_req { char ResumeKey[COPY_CHUNK_RES_KEY_SIZE]; @@ -643,6 +681,13 @@ struct fsctl_get_integrity_information_rsp { /* Integrity flags for above */ #define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 +/* See MS-SMB2 2.2.31.3 */ +struct network_resiliency_req { + __le32 Timeout; + __le32 Reserved; +} __packed; +/* There is no buffer for the response ie no struct network_resiliency_rsp */ + struct validate_negotiate_info_req { __le32 Capabilities; diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index a639d0dab453..f996daeea271 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -90,7 +90,7 @@ #define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064 /* Retrieve an opaque file reference for server-side data movement ie copy */ #define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078 -#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */ +#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ #define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index ff9e1f8b16a4..f5dc2f0df4ad 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -190,8 +190,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, #endif /* CONFIG_CIFS_ACL */ } else { int temp; - temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, - strlen(POSIX_ACL_XATTR_ACCESS)); + temp = strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS, + strlen(XATTR_NAME_POSIX_ACL_ACCESS)); if (temp == 0) { #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & MS_POSIXACL) @@ -203,8 +203,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, #else cifs_dbg(FYI, "set POSIX ACL not supported\n"); #endif - } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, - strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { + } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT, + strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) { #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & MS_POSIXACL) rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, @@ -292,8 +292,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_remap(cifs_sb)); - } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, - strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { + } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS, + strlen(XATTR_NAME_POSIX_ACL_ACCESS)) == 0) { #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & MS_POSIXACL) rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, @@ -303,8 +303,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, #else cifs_dbg(FYI, "Query POSIX ACL not supported yet\n"); #endif /* CONFIG_CIFS_POSIX */ - } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, - strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { + } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT, + strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) { #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & MS_POSIXACL) rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 7740b1c871c1..1bfb7ba4e85e 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -8,6 +8,7 @@ #include <linux/coda.h> #include <linux/coda_psdev.h> +#include <linux/pagemap.h> #include "coda_linux.h" static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) @@ -17,8 +18,7 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) static const struct inode_operations coda_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = coda_setattr, }; @@ -35,6 +35,7 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr) inode->i_fop = &coda_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &coda_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &coda_symlink_aops; inode->i_mapping = &inode->i_data; } else diff --git a/fs/coda/inode.c b/fs/coda/inode.c index cac1390b87a3..57e81cbba0fa 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -74,9 +74,9 @@ static void init_once(void *foo) int __init coda_init_inodecache(void) { coda_inode_cachep = kmem_cache_create("coda_inode_cache", - sizeof(struct coda_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - init_once); + sizeof(struct coda_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, init_once); if (coda_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index ab94ef63caef..03736e20d720 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -26,7 +26,7 @@ static int coda_symlink_filler(struct file *file, struct page *page) int error; struct coda_inode_info *cii; unsigned int len = PAGE_SIZE; - char *p = kmap(page); + char *p = page_address(page); cii = ITOC(inode); @@ -34,13 +34,11 @@ static int coda_symlink_filler(struct file *file, struct page *page) if (error) goto fail; SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; fail: SetPageError(page); - kunmap(page); unlock_page(page); return error; } diff --git a/fs/compat.c b/fs/compat.c index 6fd272d455e4..a71936a3f4cb 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -792,7 +792,7 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, const void __user *, data) { char *kernel_type; - unsigned long data_page; + void *options; char *kernel_dev; int retval; @@ -806,26 +806,25 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, if (IS_ERR(kernel_dev)) goto out1; - retval = copy_mount_options(data, &data_page); - if (retval < 0) + options = copy_mount_options(data); + retval = PTR_ERR(options); + if (IS_ERR(options)) goto out2; - retval = -EINVAL; - - if (kernel_type && data_page) { + if (kernel_type && options) { if (!strcmp(kernel_type, NCPFS_NAME)) { - do_ncp_super_data_conv((void *)data_page); + do_ncp_super_data_conv(options); } else if (!strcmp(kernel_type, NFS4_NAME)) { - if (do_nfs4_super_data_conv((void *) data_page)) + retval = -EINVAL; + if (do_nfs4_super_data_conv(options)) goto out3; } } - retval = do_mount(kernel_dev, dir_name, kernel_type, - flags, (void*)data_page); + retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options); out3: - free_page(data_page); + kfree(options); out2: kfree(kernel_dev); out1: diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 48851f6ea6ec..a5b8eb69a8f4 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -58,6 +58,8 @@ #include <linux/atalk.h> #include <linux/gfp.h> +#include "internal.h" + #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_sock.h> #include <net/bluetooth/rfcomm.h> @@ -115,19 +117,38 @@ #include <asm/fbio.h> #endif -static int w_long(unsigned int fd, unsigned int cmd, - compat_ulong_t __user *argp) +#define convert_in_user(srcptr, dstptr) \ +({ \ + typeof(*srcptr) val; \ + \ + get_user(val, srcptr) || put_user(val, dstptr); \ +}) + +static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - mm_segment_t old_fs = get_fs(); int err; - unsigned long val; - set_fs (KERNEL_DS); - err = sys_ioctl(fd, cmd, (unsigned long)&val); - set_fs (old_fs); - if (!err && put_user(val, argp)) + err = security_file_ioctl(file, cmd, arg); + if (err) + return err; + + return vfs_ioctl(file, cmd, arg); +} + +static int w_long(struct file *file, + unsigned int cmd, compat_ulong_t __user *argp) +{ + int err; + unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp)); + + if (valp == NULL) return -EFAULT; - return err; + err = do_ioctl(file, cmd, (unsigned long)valp); + if (err) + return err; + if (convert_in_user(valp, argp)) + return -EFAULT; + return 0; } struct compat_video_event { @@ -139,23 +160,23 @@ struct compat_video_event { } u; }; -static int do_video_get_event(unsigned int fd, unsigned int cmd, - struct compat_video_event __user *up) +static int do_video_get_event(struct file *file, + unsigned int cmd, struct compat_video_event __user *up) { - struct video_event kevent; - mm_segment_t old_fs = get_fs(); + struct video_event __user *kevent = + compat_alloc_user_space(sizeof(*kevent)); int err; - set_fs(KERNEL_DS); - err = sys_ioctl(fd, cmd, (unsigned long) &kevent); - set_fs(old_fs); + if (kevent == NULL) + return -EFAULT; + err = do_ioctl(file, cmd, (unsigned long)kevent); if (!err) { - err = put_user(kevent.type, &up->type); - err |= put_user(kevent.timestamp, &up->timestamp); - err |= put_user(kevent.u.size.w, &up->u.size.w); - err |= put_user(kevent.u.size.h, &up->u.size.h); - err |= put_user(kevent.u.size.aspect_ratio, + err = convert_in_user(&kevent->type, &up->type); + err |= convert_in_user(&kevent->timestamp, &up->timestamp); + err |= convert_in_user(&kevent->u.size.w, &up->u.size.w); + err |= convert_in_user(&kevent->u.size.h, &up->u.size.h); + err |= convert_in_user(&kevent->u.size.aspect_ratio, &up->u.size.aspect_ratio); if (err) err = -EFAULT; @@ -169,8 +190,8 @@ struct compat_video_still_picture { int32_t size; }; -static int do_video_stillpicture(unsigned int fd, unsigned int cmd, - struct compat_video_still_picture __user *up) +static int do_video_stillpicture(struct file *file, + unsigned int cmd, struct compat_video_still_picture __user *up) { struct video_still_picture __user *up_native; compat_uptr_t fp; @@ -190,7 +211,7 @@ static int do_video_stillpicture(unsigned int fd, unsigned int cmd, if (err) return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) up_native); + err = do_ioctl(file, cmd, (unsigned long) up_native); return err; } @@ -200,8 +221,8 @@ struct compat_video_spu_palette { compat_uptr_t palette; }; -static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, - struct compat_video_spu_palette __user *up) +static int do_video_set_spu_palette(struct file *file, + unsigned int cmd, struct compat_video_spu_palette __user *up) { struct video_spu_palette __user *up_native; compat_uptr_t palp; @@ -218,7 +239,7 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, if (err) return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) up_native); + err = do_ioctl(file, cmd, (unsigned long) up_native); return err; } @@ -276,7 +297,7 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov return 0; } -static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, +static int sg_ioctl_trans(struct file *file, unsigned int cmd, sg_io_hdr32_t __user *sgio32) { sg_io_hdr_t __user *sgio; @@ -289,7 +310,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, if (get_user(interface_id, &sgio32->interface_id)) return -EFAULT; if (interface_id != 'S') - return sys_ioctl(fd, cmd, (unsigned long)sgio32); + return do_ioctl(file, cmd, (unsigned long)sgio32); if (get_user(iovec_count, &sgio32->iovec_count)) return -EFAULT; @@ -349,7 +370,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, if (put_user(compat_ptr(data), &sgio->usr_ptr)) return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) sgio); + err = do_ioctl(file, cmd, (unsigned long) sgio); if (err >= 0) { void __user *datap; @@ -380,13 +401,13 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ int unused; }; -static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct - compat_sg_req_info __user *o) +static int sg_grt_trans(struct file *file, + unsigned int cmd, struct compat_sg_req_info __user *o) { int err, i; sg_req_info_t __user *r; r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); - err = sys_ioctl(fd,cmd,(unsigned long)r); + err = do_ioctl(file, cmd, (unsigned long)r); if (err < 0) return err; for (i = 0; i < SG_MAX_QUEUE; i++) { @@ -412,8 +433,8 @@ struct sock_fprog32 { #define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) #define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) -static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, - struct sock_fprog32 __user *u_fprog32) +static int ppp_sock_fprog_ioctl_trans(struct file *file, + unsigned int cmd, struct sock_fprog32 __user *u_fprog32) { struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); void __user *fptr64; @@ -435,7 +456,7 @@ static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, else cmd = PPPIOCSACTIVE; - return sys_ioctl(fd, cmd, (unsigned long) u_fprog64); + return do_ioctl(file, cmd, (unsigned long) u_fprog64); } struct ppp_option_data32 { @@ -451,7 +472,7 @@ struct ppp_idle32 { }; #define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) -static int ppp_gidle(unsigned int fd, unsigned int cmd, +static int ppp_gidle(struct file *file, unsigned int cmd, struct ppp_idle32 __user *idle32) { struct ppp_idle __user *idle; @@ -460,7 +481,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, idle = compat_alloc_user_space(sizeof(*idle)); - err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle); + err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle); if (!err) { if (get_user(xmit, &idle->xmit_idle) || @@ -472,7 +493,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, return err; } -static int ppp_scompress(unsigned int fd, unsigned int cmd, +static int ppp_scompress(struct file *file, unsigned int cmd, struct ppp_option_data32 __user *odata32) { struct ppp_option_data __user *odata; @@ -492,7 +513,7 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd, sizeof(__u32) + sizeof(int))) return -EFAULT; - return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata); + return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata); } #ifdef CONFIG_BLOCK @@ -512,12 +533,13 @@ struct mtpos32 { }; #define MTIOCPOS32 _IOR('m', 3, struct mtpos32) -static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) +static int mt_ioctl_trans(struct file *file, + unsigned int cmd, void __user *argp) { - mm_segment_t old_fs = get_fs(); - struct mtget get; + /* NULL initialization to make gcc shut up */ + struct mtget __user *get = NULL; struct mtget32 __user *umget32; - struct mtpos pos; + struct mtpos __user *pos = NULL; struct mtpos32 __user *upos32; unsigned long kcmd; void *karg; @@ -526,32 +548,34 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) switch(cmd) { case MTIOCPOS32: kcmd = MTIOCPOS; - karg = &pos; + pos = compat_alloc_user_space(sizeof(*pos)); + karg = pos; break; default: /* MTIOCGET32 */ kcmd = MTIOCGET; - karg = &get; + get = compat_alloc_user_space(sizeof(*get)); + karg = get; break; } - set_fs (KERNEL_DS); - err = sys_ioctl (fd, kcmd, (unsigned long)karg); - set_fs (old_fs); + if (karg == NULL) + return -EFAULT; + err = do_ioctl(file, kcmd, (unsigned long)karg); if (err) return err; switch (cmd) { case MTIOCPOS32: upos32 = argp; - err = __put_user(pos.mt_blkno, &upos32->mt_blkno); + err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno); break; case MTIOCGET32: umget32 = argp; - err = __put_user(get.mt_type, &umget32->mt_type); - err |= __put_user(get.mt_resid, &umget32->mt_resid); - err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg); - err |= __put_user(get.mt_gstat, &umget32->mt_gstat); - err |= __put_user(get.mt_erreg, &umget32->mt_erreg); - err |= __put_user(get.mt_fileno, &umget32->mt_fileno); - err |= __put_user(get.mt_blkno, &umget32->mt_blkno); + err = convert_in_user(&get->mt_type, &umget32->mt_type); + err |= convert_in_user(&get->mt_resid, &umget32->mt_resid); + err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg); + err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat); + err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg); + err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno); + err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno); break; } return err ? -EFAULT: 0; @@ -605,42 +629,41 @@ struct serial_struct32 { compat_int_t reserved[1]; }; -static int serial_struct_ioctl(unsigned fd, unsigned cmd, - struct serial_struct32 __user *ss32) +static int serial_struct_ioctl(struct file *file, + unsigned cmd, struct serial_struct32 __user *ss32) { typedef struct serial_struct32 SS32; int err; - struct serial_struct ss; - mm_segment_t oldseg = get_fs(); + struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss)); __u32 udata; unsigned int base; + unsigned char *iomem_base; + if (ss == NULL) + return -EFAULT; if (cmd == TIOCSSERIAL) { - if (!access_ok(VERIFY_READ, ss32, sizeof(SS32))) - return -EFAULT; - if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base))) - return -EFAULT; - if (__get_user(udata, &ss32->iomem_base)) + if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) || + get_user(udata, &ss32->iomem_base)) return -EFAULT; - ss.iomem_base = compat_ptr(udata); - if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || - __get_user(ss.port_high, &ss32->port_high)) + iomem_base = compat_ptr(udata); + if (put_user(iomem_base, &ss->iomem_base) || + convert_in_user(&ss32->iomem_reg_shift, + &ss->iomem_reg_shift) || + convert_in_user(&ss32->port_high, &ss->port_high) || + put_user(0UL, &ss->iomap_base)) return -EFAULT; - ss.iomap_base = 0UL; } - set_fs(KERNEL_DS); - err = sys_ioctl(fd,cmd,(unsigned long)(&ss)); - set_fs(oldseg); + err = do_ioctl(file, cmd, (unsigned long)ss); if (cmd == TIOCGSERIAL && err >= 0) { - if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32))) - return -EFAULT; - if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base))) + if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) || + get_user(iomem_base, &ss->iomem_base)) return -EFAULT; - base = (unsigned long)ss.iomem_base >> 32 ? - 0xffffffff : (unsigned)(unsigned long)ss.iomem_base; - if (__put_user(base, &ss32->iomem_base) || - __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || - __put_user(ss.port_high, &ss32->port_high)) + base = (unsigned long)iomem_base >> 32 ? + 0xffffffff : (unsigned)(unsigned long)iomem_base; + if (put_user(base, &ss32->iomem_base) || + convert_in_user(&ss->iomem_reg_shift, + &ss32->iomem_reg_shift) || + convert_in_user(&ss->port_high, &ss32->port_high)) return -EFAULT; } return err; @@ -674,8 +697,8 @@ struct i2c_rdwr_aligned { struct i2c_msg msgs[0]; }; -static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, - struct i2c_rdwr_ioctl_data32 __user *udata) +static int do_i2c_rdwr_ioctl(struct file *file, + unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata) { struct i2c_rdwr_aligned __user *tdata; struct i2c_msg __user *tmsgs; @@ -686,7 +709,7 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, if (get_user(nmsgs, &udata->nmsgs)) return -EFAULT; - if (nmsgs > I2C_RDRW_IOCTL_MAX_MSGS) + if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS) return -EINVAL; if (get_user(datap, &udata->msgs)) @@ -708,11 +731,11 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, put_user(compat_ptr(datap), &tmsgs[i].buf)) return -EFAULT; } - return sys_ioctl(fd, cmd, (unsigned long)tdata); + return do_ioctl(file, cmd, (unsigned long)tdata); } -static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, - struct i2c_smbus_ioctl_data32 __user *udata) +static int do_i2c_smbus_ioctl(struct file *file, + unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata) { struct i2c_smbus_ioctl_data __user *tdata; compat_caddr_t datap; @@ -734,7 +757,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, __put_user(compat_ptr(datap), &tdata->data)) return -EFAULT; - return sys_ioctl(fd, cmd, (unsigned long)tdata); + return do_ioctl(file, cmd, (unsigned long)tdata); } #define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) @@ -742,29 +765,27 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) #define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) -static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) +static int rtc_ioctl(struct file *file, + unsigned cmd, void __user *argp) { - mm_segment_t oldfs = get_fs(); - compat_ulong_t val32; - unsigned long kval; + unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp)); int ret; + if (valp == NULL) + return -EFAULT; switch (cmd) { case RTC_IRQP_READ32: case RTC_EPOCH_READ32: - set_fs(KERNEL_DS); - ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ? + ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ? RTC_IRQP_READ : RTC_EPOCH_READ, - (unsigned long)&kval); - set_fs(oldfs); + (unsigned long)valp); if (ret) return ret; - val32 = kval; - return put_user(val32, (unsigned int __user *)argp); + return convert_in_user(valp, (unsigned int __user *)argp); case RTC_IRQP_SET32: - return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp); + return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp); case RTC_EPOCH_SET32: - return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp); + return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp); } return -ENOIOCTLCMD; @@ -1284,12 +1305,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) -/* NBD */ -COMPATIBLE_IOCTL(NBD_DO_IT) -COMPATIBLE_IOCTL(NBD_CLEAR_SOCK) -COMPATIBLE_IOCTL(NBD_CLEAR_QUE) -COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) -COMPATIBLE_IOCTL(NBD_DISCONNECT) /* i2c */ COMPATIBLE_IOCTL(I2C_SLAVE) COMPATIBLE_IOCTL(I2C_SLAVE_FORCE) @@ -1436,53 +1451,53 @@ IGNORE_IOCTL(FBIOGCURSOR32) * a compat_ioctl operation in the place that handleѕ the * ioctl for the native case. */ -static long do_ioctl_trans(int fd, unsigned int cmd, +static long do_ioctl_trans(unsigned int cmd, unsigned long arg, struct file *file) { void __user *argp = compat_ptr(arg); switch (cmd) { case PPPIOCGIDLE32: - return ppp_gidle(fd, cmd, argp); + return ppp_gidle(file, cmd, argp); case PPPIOCSCOMPRESS32: - return ppp_scompress(fd, cmd, argp); + return ppp_scompress(file, cmd, argp); case PPPIOCSPASS32: case PPPIOCSACTIVE32: - return ppp_sock_fprog_ioctl_trans(fd, cmd, argp); + return ppp_sock_fprog_ioctl_trans(file, cmd, argp); #ifdef CONFIG_BLOCK case SG_IO: - return sg_ioctl_trans(fd, cmd, argp); + return sg_ioctl_trans(file, cmd, argp); case SG_GET_REQUEST_TABLE: - return sg_grt_trans(fd, cmd, argp); + return sg_grt_trans(file, cmd, argp); case MTIOCGET32: case MTIOCPOS32: - return mt_ioctl_trans(fd, cmd, argp); + return mt_ioctl_trans(file, cmd, argp); #endif /* Serial */ case TIOCGSERIAL: case TIOCSSERIAL: - return serial_struct_ioctl(fd, cmd, argp); + return serial_struct_ioctl(file, cmd, argp); /* i2c */ case I2C_FUNCS: - return w_long(fd, cmd, argp); + return w_long(file, cmd, argp); case I2C_RDWR: - return do_i2c_rdwr_ioctl(fd, cmd, argp); + return do_i2c_rdwr_ioctl(file, cmd, argp); case I2C_SMBUS: - return do_i2c_smbus_ioctl(fd, cmd, argp); + return do_i2c_smbus_ioctl(file, cmd, argp); /* Not implemented in the native kernel */ case RTC_IRQP_READ32: case RTC_IRQP_SET32: case RTC_EPOCH_READ32: case RTC_EPOCH_SET32: - return rtc_ioctl(fd, cmd, argp); + return rtc_ioctl(file, cmd, argp); /* dvb */ case VIDEO_GET_EVENT: - return do_video_get_event(fd, cmd, argp); + return do_video_get_event(file, cmd, argp); case VIDEO_STILLPICTURE: - return do_video_stillpicture(fd, cmd, argp); + return do_video_stillpicture(file, cmd, argp); case VIDEO_SET_SPU_PALETTE: - return do_video_set_spu_palette(fd, cmd, argp); + return do_video_set_spu_palette(file, cmd, argp); } /* @@ -1508,12 +1523,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd, case KDSKBMETA: case KDSKBLED: case KDSETLED: - /* NBD */ - case NBD_SET_SOCK: - case NBD_SET_BLKSIZE: - case NBD_SET_SIZE: - case NBD_SET_SIZE_BLOCKS: - return do_vfs_ioctl(file, fd, cmd, arg); + return vfs_ioctl(file, cmd, arg); } return -ENOIOCTLCMD; @@ -1580,6 +1590,11 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, goto out_fput; #endif + case FICLONE: + case FICLONERANGE: + case FIDEDUPERANGE: + goto do_ioctl; + case FIBMAP: case FIGETBSZ: case FIONREAD: @@ -1602,7 +1617,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, if (compat_ioctl_check_table(XFORM(cmd))) goto found_handler; - error = do_ioctl_trans(fd, cmd, arg, f.file); + error = do_ioctl_trans(cmd, arg, f.file); if (error == -ENOIOCTLCMD) error = -ENOTTY; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index b65d1ef532d5..ccc31fa6f1a7 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -53,13 +53,14 @@ struct configfs_dirent { #define CONFIGFS_ROOT 0x0001 #define CONFIGFS_DIR 0x0002 #define CONFIGFS_ITEM_ATTR 0x0004 +#define CONFIGFS_ITEM_BIN_ATTR 0x0008 #define CONFIGFS_ITEM_LINK 0x0020 #define CONFIGFS_USET_DIR 0x0040 #define CONFIGFS_USET_DEFAULT 0x0080 #define CONFIGFS_USET_DROPPING 0x0100 #define CONFIGFS_USET_IN_MKDIR 0x0200 #define CONFIGFS_USET_CREATING 0x0400 -#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) +#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR) extern struct mutex configfs_symlink_mutex; extern spinlock_t configfs_dirent_lock; @@ -72,6 +73,8 @@ extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *)); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); +extern int configfs_create_bin_file(struct config_item *, + const struct configfs_bin_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, void *, umode_t, int); extern int configfs_dirent_is_ready(struct configfs_dirent *); @@ -88,7 +91,7 @@ extern void configfs_release_fs(void); extern struct rw_semaphore configfs_rename_sem; extern const struct file_operations configfs_dir_operations; extern const struct file_operations configfs_file_operations; -extern const struct file_operations bin_fops; +extern const struct file_operations configfs_bin_file_operations; extern const struct inode_operations configfs_dir_inode_operations; extern const struct inode_operations configfs_root_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; @@ -119,6 +122,13 @@ static inline struct configfs_attribute * to_attr(struct dentry * dentry) return ((struct configfs_attribute *) sd->s_element); } +static inline struct configfs_bin_attribute *to_bin_attr(struct dentry *dentry) +{ + struct configfs_attribute *attr = to_attr(dentry); + + return container_of(attr, struct configfs_bin_attribute, cb_attr); +} + static inline struct config_item *configfs_get_config_item(struct dentry *dentry) { struct config_item * item = NULL; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index c81ce7f200a6..cab612b2ae76 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -255,6 +255,12 @@ static void configfs_init_file(struct inode * inode) inode->i_fop = &configfs_file_operations; } +static void configfs_init_bin_file(struct inode *inode) +{ + inode->i_size = 0; + inode->i_fop = &configfs_bin_file_operations; +} + static void init_symlink(struct inode * inode) { inode->i_op = &configfs_symlink_inode_operations; @@ -423,7 +429,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den spin_unlock(&configfs_dirent_lock); error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, - configfs_init_file); + (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ? + configfs_init_bin_file : + configfs_init_file); if (error) { configfs_put(sd); return error; @@ -583,6 +591,7 @@ static int populate_attrs(struct config_item *item) { struct config_item_type *t = item->ci_type; struct configfs_attribute *attr; + struct configfs_bin_attribute *bin_attr; int error = 0; int i; @@ -594,6 +603,13 @@ static int populate_attrs(struct config_item *item) break; } } + if (t->ct_bin_attrs) { + for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) { + error = configfs_create_bin_file(item, bin_attr); + if (error) + break; + } + } if (error) detach_attrs(item); @@ -1054,11 +1070,55 @@ out: return ret; } +static int configfs_do_depend_item(struct dentry *subsys_dentry, + struct config_item *target) +{ + struct configfs_dirent *p; + int ret; + + spin_lock(&configfs_dirent_lock); + /* Scan the tree, return 0 if found */ + ret = configfs_depend_prep(subsys_dentry, target); + if (ret) + goto out_unlock_dirent_lock; + + /* + * We are sure that the item is not about to be removed by rmdir(), and + * not in the middle of attachment by mkdir(). + */ + p = target->ci_dentry->d_fsdata; + p->s_dependent_count += 1; + +out_unlock_dirent_lock: + spin_unlock(&configfs_dirent_lock); + + return ret; +} + +static inline struct configfs_dirent * +configfs_find_subsys_dentry(struct configfs_dirent *root_sd, + struct config_item *subsys_item) +{ + struct configfs_dirent *p; + struct configfs_dirent *ret = NULL; + + list_for_each_entry(p, &root_sd->s_children, s_sibling) { + if (p->s_type & CONFIGFS_DIR && + p->s_element == subsys_item) { + ret = p; + break; + } + } + + return ret; +} + + int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target) { int ret; - struct configfs_dirent *p, *root_sd, *subsys_sd = NULL; + struct configfs_dirent *subsys_sd; struct config_item *s_item = &subsys->su_group.cg_item; struct dentry *root; @@ -1077,39 +1137,15 @@ int configfs_depend_item(struct configfs_subsystem *subsys, */ mutex_lock(&d_inode(root)->i_mutex); - root_sd = root->d_fsdata; - - list_for_each_entry(p, &root_sd->s_children, s_sibling) { - if (p->s_type & CONFIGFS_DIR) { - if (p->s_element == s_item) { - subsys_sd = p; - break; - } - } - } - + subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item); if (!subsys_sd) { ret = -ENOENT; goto out_unlock_fs; } /* Ok, now we can trust subsys/s_item */ + ret = configfs_do_depend_item(subsys_sd->s_dentry, target); - spin_lock(&configfs_dirent_lock); - /* Scan the tree, return 0 if found */ - ret = configfs_depend_prep(subsys_sd->s_dentry, target); - if (ret) - goto out_unlock_dirent_lock; - - /* - * We are sure that the item is not about to be removed by rmdir(), and - * not in the middle of attachment by mkdir(). - */ - p = target->ci_dentry->d_fsdata; - p->s_dependent_count += 1; - -out_unlock_dirent_lock: - spin_unlock(&configfs_dirent_lock); out_unlock_fs: mutex_unlock(&d_inode(root)->i_mutex); @@ -1128,8 +1164,7 @@ EXPORT_SYMBOL(configfs_depend_item); * configfs_depend_item() because we know that that the client driver is * pinned, thus the subsystem is pinned, and therefore configfs is pinned. */ -void configfs_undepend_item(struct configfs_subsystem *subsys, - struct config_item *target) +void configfs_undepend_item(struct config_item *target) { struct configfs_dirent *sd; @@ -1152,6 +1187,79 @@ void configfs_undepend_item(struct configfs_subsystem *subsys, } EXPORT_SYMBOL(configfs_undepend_item); +/* + * caller_subsys is a caller's subsystem not target's. This is used to + * determine if we should lock root and check subsys or not. When we are + * in the same subsystem as our target there is no need to do locking as + * we know that subsys is valid and is not unregistered during this function + * as we are called from callback of one of his children and VFS holds a lock + * on some inode. Otherwise we have to lock our root to ensure that target's + * subsystem it is not unregistered during this function. + */ +int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys, + struct config_item *target) +{ + struct configfs_subsystem *target_subsys; + struct config_group *root, *parent; + struct configfs_dirent *subsys_sd; + int ret = -ENOENT; + + /* Disallow this function for configfs root */ + if (configfs_is_root(target)) + return -EINVAL; + + parent = target->ci_group; + /* + * This may happen when someone is trying to depend root + * directory of some subsystem + */ + if (configfs_is_root(&parent->cg_item)) { + target_subsys = to_configfs_subsystem(to_config_group(target)); + root = parent; + } else { + target_subsys = parent->cg_subsys; + /* Find a cofnigfs root as we may need it for locking */ + for (root = parent; !configfs_is_root(&root->cg_item); + root = root->cg_item.ci_group) + ; + } + + if (target_subsys != caller_subsys) { + /* + * We are in other configfs subsystem, so we have to do + * additional locking to prevent other subsystem from being + * unregistered + */ + mutex_lock(&d_inode(root->cg_item.ci_dentry)->i_mutex); + + /* + * As we are trying to depend item from other subsystem + * we have to check if this subsystem is still registered + */ + subsys_sd = configfs_find_subsys_dentry( + root->cg_item.ci_dentry->d_fsdata, + &target_subsys->su_group.cg_item); + if (!subsys_sd) + goto out_root_unlock; + } else { + subsys_sd = target_subsys->su_group.cg_item.ci_dentry->d_fsdata; + } + + /* Now we can execute core of depend item */ + ret = configfs_do_depend_item(subsys_sd->s_dentry, target); + + if (target_subsys != caller_subsys) +out_root_unlock: + /* + * We were called from subsystem other than our target so we + * took some locks so now it's time to release them + */ + mutex_unlock(&d_inode(root->cg_item.ci_dentry)->i_mutex); + + return ret; +} +EXPORT_SYMBOL(configfs_depend_item_unlocked); + static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int ret = 0; @@ -1636,6 +1744,116 @@ const struct file_operations configfs_dir_operations = { .iterate = configfs_readdir, }; +/** + * configfs_register_group - creates a parent-child relation between two groups + * @parent_group: parent group + * @group: child group + * + * link groups, creates dentry for the child and attaches it to the + * parent dentry. + * + * Return: 0 on success, negative errno code on error + */ +int configfs_register_group(struct config_group *parent_group, + struct config_group *group) +{ + struct configfs_subsystem *subsys = parent_group->cg_subsys; + struct dentry *parent; + int ret; + + mutex_lock(&subsys->su_mutex); + link_group(parent_group, group); + mutex_unlock(&subsys->su_mutex); + + parent = parent_group->cg_item.ci_dentry; + + mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + ret = create_default_group(parent_group, group); + if (!ret) { + spin_lock(&configfs_dirent_lock); + configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); + spin_unlock(&configfs_dirent_lock); + } + mutex_unlock(&d_inode(parent)->i_mutex); + return ret; +} +EXPORT_SYMBOL(configfs_register_group); + +/** + * configfs_unregister_group() - unregisters a child group from its parent + * @group: parent group to be unregistered + * + * Undoes configfs_register_group() + */ +void configfs_unregister_group(struct config_group *group) +{ + struct configfs_subsystem *subsys = group->cg_subsys; + struct dentry *dentry = group->cg_item.ci_dentry; + struct dentry *parent = group->cg_item.ci_parent->ci_dentry; + + mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + spin_lock(&configfs_dirent_lock); + configfs_detach_prep(dentry, NULL); + spin_unlock(&configfs_dirent_lock); + + configfs_detach_group(&group->cg_item); + d_inode(dentry)->i_flags |= S_DEAD; + dont_mount(dentry); + d_delete(dentry); + mutex_unlock(&d_inode(parent)->i_mutex); + + dput(dentry); + + mutex_lock(&subsys->su_mutex); + unlink_group(group); + mutex_unlock(&subsys->su_mutex); +} +EXPORT_SYMBOL(configfs_unregister_group); + +/** + * configfs_register_default_group() - allocates and registers a child group + * @parent_group: parent group + * @name: child group name + * @item_type: child item type description + * + * boilerplate to allocate and register a child group with its parent. We need + * kzalloc'ed memory because child's default_group is initially empty. + * + * Return: allocated config group or ERR_PTR() on error + */ +struct config_group * +configfs_register_default_group(struct config_group *parent_group, + const char *name, + struct config_item_type *item_type) +{ + int ret; + struct config_group *group; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + config_group_init_type_name(group, name, item_type); + + ret = configfs_register_group(parent_group, group); + if (ret) { + kfree(group); + return ERR_PTR(ret); + } + return group; +} +EXPORT_SYMBOL(configfs_register_default_group); + +/** + * configfs_unregister_default_group() - unregisters and frees a child group + * @group: the group to act on + */ +void configfs_unregister_default_group(struct config_group *group) +{ + configfs_unregister_group(group); + kfree(group); +} +EXPORT_SYMBOL(configfs_unregister_default_group); + int configfs_register_subsystem(struct configfs_subsystem *subsys) { int err; diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 403269ffcdf3..3687187c8ea5 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -28,6 +28,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/mutex.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> #include <linux/configfs.h> @@ -48,6 +49,10 @@ struct configfs_buffer { struct configfs_item_operations * ops; struct mutex mutex; int needs_read_fill; + bool read_in_progress; + bool write_in_progress; + char *bin_buffer; + int bin_buffer_size; }; @@ -65,7 +70,6 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf { struct configfs_attribute * attr = to_attr(dentry); struct config_item * item = to_item(dentry->d_parent); - struct configfs_item_operations * ops = buffer->ops; int ret = 0; ssize_t count; @@ -74,7 +78,8 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf if (!buffer->page) return -ENOMEM; - count = ops->show_attribute(item,attr,buffer->page); + count = attr->show(item, buffer->page); + buffer->needs_read_fill = 0; BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE); if (count >= 0) @@ -123,6 +128,87 @@ out: return retval; } +/** + * configfs_read_bin_file - read a binary attribute. + * @file: file pointer. + * @buf: buffer to fill. + * @count: number of bytes to read. + * @ppos: starting offset in file. + * + * Userspace wants to read a binary attribute file. The attribute + * descriptor is in the file's ->d_fsdata. The target item is in the + * directory's ->d_fsdata. + * + * We check whether we need to refill the buffer. If so we will + * call the attributes' attr->read() twice. The first time we + * will pass a NULL as a buffer pointer, which the attributes' method + * will use to return the size of the buffer required. If no error + * occurs we will allocate the buffer using vmalloc and call + * attr->read() again passing that buffer as an argument. + * Then we just copy to user-space using simple_read_from_buffer. + */ + +static ssize_t +configfs_read_bin_file(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct configfs_buffer *buffer = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct config_item *item = to_item(dentry->d_parent); + struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); + ssize_t retval = 0; + ssize_t len = min_t(size_t, count, PAGE_SIZE); + + mutex_lock(&buffer->mutex); + + /* we don't support switching read/write modes */ + if (buffer->write_in_progress) { + retval = -ETXTBSY; + goto out; + } + buffer->read_in_progress = 1; + + if (buffer->needs_read_fill) { + /* perform first read with buf == NULL to get extent */ + len = bin_attr->read(item, NULL, 0); + if (len <= 0) { + retval = len; + goto out; + } + + /* do not exceed the maximum value */ + if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) { + retval = -EFBIG; + goto out; + } + + buffer->bin_buffer = vmalloc(len); + if (buffer->bin_buffer == NULL) { + retval = -ENOMEM; + goto out; + } + buffer->bin_buffer_size = len; + + /* perform second read to fill buffer */ + len = bin_attr->read(item, buffer->bin_buffer, len); + if (len < 0) { + retval = len; + vfree(buffer->bin_buffer); + buffer->bin_buffer_size = 0; + buffer->bin_buffer = NULL; + goto out; + } + + buffer->needs_read_fill = 0; + } + + retval = simple_read_from_buffer(buf, count, ppos, buffer->bin_buffer, + buffer->bin_buffer_size); +out: + mutex_unlock(&buffer->mutex); + return retval; +} + /** * fill_write_buffer - copy buffer from userspace. @@ -171,9 +257,8 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size { struct configfs_attribute * attr = to_attr(dentry); struct config_item * item = to_item(dentry->d_parent); - struct configfs_item_operations * ops = buffer->ops; - return ops->store_attribute(item,attr,buffer->page,count); + return attr->store(item, buffer->page, count); } @@ -210,10 +295,80 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof return len; } -static int check_perm(struct inode * inode, struct file * file) +/** + * configfs_write_bin_file - write a binary attribute. + * @file: file pointer + * @buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Writing to a binary attribute file is similar to a normal read. + * We buffer the consecutive writes (binary attribute files do not + * support lseek) in a continuously growing buffer, but we don't + * commit until the close of the file. + */ + +static ssize_t +configfs_write_bin_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct configfs_buffer *buffer = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); + void *tbuf = NULL; + ssize_t len; + + mutex_lock(&buffer->mutex); + + /* we don't support switching read/write modes */ + if (buffer->read_in_progress) { + len = -ETXTBSY; + goto out; + } + buffer->write_in_progress = 1; + + /* buffer grows? */ + if (*ppos + count > buffer->bin_buffer_size) { + + if (bin_attr->cb_max_size && + *ppos + count > bin_attr->cb_max_size) { + len = -EFBIG; + } + + tbuf = vmalloc(*ppos + count); + if (tbuf == NULL) { + len = -ENOMEM; + goto out; + } + + /* copy old contents */ + if (buffer->bin_buffer) { + memcpy(tbuf, buffer->bin_buffer, + buffer->bin_buffer_size); + vfree(buffer->bin_buffer); + } + + /* clear the new area */ + memset(tbuf + buffer->bin_buffer_size, 0, + *ppos + count - buffer->bin_buffer_size); + buffer->bin_buffer = tbuf; + buffer->bin_buffer_size = *ppos + count; + } + + len = simple_write_to_buffer(buffer->bin_buffer, + buffer->bin_buffer_size, ppos, buf, count); + if (len > 0) + *ppos += len; +out: + mutex_unlock(&buffer->mutex); + return len; +} + +static int check_perm(struct inode * inode, struct file * file, int type) { struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent); struct configfs_attribute * attr = to_attr(file->f_path.dentry); + struct configfs_bin_attribute *bin_attr = NULL; struct configfs_buffer * buffer; struct configfs_item_operations * ops = NULL; int error = 0; @@ -221,6 +376,9 @@ static int check_perm(struct inode * inode, struct file * file) if (!item || !attr) goto Einval; + if (type & CONFIGFS_ITEM_BIN_ATTR) + bin_attr = to_bin_attr(file->f_path.dentry); + /* Grab the module reference for this attribute if we have one */ if (!try_module_get(attr->ca_owner)) { error = -ENODEV; @@ -237,10 +395,14 @@ static int check_perm(struct inode * inode, struct file * file) * and we must have a store method. */ if (file->f_mode & FMODE_WRITE) { + if (!(inode->i_mode & S_IWUGO)) + goto Eaccess; - if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute) + if ((type & CONFIGFS_ITEM_ATTR) && !attr->store) goto Eaccess; + if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write) + goto Eaccess; } /* File needs read support. @@ -248,7 +410,13 @@ static int check_perm(struct inode * inode, struct file * file) * must be a show method for it. */ if (file->f_mode & FMODE_READ) { - if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute) + if (!(inode->i_mode & S_IRUGO)) + goto Eaccess; + + if ((type & CONFIGFS_ITEM_ATTR) && !attr->show) + goto Eaccess; + + if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read) goto Eaccess; } @@ -262,6 +430,8 @@ static int check_perm(struct inode * inode, struct file * file) } mutex_init(&buffer->mutex); buffer->needs_read_fill = 1; + buffer->read_in_progress = 0; + buffer->write_in_progress = 0; buffer->ops = ops; file->private_data = buffer; goto Done; @@ -279,12 +449,7 @@ static int check_perm(struct inode * inode, struct file * file) return error; } -static int configfs_open_file(struct inode * inode, struct file * filp) -{ - return check_perm(inode,filp); -} - -static int configfs_release(struct inode * inode, struct file * filp) +static int configfs_release(struct inode *inode, struct file *filp) { struct config_item * item = to_item(filp->f_path.dentry->d_parent); struct configfs_attribute * attr = to_attr(filp->f_path.dentry); @@ -305,6 +470,47 @@ static int configfs_release(struct inode * inode, struct file * filp) return 0; } +static int configfs_open_file(struct inode *inode, struct file *filp) +{ + return check_perm(inode, filp, CONFIGFS_ITEM_ATTR); +} + +static int configfs_open_bin_file(struct inode *inode, struct file *filp) +{ + return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR); +} + +static int configfs_release_bin_file(struct inode *inode, struct file *filp) +{ + struct configfs_buffer *buffer = filp->private_data; + struct dentry *dentry = filp->f_path.dentry; + struct config_item *item = to_item(dentry->d_parent); + struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); + ssize_t len = 0; + int ret; + + buffer->read_in_progress = 0; + + if (buffer->write_in_progress) { + buffer->write_in_progress = 0; + + len = bin_attr->write(item, buffer->bin_buffer, + buffer->bin_buffer_size); + + /* vfree on NULL is safe */ + vfree(buffer->bin_buffer); + buffer->bin_buffer = NULL; + buffer->bin_buffer_size = 0; + buffer->needs_read_fill = 1; + } + + ret = configfs_release(inode, filp); + if (len < 0) + return len; + return ret; +} + + const struct file_operations configfs_file_operations = { .read = configfs_read_file, .write = configfs_write_file, @@ -313,6 +519,14 @@ const struct file_operations configfs_file_operations = { .release = configfs_release, }; +const struct file_operations configfs_bin_file_operations = { + .read = configfs_read_bin_file, + .write = configfs_write_bin_file, + .llseek = NULL, /* bin file is not seekable */ + .open = configfs_open_bin_file, + .release = configfs_release_bin_file, +}; + /** * configfs_create_file - create an attribute file for an item. * @item: item we're creating for. @@ -334,3 +548,24 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib return error; } +/** + * configfs_create_bin_file - create a binary attribute file for an item. + * @item: item we're creating for. + * @attr: atrribute descriptor. + */ + +int configfs_create_bin_file(struct config_item *item, + const struct configfs_bin_attribute *bin_attr) +{ + struct dentry *dir = item->ci_dentry; + struct configfs_dirent *parent_sd = dir->d_fsdata; + umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG; + int error = 0; + + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode, + CONFIGFS_ITEM_BIN_ATTR); + mutex_unlock(&dir->d_inode->i_mutex); + + return error; +} diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index eae87575e681..0cc810e9dccc 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -218,7 +218,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd) if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) return sd->s_dentry->d_name.name; - if (sd->s_type & CONFIGFS_ITEM_ATTR) { + if (sd->s_type & (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)) { attr = sd->s_element; return attr->ca_name; } diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index ec5c8325b503..db6d69289608 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -279,27 +279,33 @@ static int configfs_getlink(struct dentry *dentry, char * path) } -static const char *configfs_follow_link(struct dentry *dentry, void **cookie) +static const char *configfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - unsigned long page = get_zeroed_page(GFP_KERNEL); + char *body; int error; - if (!page) + if (!dentry) + return ERR_PTR(-ECHILD); + + body = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!body) return ERR_PTR(-ENOMEM); - error = configfs_getlink(dentry, (char *)page); + error = configfs_getlink(dentry, body); if (!error) { - return *cookie = (void *)page; + set_delayed_call(done, kfree_link, body); + return body; } - free_page(page); + kfree(body); return ERR_PTR(error); } const struct inode_operations configfs_symlink_inode_operations = { - .follow_link = configfs_follow_link, + .get_link = configfs_get_link, .readlink = generic_readlink, - .put_link = free_page_put_link, .setattr = configfs_setattr, }; diff --git a/fs/coredump.c b/fs/coredump.c index a8f75640ac86..b3c153ca435d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -32,6 +32,7 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> +#include <linux/timekeeping.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -232,9 +233,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) break; /* UNIX time of coredump */ case 't': { - struct timeval tv; - do_gettimeofday(&tv); - err = cn_printf(cn, "%lu", tv.tv_sec); + time64_t time; + + time = ktime_get_real_seconds(); + err = cn_printf(cn, "%lld", time); break; } /* hostname */ @@ -280,23 +282,24 @@ out: return ispipe; } -static int zap_process(struct task_struct *start, int exit_code) +static int zap_process(struct task_struct *start, int exit_code, int flags) { struct task_struct *t; int nr = 0; + /* ignore all signals except SIGKILL, see prepare_signal() */ + start->signal->flags = SIGNAL_GROUP_COREDUMP | flags; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; - t = start; - do { + for_each_thread(start, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); if (t != current && t->mm) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); nr++; } - } while_each_thread(start, t); + } return nr; } @@ -311,10 +314,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; - nr = zap_process(tsk, exit_code); tsk->signal->group_exit_task = tsk; - /* ignore all signals except SIGKILL, see prepare_signal() */ - tsk->signal->flags = SIGNAL_GROUP_COREDUMP; + nr = zap_process(tsk, exit_code, 0); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); } spin_unlock_irq(&tsk->sighand->siglock); @@ -360,18 +361,18 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, continue; if (g->flags & PF_KTHREAD) continue; - p = g; - do { - if (p->mm) { - if (unlikely(p->mm == mm)) { - lock_task_sighand(p, &flags); - nr += zap_process(p, exit_code); - p->signal->flags = SIGNAL_GROUP_EXIT; - unlock_task_sighand(p, &flags); - } - break; + + for_each_thread(g, p) { + if (unlikely(!p->mm)) + continue; + if (unlikely(p->mm == mm)) { + lock_task_sighand(p, &flags); + nr += zap_process(p, exit_code, + SIGNAL_GROUP_EXIT); + unlock_task_sighand(p, &flags); } - } while_each_thread(g, p); + break; + } } rcu_read_unlock(); done: diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 355c522f3585..b862bc219cd7 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -100,6 +100,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &cramfs_aops; break; default: @@ -28,49 +28,68 @@ #include <linux/sched.h> #include <linux/uio.h> #include <linux/vmstat.h> +#include <linux/pfn_t.h> +#include <linux/sizes.h> -int dax_clear_blocks(struct inode *inode, sector_t block, long size) +static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) +{ + struct request_queue *q = bdev->bd_queue; + long rc = -EIO; + + dax->addr = (void __pmem *) ERR_PTR(-EIO); + if (blk_queue_enter(q, true) != 0) + return rc; + + rc = bdev_direct_access(bdev, dax); + if (rc < 0) { + dax->addr = (void __pmem *) ERR_PTR(rc); + blk_queue_exit(q); + return rc; + } + return rc; +} + +static void dax_unmap_atomic(struct block_device *bdev, + const struct blk_dax_ctl *dax) +{ + if (IS_ERR(dax->addr)) + return; + blk_queue_exit(bdev->bd_queue); +} + +/* + * dax_clear_blocks() is called from within transaction context from XFS, + * and hence this means the stack from this point must follow GFP_NOFS + * semantics for all operations. + */ +int dax_clear_blocks(struct inode *inode, sector_t block, long _size) { struct block_device *bdev = inode->i_sb->s_bdev; - sector_t sector = block << (inode->i_blkbits - 9); + struct blk_dax_ctl dax = { + .sector = block << (inode->i_blkbits - 9), + .size = _size, + }; might_sleep(); do { - void __pmem *addr; - unsigned long pfn; - long count; + long count, sz; - count = bdev_direct_access(bdev, sector, &addr, &pfn, size); + count = dax_map_atomic(bdev, &dax); if (count < 0) return count; - BUG_ON(size < count); - while (count > 0) { - unsigned pgsz = PAGE_SIZE - offset_in_page(addr); - if (pgsz > count) - pgsz = count; - clear_pmem(addr, pgsz); - addr += pgsz; - size -= pgsz; - count -= pgsz; - BUG_ON(pgsz & 511); - sector += pgsz / 512; - cond_resched(); - } - } while (size); + sz = min_t(long, count, SZ_128K); + clear_pmem(dax.addr, sz); + dax.size -= sz; + dax.sector += sz / 512; + dax_unmap_atomic(bdev, &dax); + cond_resched(); + } while (dax.size); wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_clear_blocks); -static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, - unsigned blkbits) -{ - unsigned long pfn; - sector_t sector = bh->b_blocknr << (blkbits - 9); - return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); -} - /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, loff_t pos, loff_t end) @@ -100,19 +119,29 @@ static bool buffer_size_valid(struct buffer_head *bh) return bh->b_state != 0; } + +static sector_t to_sector(const struct buffer_head *bh, + const struct inode *inode) +{ + sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); + + return sector; +} + static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { - ssize_t retval = 0; - loff_t pos = start; - loff_t max = start; - loff_t bh_max = start; - void __pmem *addr; - bool hole = false; - bool need_wmb = false; - - if (iov_iter_rw(iter) != WRITE) + loff_t pos = start, max = start, bh_max = start; + bool hole = false, need_wmb = false; + struct block_device *bdev = NULL; + int rw = iov_iter_rw(iter), rc; + long map_len = 0; + struct blk_dax_ctl dax = { + .addr = (void __pmem *) ERR_PTR(-EIO), + }; + + if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { @@ -127,13 +156,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; - retval = get_block(inode, block, bh, - iov_iter_rw(iter) == WRITE); - if (retval) + rc = get_block(inode, block, bh, rw == WRITE); + if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; + bdev = bh->b_bdev; } else { unsigned done = bh->b_size - (bh_max - (pos - first)); @@ -141,45 +170,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, bh->b_size -= done; } - hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); + hole = rw == READ && !buffer_written(bh); if (hole) { - addr = NULL; size = bh->b_size - first; } else { - retval = dax_get_addr(bh, &addr, blkbits); - if (retval < 0) + dax_unmap_atomic(bdev, &dax); + dax.sector = to_sector(bh, inode); + dax.size = bh->b_size; + map_len = dax_map_atomic(bdev, &dax); + if (map_len < 0) { + rc = map_len; break; + } if (buffer_unwritten(bh) || buffer_new(bh)) { - dax_new_buf(addr, retval, first, pos, - end); + dax_new_buf(dax.addr, map_len, first, + pos, end); need_wmb = true; } - addr += first; - size = retval - first; + dax.addr += first; + size = map_len - first; } max = min(pos + size, end); } if (iov_iter_rw(iter) == WRITE) { - len = copy_from_iter_pmem(addr, max - pos, iter); + len = copy_from_iter_pmem(dax.addr, max - pos, iter); need_wmb = true; } else if (!hole) - len = copy_to_iter((void __force *)addr, max - pos, + len = copy_to_iter((void __force *) dax.addr, max - pos, iter); else len = iov_iter_zero(max - pos, iter); - if (!len) + if (!len) { + rc = -EFAULT; break; + } pos += len; - addr += len; + if (!IS_ERR(dax.addr)) + dax.addr += len; } if (need_wmb) wmb_pmem(); + dax_unmap_atomic(bdev, &dax); - return (pos == start) ? retval : pos - start; + return (pos == start) ? rc : pos - start; } /** @@ -268,28 +305,35 @@ static int dax_load_hole(struct address_space *mapping, struct page *page, return VM_FAULT_LOCKED; } -static int copy_user_bh(struct page *to, struct buffer_head *bh, - unsigned blkbits, unsigned long vaddr) +static int copy_user_bh(struct page *to, struct inode *inode, + struct buffer_head *bh, unsigned long vaddr) { - void __pmem *vfrom; + struct blk_dax_ctl dax = { + .sector = to_sector(bh, inode), + .size = bh->b_size, + }; + struct block_device *bdev = bh->b_bdev; void *vto; - if (dax_get_addr(bh, &vfrom, blkbits) < 0) - return -EIO; + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); vto = kmap_atomic(to); - copy_user_page(vto, (void __force *)vfrom, vaddr, to); + copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); + dax_unmap_atomic(bdev, &dax); return 0; } static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct address_space *mapping = inode->i_mapping; - sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); unsigned long vaddr = (unsigned long)vmf->virtual_address; - void __pmem *addr; - unsigned long pfn; + struct address_space *mapping = inode->i_mapping; + struct block_device *bdev = bh->b_bdev; + struct blk_dax_ctl dax = { + .sector = to_sector(bh, inode), + .size = bh->b_size, + }; pgoff_t size; int error; @@ -308,20 +352,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, goto out; } - error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); - if (error < 0) - goto out; - if (error < PAGE_SIZE) { - error = -EIO; + if (dax_map_atomic(bdev, &dax) < 0) { + error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { - clear_pmem(addr, PAGE_SIZE); + clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } + dax_unmap_atomic(bdev, &dax); - error = vm_insert_mixed(vma, vaddr, pfn); + error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); @@ -415,7 +457,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) - error = copy_user_bh(new_page, &bh, blkbits, vaddr); + error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) @@ -516,6 +558,24 @@ EXPORT_SYMBOL_GPL(dax_fault); */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) +static void __dax_dbg(struct buffer_head *bh, unsigned long address, + const char *reason, const char *fn) +{ + if (bh) { + char bname[BDEVNAME_SIZE]; + bdevname(bh->b_bdev, bname); + pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " + "length %zd fallback: %s\n", fn, current->comm, + address, bname, bh->b_state, (u64)bh->b_blocknr, + bh->b_size, reason); + } else { + pr_debug("%s: %s addr: %lx fallback: %s\n", fn, + current->comm, address, reason); + } +} + +#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") + int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) @@ -527,37 +587,49 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, unsigned blkbits = inode->i_blkbits; unsigned long pmd_addr = address & PMD_MASK; bool write = flags & FAULT_FLAG_WRITE; - long length; - void __pmem *kaddr; + struct block_device *bdev; pgoff_t size, pgoff; - sector_t block, sector; - unsigned long pfn; + sector_t block; int result = 0; + /* dax pmd mappings require pfn_t_devmap() */ + if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) + return VM_FAULT_FALLBACK; + /* Fall back to PTEs if we're going to COW */ - if (write && !(vma->vm_flags & VM_SHARED)) + if (write && !(vma->vm_flags & VM_SHARED)) { + split_huge_pmd(vma, pmd, address); + dax_pmd_dbg(NULL, address, "cow write"); return VM_FAULT_FALLBACK; + } /* If the PMD would extend outside the VMA */ - if (pmd_addr < vma->vm_start) + if (pmd_addr < vma->vm_start) { + dax_pmd_dbg(NULL, address, "vma start unaligned"); return VM_FAULT_FALLBACK; - if ((pmd_addr + PMD_SIZE) > vma->vm_end) + } + if ((pmd_addr + PMD_SIZE) > vma->vm_end) { + dax_pmd_dbg(NULL, address, "vma end unaligned"); return VM_FAULT_FALLBACK; + } pgoff = linear_page_index(vma, pmd_addr); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) return VM_FAULT_SIGBUS; /* If the PMD would cover blocks out of the file */ - if ((pgoff | PG_PMD_COLOUR) >= size) + if ((pgoff | PG_PMD_COLOUR) >= size) { + dax_pmd_dbg(NULL, address, + "offset + huge page size > file size"); return VM_FAULT_FALLBACK; + } memset(&bh, 0, sizeof(bh)); block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - length = get_block(inode, block, &bh, write); - if (length) + if (get_block(inode, block, &bh, write) != 0) return VM_FAULT_SIGBUS; + bdev = bh.b_bdev; i_mmap_lock_read(mapping); /* @@ -565,8 +637,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * just fall back to PTEs. Calling get_block 512 times in a loop * would be silly. */ - if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) + if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { + dax_pmd_dbg(&bh, address, "allocated block too small"); goto fallback; + } /* * If we allocated new storage, make sure no process has any @@ -589,50 +663,82 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, result = VM_FAULT_SIGBUS; goto out; } - if ((pgoff | PG_PMD_COLOUR) >= size) + if ((pgoff | PG_PMD_COLOUR) >= size) { + dax_pmd_dbg(&bh, address, "pgoff unaligned"); goto fallback; + } if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); - if (unlikely(!zero_page)) + if (unlikely(!zero_page)) { + dax_pmd_dbg(&bh, address, "no zero page"); goto fallback; + } ptl = pmd_lock(vma->vm_mm, pmd); if (!pmd_none(*pmd)) { spin_unlock(ptl); + dax_pmd_dbg(&bh, address, "pmd already present"); goto fallback; } + dev_dbg(part_to_dev(bdev->bd_part), + "%s: %s addr: %lx pfn: <zero> sect: %llx\n", + __func__, current->comm, address, + (unsigned long long) to_sector(&bh, inode)); + entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); result = VM_FAULT_NOPAGE; spin_unlock(ptl); } else { - sector = bh.b_blocknr << (blkbits - 9); - length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, - bh.b_size); + struct blk_dax_ctl dax = { + .sector = to_sector(&bh, inode), + .size = PMD_SIZE, + }; + long length = dax_map_atomic(bdev, &dax); + if (length < 0) { result = VM_FAULT_SIGBUS; goto out; } - if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) + if (length < PMD_SIZE) { + dax_pmd_dbg(&bh, address, "dax-length too small"); + dax_unmap_atomic(bdev, &dax); + goto fallback; + } + if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { + dax_pmd_dbg(&bh, address, "pfn unaligned"); + dax_unmap_atomic(bdev, &dax); + goto fallback; + } + + if (!pfn_t_devmap(dax.pfn)) { + dax_unmap_atomic(bdev, &dax); + dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; + } if (buffer_unwritten(&bh) || buffer_new(&bh)) { - int i; - for (i = 0; i < PTRS_PER_PMD; i++) - clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); + clear_pmem(dax.addr, PMD_SIZE); wmb_pmem(); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); result |= VM_FAULT_MAJOR; } - - result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); + dax_unmap_atomic(bdev, &dax); + + dev_dbg(part_to_dev(bdev->bd_part), + "%s: %s addr: %lx pfn: %lx sect: %llx\n", + __func__, current->comm, address, + pfn_t_to_pfn(dax.pfn), + (unsigned long long) dax.sector); + result |= vmf_insert_pfn_pmd(vma, address, pmd, + dax.pfn, write); } out: @@ -734,12 +840,17 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, if (err < 0) return err; if (buffer_written(&bh)) { - void __pmem *addr; - err = dax_get_addr(&bh, &addr, inode->i_blkbits); - if (err < 0) - return err; - clear_pmem(addr + offset, length); + struct block_device *bdev = bh.b_bdev; + struct blk_dax_ctl dax = { + .sector = to_sector(&bh, inode), + .size = PAGE_CACHE_SIZE, + }; + + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); + clear_pmem(dax.addr + offset, length); wmb_pmem(); + dax_unmap_atomic(bdev, &dax); } return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 5c33aeb0f68f..b4539e84e577 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1571,7 +1571,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, GFP_KERNEL); + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; @@ -1734,7 +1735,7 @@ static unsigned d_flags_for_inode(struct inode *inode) } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { - if (unlikely(inode->i_op->follow_link)) { + if (unlikely(inode->i_op->get_link)) { add_flags = DCACHE_SYMLINK_TYPE; goto type_determined; } @@ -3303,18 +3304,18 @@ out: * @new_dentry: new dentry * @old_dentry: old dentry * - * Returns 1 if new_dentry is a subdirectory of the parent (at any depth). - * Returns 0 otherwise. + * Returns true if new_dentry is a subdirectory of the parent (at any depth). + * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ -int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) +bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { - int result; + bool result; unsigned seq; if (new_dentry == old_dentry) - return 1; + return true; do { /* for restarting inner loop in case of seq retry */ @@ -3325,9 +3326,9 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) */ rcu_read_lock(); if (d_ancestor(old_dentry, new_dentry)) - result = 1; + result = true; else - result = 0; + result = false; rcu_read_unlock(); } while (read_seqretry(&rename_lock, seq)); @@ -3415,7 +3416,7 @@ static void __init dcache_init(void) * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 6c55ade071c3..d2ba12e23ed9 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -42,6 +42,22 @@ const struct file_operations debugfs_file_operations = { .llseek = noop_llseek, }; +static struct dentry *debugfs_create_mode(const char *name, umode_t mode, + struct dentry *parent, void *value, + const struct file_operations *fops, + const struct file_operations *fops_ro, + const struct file_operations *fops_wo) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, fops_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, fops_wo); + + return debugfs_create_file(name, mode, parent, value, fops); +} + static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; @@ -83,14 +99,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); struct dentry *debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u8_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u8_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u8); + return debugfs_create_mode(name, mode, parent, value, &fops_u8, + &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); @@ -135,14 +145,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); struct dentry *debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u16_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u16_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u16); + return debugfs_create_mode(name, mode, parent, value, &fops_u16, + &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); @@ -187,14 +191,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); struct dentry *debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u32_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u32_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u32); + return debugfs_create_mode(name, mode, parent, value, &fops_u32, + &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); @@ -240,17 +238,59 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); struct dentry *debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u64_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u64_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u64); + return debugfs_create_mode(name, mode, parent, value, &fops_u64, + &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); +static int debugfs_ulong_set(void *data, u64 val) +{ + *(unsigned long *)data = val; + return 0; +} + +static int debugfs_ulong_get(void *data, u64 *val) +{ + *val = *(unsigned long *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); + +/** + * debugfs_create_ulong - create a debugfs file that is used to read and write + * an unsigned long value. + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_ulong(const char *name, umode_t mode, + struct dentry *parent, unsigned long *value) +{ + return debugfs_create_mode(name, mode, parent, value, &fops_ulong, + &fops_ulong_ro, &fops_ulong_wo); +} +EXPORT_SYMBOL_GPL(debugfs_create_ulong); + DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); @@ -264,6 +304,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); /* * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value @@ -286,14 +328,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n struct dentry *debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x8_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x8_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_x8); + return debugfs_create_mode(name, mode, parent, value, &fops_x8, + &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); @@ -310,14 +346,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); struct dentry *debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x16_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x16_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_x16); + return debugfs_create_mode(name, mode, parent, value, &fops_x16, + &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); @@ -334,14 +364,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); struct dentry *debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x32_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x32_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_x32); + return debugfs_create_mode(name, mode, parent, value, &fops_x32, + &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); @@ -358,7 +382,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32); struct dentry *debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { - return debugfs_create_file(name, mode, parent, value, &fops_x64); + return debugfs_create_mode(name, mode, parent, value, &fops_x64, + &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); @@ -375,6 +400,8 @@ static int debugfs_size_t_get(void *data, u64 *val) } DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, "%llu\n"); /* %llu and %zu are more or less the same */ +DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); /** * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value @@ -389,7 +416,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, struct dentry *debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { - return debugfs_create_file(name, mode, parent, value, &fops_size_t); + return debugfs_create_mode(name, mode, parent, value, &fops_size_t, + &fops_size_t_ro, &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); @@ -422,16 +450,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, - &fops_atomic_t_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, - &fops_atomic_t_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_atomic_t); + return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t, + &fops_atomic_t_ro, &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); @@ -439,7 +459,7 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[3]; - u32 *val = file->private_data; + bool *val = file->private_data; if (*val) buf[0] = 'Y'; @@ -457,7 +477,7 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, char buf[32]; size_t buf_size; bool bv; - u32 *val = file->private_data; + bool *val = file->private_data; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) @@ -478,6 +498,18 @@ static const struct file_operations fops_bool = { .llseek = default_llseek, }; +static const struct file_operations fops_bool_ro = { + .read = debugfs_read_file_bool, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct file_operations fops_bool_wo = { + .write = debugfs_write_file_bool, + .open = simple_open, + .llseek = default_llseek, +}; + /** * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value * @name: a pointer to a string containing the name of the file to create. @@ -503,9 +535,10 @@ static const struct file_operations fops_bool = { * code. */ struct dentry *debugfs_create_bool(const char *name, umode_t mode, - struct dentry *parent, u32 *value) + struct dentry *parent, bool *value) { - return debugfs_create_file(name, mode, parent, value, &fops_bool); + return debugfs_create_mode(name, mode, parent, value, &fops_bool, + &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index c711be8d6a3c..b7fcc0de0b2f 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -271,8 +271,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) dput(dentry); dentry = ERR_PTR(-EEXIST); } - if (IS_ERR(dentry)) + + if (IS_ERR(dentry)) { mutex_unlock(&d_inode(parent)->i_mutex); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); + } + return dentry; } @@ -533,7 +537,8 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) /** * debugfs_remove - removes a file or directory from the debugfs filesystem * @dentry: a pointer to a the dentry of the file or directory to be - * removed. + * removed. If this parameter is NULL or an error value, nothing + * will be done. * * This function removes a file or directory in debugfs that was previously * created with a call to another debugfs function (like @@ -565,7 +570,8 @@ EXPORT_SYMBOL_GPL(debugfs_remove); /** * debugfs_remove_recursive - recursively removes a directory - * @dentry: a pointer to a the dentry of the directory to be removed. + * @dentry: a pointer to a the dentry of the directory to be removed. If this + * parameter is NULL or an error value, nothing will be done. * * This function recursively removes a directory tree in debugfs that * was previously created with a call to another debugfs function diff --git a/fs/direct-io.c b/fs/direct-io.c index 11256291642e..602e8441bc0f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -109,6 +109,8 @@ struct dio_submit { struct dio { int flags; /* doesn't change */ int rw; + blk_qc_t bio_cookie; + struct block_device *bio_bdev; struct inode *inode; loff_t i_size; /* i_size when submitted */ dio_iodone_t *end_io; /* IO completion function */ @@ -120,6 +122,7 @@ struct dio { int page_errors; /* errno from get_user_pages() */ int is_async; /* is IO async ? */ bool defer_completion; /* defer AIO completion to workqueue? */ + bool should_dirty; /* if pages should be dirtied */ int io_error; /* IO error in completion path */ unsigned long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ @@ -360,7 +363,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, /* * bio_alloc() is guaranteed to return a bio when called with - * __GFP_WAIT and we request a valid number of vectors. + * __GFP_RECLAIM and we request a valid number of vectors. */ bio = bio_alloc(GFP_KERNEL, nr_vecs); @@ -393,14 +396,17 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (dio->is_async && dio->rw == READ) + if (dio->is_async && dio->rw == READ && dio->should_dirty) bio_set_pages_dirty(bio); - if (sdio->submit_io) + dio->bio_bdev = bio->bi_bdev; + + if (sdio->submit_io) { sdio->submit_io(dio->rw, bio, dio->inode, sdio->logical_offset_in_bio); - else - submit_bio(dio->rw, bio); + dio->bio_cookie = BLK_QC_T_NONE; + } else + dio->bio_cookie = submit_bio(dio->rw, bio); sdio->bio = NULL; sdio->boundary = 0; @@ -439,7 +445,8 @@ static struct bio *dio_await_one(struct dio *dio) __set_current_state(TASK_UNINTERRUPTIBLE); dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); - io_schedule(); + if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) + io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; @@ -464,14 +471,15 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (bio->bi_error) dio->io_error = -EIO; - if (dio->is_async && dio->rw == READ) { + if (dio->is_async && dio->rw == READ && dio->should_dirty) { bio_check_pages_dirty(bio); /* transfers ownership */ err = bio->bi_error; } else { bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (dio->rw == READ && !PageCompound(page)) + if (dio->rw == READ && !PageCompound(page) && + dio->should_dirty) set_page_dirty_lock(page); page_cache_release(page); } @@ -1161,6 +1169,16 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } } + /* Once we sampled i_size check for reads beyond EOF */ + dio->i_size = i_size_read(inode); + if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { + if (dio->flags & DIO_LOCKING) + mutex_unlock(&inode->i_mutex); + kmem_cache_free(dio_cache, dio); + retval = 0; + goto out; + } + /* * For file extending writes updating i_size before data writeouts * complete can expose uninitialized blocks in dumb filesystems. @@ -1214,11 +1232,11 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, sdio.next_block_for_io = -1; dio->iocb = iocb; - dio->i_size = i_size_read(inode); spin_lock_init(&dio->bio_lock); dio->refcount = 1; + dio->should_dirty = (iter->type == ITER_IOVEC); sdio.iter = iter; sdio.final_block_in_request = (offset + iov_iter_count(iter)) >> blkbits; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index d521bddf876d..8e294fbbac39 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -61,35 +61,8 @@ static struct config_item *make_node(struct config_group *, const char *); static void drop_node(struct config_group *, struct config_item *); static void release_node(struct config_item *); -static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, - char *buf); -static ssize_t store_cluster(struct config_item *i, - struct configfs_attribute *a, - const char *buf, size_t len); -static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, - char *buf); -static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, - const char *buf, size_t len); -static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, - char *buf); -static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, - const char *buf, size_t len); - -static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf); -static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, - size_t len); -static ssize_t comm_local_read(struct dlm_comm *cm, char *buf); -static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, - size_t len); -static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, - size_t len); -static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf); -static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf); -static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, - size_t len); -static ssize_t node_weight_read(struct dlm_node *nd, char *buf); -static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, - size_t len); +static struct configfs_attribute *comm_attrs[]; +static struct configfs_attribute *node_attrs[]; struct dlm_cluster { struct config_group group; @@ -108,6 +81,12 @@ struct dlm_cluster { char cl_cluster_name[DLM_LOCKSPACE_LEN]; }; +static struct dlm_cluster *config_item_to_cluster(struct config_item *i) +{ + return i ? container_of(to_config_group(i), struct dlm_cluster, group) : + NULL; +} + enum { CLUSTER_ATTR_TCP_PORT = 0, CLUSTER_ATTR_BUFFER_SIZE, @@ -124,33 +103,24 @@ enum { CLUSTER_ATTR_CLUSTER_NAME, }; -struct cluster_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct dlm_cluster *, char *); - ssize_t (*store)(struct dlm_cluster *, const char *, size_t); -}; - -static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf) +static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf) { + struct dlm_cluster *cl = config_item_to_cluster(item); return sprintf(buf, "%s\n", cl->cl_cluster_name); } -static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl, +static ssize_t cluster_cluster_name_store(struct config_item *item, const char *buf, size_t len) { + struct dlm_cluster *cl = config_item_to_cluster(item); + strlcpy(dlm_config.ci_cluster_name, buf, sizeof(dlm_config.ci_cluster_name)); strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name)); return len; } -static struct cluster_attribute cluster_attr_cluster_name = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "cluster_name", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = cluster_cluster_name_read, - .store = cluster_cluster_name_write, -}; +CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, int *info_field, int check_zero, @@ -175,17 +145,19 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, } #define CLUSTER_ATTR(name, check_zero) \ -static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \ +static ssize_t cluster_##name##_store(struct config_item *item, \ + const char *buf, size_t len) \ { \ + struct dlm_cluster *cl = config_item_to_cluster(item); \ return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ check_zero, buf, len); \ } \ -static ssize_t name##_read(struct dlm_cluster *cl, char *buf) \ +static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ { \ + struct dlm_cluster *cl = config_item_to_cluster(item); \ return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \ } \ -static struct cluster_attribute cluster_attr_##name = \ -__CONFIGFS_ATTR(name, 0644, name##_read, name##_write) +CONFIGFS_ATTR(cluster_, name); CLUSTER_ATTR(tcp_port, 1); CLUSTER_ATTR(buffer_size, 1); @@ -201,19 +173,19 @@ CLUSTER_ATTR(new_rsb_count, 0); CLUSTER_ATTR(recover_callbacks, 0); static struct configfs_attribute *cluster_attrs[] = { - [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, - [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr, - [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr, - [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr, - [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr, - [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr, - [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, - [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, - [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, - [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, - [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr, - [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr, - [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr, + [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port, + [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size, + [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size, + [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer, + [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs, + [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs, + [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug, + [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, + [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, + [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us, + [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, + [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, + [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, NULL, }; @@ -224,83 +196,11 @@ enum { COMM_ATTR_ADDR_LIST, }; -struct comm_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct dlm_comm *, char *); - ssize_t (*store)(struct dlm_comm *, const char *, size_t); -}; - -static struct comm_attribute comm_attr_nodeid = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "nodeid", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = comm_nodeid_read, - .store = comm_nodeid_write, -}; - -static struct comm_attribute comm_attr_local = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "local", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = comm_local_read, - .store = comm_local_write, -}; - -static struct comm_attribute comm_attr_addr = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "addr", - .ca_mode = S_IWUSR }, - .store = comm_addr_write, -}; - -static struct comm_attribute comm_attr_addr_list = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "addr_list", - .ca_mode = S_IRUGO }, - .show = comm_addr_list_read, -}; - -static struct configfs_attribute *comm_attrs[] = { - [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr, - [COMM_ATTR_LOCAL] = &comm_attr_local.attr, - [COMM_ATTR_ADDR] = &comm_attr_addr.attr, - [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list.attr, - NULL, -}; - enum { NODE_ATTR_NODEID = 0, NODE_ATTR_WEIGHT, }; -struct node_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct dlm_node *, char *); - ssize_t (*store)(struct dlm_node *, const char *, size_t); -}; - -static struct node_attribute node_attr_nodeid = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "nodeid", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = node_nodeid_read, - .store = node_nodeid_write, -}; - -static struct node_attribute node_attr_weight = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "weight", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = node_weight_read, - .store = node_weight_write, -}; - -static struct configfs_attribute *node_attrs[] = { - [NODE_ATTR_NODEID] = &node_attr_nodeid.attr, - [NODE_ATTR_WEIGHT] = &node_attr_weight.attr, - NULL, -}; - struct dlm_clusters { struct configfs_subsystem subsys; }; @@ -349,8 +249,6 @@ static struct configfs_group_operations clusters_ops = { static struct configfs_item_operations cluster_ops = { .release = release_cluster, - .show_attribute = show_cluster, - .store_attribute = store_cluster, }; static struct configfs_group_operations spaces_ops = { @@ -369,8 +267,6 @@ static struct configfs_group_operations comms_ops = { static struct configfs_item_operations comm_ops = { .release = release_comm, - .show_attribute = show_comm, - .store_attribute = store_comm, }; static struct configfs_group_operations nodes_ops = { @@ -380,8 +276,6 @@ static struct configfs_group_operations nodes_ops = { static struct configfs_item_operations node_ops = { .release = release_node, - .show_attribute = show_node, - .store_attribute = store_node, }; static struct config_item_type clusters_type = { @@ -427,12 +321,6 @@ static struct config_item_type node_type = { .ct_owner = THIS_MODULE, }; -static struct dlm_cluster *config_item_to_cluster(struct config_item *i) -{ - return i ? container_of(to_config_group(i), struct dlm_cluster, group) : - NULL; -} - static struct dlm_space *config_item_to_space(struct config_item *i) { return i ? container_of(to_config_group(i), struct dlm_space, group) : @@ -687,66 +575,30 @@ void dlm_config_exit(void) * Functions for user space to read/write attributes */ -static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, - char *buf) -{ - struct dlm_cluster *cl = config_item_to_cluster(i); - struct cluster_attribute *cla = - container_of(a, struct cluster_attribute, attr); - return cla->show ? cla->show(cl, buf) : 0; -} - -static ssize_t store_cluster(struct config_item *i, - struct configfs_attribute *a, - const char *buf, size_t len) +static ssize_t comm_nodeid_show(struct config_item *item, char *buf) { - struct dlm_cluster *cl = config_item_to_cluster(i); - struct cluster_attribute *cla = - container_of(a, struct cluster_attribute, attr); - return cla->store ? cla->store(cl, buf, len) : -EINVAL; -} - -static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, - char *buf) -{ - struct dlm_comm *cm = config_item_to_comm(i); - struct comm_attribute *cma = - container_of(a, struct comm_attribute, attr); - return cma->show ? cma->show(cm, buf) : 0; -} - -static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, - const char *buf, size_t len) -{ - struct dlm_comm *cm = config_item_to_comm(i); - struct comm_attribute *cma = - container_of(a, struct comm_attribute, attr); - return cma->store ? cma->store(cm, buf, len) : -EINVAL; + return sprintf(buf, "%d\n", config_item_to_comm(item)->nodeid); } -static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf) -{ - return sprintf(buf, "%d\n", cm->nodeid); -} - -static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, +static ssize_t comm_nodeid_store(struct config_item *item, const char *buf, size_t len) { - int rc = kstrtoint(buf, 0, &cm->nodeid); + int rc = kstrtoint(buf, 0, &config_item_to_comm(item)->nodeid); if (rc) return rc; return len; } -static ssize_t comm_local_read(struct dlm_comm *cm, char *buf) +static ssize_t comm_local_show(struct config_item *item, char *buf) { - return sprintf(buf, "%d\n", cm->local); + return sprintf(buf, "%d\n", config_item_to_comm(item)->local); } -static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, +static ssize_t comm_local_store(struct config_item *item, const char *buf, size_t len) { + struct dlm_comm *cm = config_item_to_comm(item); int rc = kstrtoint(buf, 0, &cm->local); if (rc) @@ -756,8 +608,10 @@ static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, return len; } -static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) +static ssize_t comm_addr_store(struct config_item *item, const char *buf, + size_t len) { + struct dlm_comm *cm = config_item_to_comm(item); struct sockaddr_storage *addr; int rv; @@ -783,8 +637,9 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) return len; } -static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf) +static ssize_t comm_addr_list_show(struct config_item *item, char *buf) { + struct dlm_comm *cm = config_item_to_comm(item); ssize_t s; ssize_t allowance; int i; @@ -827,32 +682,28 @@ static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf) return 4096 - allowance; } -static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, - char *buf) -{ - struct dlm_node *nd = config_item_to_node(i); - struct node_attribute *nda = - container_of(a, struct node_attribute, attr); - return nda->show ? nda->show(nd, buf) : 0; -} +CONFIGFS_ATTR(comm_, nodeid); +CONFIGFS_ATTR(comm_, local); +CONFIGFS_ATTR_WO(comm_, addr); +CONFIGFS_ATTR_RO(comm_, addr_list); -static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, - const char *buf, size_t len) -{ - struct dlm_node *nd = config_item_to_node(i); - struct node_attribute *nda = - container_of(a, struct node_attribute, attr); - return nda->store ? nda->store(nd, buf, len) : -EINVAL; -} +static struct configfs_attribute *comm_attrs[] = { + [COMM_ATTR_NODEID] = &comm_attr_nodeid, + [COMM_ATTR_LOCAL] = &comm_attr_local, + [COMM_ATTR_ADDR] = &comm_attr_addr, + [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list, + NULL, +}; -static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf) +static ssize_t node_nodeid_show(struct config_item *item, char *buf) { - return sprintf(buf, "%d\n", nd->nodeid); + return sprintf(buf, "%d\n", config_item_to_node(item)->nodeid); } -static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, +static ssize_t node_nodeid_store(struct config_item *item, const char *buf, size_t len) { + struct dlm_node *nd = config_item_to_node(item); uint32_t seq = 0; int rc = kstrtoint(buf, 0, &nd->nodeid); @@ -863,21 +714,30 @@ static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, return len; } -static ssize_t node_weight_read(struct dlm_node *nd, char *buf) +static ssize_t node_weight_show(struct config_item *item, char *buf) { - return sprintf(buf, "%d\n", nd->weight); + return sprintf(buf, "%d\n", config_item_to_node(item)->weight); } -static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, +static ssize_t node_weight_store(struct config_item *item, const char *buf, size_t len) { - int rc = kstrtoint(buf, 0, &nd->weight); + int rc = kstrtoint(buf, 0, &config_item_to_node(item)->weight); if (rc) return rc; return len; } +CONFIGFS_ATTR(node_, nodeid); +CONFIGFS_ATTR(node_, weight); + +static struct configfs_attribute *node_attrs[] = { + [NODE_ATTR_NODEID] = &node_attr_nodeid, + [NODE_ATTR_WEIGHT] = &node_attr_weight, + NULL, +}; + /* * Functions for the dlm to get the info that's been configured */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 87e9d796cf7d..3a37bd3f9637 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -421,7 +421,7 @@ static void lowcomms_write_space(struct sock *sk) if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { con->sock->sk->sk_write_pending--; - clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags); + clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); } if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) @@ -1448,7 +1448,7 @@ static void send_to_sock(struct connection *con) msg_flags); if (ret == -EAGAIN || ret == 0) { if (ret == -EAGAIN && - test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) && + test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { /* Notify TCP that we're limited by the * application window size. diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 5532f097f6da..d401425f602a 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -145,7 +145,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); if (xop->callback == NULL) { - rv = wait_event_killable(recv_wq, (op->done != 0)); + rv = wait_event_interruptible(recv_wq, (op->done != 0)); if (rv == -ERESTARTSYS) { log_debug(ls, "dlm_posix_lock: wait killed %llx", (unsigned long long)number); @@ -172,7 +172,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = op->info.rv; if (!rv) { - if (posix_lock_file_wait(file, fl) < 0) + if (locks_lock_file_wait(file, fl) < 0) log_error(ls, "dlm_posix_lock: vfs lock error %llx", (unsigned long long)number); } @@ -262,7 +262,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, /* cause the vfs unlock to return ENOENT if lock is not found */ fl->fl_flags |= FL_EXISTS; - rv = posix_lock_file_wait(file, fl); + rv = locks_lock_file_wait(file, fl); if (rv == -ENOENT) { rv = 0; goto out_free; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 173b3873a4f4..1925d6d222b8 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -515,14 +515,9 @@ static ssize_t device_write(struct file *file, const char __user *buf, if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) return -EINVAL; - kbuf = kzalloc(count + 1, GFP_NOFS); - if (!kbuf) - return -ENOMEM; - - if (copy_from_user(kbuf, buf, count)) { - error = -EFAULT; - goto out_free; - } + kbuf = memdup_user_nul(buf, count); + if (!IS_ERR(kbuf)) + return PTR_ERR(kbuf); if (check_version(kbuf)) { error = -EBADE; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 5ba029e627cc..7b39260c7bba 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -86,7 +86,7 @@ ecryptfs_get_encrypted_key_payload_data(struct key *key) { if (key->type == &key_type_encrypted) return (struct ecryptfs_auth_tok *) - (&((struct encrypted_key_payload *)key->payload.data)->payload_data); + (&((struct encrypted_key_payload *)key->payload.data[0])->payload_data); else return NULL; } @@ -117,8 +117,7 @@ ecryptfs_get_key_payload_data(struct key *key) auth_tok = ecryptfs_get_encrypted_key_payload_data(key); if (!auth_tok) - return (struct ecryptfs_auth_tok *) - (((struct user_key_payload *)key->payload.data)->data); + return (struct ecryptfs_auth_tok *)user_key_payload(key)->data; else return auth_tok; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 3c4db1172d22..040aa879d634 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -270,7 +270,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); - if (unlikely(IS_ERR(ecryptfs_inode))) { + if (IS_ERR(ecryptfs_inode)) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" "lower filesystem\n"); rc = PTR_ERR(ecryptfs_inode); @@ -282,9 +282,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, if (rc) { ecryptfs_do_unlink(directory_inode, ecryptfs_dentry, ecryptfs_inode); - make_bad_inode(ecryptfs_inode); - unlock_new_inode(ecryptfs_inode); - iput(ecryptfs_inode); + iget_failed(ecryptfs_inode); goto out; } unlock_new_inode(ecryptfs_inode); @@ -674,16 +672,24 @@ out: return rc ? ERR_PTR(rc) : buf; } -static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie) +static const char *ecryptfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { size_t len; - char *buf = ecryptfs_readlink_lower(dentry, &len); + char *buf; + + if (!dentry) + return ERR_PTR(-ECHILD); + + buf = ecryptfs_readlink_lower(dentry, &len); if (IS_ERR(buf)) return buf; fsstack_copy_attr_atime(d_inode(dentry), d_inode(ecryptfs_dentry_to_lower(dentry))); buf[len] = '\0'; - return *cookie = buf; + set_delayed_call(done, kfree_link, buf); + return buf; } /** @@ -1095,8 +1101,7 @@ out: const struct inode_operations ecryptfs_symlink_iops = { .readlink = generic_readlink, - .follow_link = ecryptfs_follow_link, - .put_link = kfree_put_link, + .get_link = ecryptfs_get_link, .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, .getattr = ecryptfs_getattr_link, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 4f4d0474bee9..e25b6b06bacf 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -663,6 +663,7 @@ static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; + unsigned long flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { @@ -684,6 +685,7 @@ static struct ecryptfs_cache_info { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), + .flags = SLAB_ACCOUNT, .ctor = inode_info_init_once, }, { @@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void) struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; - *(info->cache) = kmem_cache_create(info->name, info->size, - 0, SLAB_HWCACHE_ALIGN, info->ctor); + *(info->cache) = kmem_cache_create(info->name, info->size, 0, + SLAB_HWCACHE_ALIGN | info->flags, info->ctor); if (!*(info->cache)) { ecryptfs_free_kmem_caches(); ecryptfs_printk(KERN_WARNING, "%s: " diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 079d20306ee1..cdf0872382af 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -151,6 +151,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &efs_symlink_aops; break; case S_IFCHR: diff --git a/fs/efs/super.c b/fs/efs/super.c index c8411a30f7da..cb68dac4f9d3 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -94,9 +94,9 @@ static void init_once(void *foo) static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", - sizeof(struct efs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - init_once); + sizeof(struct efs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c index 75117d0dac2b..4870cc82deb0 100644 --- a/fs/efs/symlink.c +++ b/fs/efs/symlink.c @@ -13,7 +13,7 @@ static int efs_symlink_readpage(struct file *file, struct page *page) { - char *link = kmap(page); + char *link = page_address(page); struct buffer_head * bh; struct inode * inode = page->mapping->host; efs_block_t size = inode->i_size; @@ -39,12 +39,10 @@ static int efs_symlink_readpage(struct file *file, struct page *page) } link[size] = '\0'; SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; fail: SetPageError(page); - kunmap(page); unlock_page(page); return err; } diff --git a/fs/eventfd.c b/fs/eventfd.c index 8d0c0df01854..ed70cf9fdc7b 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -45,10 +45,10 @@ struct eventfd_ctx { * * This function is supposed to be called by the kernel in paths that do not * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX - * value, and we signal this as overflow condition by returining a POLLERR + * value, and we signal this as overflow condition by returning a POLLERR * to poll(2). * - * Returns the amount by which the counter was incrememnted. This will be less + * Returns the amount by which the counter was incremented. This will be less * than @n if the counter has overflowed. */ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) diff --git a/fs/exec.c b/fs/exec.c index b06623a9347f..828ec5f07de0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -119,7 +119,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, - .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, + .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; @@ -763,7 +763,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) int err; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, - .acc_mode = MAY_EXEC | MAY_OPEN, + .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 73c64daa0f55..9eaf595aeaf8 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -592,10 +592,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) } unlock_page(page); } - if (PageDirty(page) || PageWriteback(page)) - *uptodate = true; - else - *uptodate = PageUptodate(page); + *uptodate = PageUptodate(page); EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); return page; } else { @@ -1227,6 +1224,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_link = (char *)oi->i_data; } else { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &exofs_aops; } } else { diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 09a6bb1ad63c..c20d77df2679 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -80,9 +80,6 @@ static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, struct inode *inode; int err; - if (!new_valid_dev(rdev)) - return -EINVAL; - inode = exofs_new_inode(dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -114,6 +111,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry, if (l > sizeof(oi->i_data)) { /* slow symlink */ inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &exofs_aops; memset(oi->i_data, 0, sizeof(oi->i_data)); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index b795c567b5e1..6658a50530a0 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -194,8 +194,8 @@ static int init_inodecache(void) { exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", sizeof(struct exofs_i_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - exofs_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT, exofs_init_once); if (exofs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 8d15febd0aa3..4c69c94cafd8 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -684,6 +684,9 @@ struct ext2_inode_info { struct rw_semaphore xattr_sem; #endif rwlock_t i_meta_lock; +#ifdef CONFIG_FS_DAX + struct rw_semaphore dax_sem; +#endif /* * truncate_mutex is for serialising ext2_truncate() against @@ -699,6 +702,14 @@ struct ext2_inode_info { #endif }; +#ifdef CONFIG_FS_DAX +#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem) +#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem) +#else +#define dax_sem_down_write(ext2_inode) +#define dax_sem_up_write(ext2_inode) +#endif + /* * Inode dynamic state flags */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 1982c3f11aec..11a42c5a09ae 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -27,27 +27,103 @@ #include "acl.h" #ifdef CONFIG_FS_DAX +/* + * The lock ordering for ext2 DAX fault paths is: + * + * mmap_sem (MM) + * sb_start_pagefault (vfs, freeze) + * ext2_inode_info->dax_sem + * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX) + * ext2_inode_info->truncate_mutex + * + * The default page_lock and i_size verification done by non-DAX fault paths + * is sufficient because ext2 doesn't support hole punching. + */ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext2_get_block, NULL); + struct inode *inode = file_inode(vma->vm_file); + struct ext2_inode_info *ei = EXT2_I(inode); + int ret; + + if (vmf->flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + } + down_read(&ei->dax_sem); + + ret = __dax_fault(vma, vmf, ext2_get_block, NULL); + + up_read(&ei->dax_sem); + if (vmf->flags & FAULT_FLAG_WRITE) + sb_end_pagefault(inode->i_sb); + return ret; } static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { - return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); + struct inode *inode = file_inode(vma->vm_file); + struct ext2_inode_info *ei = EXT2_I(inode); + int ret; + + if (flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + } + down_read(&ei->dax_sem); + + ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); + + up_read(&ei->dax_sem); + if (flags & FAULT_FLAG_WRITE) + sb_end_pagefault(inode->i_sb); + return ret; } static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext2_get_block, NULL); + struct inode *inode = file_inode(vma->vm_file); + struct ext2_inode_info *ei = EXT2_I(inode); + int ret; + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + down_read(&ei->dax_sem); + + ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL); + + up_read(&ei->dax_sem); + sb_end_pagefault(inode->i_sb); + return ret; +} + +static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct ext2_inode_info *ei = EXT2_I(inode); + int ret = VM_FAULT_NOPAGE; + loff_t size; + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + down_read(&ei->dax_sem); + + /* check that the faulting page hasn't raced with truncate */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + + up_read(&ei->dax_sem); + sb_end_pagefault(inode->i_sb); + return ret; } static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, .pmd_fault = ext2_dax_pmd_fault, .page_mkwrite = ext2_dax_mkwrite, - .pfn_mkwrite = dax_pfn_mkwrite, + .pfn_mkwrite = ext2_dax_pfn_mkwrite, }; static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c60a248c640c..338eefda70c6 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1085,6 +1085,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de ext2_free_data(inode, p, q); } +/* dax_sem must be held when calling this function */ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) { __le32 *i_data = EXT2_I(inode)->i_data; @@ -1100,6 +1101,10 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) blocksize = inode->i_sb->s_blocksize; iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); +#ifdef CONFIG_FS_DAX + WARN_ON(!rwsem_is_locked(&ei->dax_sem)); +#endif + n = ext2_block_to_path(inode, iblock, offsets, NULL); if (n == 0) return; @@ -1185,7 +1190,10 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset) return; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; + + dax_sem_down_write(EXT2_I(inode)); __ext2_truncate_blocks(inode, offset); + dax_sem_up_write(EXT2_I(inode)); } static int ext2_setsize(struct inode *inode, loff_t newsize) @@ -1213,8 +1221,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) if (error) return error; + dax_sem_down_write(EXT2_I(inode)); truncate_setsize(inode, newsize); __ext2_truncate_blocks(inode, newsize); + dax_sem_up_write(EXT2_I(inode)); inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; if (inode_needs_sync(inode)) { @@ -1410,6 +1420,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) sizeof(ei->i_data) - 1); } else { inode->i_op = &ext2_symlink_inode_operations; + inode_nohighmem(inode); if (test_opt(inode->i_sb, NOBH)) inode->i_mapping->a_ops = &ext2_nobh_aops; else diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index b4841e3066a5..7a2be8f7f3c3 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -143,9 +143,6 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, struct inode * inode; int err; - if (!new_valid_dev(rdev)) - return -EINVAL; - err = dquot_initialize(dir); if (err) return err; @@ -186,6 +183,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, if (l > sizeof (EXT2_I(inode)->i_data)) { /* slow symlink */ inode->i_op = &ext2_symlink_inode_operations; + inode_nohighmem(inode); if (test_opt(inode->i_sb, NOBH)) inode->i_mapping->a_ops = &ext2_nobh_aops; else diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 900e19cf9ef6..2a188413a2b0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -192,6 +192,9 @@ static void init_once(void *foo) init_rwsem(&ei->xattr_sem); #endif mutex_init(&ei->truncate_mutex); +#ifdef CONFIG_FS_DAX + init_rwsem(&ei->dax_sem); +#endif inode_init_once(&ei->vfs_inode); } @@ -200,7 +203,7 @@ static int __init init_inodecache(void) ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", sizeof(struct ext2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ext2_inode_cachep == NULL) return -ENOMEM; @@ -566,6 +569,8 @@ static int parse_options(char *options, struct super_block *sb) /* Fall through */ case Opt_dax: #ifdef CONFIG_FS_DAX + ext2_msg(sb, KERN_WARNING, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); set_opt(sbi->s_mount_opt, DAX); #else ext2_msg(sb, KERN_INFO, "dax option not supported"); diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index ae17179f3810..3495d8ae4b33 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -22,8 +22,7 @@ const struct inode_operations ext2_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, @@ -35,7 +34,7 @@ const struct inode_operations ext2_symlink_inode_operations = { const struct inode_operations ext2_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 0b6bfd3a398b..f57a7aba32eb 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -77,10 +77,8 @@ printk("\n"); \ } while (0) # define ea_bdebug(bh, f...) do { \ - char b[BDEVNAME_SIZE]; \ - printk(KERN_DEBUG "block %s:%lu: ", \ - bdevname(bh->b_bdev, b), \ - (unsigned long) bh->b_blocknr); \ + printk(KERN_DEBUG "block %pg:%lu: ", \ + bh->b_bdev, (unsigned long) bh->b_blocknr); \ printk(f); \ printk("\n"); \ } while (0) @@ -292,17 +290,21 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", const struct xattr_handler *handler = ext2_xattr_handler(entry->e_name_index); - if (handler) { - size_t size = handler->list(dentry, buffer, rest, - entry->e_name, - entry->e_name_len, - handler->flags); + if (handler && (!handler->list || handler->list(dentry))) { + const char *prefix = handler->prefix ?: handler->name; + size_t prefix_len = strlen(prefix); + size_t size = prefix_len + entry->e_name_len + 1; + if (buffer) { if (size > rest) { error = -ERANGE; goto cleanup; } - buffer += size; + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, entry->e_name, entry->e_name_len); + buffer += entry->e_name_len; + *buffer++ = 0; } rest -= size; } diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 702fc6840246..ba97f243b050 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -7,37 +7,20 @@ #include <linux/security.h> #include "xattr.h" -static size_t -ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const int prefix_len = XATTR_SECURITY_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - static int -ext2_xattr_security_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +ext2_xattr_security_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, buffer, size); } static int -ext2_xattr_security_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +ext2_xattr_security_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -68,7 +51,6 @@ ext2_init_security(struct inode *inode, struct inode *dir, const struct xattr_handler ext2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = ext2_xattr_security_list, .get = ext2_xattr_security_get, .set = ext2_xattr_security_set, }; diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 42b6e9874bcc..2c94d1930626 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -8,40 +8,26 @@ #include "ext2.h" #include "xattr.h" -static size_t -ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) +static bool +ext2_xattr_trusted_list(struct dentry *dentry) { - const int prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; + return capable(CAP_SYS_ADMIN); } static int -ext2_xattr_trusted_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +ext2_xattr_trusted_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, buffer, size); } static int -ext2_xattr_trusted_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +ext2_xattr_trusted_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index ecdc4605192c..72a2a96d677f 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -10,30 +10,17 @@ #include "ext2.h" #include "xattr.h" -static size_t -ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) +static bool +ext2_xattr_user_list(struct dentry *dentry) { - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!test_opt(dentry->d_sb, XATTR_USER)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; + return test_opt(dentry->d_sb, XATTR_USER); } static int -ext2_xattr_user_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +ext2_xattr_user_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER, @@ -41,11 +28,10 @@ ext2_xattr_user_get(struct dentry *dentry, const char *name, } static int -ext2_xattr_user_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +ext2_xattr_user_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 75285ea9aa05..f52cf54f0cbc 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ - xattr_trusted.o inline.o readpage.o + xattr_trusted.o inline.o readpage.o sysfs.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index cd6ea29be645..ec0668a60678 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -191,6 +191,7 @@ static int ext4_init_block_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { + ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -203,7 +204,7 @@ static int ext4_init_block_bitmap(struct super_block *sb, count); } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return -EIO; + return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); @@ -213,7 +214,7 @@ static int ext4_init_block_bitmap(struct super_block *sb, start = ext4_group_first_block_no(sb, block_group); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (ext4_has_feature_flex_bg(sb)) flex_bg = 1; /* Set bits for block and inode bitmaps, and inode table */ @@ -322,7 +323,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, ext4_fsblk_t blk; ext4_fsblk_t group_first_block; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + if (ext4_has_feature_flex_bg(sb)) { /* with FLEX_BG, the inode/block bitmaps and itable * blocks may not be in the group at all * so the bitmap validation will be skipped for those groups @@ -360,42 +361,45 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, return 0; } -static void ext4_validate_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - ext4_group_t block_group, - struct buffer_head *bh) +static int ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - return; + if (buffer_verified(bh)) + return 0; + if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; ext4_lock_group(sb, block_group); - blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); - if (unlikely(blk != 0)) { + if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, + desc, bh))) { ext4_unlock_group(sb, block_group); - ext4_error(sb, "bg %u: block %llu: invalid block bitmap", - block_group, blk); + ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - return; + return -EFSBADCRC; } - if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, - desc, bh))) { + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); - ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + ext4_error(sb, "bg %u: block %llu: invalid block bitmap", + block_group, blk); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - return; + return -EFSCORRUPTED; } set_buffer_verified(bh); ext4_unlock_group(sb, block_group); + return 0; } /** @@ -414,17 +418,18 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh; ext4_fsblk_t bitmap_blk; + int err; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) - return NULL; + return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_block_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_error(sb, "Cannot get buffer for block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); - return NULL; + return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) @@ -437,7 +442,6 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - int err; err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); @@ -445,7 +449,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) ext4_unlock_group(sb, block_group); unlock_buffer(bh); if (err) - ext4_error(sb, "Checksum bad for grp %u", block_group); + goto out; goto verify; } ext4_unlock_group(sb, block_group); @@ -468,11 +472,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) submit_bh(READ | REQ_META | REQ_PRIO, bh); return bh; verify: - ext4_validate_block_bitmap(sb, desc, block_group, bh); - if (buffer_verified(bh)) - return bh; + err = ext4_validate_block_bitmap(sb, desc, block_group, bh); + if (err) + goto out; + return bh; +out: put_bh(bh); - return NULL; + return ERR_PTR(err); } /* Returns 0 on success, 1 on error */ @@ -485,32 +491,32 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, return 0; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) - return 1; + return -EFSCORRUPTED; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); - return 1; + return -EIO; } clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ - ext4_validate_block_bitmap(sb, desc, block_group, bh); - /* ...but check for error just in case errors=continue. */ - return !buffer_verified(bh); + return ext4_validate_block_bitmap(sb, desc, block_group, bh); } struct buffer_head * ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { struct buffer_head *bh; + int err; bh = ext4_read_block_bitmap_nowait(sb, block_group); - if (!bh) - return NULL; - if (ext4_wait_block_bitmap(sb, block_group, bh)) { + if (IS_ERR(bh)) + return bh; + err = ext4_wait_block_bitmap(sb, block_group, bh); + if (err) { put_bh(bh); - return NULL; + return ERR_PTR(err); } return bh; } @@ -681,8 +687,10 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); - if (bitmap_bh == NULL) + if (IS_ERR(bitmap_bh)) { + bitmap_bh = NULL; continue; + } x = ext4_count_free(bitmap_bh->b_data, EXT4_CLUSTERS_PER_GROUP(sb) / 8); @@ -740,14 +748,13 @@ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) if (group == 0) return 1; - if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) { + if (ext4_has_feature_sparse_super2(sb)) { if (group == le32_to_cpu(es->s_backup_bgs[0]) || group == le32_to_cpu(es->s_backup_bgs[1])) return 1; return 0; } - if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) + if ((group <= 1) || !ext4_has_feature_sparse_super(sb)) return 1; if (!(group & 1)) return 0; @@ -776,7 +783,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, if (!ext4_bg_has_super(sb, group)) return 0; - if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) + if (ext4_has_feature_meta_bg(sb)) return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); else return EXT4_SB(sb)->s_gdb_count; @@ -797,8 +804,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); - if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || - metagroup < first_meta_bg) + if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) return ext4_bg_num_gdb_nometa(sb, group); return ext4_bg_num_gdb_meta(sb,group); @@ -818,7 +824,7 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, /* Check for superblock and gdt backups in this group */ num = ext4_bg_has_super(sb, block_group); - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + if (!ext4_has_feature_meta_bg(sb) || block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 3522340c7a99..02ddec6d8a7d 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -234,7 +234,7 @@ int ext4_check_blockref(const char *function, unsigned int line, es->s_last_error_block = cpu_to_le64(blk); ext4_error_inode(inode, function, line, blk, "invalid block"); - return -EIO; + return -EFSCORRUPTED; } } return 0; diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 45731558138c..1a0835073663 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -253,8 +253,7 @@ typedef enum { EXT4_ENCRYPT, } ext4_direction_t; -static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, - struct inode *inode, +static int ext4_page_crypto(struct inode *inode, ext4_direction_t rw, pgoff_t index, struct page *src_page, @@ -296,7 +295,6 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, else res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } @@ -353,7 +351,7 @@ struct page *ext4_encrypt(struct inode *inode, if (IS_ERR(ciphertext_page)) goto errout; ctx->w.control_page = plaintext_page; - err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, + err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index, plaintext_page, ciphertext_page); if (err) { ciphertext_page = ERR_PTR(err); @@ -378,40 +376,29 @@ struct page *ext4_encrypt(struct inode *inode, * * Return: Zero on success, non-zero otherwise. */ -int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page) +int ext4_decrypt(struct page *page) { BUG_ON(!PageLocked(page)); - return ext4_page_crypto(ctx, page->mapping->host, + return ext4_page_crypto(page->mapping->host, EXT4_DECRYPT, page->index, page, page); } -/* - * Convenience function which takes care of allocating and - * deallocating the encryption context - */ -int ext4_decrypt_one(struct inode *inode, struct page *page) -{ - int ret; - - struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); - - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - ret = ext4_decrypt(ctx, page); - ext4_release_crypto_ctx(ctx); - return ret; -} - int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) { struct ext4_crypto_ctx *ctx; struct page *ciphertext_page = NULL; struct bio *bio; - ext4_lblk_t lblk = ex->ee_block; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); ext4_fsblk_t pblk = ext4_ext_pblock(ex); unsigned int len = ext4_ext_get_actual_len(ex); - int err = 0; + int ret, err = 0; + +#if 0 + ext4_msg(inode->i_sb, KERN_CRIT, + "ext4_encrypted_zeroout ino %lu lblk %u len %u", + (unsigned long) inode->i_ino, lblk, len); +#endif BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); @@ -426,7 +413,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) } while (len--) { - err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, + err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk, ZERO_PAGE(0), ciphertext_page); if (err) goto errout; @@ -437,17 +424,26 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) goto errout; } bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = pblk; - err = bio_add_page(bio, ciphertext_page, + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + ret = bio_add_page(bio, ciphertext_page, inode->i_sb->s_blocksize, 0); - if (err) { + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + ext4_msg(inode->i_sb, KERN_ERR, + "bio_add_page failed: %d", ret); + WARN_ON(1); bio_put(bio); + err = -EIO; goto errout; } err = submit_bio_wait(WRITE, bio); + if ((err == 0) && bio->bi_error) + err = -EIO; bio_put(bio); if (err) goto errout; + lblk++; pblk++; } err = 0; errout: diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 847f919c84d9..2fbef8a14760 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -120,7 +120,6 @@ static int ext4_fname_encrypt(struct inode *inode, ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } @@ -182,7 +181,6 @@ static int ext4_fname_decrypt(struct inode *inode, ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); res = crypto_ablkcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 1d510c11b100..c5882b36e558 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -71,7 +71,6 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], EXT4_AES_256_XTS_KEY_SIZE, NULL); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } @@ -121,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode) struct key *keyring_key = NULL; struct ext4_encryption_key *master_key; struct ext4_encryption_context ctx; - struct user_key_payload *ukp; + const struct user_key_payload *ukp; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct crypto_ablkcipher *ctfm; const char *cipher_str; @@ -208,8 +207,13 @@ retry: goto out; } crypt_info->ci_keyring_key = keyring_key; - BUG_ON(keyring_key->type != &key_type_logon); - ukp = ((struct user_key_payload *)keyring_key->payload.data); + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING + "ext4: key type must be logon\n"); + res = -ENOKEY; + goto out; + } + ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { res = -EINVAL; goto out; @@ -217,7 +221,13 @@ retry: master_key = (struct ext4_encryption_key *)ukp->data; BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != EXT4_KEY_DERIVATION_NONCE_SIZE); - BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); + if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "ext4: key size incorrect: %d\n", + master_key->size); + res = -ENOKEY; + goto out; + } res = ext4_derive_key_aes(ctx.nonce, master_key->raw, raw_key); if (res) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index a640ec2c4b13..ad050698143f 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -150,7 +150,8 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, if ((parent == NULL) || (child == NULL)) { pr_err("parent %p child %p\n", parent, child); - BUG_ON(1); + WARN_ON(1); /* Should never happen */ + return 0; } /* no restrictions if the parent directory is not encrypted */ if (!ext4_encrypted_inode(parent)) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index f9e14911918c..1d1bca74f844 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -40,8 +40,7 @@ static int is_dx_dir(struct inode *inode) { struct super_block *sb = inode->i_sb; - if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX) && + if (ext4_has_feature_dir_index(inode->i_sb) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || ((inode->i_size >> sb->s_blocksize_bits) == 1) || ext4_has_inline_data(inode))) @@ -621,14 +620,14 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) - return -EIO; + return -EFSCORRUPTED; nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) - return -EIO; + return -EFSCORRUPTED; return 0; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fd1f28be5296..cc7ca4e87144 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -26,6 +26,7 @@ #include <linux/seqlock.h> #include <linux/mutex.h> #include <linux/timer.h> +#include <linux/version.h> #include <linux/wait.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> @@ -374,6 +375,7 @@ struct flex_groups { #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ @@ -431,6 +433,7 @@ enum { EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; @@ -475,6 +478,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(PROJINHERIT); CHECK_FLAG_VALUE(RESERVED); } @@ -692,6 +696,7 @@ struct ext4_inode { __le32 i_crtime; /* File Creation time */ __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ __le32 i_version_hi; /* high 32 bits for 64-bit version */ + __le32 i_projid; /* Project ID */ }; struct move_extent { @@ -723,19 +728,55 @@ struct move_extent { <= (EXT4_GOOD_OLD_INODE_SIZE + \ (einode)->i_extra_isize)) \ +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + static inline __le32 ext4_encode_extra_time(struct timespec *time) { - return cpu_to_le32((sizeof(time->tv_sec) > 4 ? - (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | - ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); + u32 extra = sizeof(time->tv_sec) > 4 ? + ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0; + return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); } static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) { - if (sizeof(time->tv_sec) > 4) - time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) - << 32; - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + if (unlikely(sizeof(time->tv_sec) > 4 && + (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0) + /* Handle legacy encoding of pre-1970 dates with epoch + * bits 1,1. We assume that by kernel version 4.20, + * everyone will have run fsck over the affected + * filesystems to correct the problem. (This + * backwards compatibility may be removed before this + * time, at the discretion of the ext4 developers.) + */ + u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; + if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) + extra_bits = 0; + time->tv_sec += extra_bits << 32; +#else + time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; +#endif + } + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ @@ -1019,6 +1060,9 @@ struct ext4_inode_info { #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ +#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly + specified journal checksum */ + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ @@ -1179,7 +1223,9 @@ struct ext4_super_block { __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ __le32 s_lpf_ino; /* Location of the lost+found inode */ - __le32 s_reserved[100]; /* Padding to the end of the block */ + __le32 s_prj_quota_inum; /* inode for tracking project quota */ + __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ + __le32 s_reserved[98]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1522,6 +1568,7 @@ static inline int ext4_encrypted_inode(struct inode *inode) * Feature set definitions */ +/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */ #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ @@ -1566,6 +1613,7 @@ static inline int ext4_encrypted_inode(struct inode *inode) */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1578,11 +1626,99 @@ static inline int ext4_encrypted_inode(struct inode *inode) #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ -#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ +#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat |= \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat |= \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat &= \ + ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} + +EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) +EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) +EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) +EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) +EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) +EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) +EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) + +EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) +EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) +EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) +EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) +EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) +EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) +EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) +EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) + +EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) +EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) +EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) +EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) +EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) +EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) +EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) +EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) +EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) +EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) + #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_META_BG) @@ -1598,7 +1734,7 @@ static inline int ext4_encrypted_inode(struct inode *inode) EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) -#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ @@ -1607,7 +1743,8 @@ static inline int ext4_encrypted_inode(struct inode *inode) EXT4_FEATURE_INCOMPAT_FLEX_BG| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ - EXT4_FEATURE_INCOMPAT_ENCRYPT) + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -1619,6 +1756,40 @@ static inline int ext4_encrypted_inode(struct inode *inode) EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA) +#define EXTN_FEATURE_FUNCS(ver) \ +static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ +} + +EXTN_FEATURE_FUNCS(2) +EXTN_FEATURE_FUNCS(3) +EXTN_FEATURE_FUNCS(4) + +static inline bool ext4_has_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_compat != 0); +} +static inline bool ext4_has_ro_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); +} +static inline bool ext4_has_incompat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); +} + /* * Default values for user and/or group using reserved blocks */ @@ -1769,8 +1940,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) * (c) Daniel Phillips, 2001 */ -#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ - EXT4_FEATURE_COMPAT_DIR_INDEX) && \ +#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) @@ -2063,8 +2233,7 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); void ext4_restore_control_page(struct page *data_page); struct page *ext4_encrypt(struct inode *inode, struct page *plaintext_page); -int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page); -int ext4_decrypt_one(struct inode *inode, struct page *page); +int ext4_decrypt(struct page *page); int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -2072,7 +2241,7 @@ int ext4_init_crypto(void); void ext4_exit_crypto(void); static inline int ext4_sb_has_crypto(struct super_block *sb) { - return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); + return ext4_has_feature_encrypt(sb); } #else static inline int ext4_init_crypto(void) { return 0; } @@ -2193,8 +2362,7 @@ int ext4_insert_dentry(struct inode *dir, struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { - if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX)) + if (!ext4_has_feature_dir_index(inode->i_sb)) ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } static unsigned char ext4_filetype_table[] = { @@ -2203,8 +2371,7 @@ static unsigned char ext4_filetype_table[] = { static inline unsigned char get_dtype(struct super_block *sb, int filetype) { - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || - (filetype >= EXT4_FT_MAX)) + if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) return DT_UNKNOWN; return ext4_filetype_table[filetype]; @@ -2245,6 +2412,7 @@ extern int ext4_init_inode_table(struct super_block *sb, extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ +extern const struct file_operations ext4_seq_mb_groups_fops; extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; extern int ext4_mb_init(struct super_block *); @@ -2372,6 +2540,7 @@ extern int ext4_group_extend(struct super_block *sb, extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); extern void *ext4_kvmalloc(size_t size, gfp_t flags); @@ -2534,15 +2703,13 @@ extern int ext4_register_li_request(struct super_block *sb, static inline int ext4_has_group_desc_csum(struct super_block *sb) { - return EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM) || - (EXT4_SB(sb)->s_chksum_driver != NULL); + return ext4_has_feature_gdt_csum(sb) || + EXT4_SB(sb)->s_chksum_driver != NULL; } static inline int ext4_has_metadata_csum(struct super_block *sb) { - WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && !EXT4_SB(sb)->s_chksum_driver); return (EXT4_SB(sb)->s_chksum_driver != NULL); @@ -2889,7 +3056,7 @@ static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { static inline void ext4_set_de_type(struct super_block *sb, struct ext4_dir_entry_2 *de, umode_t mode) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + if (ext4_has_feature_filetype(sb)) de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } @@ -2903,6 +3070,12 @@ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; +/* sysfs.c */ +extern int ext4_register_sysfs(struct super_block *sb); +extern void ext4_unregister_sysfs(struct super_block *sb); +extern int __init ext4_init_sysfs(void); +extern void ext4_exit_sysfs(void); + /* block_validity */ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); @@ -3049,4 +3222,7 @@ extern void ext4_resize_end(struct super_block *sb); #endif /* __KERNEL__ */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d41843181818..e770c1ee4613 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -88,13 +88,13 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) return 0; } + err = handle->h_err; if (!handle->h_transaction) { - err = jbd2_journal_stop(handle); - return handle->h_err ? handle->h_err : err; + rc = jbd2_journal_stop(handle); + return err ? err : rc; } sb = handle->h_transaction->t_journal->j_private; - err = handle->h_err; rc = jbd2_journal_stop(handle); if (!err) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 9c5b49fb281e..5f5846211095 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -34,8 +34,7 @@ */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ - (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - ? 20U : 8U) + (ext4_has_feature_extents(sb) ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode @@ -84,17 +83,16 @@ /* Amount of blocks needed for quota update - we know that the structure was * allocated so we need to update only data block */ #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ - EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ - 1 : 0) + ext4_has_feature_quota(sb)) ? 1 : 0) /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ - EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ + ext4_has_feature_quota(sb)) ?\ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_INIT_REWRITE) : 0) #define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ - EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ + ext4_has_feature_quota(sb)) ?\ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_DEL_REWRITE) : 0) #else diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2553aa8b608d..551353b1b17a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -442,7 +442,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, int depth, ext4_fsblk_t pblk) { const char *error_msg; - int max = 0; + int max = 0, err = -EFSCORRUPTED; if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { error_msg = "invalid magic"; @@ -473,6 +473,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { error_msg = "extent tree corrupted"; + err = -EFSBADCRC; goto corrupted; } return 0; @@ -485,7 +486,7 @@ corrupted: le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), max, le16_to_cpu(eh->eh_depth), depth); - return -EIO; + return err; } #define ext4_ext_check(inode, eh, depth, pblk) \ @@ -899,7 +900,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, bh = read_extent_tree_block(inode, path[ppos].p_block, --i, flags); - if (unlikely(IS_ERR(bh))) { + if (IS_ERR(bh)) { ret = PTR_ERR(bh); goto err; } @@ -910,7 +911,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, put_bh(bh); EXT4_ERROR_INODE(inode, "ppos %d > depth %d", ppos, depth); - ret = -EIO; + ret = -EFSCORRUPTED; goto err; } path[ppos].p_bh = bh; @@ -959,7 +960,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, EXT4_ERROR_INODE(inode, "logical %d == ei_block %d!", logical, le32_to_cpu(curp->p_idx->ei_block)); - return -EIO; + return -EFSCORRUPTED; } if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) @@ -968,7 +969,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, "eh_entries %d >= eh_max %d!", le16_to_cpu(curp->p_hdr->eh_entries), le16_to_cpu(curp->p_hdr->eh_max)); - return -EIO; + return -EFSCORRUPTED; } if (logical > le32_to_cpu(curp->p_idx->ei_block)) { @@ -992,7 +993,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); - return -EIO; + return -EFSCORRUPTED; } ix->ei_block = cpu_to_le32(logical); @@ -1001,7 +1002,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); - return -EIO; + return -EFSCORRUPTED; } err = ext4_ext_dirty(handle, inode, curp); @@ -1042,7 +1043,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, * border from split point */ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); - return -EIO; + return -EFSCORRUPTED; } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; @@ -1086,7 +1087,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, newblock = ablocks[--a]; if (unlikely(newblock == 0)) { EXT4_ERROR_INODE(inode, "newblock == 0!"); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); @@ -1112,7 +1113,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", path[depth].p_hdr->eh_entries, path[depth].p_hdr->eh_max); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } /* start copy from next extent */ @@ -1151,7 +1152,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, k = depth - at - 1; if (unlikely(k < 0)) { EXT4_ERROR_INODE(inode, "k %d < 0!", k); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } if (k) @@ -1191,7 +1192,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, EXT4_ERROR_INODE(inode, "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", le32_to_cpu(path[i].p_ext->ee_block)); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } /* start copy indexes */ @@ -1425,7 +1426,7 @@ static int ext4_ext_search_left(struct inode *inode, if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); - return -EIO; + return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; @@ -1444,7 +1445,7 @@ static int ext4_ext_search_left(struct inode *inode, EXT4_ERROR_INODE(inode, "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", *logical, le32_to_cpu(ex->ee_block)); - return -EIO; + return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; @@ -1455,7 +1456,7 @@ static int ext4_ext_search_left(struct inode *inode, EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, depth); - return -EIO; + return -EFSCORRUPTED; } } return 0; @@ -1465,7 +1466,7 @@ static int ext4_ext_search_left(struct inode *inode, EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); - return -EIO; + return -EFSCORRUPTED; } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; @@ -1495,7 +1496,7 @@ static int ext4_ext_search_right(struct inode *inode, if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); - return -EIO; + return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; @@ -1514,7 +1515,7 @@ static int ext4_ext_search_right(struct inode *inode, EXT4_ERROR_INODE(inode, "first_extent(path[%d].p_hdr) != ex", depth); - return -EIO; + return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; @@ -1522,7 +1523,7 @@ static int ext4_ext_search_right(struct inode *inode, EXT4_ERROR_INODE(inode, "ix != EXT_FIRST_INDEX *logical %d!", *logical); - return -EIO; + return -EFSCORRUPTED; } } goto found_extent; @@ -1532,7 +1533,7 @@ static int ext4_ext_search_right(struct inode *inode, EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); - return -EIO; + return -EFSCORRUPTED; } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { @@ -1670,7 +1671,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, if (unlikely(ex == NULL || eh == NULL)) { EXT4_ERROR_INODE(inode, "ex %p == NULL or eh %p == NULL", ex, eh); - return -EIO; + return -EFSCORRUPTED; } if (depth == 0) { @@ -1938,14 +1939,14 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); - return -EIO; + return -EFSCORRUPTED; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EIO; + return -EFSCORRUPTED; } /* try to insert block into found extent and return */ @@ -2172,7 +2173,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, if (unlikely(path[depth].p_hdr == NULL)) { up_read(&EXT4_I(inode)->i_data_sem); EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - err = -EIO; + err = -EFSCORRUPTED; break; } ex = path[depth].p_ext; @@ -2241,7 +2242,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, if (unlikely(es.es_len == 0)) { EXT4_ERROR_INODE(inode, "es.es_len == 0"); - err = -EIO; + err = -EFSCORRUPTED; break; } @@ -2264,7 +2265,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, "next extent == %u, next " "delalloc extent = %u", next, next_del); - err = -EIO; + err = -EFSCORRUPTED; break; } } @@ -2363,7 +2364,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); - return -EIO; + return -EFSCORRUPTED; } err = ext4_ext_get_access(handle, inode, path); if (err) @@ -2612,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EIO; + return -EFSCORRUPTED; } /* find where to start removing */ ex = path[depth].p_ext; @@ -2666,7 +2667,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, "on extent %u:%u", start, end, ex_ee_block, ex_ee_block + ex_ee_len - 1); - err = -EIO; + err = -EFSCORRUPTED; goto out; } else if (a != ex_ee_block) { /* remove tail of the extent */ @@ -2841,7 +2842,7 @@ again: EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - err = -EIO; + err = -EFSCORRUPTED; } goto out; } @@ -2920,7 +2921,7 @@ again: i = 0; if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { - err = -EIO; + err = -EFSCORRUPTED; goto out; } } @@ -2978,7 +2979,7 @@ again: * Should be a no-op if we did IO above. */ cond_resched(); if (WARN_ON(i + 1 > depth)) { - err = -EIO; + err = -EFSCORRUPTED; break; } path[i + 1].p_bh = bh; @@ -3054,7 +3055,7 @@ void ext4_ext_init(struct super_block *sb) * possible initialization would be here */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_extents(sb)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST @@ -3081,7 +3082,7 @@ void ext4_ext_init(struct super_block *sb) */ void ext4_ext_release(struct super_block *sb) { - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + if (!ext4_has_feature_extents(sb)) return; #ifdef EXTENTS_STATS @@ -3345,7 +3346,7 @@ static int ext4_split_extent(handle_t *handle, if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); - return -EIO; + return -EFSCORRUPTED; } unwritten = ext4_ext_is_unwritten(ex); split_flag1 = 0; @@ -3558,6 +3559,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); + if (ext4_encrypted_inode(inode)) + max_zeroout = 0; + /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); @@ -3970,7 +3974,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); - return -EIO; + return -EFSCORRUPTED; } } @@ -4308,7 +4312,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, "lblock: %lu, depth: %d pblock %lld", (unsigned long) map->m_lblk, depth, path[depth].p_block); - err = -EIO; + err = -EFSCORRUPTED; goto out2; } @@ -5271,7 +5275,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (depth == path->p_depth) { ex_start = path[depth].p_ext; if (!ex_start) - return -EIO; + return -EFSCORRUPTED; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); @@ -5411,7 +5415,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) *iterator); - return -EIO; + return -EFSCORRUPTED; } if (SHIFT == SHIFT_LEFT && *iterator > le32_to_cpu(extent->ee_block)) { @@ -5792,7 +5796,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, int split = 0; path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); - if (unlikely(IS_ERR(path1))) { + if (IS_ERR(path1)) { *erp = PTR_ERR(path1); path1 = NULL; finish: @@ -5800,7 +5804,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, goto repeat; } path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); - if (unlikely(IS_ERR(path2))) { + if (IS_ERR(path2)) { *erp = PTR_ERR(path2); path2 = NULL; goto finish; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 26724aeece73..ac748b3af1c1 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1089,20 +1089,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, return nr_shrunk; } -static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) +int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v) { - return *pos ? NULL : SEQ_START_TOKEN; -} - -static void * -ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) -{ - return NULL; -} - -static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) -{ - struct ext4_sb_info *sbi = seq->private; + struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private); struct ext4_es_stats *es_stats = &sbi->s_es_stats; struct ext4_inode_info *ei, *max = NULL; unsigned int inode_cnt = 0; @@ -1143,45 +1132,6 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) return 0; } -static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations ext4_es_seq_shrinker_info_ops = { - .start = ext4_es_seq_shrinker_info_start, - .next = ext4_es_seq_shrinker_info_next, - .stop = ext4_es_seq_shrinker_info_stop, - .show = ext4_es_seq_shrinker_info_show, -}; - -static int -ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) -{ - int ret; - - ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = PDE_DATA(inode); - } - - return ret; -} - -static int -ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} - -static const struct file_operations ext4_es_seq_shrinker_info_fops = { - .owner = THIS_MODULE, - .open = ext4_es_seq_shrinker_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = ext4_es_seq_shrinker_info_release, -}; - int ext4_es_register_shrinker(struct ext4_sb_info *sbi) { int err; @@ -1210,10 +1160,6 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) if (err) goto err2; - if (sbi->s_proc) - proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, - &ext4_es_seq_shrinker_info_fops, sbi); - return 0; err2: @@ -1225,8 +1171,6 @@ err1: void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { - if (sbi->s_proc) - remove_proc_entry("es_shrinker_info", sbi->s_proc); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); unregister_shrinker(&sbi->s_es_shrinker); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 691b52613ce4..f7aa24f4642d 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -172,4 +172,6 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); +extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); + #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 619bfc1fda8c..1b8024d26f65 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -64,7 +64,7 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) } /* Initializes an uninitialized inode bitmap */ -static unsigned ext4_init_inode_bitmap(struct super_block *sb, +static int ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) @@ -89,7 +89,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, count); } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return 0; + return -EFSBADCRC; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); @@ -99,7 +99,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, block_group, gdp); - return EXT4_INODES_PER_GROUP(sb); + return 0; } void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) @@ -112,6 +112,42 @@ void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) put_bh(bh); } +static int ext4_validate_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_fsblk_t blk; + struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (buffer_verified(bh)) + return 0; + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; + + ext4_lock_group(sb, block_group); + blk = ext4_inode_bitmap(sb, desc); + if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, + EXT4_INODES_PER_GROUP(sb) / 8)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " + "inode_bitmap = %llu", block_group, blk); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + return -EFSBADCRC; + } + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + return 0; +} + /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. @@ -124,12 +160,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; - struct ext4_group_info *grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) - return NULL; + return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_inode_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); @@ -137,7 +172,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); - return NULL; + return ERR_PTR(-EIO); } if (bitmap_uptodate(bh)) goto verify; @@ -150,12 +185,14 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - ext4_init_inode_bitmap(sb, bh, block_group, desc); + err = ext4_init_inode_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); + if (err) + goto out; return bh; } ext4_unlock_group(sb, block_group); @@ -182,31 +219,17 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); - return NULL; + return ERR_PTR(-EIO); } verify: - ext4_lock_group(sb, block_group); - if (!buffer_verified(bh) && - !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, - EXT4_INODES_PER_GROUP(sb) / 8)) { - ext4_unlock_group(sb, block_group); - put_bh(bh); - ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " - "inode_bitmap = %llu", block_group, bitmap_blk); - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, desc); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return NULL; - } - ext4_unlock_group(sb, block_group); - set_buffer_verified(bh); + err = ext4_validate_inode_bitmap(sb, desc, block_group, bh); + if (err) + goto out; return bh; +out: + put_bh(bh); + return ERR_PTR(err); } /* @@ -286,8 +309,15 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) bitmap_bh = ext4_read_inode_bitmap(sb, block_group); /* Don't bother if the inode bitmap is corrupt. */ grp = ext4_get_group_info(sb, block_group); - if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh) + if (IS_ERR(bitmap_bh)) { + fatal = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; + goto error_return; + } + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { + fatal = -EFSCORRUPTED; goto error_return; + } BUFFER_TRACE(bitmap_bh, "get_write_access"); fatal = ext4_journal_get_write_access(handle, bitmap_bh); @@ -826,7 +856,9 @@ got_group: brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); /* Skip groups with suspicious inode tables */ - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || + IS_ERR(inode_bitmap_bh)) { + inode_bitmap_bh = NULL; if (++group == ngroups) group = 0; continue; @@ -902,8 +934,8 @@ got: struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); - if (!block_bitmap_bh) { - err = -EIO; + if (IS_ERR(block_bitmap_bh)) { + err = PTR_ERR(block_bitmap_bh); goto out; } BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); @@ -1045,7 +1077,7 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ei->i_inline_off = 0; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + if (ext4_has_feature_inline_data(sb)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); @@ -1060,7 +1092,7 @@ got: if (err) goto fail_free_drop; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_extents(sb)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); @@ -1116,14 +1148,17 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) /* Error cases - e2fsck has already cleaned up for us */ if (ino > max_ino) { ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); + err = -EFSCORRUPTED; goto error; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) { - ext4_warning(sb, "inode bitmap error for orphan %lu", ino); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + ext4_warning(sb, "inode bitmap error %ld for orphan %lu", + ino, err); goto error; } @@ -1198,8 +1233,10 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) desc_count += ext4_free_inodes_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_inode_bitmap(sb, i); - if (!bitmap_bh) + if (IS_ERR(bitmap_bh)) { + bitmap_bh = NULL; continue; + } x = ext4_count_free(bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb) / 8); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 2468261748b2..355ef9c36c87 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -562,11 +562,10 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, /* * Okay, we need to do block allocation. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + if (ext4_has_feature_bigalloc(inode->i_sb)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); - return -EUCLEAN; + return -EFSCORRUPTED; } /* Set up for the direct block allocation */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index cd944a7a99cd..d884989cc83d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -434,8 +434,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); - if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_extents(inode->i_sb)) { if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 612fbcf76b5c..b3bd912df6bf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -378,7 +378,7 @@ static int __check_block_validity(struct inode *inode, const char *func, "lblock %lu mapped to illegal pblock " "(length %d)", (unsigned long) map->m_lblk, map->m_len); - return -EIO; + return -EFSCORRUPTED; } return 0; } @@ -480,7 +480,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, /* We can handle the block number less than EXT_MAX_BLOCKS */ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) - return -EIO; + return -EFSCORRUPTED; /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { @@ -965,7 +965,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = ext4_decrypt_one(inode, page); + err = ext4_decrypt(page); return err; } #endif @@ -1181,6 +1181,38 @@ errout: return ret ? ret : copied; } +/* + * This is a private version of page_zero_new_buffers() which doesn't + * set the buffer to be dirty, since in data=journalled mode we need + * to call ext4_handle_dirty_metadata() instead. + */ +static void zero_new_buffers(struct page *page, unsigned from, unsigned to) +{ + unsigned int block_start = 0, block_end; + struct buffer_head *head, *bh; + + bh = head = page_buffers(page); + do { + block_end = block_start + bh->b_size; + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, size; + + start = max(from, block_start); + size = min(to, block_end) - start; + + zero_user(page, start, size); + set_buffer_uptodate(bh); + } + clear_buffer_new(bh); + } + } + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} + static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, @@ -1207,7 +1239,7 @@ static int ext4_journalled_write_end(struct file *file, if (copied < len) { if (!PageUptodate(page)) copied = 0; - page_zero_new_buffers(page, from+copied, to); + zero_new_buffers(page, from+copied, to); } ret = ext4_walk_page_buffers(handle, page_buffers(page), from, @@ -1815,11 +1847,22 @@ static int ext4_writepage(struct page *page, * the page. But we may reach here when we do a journal commit via * journal_submit_inode_data_buffers() and in that case we must write * allocated buffers to achieve data=ordered mode guarantees. + * + * Also, if there is only one buffer per page (the fs block + * size == the page size), if one buffer needs block + * allocation or needs to modify the extent tree to clear the + * unwritten flag, we know that the page can't be written at + * all, so we might as well refuse the write immediately. + * Unfortunately if the block size != page size, we can't as + * easily detect this case using ext4_walk_page_buffers(), but + * for the extremely common case, this is an optimization that + * skips a useless round trip through ext4_bio_write_page(). */ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, ext4_bh_delay_or_unwritten)) { redirty_page_for_writepage(wbc, page); - if (current->flags & PF_MEMALLOC) { + if ((current->flags & PF_MEMALLOC) || + (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) { /* * For memory cleaning there's no point in writing only * some buffers. So just bail out. Warn if we came here @@ -2599,8 +2642,7 @@ static int ext4_nonda_switch(struct super_block *sb) /* We always reserve for an inode update; the superblock could be there too */ static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) { - if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE))) + if (likely(ext4_has_feature_large_file(inode->i_sb))) return 1; if (pos + len <= 0x7fffffffULL) @@ -3344,7 +3386,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, int err = 0; page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, - mapping_gfp_mask(mapping) & ~__GFP_FS); + mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) return -ENOMEM; @@ -3393,7 +3435,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* We expect the key to be set. */ BUG_ON(!ext4_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_CACHE_SIZE); - WARN_ON_ONCE(ext4_decrypt_one(inode, page)); + WARN_ON_ONCE(ext4_decrypt(page)); } } if (ext4_should_journal_data(inode)) { @@ -3820,7 +3862,7 @@ static int __ext4_get_inode_loc(struct inode *inode, iloc->bh = NULL; if (!ext4_valid_inum(sb, inode->i_ino)) - return -EIO; + return -EFSCORRUPTED; iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); @@ -4006,8 +4048,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct inode *inode = &(ei->vfs_inode); struct super_block *sb = inode->i_sb; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + if (ext4_has_feature_huge_file(sb)) { /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); @@ -4068,7 +4109,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); - ret = -EIO; + ret = -EFSCORRUPTED; goto bad_inode; } } else @@ -4088,7 +4129,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { EXT4_ERROR_INODE(inode, "checksum invalid"); - ret = -EIO; + ret = -EFSBADCRC; goto bad_inode; } @@ -4130,7 +4171,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_flags = le32_to_cpu(raw_inode->i_flags); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); @@ -4203,7 +4244,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", ei->i_file_acl); - ret = -EIO; + ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { @@ -4242,6 +4283,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); } + inode_nohighmem(inode); } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; @@ -4254,7 +4296,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { - ret = -EIO; + ret = -EFSCORRUPTED; EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); goto bad_inode; } @@ -4272,7 +4314,7 @@ bad_inode: struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) { if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); return ext4_iget(sb, ino); } @@ -4294,7 +4336,7 @@ static int ext4_inode_blocks_set(handle_t *handle, ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + if (!ext4_has_feature_huge_file(sb)) return -EFBIG; if (i_blocks <= 0xffffffffffffULL) { @@ -4455,8 +4497,7 @@ static int ext4_do_update_inode(handle_t *handle, need_datasync = 1; } if (ei->i_disksize > 0x7fffffffULL) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || + if (!ext4_has_feature_large_file(sb) || EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; @@ -4505,8 +4546,7 @@ static int ext4_do_update_inode(handle_t *handle, if (err) goto out_brelse; ext4_update_dynamic_rev(sb); - EXT4_SET_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + ext4_set_feature_large_file(sb); ext4_handle_sync(handle); err = ext4_handle_dirty_super(handle, sb); } @@ -5244,7 +5284,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) !ext4_should_journal_data(inode) && !ext4_nonda_switch(inode->i_sb)) { do { - ret = __block_page_mkwrite(vma, vmf, + ret = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); } while (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); @@ -5291,7 +5331,7 @@ retry_alloc: ret = VM_FAULT_SIGBUS; goto out; } - ret = __block_page_mkwrite(vma, vmf, get_block); + ret = block_page_mkwrite(vma, vmf, get_block); if (!ret && ext4_should_journal_data(inode)) { if (ext4_walk_page_buffers(handle, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 1346cfa355d0..5e872fd40e5e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -145,8 +145,7 @@ static long swap_inode_boot_loader(struct super_block *sb, inode_bl->i_version = 1; i_size_write(inode_bl, 0); inode_bl->i_mode = S_IFREG; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_extents(sb)) { ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode_bl); } else @@ -383,8 +382,7 @@ setversion_out: goto group_extend_out; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); err = -EOPNOTSUPP; @@ -432,8 +430,7 @@ group_extend_out: goto mext_out; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with bigalloc"); err = -EOPNOTSUPP; @@ -470,8 +467,7 @@ mext_out: goto group_add_out; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); err = -EOPNOTSUPP; @@ -553,8 +549,7 @@ group_add_out: int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not (yet) supported with bigalloc"); return -EOPNOTSUPP; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 34b610ea5030..61eaf74dca37 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -874,8 +874,10 @@ static int ext4_mb_init_cache(struct page *page, char *incore) bh[i] = NULL; continue; } - if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { - err = -ENOMEM; + bh[i] = ext4_read_block_bitmap_nowait(sb, group); + if (IS_ERR(bh[i])) { + err = PTR_ERR(bh[i]); + bh[i] = NULL; goto out; } mb_debug(1, "read bitmap for group %u\n", group); @@ -883,8 +885,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { - if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) - err = -EIO; + int err2; + + if (!bh[i]) + continue; + err2 = ext4_wait_block_bitmap(sb, group, bh[i]); + if (!err) + err = err2; } first_block = page->index * blocks_per_page; @@ -2333,7 +2340,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) } -static const struct file_operations ext4_mb_seq_groups_fops = { +const struct file_operations ext4_seq_mb_groups_fops = { .owner = THIS_MODULE, .open = ext4_mb_seq_groups_open, .read = seq_read, @@ -2447,7 +2454,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, kmalloc(sb->s_blocksize, GFP_NOFS); BUG_ON(meta_group_info[i]->bb_bitmap == NULL); bh = ext4_read_block_bitmap(sb, group); - BUG_ON(bh == NULL); + BUG_ON(IS_ERR_OR_NULL(bh)); memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); @@ -2661,10 +2668,6 @@ int ext4_mb_init(struct super_block *sb) if (ret != 0) goto out_free_locality_groups; - if (sbi->s_proc) - proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_groups_fops, sb); - return 0; out_free_locality_groups: @@ -2705,9 +2708,6 @@ int ext4_mb_release(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); - if (sbi->s_proc) - remove_proc_entry("mb_groups", sbi->s_proc); - if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2896,10 +2896,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sb = ac->ac_sb; sbi = EXT4_SB(sb); - err = -EIO; bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); - if (!bitmap_bh) + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; goto out_err; + } BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); @@ -3843,8 +3845,10 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, return 0; bitmap_bh = ext4_read_block_bitmap(sb, group); - if (bitmap_bh == NULL) { - ext4_error(sb, "Error reading block bitmap for %u", group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + ext4_error(sb, "Error %d reading block bitmap for %u", + err, group); return 0; } @@ -4015,9 +4019,10 @@ repeat: } bitmap_bh = ext4_read_block_bitmap(sb, group); - if (bitmap_bh == NULL) { - ext4_error(sb, "Error reading block bitmap for %u", - group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + ext4_error(sb, "Error %d reading block bitmap for %u", + err, group); ext4_mb_unload_buddy(&e4b); continue; } @@ -4682,22 +4687,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_debug("freeing block %llu\n", block); trace_ext4_free_blocks(inode, block, count, flags); - if (flags & EXT4_FREE_BLOCKS_FORGET) { - struct buffer_head *tbh = bh; - int i; - - BUG_ON(bh && (count > 1)); + if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { + BUG_ON(count > 1); - for (i = 0; i < count; i++) { - cond_resched(); - if (!bh) - tbh = sb_find_get_block(inode->i_sb, - block + i); - if (!tbh) - continue; - ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, - inode, tbh, block + i); - } + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, bh, block); } /* @@ -4742,6 +4736,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, count += sbi->s_cluster_ratio - overflow; } + if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { + int i; + + for (i = 0; i < count; i++) { + cond_resched(); + bh = sb_find_get_block(inode->i_sb, block + i); + if (!bh) + continue; + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, bh, block + i); + } + } + do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -4761,8 +4768,9 @@ do_more: } count_clusters = EXT4_NUM_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) { - err = -EIO; + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; goto error_return; } gdp = ext4_get_group_desc(sb, block_group, &gd_bh); @@ -4931,8 +4939,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, } bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) { - err = -EIO; + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; goto error_return; } diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 6163ad21cb0e..a4651894cc33 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -448,8 +448,7 @@ int ext4_ext_migrate(struct inode *inode) * If the filesystem does not support extents, or the inode * already is extent-based, error out. */ - if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_INCOMPAT_EXTENTS) || + if (!ext4_has_feature_extents(inode->i_sb) || (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; @@ -625,13 +624,11 @@ int ext4_ind_migrate(struct inode *inode) handle_t *handle; int ret; - if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_INCOMPAT_EXTENTS) || + if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + if (ext4_has_feature_bigalloc(inode->i_sb)) return -EOPNOTSUPP; /* diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 6eb1a619890c..0a512aa81bf7 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -98,10 +98,12 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, } mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) == EXT4_MMP_MAGIC && - ext4_mmp_csum_verify(sb, mmp)) + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + ret = -EFSCORRUPTED; + else if (!ext4_mmp_csum_verify(sb, mmp)) + ret = -EFSBADCRC; + else return 0; - ret = -EINVAL; warn_exit: ext4_warning(sb, "Error %d while reading MMP block %llu", diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 9f61e7679a6d..f27e0c2598c5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -109,7 +109,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (!bh) { ext4_error_inode(inode, func, line, block, "Directory hole found"); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ @@ -124,7 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (!is_dx_block && type == INDEX) { ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } if (!ext4_has_metadata_csum(inode->i_sb) || buffer_verified(bh)) @@ -142,7 +142,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ext4_error_inode(inode, func, line, block, "Directory index failed checksum"); brelse(bh); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSBADCRC); } } if (!is_dx_block) { @@ -152,7 +152,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ext4_error_inode(inode, func, line, block, "Directory block failed checksum"); brelse(bh); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSBADCRC); } } return bh; @@ -1429,7 +1429,7 @@ restart: } num++; bh = ext4_getblk(NULL, dir, b++, 0); - if (unlikely(IS_ERR(bh))) { + if (IS_ERR(bh)) { if (ra_max == 0) { ret = bh; goto cleanup_and_exit; @@ -1570,19 +1570,19 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } if (unlikely(ino == dir->i_ino)) { EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", dentry); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } inode = ext4_iget_normal(dir->i_sb, ino); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", ino); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || @@ -1619,7 +1619,7 @@ struct dentry *ext4_get_parent(struct dentry *child) if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) { EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino)); @@ -1807,7 +1807,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) { - res = -EIO; + res = -EFSCORRUPTED; goto return_result; } /* Provide crypto context and crypto buffer to ext4 match */ @@ -1967,7 +1967,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, if ((char *) de >= (((char *) root) + blocksize)) { EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); brelse(bh); - return -EIO; + return -EFSCORRUPTED; } len = ((char *) root) + (blocksize - csum_size) - (char *) de; @@ -2118,7 +2118,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, goto out; if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + ext4_has_feature_dir_index(sb)) { retval = make_indexed_dir(handle, &fname, dentry, inode, bh); bh = NULL; /* make_indexed_dir releases bh */ @@ -2315,7 +2315,7 @@ int ext4_generic_delete_entry(handle_t *handle, while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, i)) - return -EIO; + return -EFSCORRUPTED; if (de == de_del) { if (pde) pde->rec_len = ext4_rec_len_to_disk( @@ -2388,8 +2388,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode) /* limit is 16-bit i_links_count */ if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { set_nlink(inode, 1); - EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_DIR_NLINK); + ext4_set_feature_dir_nlink(inode->i_sb); } } } @@ -2469,9 +2468,6 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err, credits, retries = 0; - if (!new_valid_dev(rdev)) - return -EINVAL; - err = dquot_initialize(dir); if (err) return err; @@ -2934,7 +2930,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) inode = d_inode(dentry); - retval = -EIO; + retval = -EFSCORRUPTED; if (le32_to_cpu(de->inode) != inode->i_ino) goto end_rmdir; @@ -3008,7 +3004,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) inode = d_inode(dentry); - retval = -EIO; + retval = -EFSCORRUPTED; if (le32_to_cpu(de->inode) != inode->i_ino) goto end_unlink; @@ -3136,6 +3132,7 @@ static int ext4_symlink(struct inode *dir, if ((disk_link.len > EXT4_N_BLOCKS * 4)) { if (!encryption_required) inode->i_op = &ext4_symlink_inode_operations; + inode_nohighmem(inode); ext4_set_aops(inode); /* * We cannot call page_symlink() with transaction started @@ -3310,7 +3307,7 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) if (!ent->dir_bh) return retval; if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) - return -EIO; + return -EFSCORRUPTED; BUFFER_TRACE(ent->dir_bh, "get_write_access"); return ext4_journal_get_write_access(handle, ent->dir_bh); } @@ -3352,8 +3349,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, if (retval) return retval; ent->de->inode = cpu_to_le32(ino); - if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb, - EXT4_FEATURE_INCOMPAT_FILETYPE)) + if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; ent->dir->i_version++; ent->dir->i_ctime = ent->dir->i_mtime = diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 84ba4d2b3a35..090b3498638e 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -52,9 +52,8 @@ void ext4_exit_pageio(void) */ static void buffer_io_error(struct buffer_head *bh) { - char b[BDEVNAME_SIZE]; - printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", - bdevname(bh->b_bdev, b), + printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n", + bh->b_bdev, (unsigned long long)bh->b_blocknr); } @@ -425,6 +424,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct buffer_head *bh, *head; int ret = 0; int nr_submitted = 0; + int nr_to_submit = 0; blocksize = 1 << inode->i_blkbits; @@ -477,11 +477,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); } set_buffer_async_write(bh); + nr_to_submit++; } while ((bh = bh->b_this_page) != head); bh = head = page_buffers(page); - if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && + nr_to_submit) { data_page = ext4_encrypt(inode, page); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 560af0437704..5dc5e95063de 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -62,7 +62,7 @@ static void completion_pages(struct work_struct *work) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - int ret = ext4_decrypt(ctx, page); + int ret = ext4_decrypt(page); if (ret) { WARN_ON_ONCE(1); SetPageError(page); @@ -166,7 +166,7 @@ int ext4_mpage_readpages(struct address_space *mapping, page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) + mapping_gfp_constraint(mapping, GFP_KERNEL))) goto next_page; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index cf0c472047e3..ad62d7acc315 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -490,7 +490,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, group_data[0].group != sbi->s_groups_count); reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); - meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + meta_bg = ext4_has_feature_meta_bg(sb); /* This transaction may be extended/restarted along the way */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); @@ -680,8 +680,7 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, int mult = 3; unsigned ret; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + if (!ext4_has_feature_sparse_super(sb)) { ret = *min; *min += 1; return ret; @@ -1040,7 +1039,7 @@ exit_free: * do not copy the full number of backups at this time. The resize * which changed s_groups_count will backup again. */ -static void update_backups(struct super_block *sb, int blk_off, char *data, +static void update_backups(struct super_block *sb, sector_t blk_off, char *data, int size, int meta_bg) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1065,7 +1064,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data, group = ext4_list_backups(sb, &three, &five, &seven); last = sbi->s_groups_count; } else { - group = ext4_meta_bg_first_group(sb, group) + 1; + group = ext4_get_group_number(sb, blk_off) + 1; last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2); } @@ -1158,7 +1157,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, int i, gdb_off, gdb_num, err = 0; int meta_bg; - meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + meta_bg = ext4_has_feature_meta_bg(sb); for (i = 0; i < count; i++, group++) { int reserved_gdb = ext4_bg_has_super(sb, group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; @@ -1381,9 +1380,7 @@ static void ext4_update_super(struct super_block *sb, ext4_debug("free blocks count %llu", percpu_counter_read(&sbi->s_freeclusters_counter)); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_FLEX_BG) && - sbi->s_log_groups_per_flex) { + if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), @@ -1476,8 +1473,7 @@ exit_journal: int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int gdb_num_end = ((group + flex_gd->count - 1) / EXT4_DESC_PER_BLOCK(sb)); - int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_META_BG); + int meta_bg = ext4_has_feature_meta_bg(sb); sector_t old_gdb = 0; update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, @@ -1585,8 +1581,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); - if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + if (gdb_off == 0 && !ext4_has_feature_sparse_super(sb)) { ext4_warning(sb, "Can't resize non-sparse filesystem further"); return -EPERM; } @@ -1604,9 +1599,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } if (reserved_gdb || gdb_off == 0) { - if (!EXT4_HAS_COMPAT_FEATURE(sb, - EXT4_FEATURE_COMPAT_RESIZE_INODE) - || !le16_to_cpu(es->s_reserved_gdt_blocks)) { + if (ext4_has_feature_resize_inode(sb) || + !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, "No reserved GDT blocks, can't resize"); return -EPERM; @@ -1825,8 +1819,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) if (err) goto errout; - EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE); - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + ext4_clear_feature_resize_inode(sb); + ext4_set_feature_meta_bg(sb); sbi->s_es->s_first_meta_bg = cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); @@ -1918,9 +1912,9 @@ retry: n_desc_blocks = num_desc_blocks(sb, n_group + 1); o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count); - meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + meta_bg = ext4_has_feature_meta_bg(sb); - if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) { + if (ext4_has_feature_resize_inode(sb)) { if (meta_bg) { ext4_error(sb, "resize_inode and meta_bg enabled " "simultaneously"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a63c7b0a10cf..f1b56ff01208 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -34,7 +34,6 @@ #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> -#include <linux/proc_fs.h> #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> @@ -54,11 +53,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> -static struct proc_dir_entry *ext4_proc_root; -static struct kset *ext4_kset; static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; -static struct ext4_features *ext4_feat; static int ext4_mballoc_ready; static struct ratelimit_state ext4_mount_msg_ratelimit; @@ -83,7 +79,6 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); -static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { @@ -115,8 +110,7 @@ MODULE_ALIAS("ext3"); static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum_type == EXT4_CRC32C_CHKSUM; @@ -394,9 +388,13 @@ static void ext4_handle_error(struct super_block *sb) smp_wmb(); sb->s_flags |= MS_RDONLY; } - if (test_opt(sb, ERRORS_PANIC)) + if (test_opt(sb, ERRORS_PANIC)) { + if (EXT4_SB(sb)->s_journal && + !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) + return; panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); + } } #define ext4_error_ratelimit(sb) \ @@ -495,6 +493,12 @@ const char *ext4_decode_error(struct super_block *sb, int errno, char *errstr = NULL; switch (errno) { + case -EFSCORRUPTED: + errstr = "Corrupt filesystem"; + break; + case -EFSBADCRC: + errstr = "Filesystem failed CRC"; + break; case -EIO: errstr = "IO failure"; break; @@ -585,8 +589,12 @@ void __ext4_abort(struct super_block *sb, const char *function, jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); save_error_info(sb, function, line); } - if (test_opt(sb, ERRORS_PANIC)) + if (test_opt(sb, ERRORS_PANIC)) { + if (EXT4_SB(sb)->s_journal && + !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) + return; panic("EXT4-fs panic from previous error\n"); + } } void __ext4_msg(struct super_block *sb, @@ -800,6 +808,7 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, "Couldn't clean up the journal"); } + ext4_unregister_sysfs(sb); ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); @@ -808,18 +817,12 @@ static void ext4_put_super(struct super_block *sb) ext4_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY)) { - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_clear_feature_journal_needs_recovery(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!(sb->s_flags & MS_RDONLY)) ext4_commit_super(sb, 1); - if (sbi->s_proc) { - remove_proc_entry("options", sbi->s_proc); - remove_proc_entry(sb->s_id, ext4_proc_root); - } - kobject_del(&sbi->s_kobj); - for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kvfree(sbi->s_group_desc); @@ -963,7 +966,7 @@ static int __init init_inodecache(void) ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; @@ -1058,7 +1061,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, - wait & ~__GFP_WAIT); + wait & ~__GFP_DIRECT_RECLAIM); return try_to_free_buffers(page); } @@ -1288,7 +1291,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "quota options when quota turned on"); return -1; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { + if (ext4_has_feature_quota(sb)) { ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " "when QUOTA feature is enabled"); return -1; @@ -1381,10 +1384,10 @@ static const struct mount_opts { {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, - MOPT_EXT4_ONLY | MOPT_SET}, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | EXT4_MOUNT_JOURNAL_CHECKSUM), - MOPT_EXT4_ONLY | MOPT_SET}, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, @@ -1513,8 +1516,14 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) return -1; - if (m->flags & MOPT_EXPLICIT) - set_opt2(sb, EXPLICIT_DELALLOC); + if (m->flags & MOPT_EXPLICIT) { + if (m->mount_opt & EXT4_MOUNT_DELALLOC) { + set_opt2(sb, EXPLICIT_DELALLOC); + } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { + set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM); + } else + return -1; + } if (m->flags & MOPT_CLEAR_ERR) clear_opt(sb, ERRORS_MASK); if (token == Opt_noquota && sb_any_quota_loaded(sb)) { @@ -1647,8 +1656,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, "quota options when quota turned on"); return -1; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_QUOTA)) { + if (ext4_has_feature_quota(sb)) { ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " "when QUOTA feature is enabled"); @@ -1656,8 +1664,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif -#ifndef CONFIG_FS_DAX } else if (token == Opt_dax) { +#ifdef CONFIG_FS_DAX + ext4_msg(sb, KERN_WARNING, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + sbi->s_mount_opt |= m->mount_opt; +#else ext4_msg(sb, KERN_INFO, "dax option not supported"); return -1; #endif @@ -1707,7 +1719,7 @@ static int parse_options(char *options, struct super_block *sb, return 0; } #ifdef CONFIG_QUOTA - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + if (ext4_has_feature_quota(sb) && (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA " "feature is enabled"); @@ -1880,7 +1892,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root) return _ext4_show_options(seq, root->d_sb, 0); } -static int options_seq_show(struct seq_file *seq, void *offset) +int ext4_seq_options_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; int rc; @@ -1891,19 +1903,6 @@ static int options_seq_show(struct seq_file *seq, void *offset) return rc; } -static int options_open_fs(struct inode *inode, struct file *file) -{ - return single_open(file, options_seq_show, PDE_DATA(inode)); -} - -static const struct file_operations ext4_seq_options_fops = { - .owner = THIS_MODULE, - .open = options_open_fs, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { @@ -1944,7 +1943,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_mtime = cpu_to_le32(get_seconds()); ext4_update_dynamic_rev(sb); if (sbi->s_journal) - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_set_feature_journal_needs_recovery(sb); ext4_commit_super(sb, 1); done: @@ -2027,12 +2026,13 @@ failed: return 0; } -static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, +static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { int offset; __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ @@ -2052,8 +2052,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, } /* old crc16 code */ - if (!(sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM))) + if (!ext4_has_feature_gdt_csum(sb)) return 0; offset = offsetof(struct ext4_group_desc, bg_checksum); @@ -2063,8 +2062,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ - if ((sbi->s_es->s_feature_incompat & - cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && + if (ext4_has_feature_64bit(sb) && offset < le16_to_cpu(sbi->s_es->s_desc_size)) crc = crc16(crc, (__u8 *)gdp + offset, le16_to_cpu(sbi->s_es->s_desc_size) - @@ -2078,8 +2076,7 @@ int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (ext4_has_group_desc_csum(sb) && - (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb), - block_group, gdp))) + (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) return 0; return 1; @@ -2090,7 +2087,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, { if (!ext4_has_group_desc_csum(sb)) return; - gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp); + gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); } /* Called at mount-time, super-block is locked */ @@ -2106,7 +2103,7 @@ static int ext4_check_descriptors(struct super_block *sb, int flexbg_flag = 0; ext4_group_t i, grp = sbi->s_groups_count; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (ext4_has_feature_flex_bg(sb)) flexbg_flag = 1; ext4_debug("Checking group descriptors"); @@ -2150,7 +2147,7 @@ static int ext4_check_descriptors(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", - i, le16_to_cpu(ext4_group_desc_csum(sbi, i, + i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!(sb->s_flags & MS_RDONLY)) { ext4_unlock_group(sb, i); @@ -2413,8 +2410,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb, first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || - nr < first_meta_bg) + if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) return logical_sb_block + nr + 1; bg = sbi->s_desc_per_block * nr; if (ext4_bg_has_super(sb, bg)) @@ -2470,335 +2466,6 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) return ret; } -/* sysfs supprt */ - -struct ext4_attr { - struct attribute attr; - ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); - ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, - const char *, size_t); - union { - int offset; - int deprecated_val; - } u; -}; - -static int parse_strtoull(const char *buf, - unsigned long long max, unsigned long long *value) -{ - int ret; - - ret = kstrtoull(skip_spaces(buf), 0, value); - if (!ret && *value > max) - ret = -EINVAL; - return ret; -} - -static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, - char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (s64) EXT4_C2B(sbi, - percpu_counter_sum(&sbi->s_dirtyclusters_counter))); -} - -static ssize_t session_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->s_buddy_cache->i_sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - return snprintf(buf, PAGE_SIZE, "%lu\n", - (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - sbi->s_sectors_written_start) >> 1); -} - -static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->s_buddy_cache->i_sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1))); -} - -static ssize_t inode_readahead_blks_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned long t; - int ret; - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret) - return ret; - - if (t && (!is_power_of_2(t) || t > 0x40000000)) - return -EINVAL; - - sbi->s_inode_readahead_blks = t; - return count; -} - -static ssize_t sbi_ui_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t sbi_ui_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); - unsigned long t; - int ret; - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret) - return ret; - *ui = t; - return count; -} - -static ssize_t es_ui_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - - unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) + - a->u.offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t reserved_clusters_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); -} - -static ssize_t reserved_clusters_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned long long val; - int ret; - - if (parse_strtoull(buf, -1ULL, &val)) - return -EINVAL; - ret = ext4_reserve_clusters(sbi, val); - - return ret ? ret : count; -} - -static ssize_t trigger_test_error(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - int len = count; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (len && buf[len-1] == '\n') - len--; - - if (len) - ext4_error(sbi->s_sb, "%.*s", len, buf); - return count; -} - -static ssize_t sbi_deprecated_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); -} - -#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ -static struct ext4_attr ext4_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .u = { \ - .offset = offsetof(struct ext4_sb_info, _elname),\ - }, \ -} - -#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \ -static struct ext4_attr ext4_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .u = { \ - .offset = offsetof(struct ext4_super_block, _elname), \ - }, \ -} - -#define EXT4_ATTR(name, mode, show, store) \ -static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) - -#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) -#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) -#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) - -#define EXT4_RO_ATTR_ES_UI(name, elname) \ - EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname) -#define EXT4_RW_ATTR_SBI_UI(name, elname) \ - EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) - -#define ATTR_LIST(name) &ext4_attr_##name.attr -#define EXT4_DEPRECATED_ATTR(_name, _val) \ -static struct ext4_attr ext4_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = 0444 }, \ - .show = sbi_deprecated_show, \ - .u = { \ - .deprecated_val = _val, \ - }, \ -} - -EXT4_RO_ATTR(delayed_allocation_blocks); -EXT4_RO_ATTR(session_write_kbytes); -EXT4_RO_ATTR(lifetime_write_kbytes); -EXT4_RW_ATTR(reserved_clusters); -EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, - inode_readahead_blks_store, s_inode_readahead_blks); -EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); -EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); -EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); -EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); -EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); -EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128); -EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); -EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); -EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); -EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); -EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); - -static struct attribute *ext4_attrs[] = { - ATTR_LIST(delayed_allocation_blocks), - ATTR_LIST(session_write_kbytes), - ATTR_LIST(lifetime_write_kbytes), - ATTR_LIST(reserved_clusters), - ATTR_LIST(inode_readahead_blks), - ATTR_LIST(inode_goal), - ATTR_LIST(mb_stats), - ATTR_LIST(mb_max_to_scan), - ATTR_LIST(mb_min_to_scan), - ATTR_LIST(mb_order2_req), - ATTR_LIST(mb_stream_req), - ATTR_LIST(mb_group_prealloc), - ATTR_LIST(max_writeback_mb_bump), - ATTR_LIST(extent_max_zeroout_kb), - ATTR_LIST(trigger_fs_error), - ATTR_LIST(err_ratelimit_interval_ms), - ATTR_LIST(err_ratelimit_burst), - ATTR_LIST(warning_ratelimit_interval_ms), - ATTR_LIST(warning_ratelimit_burst), - ATTR_LIST(msg_ratelimit_interval_ms), - ATTR_LIST(msg_ratelimit_burst), - ATTR_LIST(errors_count), - ATTR_LIST(first_error_time), - ATTR_LIST(last_error_time), - NULL, -}; - -/* Features this copy of ext4 supports */ -EXT4_INFO_ATTR(lazy_itable_init); -EXT4_INFO_ATTR(batched_discard); -EXT4_INFO_ATTR(meta_bg_resize); -EXT4_INFO_ATTR(encryption); - -static struct attribute *ext4_feat_attrs[] = { - ATTR_LIST(lazy_itable_init), - ATTR_LIST(batched_discard), - ATTR_LIST(meta_bg_resize), - ATTR_LIST(encryption), - NULL, -}; - -static ssize_t ext4_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; -} - -static ssize_t ext4_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - - return a->store ? a->store(a, sbi, buf, len) : 0; -} - -static void ext4_sb_release(struct kobject *kobj) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} - -static const struct sysfs_ops ext4_attr_ops = { - .show = ext4_attr_show, - .store = ext4_attr_store, -}; - -static struct kobj_type ext4_ktype = { - .default_attrs = ext4_attrs, - .sysfs_ops = &ext4_attr_ops, - .release = ext4_sb_release, -}; - -static void ext4_feat_release(struct kobject *kobj) -{ - complete(&ext4_feat->f_kobj_unregister); -} - -static ssize_t ext4_feat_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "supported\n"); -} - -/* - * We can not use ext4_attr_show/store because it relies on the kobject - * being embedded in the ext4_sb_info structure which is definitely not - * true in this case. - */ -static const struct sysfs_ops ext4_feat_ops = { - .show = ext4_feat_show, - .store = NULL, -}; - -static struct kobj_type ext4_feat_ktype = { - .default_attrs = ext4_feat_attrs, - .sysfs_ops = &ext4_feat_ops, - .release = ext4_feat_release, -}; - /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. @@ -2807,7 +2474,7 @@ static struct kobj_type ext4_feat_ktype = { */ static int ext4_feature_set_ok(struct super_block *sb, int readonly) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { + if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, "Couldn't mount because of " "unsupported optional features (%x)", @@ -2819,14 +2486,14 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) if (readonly) return 1; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) { + if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); sb->s_flags |= MS_RDONLY; return 1; } /* Check that feature set is OK for a read-write mount */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { + if (ext4_has_unknown_ext4_ro_compat_features(sb)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & @@ -2837,7 +2504,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) * Large file size enabled file system can only be mounted * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + if (ext4_has_feature_huge_file(sb)) { if (sizeof(blkcnt_t) < sizeof(u64)) { ext4_msg(sb, KERN_ERR, "Filesystem with huge files " "cannot be mounted RDWR without " @@ -2845,8 +2512,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) && - !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " "extents feature\n"); @@ -2854,8 +2520,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) } #ifndef CONFIG_QUOTA - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && - !readonly) { + if (ext4_has_feature_quota(sb) && !readonly) { ext4_msg(sb, KERN_ERR, "Filesystem with quota feature cannot be mounted RDWR " "without CONFIG_QUOTA"); @@ -3312,7 +2977,7 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + if (!ext4_has_feature_bigalloc(sb)) return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + sbi->s_itb_per_group + 2); @@ -3403,10 +3068,10 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } - -static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) +static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; + struct ext4_sb_info *sbi = EXT4_SB(sb); /* * There's no need to reserve anything when we aren't using extents. @@ -3414,8 +3079,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) * hole punching doesn't need new metadata... This is needed especially * to keep ext2/3 backward compatibility. */ - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) - return 0; + if (!ext4_has_feature_extents(sb)) + return; /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run @@ -3424,26 +3089,13 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ - resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >> - EXT4_SB(sb)->s_cluster_bits; + resv_clusters = (ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits); do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); - return resv_clusters; -} - - -static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count) -{ - ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >> - sbi->s_cluster_bits; - - if (count >= clusters) - return -EINVAL; - - atomic64_set(&sbi->s_resv_clusters, count); - return 0; + atomic64_set(&sbi->s_resv_clusters, resv_clusters); } static int ext4_fill_super(struct super_block *sb, void *data, int silent) @@ -3526,9 +3178,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); /* Warn if metadata_csum and gdt_csum are both set. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && - EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (ext4_has_feature_metadata_csum(sb) && + ext4_has_feature_gdt_csum(sb)) ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); @@ -3541,8 +3192,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Load the checksum driver */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_feature_metadata_csum(sb)) { sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); @@ -3557,11 +3207,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "invalid superblock checksum. Run e2fsck?"); silent = 1; + ret = -EFSBADCRC; goto cantfind_ext4; } /* Precompute checksum seed for all metadata */ - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_csum_seed(sb)) + sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); + else if (ext4_has_metadata_csum(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); @@ -3664,17 +3317,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && - (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || - EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) || - EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U))) + (ext4_has_compat_features(sb) || + ext4_has_ro_compat_features(sb) || + ext4_has_incompat_features(sb))) ext4_msg(sb, KERN_WARNING, "feature flags set on rev 0 fs, " "running e2fsck is recommended"); if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { set_opt2(sb, HURD_COMPAT); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_64BIT)) { + if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); goto failed_mount; @@ -3732,8 +3384,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) && - es->s_encryption_level) { + if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", es->s_encryption_level); goto failed_mount; @@ -3765,8 +3416,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); @@ -3790,7 +3440,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_desc_size = le16_to_cpu(es->s_desc_size); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) { + if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || sbi->s_desc_size > EXT4_MAX_DESC_SIZE || !is_power_of_2(sbi->s_desc_size)) { @@ -3821,7 +3471,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; - if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) sbi->s_hash_unsigned = 3; @@ -3841,8 +3491,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); - has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC); + has_bigalloc = ext4_has_feature_bigalloc(sb); if (has_bigalloc) { if (clustersize < blocksize) { ext4_msg(sb, KERN_ERR, @@ -3961,13 +3610,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (ext4_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); - - if (sbi->s_proc) - proc_create_data("options", S_IRUGO, sbi->s_proc, - &ext4_seq_options_fops, sb); - bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { @@ -3982,6 +3624,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); + ret = -EFSCORRUPTED; goto failed_mount2; } @@ -4007,7 +3650,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) + if (ext4_has_feature_quota(sb)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; @@ -4021,11 +3664,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || - EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_RECOVER)); + ext4_has_feature_journal_needs_recovery(sb)); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && - !(sb->s_flags & MS_RDONLY)) + if (ext4_has_feature_mmp(sb) && !(sb->s_flags & MS_RDONLY)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) goto failed_mount3a; @@ -4033,23 +3674,47 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { if (ext4_load_journal(sb, es, journal_devnum)) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && - EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); goto failed_mount_wq; } else { + /* Nojournal mode, all journal mount options are illegal */ + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_checksum, fs mounted w/o journal"); + goto failed_mount_wq; + } + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit, fs mounted w/o journal"); + goto failed_mount_wq; + } + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "commit=%lu, fs mounted w/o journal", + sbi->s_commit_interval / HZ); + goto failed_mount_wq; + } + if (EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "data=, fs mounted w/o journal"); + goto failed_mount_wq; + } + sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM; + clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); sbi->s_journal = NULL; needs_recovery = 0; goto no_journal; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) && + if (ext4_has_feature_64bit(sb) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); @@ -4101,18 +3766,16 @@ no_journal: } } - if ((DUMMY_ENCRYPTION_ENABLED(sbi) || - EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) && + if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && (blocksize != PAGE_CACHE_SIZE)) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs encryption"); goto failed_mount_wq; } - if (DUMMY_ENCRYPTION_ENABLED(sbi) && - !(sb->s_flags & MS_RDONLY) && - !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) { - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) && + !ext4_has_feature_encrypt(sb)) { + ext4_set_feature_encrypt(sb); ext4_commit_super(sb, 1); } @@ -4171,8 +3834,7 @@ no_journal: if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { + if (ext4_has_feature_extra_isize(sb)) { if (sbi->s_want_extra_isize < le16_to_cpu(es->s_want_extra_isize)) sbi->s_want_extra_isize = @@ -4192,12 +3854,7 @@ no_journal: "available"); } - err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb)); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " - "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount4a; - } + ext4_set_resv_clusters(sb); err = ext4_setup_system_zone(sb); if (err) { @@ -4236,7 +3893,7 @@ no_journal: goto failed_mount6; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " @@ -4248,17 +3905,13 @@ no_journal: if (err) goto failed_mount6; - sbi->s_kobj.kset = ext4_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, - "%s", sb->s_id); + err = ext4_register_sysfs(sb); if (err) goto failed_mount7; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && - !(sb->s_flags & MS_RDONLY)) { + if (ext4_has_feature_quota(sb) && !(sb->s_flags & MS_RDONLY)) { err = ext4_enable_quotas(sb); if (err) goto failed_mount8; @@ -4313,7 +3966,7 @@ cantfind_ext4: #ifdef CONFIG_QUOTA failed_mount8: - kobject_del(&sbi->s_kobj); + ext4_unregister_sysfs(sb); #endif failed_mount7: ext4_unregister_li_request(sb); @@ -4353,10 +4006,6 @@ failed_mount2: failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); - if (sbi->s_proc) { - remove_proc_entry("options", sbi->s_proc); - remove_proc_entry(sb->s_id, ext4_proc_root); - } #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); @@ -4403,7 +4052,7 @@ static journal_t *ext4_get_journal(struct super_block *sb, struct inode *journal_inode; journal_t *journal; - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + BUG_ON(!ext4_has_feature_journal(sb)); /* First, test for the existence of a valid inode on disk. Bad * things happen if we iget() an unused inode, as the subsequent @@ -4453,7 +4102,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, struct ext4_super_block *es; struct block_device *bdev; - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + BUG_ON(!ext4_has_feature_journal(sb)); bdev = ext4_blkdev_get(j_dev, sb); if (bdev == NULL) @@ -4545,7 +4194,7 @@ static int ext4_load_journal(struct super_block *sb, int err = 0; int really_read_only; - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + BUG_ON(!ext4_has_feature_journal(sb)); if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { @@ -4562,7 +4211,7 @@ static int ext4_load_journal(struct super_block *sb, * crash? For recovery, we need to check in advance whether we * can get read-write access to the device. */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + if (ext4_has_feature_journal_needs_recovery(sb)) { if (sb->s_flags & MS_RDONLY) { ext4_msg(sb, KERN_INFO, "INFO: recovery " "required on readonly filesystem"); @@ -4593,7 +4242,7 @@ static int ext4_load_journal(struct super_block *sb, if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) + if (!ext4_has_feature_journal_needs_recovery(sb)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); @@ -4707,7 +4356,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb, { journal_t *journal = EXT4_SB(sb)->s_journal; - if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + if (!ext4_has_feature_journal(sb)) { BUG_ON(journal != NULL); return; } @@ -4715,9 +4364,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb, if (jbd2_journal_flush(journal) < 0) goto out; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && + if (ext4_has_feature_journal_needs_recovery(sb) && sb->s_flags & MS_RDONLY) { - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_clear_feature_journal_needs_recovery(sb); ext4_commit_super(sb, 1); } @@ -4737,7 +4386,7 @@ static void ext4_clear_journal_err(struct super_block *sb, int j_errno; const char *errstr; - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + BUG_ON(!ext4_has_feature_journal(sb)); journal = EXT4_SB(sb)->s_journal; @@ -4852,7 +4501,7 @@ static int ext4_freeze(struct super_block *sb) goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_clear_feature_journal_needs_recovery(sb); } error = ext4_commit_super(sb, 1); @@ -4874,7 +4523,7 @@ static int ext4_unfreeze(struct super_block *sb) if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_set_feature_journal_needs_recovery(sb); } ext4_commit_super(sb, 1); @@ -5027,8 +4676,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_mark_recovery_complete(sb, es); } else { /* Make sure we can mount this feature set readwrite */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_READONLY) || + if (ext4_has_feature_readonly(sb) || !ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; @@ -5044,9 +4692,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!ext4_group_desc_csum_verify(sb, g, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_remount: Checksum for group %u failed (%u!=%u)", - g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), + g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)), le16_to_cpu(gdp->bg_checksum)); - err = -EINVAL; + err = -EFSBADCRC; goto restore_opts; } } @@ -5076,8 +4724,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sbi->s_mount_state = le16_to_cpu(es->s_state); if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_MMP)) + if (ext4_has_feature_mmp(sb)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) { err = -EROFS; @@ -5110,8 +4757,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); - else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_QUOTA)) { + else if (ext4_has_feature_quota(sb)) { err = ext4_enable_quotas(sb); if (err) goto restore_opts; @@ -5255,7 +4901,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot) struct ext4_sb_info *sbi = EXT4_SB(sb); /* Are we journaling quotas? */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) || + if (ext4_has_feature_quota(sb) || sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); @@ -5343,7 +4989,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) }; - BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)); + BUG_ON(!ext4_has_feature_quota(sb)); if (!qf_inums[type]) return -EPERM; @@ -5537,11 +5183,11 @@ static inline void unregister_as_ext2(void) static inline int ext2_feature_set_ok(struct super_block *sb) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP)) + if (ext4_has_unknown_ext2_incompat_features(sb)) return 0; if (sb->s_flags & MS_RDONLY) return 1; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP)) + if (ext4_has_unknown_ext2_ro_compat_features(sb)) return 0; return 1; } @@ -5566,13 +5212,13 @@ static inline void unregister_as_ext3(void) static inline int ext3_feature_set_ok(struct super_block *sb) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP)) + if (ext4_has_unknown_ext3_incompat_features(sb)) return 0; - if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + if (!ext4_has_feature_journal(sb)) return 0; if (sb->s_flags & MS_RDONLY) return 1; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) + if (ext4_has_unknown_ext3_ro_compat_features(sb)) return 0; return 1; } @@ -5586,37 +5232,6 @@ static struct file_system_type ext4_fs_type = { }; MODULE_ALIAS_FS("ext4"); -static int __init ext4_init_feat_adverts(void) -{ - struct ext4_features *ef; - int ret = -ENOMEM; - - ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL); - if (!ef) - goto out; - - ef->f_kobj.kset = ext4_kset; - init_completion(&ef->f_kobj_unregister); - ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL, - "features"); - if (ret) { - kfree(ef); - goto out; - } - - ext4_feat = ef; - ret = 0; -out: - return ret; -} - -static void ext4_exit_feat_adverts(void) -{ - kobject_put(&ext4_feat->f_kobj); - wait_for_completion(&ext4_feat->f_kobj_unregister); - kfree(ext4_feat); -} - /* Shared across all ext4 file systems */ wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; @@ -5643,21 +5258,15 @@ static int __init ext4_init_fs(void) err = ext4_init_pageio(); if (err) - goto out7; + goto out5; err = ext4_init_system_zone(); if (err) - goto out6; - ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); - if (!ext4_kset) { - err = -ENOMEM; - goto out5; - } - ext4_proc_root = proc_mkdir("fs/ext4", NULL); + goto out4; - err = ext4_init_feat_adverts(); + err = ext4_init_sysfs(); if (err) - goto out4; + goto out3; err = ext4_init_mballoc(); if (err) @@ -5682,16 +5291,12 @@ out1: ext4_mballoc_ready = 0; ext4_exit_mballoc(); out2: - ext4_exit_feat_adverts(); -out4: - if (ext4_proc_root) - remove_proc_entry("fs/ext4", NULL); - kset_unregister(ext4_kset); -out5: + ext4_exit_sysfs(); +out3: ext4_exit_system_zone(); -out6: +out4: ext4_exit_pageio(); -out7: +out5: ext4_exit_es(); return err; @@ -5706,9 +5311,7 @@ static void __exit ext4_exit_fs(void) unregister_filesystem(&ext4_fs_type); destroy_inodecache(); ext4_exit_mballoc(); - ext4_exit_feat_adverts(); - remove_proc_entry("fs/ext4", NULL); - kset_unregister(ext4_kset); + ext4_exit_sysfs(); ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_es(); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index c677f2c1044b..6f7ee30a89ce 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -23,17 +23,21 @@ #include "xattr.h" #ifdef CONFIG_EXT4_FS_ENCRYPTION -static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie) +static const char *ext4_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { struct page *cpage = NULL; char *caddr, *paddr = NULL; struct ext4_str cstr, pstr; - struct inode *inode = d_inode(dentry); struct ext4_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; u32 plen, max_size = inode->i_sb->s_blocksize; + if (!dentry) + return ERR_PTR(-ECHILD); + res = ext4_get_encryption_info(inode); if (res) return ERR_PTR(res); @@ -45,19 +49,19 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) return ERR_CAST(cpage); - caddr = kmap(cpage); + caddr = page_address(cpage); caddr[size] = 0; } /* Symlink is encrypted */ sd = (struct ext4_encrypted_symlink_data *)caddr; cstr.name = sd->encrypted_path; - cstr.len = le32_to_cpu(sd->len); + cstr.len = le16_to_cpu(sd->len); if ((cstr.len + sizeof(struct ext4_encrypted_symlink_data) - 1) > max_size) { /* Symlink data on the disk is corrupted */ - res = -EIO; + res = -EFSCORRUPTED; goto errout; } plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ? @@ -75,24 +79,20 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook /* Null-terminate the name */ if (res <= plen) paddr[res] = '\0'; - if (cpage) { - kunmap(cpage); + if (cpage) page_cache_release(cpage); - } - return *cookie = paddr; + set_delayed_call(done, kfree_link, paddr); + return paddr; errout: - if (cpage) { - kunmap(cpage); + if (cpage) page_cache_release(cpage); - } kfree(paddr); return ERR_PTR(res); } const struct inode_operations ext4_encrypted_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext4_encrypted_follow_link, - .put_link = kfree_put_link, + .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -103,8 +103,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = { const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -114,7 +113,7 @@ const struct inode_operations ext4_symlink_inode_operations = { const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c new file mode 100644 index 000000000000..1420a3c614af --- /dev/null +++ b/fs/ext4/sysfs.c @@ -0,0 +1,448 @@ +/* + * linux/fs/ext4/sysfs.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Theodore Ts'o (tytso@mit.edu) + * + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "ext4.h" +#include "ext4_jbd2.h" + +typedef enum { + attr_noop, + attr_delayed_allocation_blocks, + attr_session_write_kbytes, + attr_lifetime_write_kbytes, + attr_reserved_clusters, + attr_inode_readahead, + attr_trigger_test_error, + attr_feature, + attr_pointer_ui, + attr_pointer_atomic, +} attr_id_t; + +typedef enum { + ptr_explicit, + ptr_ext4_sb_info_offset, + ptr_ext4_super_block_offset, +} attr_ptr_t; + +static const char *proc_dirname = "fs/ext4"; +static struct proc_dir_entry *ext4_proc_root; + +struct ext4_attr { + struct attribute attr; + short attr_id; + short attr_ptr; + union { + int offset; + void *explicit_ptr; + } u; +}; + +static ssize_t session_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + return snprintf(buf, PAGE_SIZE, "%lu\n", + (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + sbi->s_sectors_written_start) >> 1); +} + +static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1))); +} + +static ssize_t inode_readahead_blks_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + if (t && (!is_power_of_2(t) || t > 0x40000000)) + return -EINVAL; + + sbi->s_inode_readahead_blks = t; + return count; +} + +static ssize_t reserved_clusters_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long long val; + ext4_fsblk_t clusters = (ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits); + int ret; + + ret = kstrtoull(skip_spaces(buf), 0, &val); + if (!ret || val >= clusters) + return -EINVAL; + + atomic64_set(&sbi->s_resv_clusters, val); + return count; +} + +static ssize_t trigger_test_error(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + int len = count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (len && buf[len-1] == '\n') + len--; + + if (len) + ext4_error(sbi->s_sb, "%.*s", len, buf); + return count; +} + +#define EXT4_ATTR(_name,_mode,_id) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ +} + +#define EXT4_ATTR_FUNC(_name,_mode) EXT4_ATTR(_name,_mode,_name) + +#define EXT4_ATTR_FEATURE(_name) EXT4_ATTR(_name, 0444, feature) + +#define EXT4_ATTR_OFFSET(_name,_mode,_id,_struct,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ + .attr_ptr = ptr_##_struct##_offset, \ + .u = { \ + .offset = offsetof(struct _struct, _elname),\ + }, \ +} + +#define EXT4_RO_ATTR_ES_UI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname) + +#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) + +#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ + .attr_ptr = ptr_explicit, \ + .u = { \ + .explicit_ptr = _ptr, \ + }, \ +} + +#define ATTR_LIST(name) &ext4_attr_##name.attr + +EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444); +EXT4_ATTR_FUNC(session_write_kbytes, 0444); +EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); +EXT4_ATTR_FUNC(reserved_clusters, 0644); + +EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, + ext4_sb_info, s_inode_readahead_blks); +EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); +EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); +EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); +EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); +EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); +EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); + +static unsigned int old_bump_val = 128; +EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); + +static struct attribute *ext4_attrs[] = { + ATTR_LIST(delayed_allocation_blocks), + ATTR_LIST(session_write_kbytes), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_clusters), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), + ATTR_LIST(mb_stream_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), + ATTR_LIST(trigger_fs_error), + ATTR_LIST(err_ratelimit_interval_ms), + ATTR_LIST(err_ratelimit_burst), + ATTR_LIST(warning_ratelimit_interval_ms), + ATTR_LIST(warning_ratelimit_burst), + ATTR_LIST(msg_ratelimit_interval_ms), + ATTR_LIST(msg_ratelimit_burst), + ATTR_LIST(errors_count), + ATTR_LIST(first_error_time), + ATTR_LIST(last_error_time), + NULL, +}; + +/* Features this copy of ext4 supports */ +EXT4_ATTR_FEATURE(lazy_itable_init); +EXT4_ATTR_FEATURE(batched_discard); +EXT4_ATTR_FEATURE(meta_bg_resize); +EXT4_ATTR_FEATURE(encryption); +EXT4_ATTR_FEATURE(metadata_csum_seed); + +static struct attribute *ext4_feat_attrs[] = { + ATTR_LIST(lazy_itable_init), + ATTR_LIST(batched_discard), + ATTR_LIST(meta_bg_resize), + ATTR_LIST(encryption), + ATTR_LIST(metadata_csum_seed), + NULL, +}; + +static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) +{ + switch (a->attr_ptr) { + case ptr_explicit: + return a->u.explicit_ptr; + case ptr_ext4_sb_info_offset: + return (void *) (((char *) sbi) + a->u.offset); + case ptr_ext4_super_block_offset: + return (void *) (((char *) sbi->s_es) + a->u.offset); + } + return NULL; +} + +static ssize_t ext4_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + void *ptr = calc_ptr(a, sbi); + + switch (a->attr_id) { + case attr_delayed_allocation_blocks: + return snprintf(buf, PAGE_SIZE, "%llu\n", + (s64) EXT4_C2B(sbi, + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); + case attr_session_write_kbytes: + return session_write_kbytes_show(a, sbi, buf); + case attr_lifetime_write_kbytes: + return lifetime_write_kbytes_show(a, sbi, buf); + case attr_reserved_clusters: + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long) + atomic64_read(&sbi->s_resv_clusters)); + case attr_inode_readahead: + case attr_pointer_ui: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%u\n", + *((unsigned int *) ptr)); + case attr_pointer_atomic: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%d\n", + atomic_read((atomic_t *) ptr)); + case attr_feature: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + + return 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + void *ptr = calc_ptr(a, sbi); + unsigned long t; + int ret; + + switch (a->attr_id) { + case attr_reserved_clusters: + return reserved_clusters_store(a, sbi, buf, len); + case attr_pointer_ui: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + *((unsigned int *) ptr) = t; + return len; + case attr_inode_readahead: + return inode_readahead_blks_store(a, sbi, buf, len); + case attr_trigger_test_error: + return trigger_test_error(a, sbi, buf, len); + } + return 0; +} + +static void ext4_sb_release(struct kobject *kobj) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops ext4_attr_ops = { + .show = ext4_attr_show, + .store = ext4_attr_store, +}; + +static struct kobj_type ext4_sb_ktype = { + .default_attrs = ext4_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_sb_release, +}; + +static struct kobj_type ext4_ktype = { + .sysfs_ops = &ext4_attr_ops, +}; + +static struct kset ext4_kset = { + .kobj = {.ktype = &ext4_ktype}, +}; + +static struct kobj_type ext4_feat_ktype = { + .default_attrs = ext4_feat_attrs, + .sysfs_ops = &ext4_attr_ops, +}; + +static struct kobject ext4_feat = { + .kset = &ext4_kset, +}; + +#define PROC_FILE_SHOW_DEFN(name) \ +static int name##_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \ +} \ +\ +static const struct file_operations ext4_seq_##name##_fops = { \ + .owner = THIS_MODULE, \ + .open = name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} + +#define PROC_FILE_LIST(name) \ + { __stringify(name), &ext4_seq_##name##_fops } + +PROC_FILE_SHOW_DEFN(es_shrinker_info); +PROC_FILE_SHOW_DEFN(options); + +static struct ext4_proc_files { + const char *name; + const struct file_operations *fops; +} proc_files[] = { + PROC_FILE_LIST(options), + PROC_FILE_LIST(es_shrinker_info), + PROC_FILE_LIST(mb_groups), + { NULL, NULL }, +}; + +int ext4_register_sysfs(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_proc_files *p; + int err; + + sbi->s_kobj.kset = &ext4_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + + if (sbi->s_proc) { + for (p = proc_files; p->name; p++) + proc_create_data(p->name, S_IRUGO, sbi->s_proc, + p->fops, sb); + } + return 0; +} + +void ext4_unregister_sysfs(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_proc_files *p; + + if (sbi->s_proc) { + for (p = proc_files; p->name; p++) + remove_proc_entry(p->name, sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } + kobject_del(&sbi->s_kobj); +} + +int __init ext4_init_sysfs(void) +{ + int ret; + + kobject_set_name(&ext4_kset.kobj, "ext4"); + ext4_kset.kobj.parent = fs_kobj; + ret = kset_register(&ext4_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&ext4_feat, &ext4_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&ext4_kset); + else + ext4_proc_root = proc_mkdir(proc_dirname, NULL); + return ret; +} + +void ext4_exit_sysfs(void) +{ + kobject_put(&ext4_feat); + kset_unregister(&ext4_kset); + remove_proc_entry(proc_dirname, NULL); + ext4_proc_root = NULL; +} + diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 16e28c08d1e8..a95151e875bd 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -68,10 +68,8 @@ printk("\n"); \ } while (0) # define ea_bdebug(bh, f...) do { \ - char b[BDEVNAME_SIZE]; \ - printk(KERN_DEBUG "block %s:%lu: ", \ - bdevname(bh->b_bdev, b), \ - (unsigned long) bh->b_blocknr); \ + printk(KERN_DEBUG "block %pg:%lu: ", \ + bh->b_bdev, (unsigned long) bh->b_blocknr); \ printk(f); \ printk("\n"); \ } while (0) @@ -195,7 +193,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) - return -EIO; + return -EFSCORRUPTED; e = next; } @@ -205,7 +203,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, (void *)e + sizeof(__u32) || value_start + le16_to_cpu(entry->e_value_offs) + le32_to_cpu(entry->e_value_size) > end)) - return -EIO; + return -EFSCORRUPTED; entry = EXT4_XATTR_NEXT(entry); } @@ -222,9 +220,9 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) - return -EIO; + return -EFSCORRUPTED; if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) - return -EIO; + return -EFSBADCRC; error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, bh->b_data); if (!error) @@ -239,7 +237,7 @@ ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) if (entry->e_value_block != 0 || value_size > size || le16_to_cpu(entry->e_value_offs) + value_size > size) - return -EIO; + return -EFSCORRUPTED; return 0; } @@ -266,7 +264,7 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, } *pentry = entry; if (!cmp && ext4_xattr_check_entry(entry, size)) - return -EIO; + return -EFSCORRUPTED; return cmp ? -ENODATA : 0; } @@ -297,13 +295,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); - error = -EIO; + error = -EFSCORRUPTED; goto cleanup; } ext4_xattr_cache_insert(ext4_mb_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); - if (error == -EIO) + if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; @@ -404,20 +402,24 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, const struct xattr_handler *handler = ext4_xattr_handler(entry->e_name_index); - if (handler) { - size_t size = handler->list(dentry, buffer, rest, - entry->e_name, - entry->e_name_len, - handler->flags); + if (handler && (!handler->list || handler->list(dentry))) { + const char *prefix = handler->prefix ?: handler->name; + size_t prefix_len = strlen(prefix); + size_t size = prefix_len + entry->e_name_len + 1; + if (buffer) { if (size > rest) return -ERANGE; - buffer += size; + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, entry->e_name, entry->e_name_len); + buffer += entry->e_name_len; + *buffer++ = 0; } rest -= size; } } - return buffer_size - rest; + return buffer_size - rest; /* total size */ } static int @@ -445,7 +447,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (ext4_xattr_check_block(inode, bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); - error = -EIO; + error = -EFSCORRUPTED; goto cleanup; } ext4_xattr_cache_insert(ext4_mb_cache, bh); @@ -525,12 +527,12 @@ errout: static void ext4_xattr_update_super_block(handle_t *handle, struct super_block *sb) { - if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR)) + if (ext4_has_feature_xattr(sb)) return; BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { - EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); + ext4_set_feature_xattr(sb); ext4_handle_dirty_super(handle, sb); } } @@ -751,7 +753,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, if (ext4_xattr_check_block(inode, bs->bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); - error = -EIO; + error = -EFSCORRUPTED; goto cleanup; } /* Find the named attribute. */ @@ -811,7 +813,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, bs->bh); } unlock_buffer(bs->bh); - if (error == -EIO) + if (error == -EFSCORRUPTED) goto bad_block; if (!error) error = ext4_handle_dirty_xattr_block(handle, @@ -855,7 +857,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } error = ext4_xattr_set_entry(i, s); - if (error == -EIO) + if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; @@ -1314,7 +1316,7 @@ retry: if (ext4_xattr_check_block(inode, bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); - error = -EIO; + error = -EFSCORRUPTED; goto cleanup; } base = BHDR(bh); @@ -1579,7 +1581,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; if (entry1->e_value_block != 0 || entry2->e_value_block != 0) - return -EIO; + return -EFSCORRUPTED; if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 95d90e0560f0..3e81bdca071a 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -11,38 +11,20 @@ #include "ext4.h" #include "xattr.h" -static size_t -ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; - const size_t total_len = prefix_len + name_len + 1; - - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - static int -ext4_xattr_security_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +ext4_xattr_security_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, name, buffer, size); } static int -ext4_xattr_security_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +ext4_xattr_security_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -76,7 +58,6 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct xattr_handler ext4_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = ext4_xattr_security_list, .get = ext4_xattr_security_get, .set = ext4_xattr_security_set, }; diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 891ee2ddfbd6..2a3c6f9b8cb8 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -12,40 +12,26 @@ #include "ext4.h" #include "xattr.h" -static size_t -ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) +static bool +ext4_xattr_trusted_list(struct dentry *dentry) { - const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; + return capable(CAP_SYS_ADMIN); } static int -ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) +ext4_xattr_trusted_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, void *buffer, + size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, name, buffer, size); } static int -ext4_xattr_trusted_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +ext4_xattr_trusted_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 6ed932b3c043..d152f431e432 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -11,30 +11,17 @@ #include "ext4.h" #include "xattr.h" -static size_t -ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) +static bool +ext4_xattr_user_list(struct dentry *dentry) { - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!test_opt(dentry->d_sb, XATTR_USER)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; + return test_opt(dentry->d_sb, XATTR_USER); } static int -ext4_xattr_user_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +ext4_xattr_user_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER, @@ -42,11 +29,10 @@ ext4_xattr_user_get(struct dentry *dentry, const char *name, } static int -ext4_xattr_user_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +ext4_xattr_user_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c5a38e352a80..3842af954cd5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -47,7 +47,8 @@ repeat: /* * We guarantee no failure on the returned page. */ -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, + bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); struct page *page; @@ -58,6 +59,9 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) .blk_addr = index, .encrypted_page = NULL, }; + + if (unlikely(!is_meta)) + fio.rw &= ~REQ_META; repeat: page = grab_cache_page(mapping, index); if (!page) { @@ -91,6 +95,17 @@ out: return page; } +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, true); +} + +/* for POR only */ +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, false); +} + bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { @@ -125,7 +140,8 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) /* * Readahead CP/NAT/SIT/SSA pages */ -int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type) +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync) { block_t prev_blk_addr = 0; struct page *page; @@ -133,10 +149,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO, + .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, .encrypted_page = NULL, }; + if (unlikely(type == META_POR)) + fio.rw &= ~REQ_META; + for (; nrpages-- > 0; blkno++) { if (!is_valid_blkaddr(sbi, blkno, type)) @@ -196,7 +215,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR); + ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); } static int f2fs_write_meta_page(struct page *page, @@ -218,7 +237,7 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); - if (wbc->for_reclaim) + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, META, WRITE); return 0; @@ -257,7 +276,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = LONG_MAX; + pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX; struct pagevec pvec; long nwritten = 0; struct writeback_control wbc = { @@ -277,6 +296,13 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (prev == LONG_MAX) + prev = page->index - 1; + if (nr_to_write != LONG_MAX && page->index != prev + 1) { + pagevec_release(&pvec); + goto stop; + } + lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -297,13 +323,14 @@ continue_unlock: break; } nwritten++; + prev = page->index; if (unlikely(nwritten >= nr_to_write)) break; } pagevec_release(&pvec); cond_resched(); } - +stop: if (nwritten) f2fs_submit_merged_bio(sbi, type, WRITE); @@ -383,13 +410,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, type); } -void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); @@ -407,7 +434,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_dirty_inode(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi) { struct ino_entry *e, *tmp; int i; @@ -495,7 +522,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); - ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP); + ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { struct page *page = get_meta_page(sbi, start_blk + i); @@ -695,47 +722,48 @@ fail_no_cp: return -EINVAL; } -static int __add_dirty_inode(struct inode *inode, struct inode_entry *new) +static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) - return -EEXIST; + if (is_inode_flag_set(fi, flag)) + return; - set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - F2FS_I(inode)->dirty_dir = new; - list_add_tail(&new->list, &sbi->dir_inode_list); - stat_inc_dirty_dir(sbi); - return 0; + set_inode_flag(fi, flag); + list_add_tail(&fi->dirty_list, &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); +} + +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (get_dirty_pages(inode) || + !is_inode_flag_set(F2FS_I(inode), flag)) + return; + + list_del_init(&fi->dirty_list); + clear_inode_flag(fi, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); } void update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new; - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; - if (!S_ISDIR(inode->i_mode)) { - inode_inc_dirty_pages(inode); - goto out; - } - - new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); + spin_lock(&sbi->inode_lock[type]); + __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); - spin_unlock(&sbi->dir_inode_lock); + spin_unlock(&sbi->inode_lock[type]); - if (ret) - kmem_cache_free(inode_entry_slab, new); -out: SetPagePrivate(page); f2fs_trace_pid(page); } @@ -743,70 +771,60 @@ out: void add_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new = - f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - int ret = 0; - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); - spin_unlock(&sbi->dir_inode_lock); - - if (ret) - kmem_cache_free(inode_entry_slab, new); + spin_lock(&sbi->inode_lock[DIR_INODE]); + __add_dirty_inode(inode, DIR_INODE); + spin_unlock(&sbi->inode_lock[DIR_INODE]); } -void remove_dirty_dir_inode(struct inode *inode) +void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *entry; - - if (!S_ISDIR(inode->i_mode)) - return; + struct f2fs_inode_info *fi = F2FS_I(inode); + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; - spin_lock(&sbi->dir_inode_lock); - if (get_dirty_pages(inode) || - !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { - spin_unlock(&sbi->dir_inode_lock); + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; - } - entry = F2FS_I(inode)->dirty_dir; - list_del(&entry->list); - F2FS_I(inode)->dirty_dir = NULL; - clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - stat_dec_dirty_dir(sbi); - spin_unlock(&sbi->dir_inode_lock); - kmem_cache_free(inode_entry_slab, entry); + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); /* Only from the recovery routine */ - if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { - clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); + if (is_inode_flag_set(fi, FI_DELAY_IPUT)) { + clear_inode_flag(fi, FI_DELAY_IPUT); iput(inode); } } -void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; - struct inode_entry *entry; struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - spin_lock(&sbi->dir_inode_lock); + spin_lock(&sbi->inode_lock[type]); - head = &sbi->dir_inode_list; + head = &sbi->inode_list[type]; if (list_empty(head)) { - spin_unlock(&sbi->dir_inode_lock); - return; + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; } - entry = list_entry(head->next, struct inode_entry, list); - inode = igrab(entry->inode); - spin_unlock(&sbi->dir_inode_lock); + fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); if (inode) { filemap_fdatawrite(inode->i_mapping); iput(inode); @@ -841,11 +859,9 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - sync_dirty_dir_inodes(sbi); - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; + err = sync_dirty_inodes(sbi, DIR_INODE); + if (err) goto out; - } goto retry_flush_dents; } @@ -858,10 +874,9 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - sync_node_pages(sbi, 0, &wbc); - if (unlikely(f2fs_cp_error(sbi))) { + err = sync_node_pages(sbi, 0, &wbc); + if (err) { f2fs_unlock_all(sbi); - err = -EIO; goto out; } goto retry_flush_nodes; @@ -892,7 +907,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -918,7 +933,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; } next_free_nid(sbi, &last_nid); @@ -1000,6 +1015,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_addr(sbi); + /* need to wait for end_io results */ + wait_on_all_pages_writeback(sbi); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1026,7 +1046,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); @@ -1049,22 +1069,25 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, discard_blk); - release_dirty_inode(sbi); + release_ino_entry(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); + + return 0; } /* * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + int err = 0; mutex_lock(&sbi->cp_mutex); @@ -1072,14 +1095,19 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || (cpc->reason == CP_DISCARD && !sbi->discard_blks))) goto out; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto out; - if (f2fs_readonly(sbi->sb)) + } + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; goto out; + } trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); - if (block_operations(sbi)) + err = block_operations(sbi); + if (err) goto out; trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); @@ -1101,7 +1129,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, cpc); + err = do_checkpoint(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1109,9 +1137,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason == CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); + + /* do checkpoint periodically */ + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: mutex_unlock(&sbi->cp_mutex); - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); + return err; } void init_ino_entry_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index 9f77de2ef317..5de2d866a25c 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -122,7 +122,7 @@ int _f2fs_get_encryption_info(struct inode *inode) struct key *keyring_key = NULL; struct f2fs_encryption_key *master_key; struct f2fs_encryption_context ctx; - struct user_key_payload *ukp; + const struct user_key_payload *ukp; struct crypto_ablkcipher *ctfm; const char *cipher_str; char raw_key[F2FS_MAX_KEY_SIZE]; @@ -199,7 +199,7 @@ retry: } crypt_info->ci_keyring_key = keyring_key; BUG_ON(keyring_key->type != &key_type_logon); - ukp = ((struct user_key_payload *)keyring_key->payload.data); + ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { res = -EINVAL; goto out; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a82abe921b89..ac9e7c6aac74 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -225,7 +225,8 @@ void set_data_blkaddr(struct dnode_of_data *dn) /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - set_page_dirty(node_page); + if (set_page_dirty(node_page)) + dn->node_changed = true; } int reserve_new_block(struct dnode_of_data *dn) @@ -275,7 +276,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) return f2fs_reserve_block(dn, index); } -struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int rw, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -292,7 +294,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return read_mapping_page(mapping, index, NULL); - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) return ERR_PTR(-ENOMEM); @@ -352,7 +354,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, false); if (IS_ERR(page)) return page; @@ -372,12 +374,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write) { struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, for_write); if (IS_ERR(page)) return page; @@ -410,8 +413,8 @@ struct page *get_new_data_page(struct inode *inode, struct page *page; struct dnode_of_data dn; int err; -repeat: - page = grab_cache_page(mapping, index); + + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* * before exiting, we should make sure ipage will be released @@ -439,17 +442,16 @@ repeat: } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC); + /* if ipage exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ipage); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - goto repeat; - - /* wait for read completion */ - lock_page(page); + return page; } got_it: - if (new_i_size && - i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { - i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); + if (new_i_size && i_size_read(inode) < + ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) { + i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)); /* Only the directory inode sets new_i_size */ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); } @@ -489,16 +491,13 @@ alloc: /* update i_size */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; - if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) - i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); - - /* direct IO doesn't use extent cache to maximize the performance */ - f2fs_drop_largest_extent(dn->inode, fofs); - + if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) + i_size_write(dn->inode, + ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); return 0; } -static void __allocate_data_blocks(struct inode *inode, loff_t offset, +static int __allocate_data_blocks(struct inode *inode, loff_t offset, size_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -507,14 +506,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, u64 len = F2FS_BYTES_TO_BLK(count); bool allocated; u64 end_offset; + int err = 0; while (len) { - f2fs_balance_fs(sbi); f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, ALLOC_NODE)) + err = get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) goto out; allocated = false; @@ -523,9 +523,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, while (dn.ofs_in_node < end_offset && len) { block_t blkaddr; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; + } + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { - if (__allocate_data_block(&dn)) + err = __allocate_data_block(&dn); + if (err) goto sync_out; allocated = true; } @@ -539,8 +545,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); } - return; + return err; sync_out: if (allocated) @@ -548,7 +556,8 @@ sync_out: f2fs_put_dnode(&dn); out: f2fs_unlock_op(sbi); - return; + f2fs_balance_fs(sbi, dn.node_changed); + return err; } /* @@ -560,16 +569,18 @@ out: * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; pgoff_t pgofs, end_offset; int err = 0, ofs = 1; struct extent_info ei; bool allocated = false; + block_t blkaddr; map->m_len = 0; map->m_flags = 0; @@ -585,7 +596,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } if (create) - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -595,50 +606,59 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) { - err = -ENOENT; - goto put_out; - } else if (flag == F2FS_GET_BLOCK_READ || - flag == F2FS_GET_BLOCK_DIO) { - goto put_out; + + if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { + if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto put_out; + } + err = __allocate_data_block(&dn); + if (err) + goto put_out; + allocated = true; + map->m_flags = F2FS_MAP_NEW; + } else { + if (flag != F2FS_GET_BLOCK_FIEMAP || + dn.data_blkaddr != NEW_ADDR) { + if (flag == F2FS_GET_BLOCK_BMAP) + err = -ENOENT; + goto put_out; + } + + /* + * preallocated unwritten block should be mapped + * for fiemap. + */ + if (dn.data_blkaddr == NEW_ADDR) + map->m_flags = F2FS_MAP_UNWRITTEN; } - /* - * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP), - * mark it as mapped and unwritten block. - */ } - if (dn.data_blkaddr != NULL_ADDR) { - map->m_flags = F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - if (dn.data_blkaddr == NEW_ADDR) - map->m_flags |= F2FS_MAP_UNWRITTEN; - } else if (create) { - err = __allocate_data_block(&dn); - if (err) - goto put_out; - allocated = true; - map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - } else { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; - goto put_out; - } + map->m_flags |= F2FS_MAP_MAPPED; + map->m_pblk = dn.data_blkaddr; + map->m_len = 1; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); - map->m_len = 1; dn.ofs_in_node++; pgofs++; get_next: + if (map->m_len >= maxblocks) + goto sync_out; + if (dn.ofs_in_node >= end_offset) { if (allocated) sync_inode_page(&dn); allocated = false; f2fs_put_dnode(&dn); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + f2fs_lock_op(sbi); + } + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { @@ -647,43 +667,56 @@ get_next: goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR && - flag != F2FS_GET_BLOCK_FIEMAP) - goto put_out; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); } - if (maxblocks > map->m_len) { - block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NULL_ADDR && create) { + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { + if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; + } err = __allocate_data_block(&dn); if (err) goto sync_out; allocated = true; map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; - } - /* Give more consecutive addresses for the readahead */ - if ((map->m_pblk != NEW_ADDR && - blkaddr == (map->m_pblk + ofs)) || - (map->m_pblk == NEW_ADDR && - blkaddr == NEW_ADDR)) { - ofs++; - dn.ofs_in_node++; - pgofs++; - map->m_len++; - goto get_next; + } else { + /* + * we only merge preallocated unwritten blocks + * for fiemap. + */ + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; } } + + /* Give more consecutive addresses for the readahead */ + if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && + blkaddr == NEW_ADDR)) { + ofs++; + dn.ofs_in_node++; + pgofs++; + map->m_len++; + goto get_next; + } + sync_out: if (allocated) sync_inode_page(&dn); put_out: f2fs_put_dnode(&dn); unlock_out: - if (create) - f2fs_unlock_op(F2FS_I_SB(inode)); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + } out: trace_f2fs_map_blocks(inode, map, err); return err; @@ -723,6 +756,10 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) + return -EFBIG; + return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_BMAP); } @@ -742,28 +779,36 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { struct buffer_head map_bh; sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); + loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; - bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); if (ret) return ret; + if (f2fs_has_inline_data(inode)) { + ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); + if (ret != -EAGAIN) + return ret; + } + mutex_lock(&inode->i_mutex); - if (len >= isize) { - whole_file = true; - len = isize; - } + isize = i_size_read(inode); + if (start >= isize) + goto out; + + if (start + len > isize) + len = isize - start; if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); + next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; @@ -775,59 +820,37 @@ next: /* HOLE */ if (!buffer_mapped(&map_bh)) { - start_blk++; - - if (!past_eof && blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - if (past_eof && size) { - flags |= FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } + /* Go through holes util pass the EOF */ + if (blk_to_logical(inode, start_blk++) < isize) + goto prep_next; + /* Found a hole beyond isize means no more extents. + * Note that the premise is that filesystems don't + * punch holes beyond isize and keep size unchanged. + */ + flags |= FIEMAP_EXTENT_LAST; + } - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - goto out; - } else { - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - goto out; - } + if (size) { + if (f2fs_encrypted_inode(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; - /* - * if size != 0 then we know we already have an extent - * to add, so add it. - */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - if (ret) - goto out; - } + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = 0; - if (buffer_unwritten(&map_bh)) - flags = FIEMAP_EXTENT_UNWRITTEN; + if (start_blk > last_blk || ret) + goto out; - start_blk += logical_to_blk(inode, size); + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; - } + start_blk += logical_to_blk(inode, size); + +prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; @@ -903,7 +926,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; - if (f2fs_map_blocks(inode, &map, 0, false)) + if (f2fs_map_blocks(inode, &map, 0, + F2FS_GET_BLOCK_READ)) goto set_error_page; } got_it: @@ -936,21 +960,14 @@ submit_and_realloc: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - struct page *cpage; ctx = f2fs_get_crypto_ctx(inode); if (IS_ERR(ctx)) goto set_error_page; /* wait the page to be moved by cleaning */ - cpage = find_lock_page( - META_MAPPING(F2FS_I_SB(inode)), - block_nr); - if (cpage) { - f2fs_wait_on_page_writeback(cpage, - DATA); - f2fs_put_page(cpage, 1); - } + f2fs_wait_on_encrypted_page_writeback( + F2FS_I_SB(inode), block_nr); } bio = bio_alloc(GFP_KERNEL, @@ -1012,6 +1029,9 @@ static int f2fs_read_data_pages(struct file *file, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_mapping->host; + struct page *page = list_entry(pages->prev, struct page, lru); + + trace_f2fs_readpages(inode, page, nr_pages); /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) @@ -1041,6 +1061,11 @@ int do_write_data_page(struct f2fs_io_info *fio) } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), + fio->blk_addr); + fio->encrypted_page = f2fs_encrypt(inode, fio->page); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); @@ -1056,6 +1081,7 @@ int do_write_data_page(struct f2fs_io_info *fio) */ if (unlikely(fio->blk_addr != NEW_ADDR && !is_cold_data(page) && + !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { rewrite_data_page(fio); set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); @@ -1152,10 +1178,11 @@ out: if (err) ClearPageUptodate(page); unlock_page(page); - if (need_balance_fs) - f2fs_balance_fs(sbi); - if (wbc->for_reclaim) + f2fs_balance_fs(sbi, need_balance_fs); + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); + remove_dirty_inode(inode); + } return 0; redirty_out: @@ -1327,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; + /* skip writing during file defragment */ + if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG)) + goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; @@ -1342,7 +1373,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (locked) mutex_unlock(&sbi->writepages); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; @@ -1355,11 +1386,83 @@ skip_write: static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + loff_t i_size = i_size_read(inode); + + if (to > i_size) { + truncate_pagecache(inode, i_size); + truncate_blocks(inode, i_size, true); + } +} + +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + pgoff_t index = page->index; + struct dnode_of_data dn; + struct page *ipage; + bool locked = false; + struct extent_info ei; + int err = 0; - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - truncate_blocks(inode, inode->i_size, true); + if (f2fs_has_inline_data(inode) || + (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + f2fs_lock_op(sbi); + locked = true; } +restart: + /* check inline_data */ + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA) { + read_inline_data(page, ipage); + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + sync_inode_page(&dn); + } else { + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto out; + if (dn.data_blkaddr == NULL_ADDR) + err = f2fs_get_block(&dn, index); + } + } else if (locked) { + err = f2fs_get_block(&dn, index); + } else { + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + bool restart = false; + + /* hole case */ + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err || (!err && dn.data_blkaddr == NULL_ADDR)) + restart = true; + if (restart) { + f2fs_put_dnode(&dn); + f2fs_lock_op(sbi); + locked = true; + goto restart; + } + } + } + + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; +out: + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + f2fs_unlock_op(sbi); + return err; } static int f2fs_write_begin(struct file *file, struct address_space *mapping, @@ -1369,15 +1472,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; - struct page *ipage; pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; - struct dnode_of_data dn; + bool need_balance = false; + block_t blkaddr = NULL_ADDR; int err = 0; trace_f2fs_write_begin(inode, pos, len, flags); - f2fs_balance_fs(sbi); - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -1397,38 +1498,28 @@ repeat: *pagep = page; - f2fs_lock_op(sbi); - - /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto unlock_fail; - } - - set_new_dnode(&dn, inode, ipage, ipage, 0); + err = prepare_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + if (err) + goto fail; - if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { - read_inline_data(page, ipage); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - sync_inode_page(&dn); - goto put_next; + if (need_balance && has_not_enough_free_secs(sbi, 0)) { + unlock_page(page); + f2fs_balance_fs(sbi, true); + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + f2fs_put_page(page, 1); + goto repeat; } - err = f2fs_convert_inline_page(&dn, page); - if (err) - goto put_fail; } - err = f2fs_get_block(&dn, index); - if (err) - goto put_fail; -put_next: - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - f2fs_wait_on_page_writeback(page, DATA); + /* wait for GCed encrypted page writeback */ + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + if (len == PAGE_CACHE_SIZE) goto out_update; if (PageUptodate(page)) @@ -1443,14 +1534,14 @@ put_next: goto out_update; } - if (dn.data_blkaddr == NEW_ADDR) { + if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, .rw = READ_SYNC, - .blk_addr = dn.data_blkaddr, + .blk_addr = blkaddr, .page = page, .encrypted_page = NULL, }; @@ -1481,10 +1572,6 @@ out_clear: clear_cold_data(page); return 0; -put_fail: - f2fs_put_dnode(&dn); -unlock_fail: - f2fs_unlock_op(sbi); fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); @@ -1509,6 +1596,7 @@ static int f2fs_write_end(struct file *file, } f2fs_put_page(page, 1); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } @@ -1536,11 +1624,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int err; /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; @@ -1551,10 +1637,14 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (iov_iter_rw(iter) == WRITE) - __allocate_data_blocks(inode, offset, count); + if (iov_iter_rw(iter) == WRITE) { + err = __allocate_data_blocks(inode, offset, count); + if (err) + goto out; + } err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); +out: if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); @@ -1636,12 +1726,13 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + if (f2fs_has_inline_data(inode)) + return 0; + + /* make sure allocating whole blocks */ + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + filemap_write_and_wait(mapping); + return generic_block_bmap(mapping, block, get_data_block_bmap); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index d013d8479753..4fb6ef88a34f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -33,17 +33,20 @@ static void update_general_status(struct f2fs_sb_info *sbi) int i; /* validation check of the segment numbers */ - si->hit_largest = atomic_read(&sbi->read_hit_largest); - si->hit_cached = atomic_read(&sbi->read_hit_cached); - si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree); + si->hit_largest = atomic64_read(&sbi->read_hit_largest); + si->hit_cached = atomic64_read(&sbi->read_hit_cached); + si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; - si->total_ext = atomic_read(&sbi->total_hit_ext); - si->ext_tree = sbi->total_ext_tree; + si->total_ext = atomic64_read(&sbi->total_hit_ext); + si->ext_tree = atomic_read(&sbi->total_ext_tree); + si->zombie_tree = atomic_read(&sbi->total_zombie_tree); si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); - si->ndirty_dirs = sbi->n_dirty_dirs; si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; @@ -105,7 +108,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); + blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); @@ -118,7 +121,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) } } dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); - si->bimodal = div_u64(bimodal, dist); + si->bimodal = div64_u64(bimodal, dist); if (si->dirty_count) si->avg_vblocks = div_u64(total_vblocks, ndirty); else @@ -189,18 +192,18 @@ get_cache: si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); for (i = 0; i <= UPDATE_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree); si->cache_mem += atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node); si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; npages = META_MAPPING(sbi)->nrpages; - si->page_mem += npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; } static int stat_show(struct seq_file *s, void *v) @@ -211,12 +214,10 @@ static int stat_show(struct seq_file *s, void *v) mutex_lock(&f2fs_stat_mutex); list_for_each_entry(si, &f2fs_stat_list, stat_list) { - char devname[BDEVNAME_SIZE]; - update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", - bdevname(si->sbi->sb->s_bdev, devname), i++); + seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n", + si->sbi->sb->s_bdev, i++); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", @@ -269,7 +270,8 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); - seq_printf(s, "CP calls: %d\n", si->cp_count); + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_count, si->bg_cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", @@ -283,15 +285,15 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); seq_puts(s, "\nExtent Cache:\n"); - seq_printf(s, " - Hit Count: L1-1:%d L1-2:%d L2:%d\n", + seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", si->hit_largest, si->hit_cached, si->hit_rbtree); - seq_printf(s, " - Hit Ratio: %d%% (%d / %d)\n", + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", !si->total_ext ? 0 : - (si->hit_total * 100) / si->total_ext, + div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); - seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", - si->ext_tree, si->ext_node); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - inmem: %4d, wb: %4d\n", si->inmem_pages, si->wb_pages); @@ -299,6 +301,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); + seq_printf(s, " - datas: %4d in files:%4d\n", + si->ndirty_data, si->ndirty_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", @@ -333,13 +337,13 @@ static int stat_show(struct seq_file *s, void *v) /* memory footprint */ update_mem_info(si->sbi); - seq_printf(s, "\nMemory: %u KB\n", + seq_printf(s, "\nMemory: %llu KB\n", (si->base_mem + si->cache_mem + si->page_mem) >> 10); - seq_printf(s, " - static: %u KB\n", + seq_printf(s, " - static: %llu KB\n", si->base_mem >> 10); - seq_printf(s, " - cached: %u KB\n", + seq_printf(s, " - cached: %llu KB\n", si->cache_mem >> 10); - seq_printf(s, " - paged : %u KB\n", + seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } mutex_unlock(&f2fs_stat_mutex); @@ -378,10 +382,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; - atomic_set(&sbi->total_hit_ext, 0); - atomic_set(&sbi->read_hit_rbtree, 0); - atomic_set(&sbi->read_hit_largest, 0); - atomic_set(&sbi->read_hit_cached, 0); + atomic64_set(&sbi->total_hit_ext, 0); + atomic64_set(&sbi->read_hit_rbtree, 0); + atomic64_set(&sbi->read_hit_largest, 0); + atomic64_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); @@ -406,20 +410,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kfree(si); } -void __init f2fs_create_root_stats(void) +int __init f2fs_create_root_stats(void) { struct dentry *file; f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - return; + return -ENOMEM; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); if (!file) { debugfs_remove(f2fs_debugfs_root); f2fs_debugfs_root = NULL; + return -ENOMEM; } + + return 0; } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8f15fc134040..faa7495e2d7e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -172,8 +172,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, namehash = f2fs_dentry_hash(&name); - f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); - nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -238,6 +236,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, goto out; max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING, + "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + F2FS_I(dir)->i_current_depth = max_depth; + mark_inode_dirty(dir); + } for (level = 0; level < max_depth; level++) { de = find_in_level(dir, level, &fname, res_page); @@ -258,7 +264,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) if (f2fs_has_inline_dentry(dir)) return f2fs_parent_inline_dir(dir, p); - page = get_lock_data_page(dir, 0); + page = get_lock_data_page(dir, 0, false); if (IS_ERR(page)) return NULL; @@ -444,7 +450,7 @@ error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); truncate_blocks(inode, 0, false); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); remove_inode_page(inode); return ERR_PTR(err); } @@ -630,6 +636,7 @@ fail: f2fs_put_page(dentry_page, 1); out: f2fs_fname_free_filename(&fname); + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; } @@ -651,6 +658,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); fail: up_write(&F2FS_I(inode)->i_sem); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } @@ -695,6 +703,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); @@ -740,7 +750,7 @@ bool f2fs_empty_dir(struct inode *dir) return f2fs_empty_inline_dir(dir); for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = get_lock_data_page(dir, bidx); + dentry_page = get_lock_data_page(dir, bidx, false); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) continue; @@ -787,7 +797,6 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, else d_type = DT_UNKNOWN; - /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -795,12 +804,20 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; + de_name.name = kmalloc(de_name.len, GFP_NOFS); + if (!de_name.name) + return false; + + memcpy(de_name.name, d->filename[bit_pos], de_name.len); + ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, &de_name, fstr); - de_name = *fstr; - fstr->len = save_len; + kfree(de_name.name); if (ret < 0) return true; + + de_name = *fstr; + fstr->len = save_len; } if (!dir_emit(ctx, de_name.name, de_name.len, @@ -847,26 +864,28 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); for (; n < npages; n++) { - dentry_page = get_lock_data_page(inode, n); - if (IS_ERR(dentry_page)) - continue; + dentry_page = get_lock_data_page(inode, n, false); + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + if (err == -ENOENT) + continue; + else + goto out; + } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) - goto stop; + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + break; + } ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - dentry_page = NULL; - } -stop: - if (dentry_page && !IS_ERR(dentry_page)) { - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); } out: f2fs_fname_crypto_free_buffer(&fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 997ac86f2a1d..ccd5c636d3fe 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -36,7 +36,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, rb_link_node(&en->rb_node, parent, p); rb_insert_color(&en->rb_node, &et->root); - et->count++; + atomic_inc(&et->node_cnt); atomic_inc(&sbi->total_ext_node); return en; } @@ -45,7 +45,7 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { rb_erase(&en->rb_node, &et->root); - et->count--; + atomic_dec(&et->node_cnt); atomic_dec(&sbi->total_ext_node); if (et->cached_en == en) @@ -68,11 +68,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) et->root = RB_ROOT; et->cached_en = NULL; rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&sbi->total_ext_tree); + } else { + atomic_dec(&sbi->total_zombie_tree); + list_del_init(&et->list); } - atomic_inc(&et->refcount); up_write(&sbi->extent_tree_lock); /* never died until evict_inode */ @@ -131,7 +133,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = et->count; + unsigned int count = atomic_read(&et->node_cnt); node = rb_first(&et->root); while (node) { @@ -152,45 +154,45 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, node = next; } - return count - et->count; + return count - atomic_read(&et->node_cnt); } -static void __drop_largest_extent(struct inode *inode, pgoff_t fofs) +static void __drop_largest_extent(struct inode *inode, + pgoff_t fofs, unsigned int len) { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - if (largest->fofs <= fofs && largest->fofs + largest->len > fofs) + if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) largest->len = 0; } -void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) -{ - if (!f2fs_may_extent_tree(inode)) - return; - - __drop_largest_extent(inode, fofs); -} - -void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +/* return true, if inode page is changed */ +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) - return; + if (!f2fs_may_extent_tree(inode)) { + /* drop largest extent */ + if (i_ext && i_ext->len) { + i_ext->len = 0; + return true; + } + return false; + } et = __grab_extent_tree(inode); - if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; + if (!i_ext || !i_ext->len) + return false; set_extent_info(&ei, le32_to_cpu(i_ext->fofs), le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); write_lock(&et->lock); - if (et->count) + if (atomic_read(&et->node_cnt)) goto out; en = __init_extent_tree(sbi, et, &ei); @@ -201,6 +203,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) } out: write_unlock(&et->lock); + return false; } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -350,8 +353,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, } if (en) { - if (en->ei.len > et->largest.len) - et->largest = en->ei; + __try_update_largest_extent(et, en); et->cached_en = en; } return en; @@ -388,18 +390,17 @@ do_insert: if (!en) return NULL; - if (en->ei.len > et->largest.len) - et->largest = en->ei; + __try_update_largest_extent(et, en); et->cached_en = en; return en; } -unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static unsigned int f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; - struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; + struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; @@ -409,6 +410,8 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (!et) return false; + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + write_lock(&et->lock); if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { @@ -419,148 +422,99 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode, prev = et->largest; dei.len = 0; - /* we do not guarantee that the largest extent is cached all the time */ - __drop_largest_extent(inode, fofs); + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, &insert_p, &insert_parent); - if (!en) { - if (next_en) { - en = next_en; - f2fs_bug_on(sbi, en->ei.fofs <= pos); - pos = en->ei.fofs; - } else { - /* - * skip searching in the tree since there is no - * larger extent node in the cache. - */ - goto update_extent; - } - } + if (!en) + en = next_en; /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ - while (en) { - struct rb_node *node; + while (en && en->ei.fofs < end) { + unsigned int org_end; + int parts = 0; /* # of parts current extent split into */ - if (pos >= end) - break; + next_en = en1 = NULL; dei = en->ei; - en1 = en2 = NULL; + org_end = dei.fofs + dei.len; + f2fs_bug_on(sbi, pos >= org_end); - node = rb_next(&en->rb_node); + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + en->ei.len = pos - en->ei.fofs; + prev_en = en; + parts = 1; + } - /* - * 2.1 there are four cases when we invalidate blkaddr in extent - * node, |V: valid address, X: will be invalidated| - */ - /* case#1, invalidate right part of extent node |VVVVVXXXXX| */ - if (pos > dei.fofs && end >= dei.fofs + dei.len) { - en->ei.len = pos - dei.fofs; - - if (en->ei.len < F2FS_MIN_EXTENT_LEN) { - __detach_extent_node(sbi, et, en); - insert_p = NULL; - insert_parent = NULL; - goto update; + if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (parts) { + set_extent_info(&ei, end, + end - dei.fofs + dei.blk, + org_end - end); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + next_en = en1; + } else { + en->ei.fofs = end; + en->ei.blk += end - dei.fofs; + en->ei.len -= end - dei.fofs; + next_en = en; } - - if (__is_extent_same(&dei, &et->largest)) - et->largest = en->ei; - goto next; + parts++; } - /* case#2, invalidate left part of extent node |XXXXXVVVVV| */ - if (pos <= dei.fofs && end < dei.fofs + dei.len) { - en->ei.fofs = end; - en->ei.blk += end - dei.fofs; - en->ei.len -= end - dei.fofs; - - if (en->ei.len < F2FS_MIN_EXTENT_LEN) { - __detach_extent_node(sbi, et, en); - insert_p = NULL; - insert_parent = NULL; - goto update; - } + if (!next_en) { + struct rb_node *node = rb_next(&en->rb_node); - if (__is_extent_same(&dei, &et->largest)) - et->largest = en->ei; - goto next; + next_en = node ? + rb_entry(node, struct extent_node, rb_node) + : NULL; } - __detach_extent_node(sbi, et, en); + if (parts) + __try_update_largest_extent(et, en); + else + __detach_extent_node(sbi, et, en); /* - * if we remove node in rb-tree, our parent node pointer may - * point the wrong place, discard them. + * if original extent is split into zero or two parts, extent + * tree has been altered by deletion or insertion, therefore + * invalidate pointers regard to tree. */ - insert_p = NULL; - insert_parent = NULL; - - /* case#3, invalidate entire extent node |XXXXXXXXXX| */ - if (pos <= dei.fofs && end >= dei.fofs + dei.len) { - if (__is_extent_same(&dei, &et->largest)) - et->largest.len = 0; - goto update; + if (parts != 1) { + insert_p = NULL; + insert_parent = NULL; } - /* - * case#4, invalidate data in the middle of extent node - * |VVVXXXXVVV| - */ - if (dei.len > F2FS_MIN_EXTENT_LEN) { - unsigned int endofs; - - /* insert left part of split extent into cache */ - if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, dei.fofs, dei.blk, - pos - dei.fofs); - en1 = __insert_extent_tree(sbi, et, &ei, - NULL, NULL); - } - - /* insert right part of split extent into cache */ - endofs = dei.fofs + dei.len; - if (endofs - end >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, end, - end - dei.fofs + dei.blk, - endofs - end); - en2 = __insert_extent_tree(sbi, et, &ei, - NULL, NULL); - } - } -update: - /* 2.2 update in global extent list */ + /* update in global extent list */ spin_lock(&sbi->extent_lock); - if (en && !list_empty(&en->list)) + if (!parts && !list_empty(&en->list)) list_del(&en->list); if (en1) list_add_tail(&en1->list, &sbi->extent_list); - if (en2) - list_add_tail(&en2->list, &sbi->extent_list); spin_unlock(&sbi->extent_lock); - /* 2.3 release extent node */ - if (en) + /* release extent node */ + if (!parts) kmem_cache_free(extent_node_slab, en); -next: - en = node ? rb_entry(node, struct extent_node, rb_node) : NULL; - next_en = en; - if (en) - pos = en->ei.fofs; + + en = next_en; } -update_extent: /* 3. update extent in extent cache */ if (blkaddr) { struct extent_node *den = NULL; set_extent_info(&ei, fofs, blkaddr, len); - en3 = __try_merge_extent_node(sbi, et, &ei, &den, + en1 = __try_merge_extent_node(sbi, et, &ei, &den, prev_en, next_en); - if (!en3) - en3 = __insert_extent_tree(sbi, et, &ei, + if (!en1) + en1 = __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ @@ -572,11 +526,11 @@ update_extent: } spin_lock(&sbi->extent_lock); - if (en3) { - if (list_empty(&en3->list)) - list_add_tail(&en3->list, &sbi->extent_list); + if (en1) { + if (list_empty(&en1->list)) + list_add_tail(&en1->list, &sbi->extent_list); else - list_move_tail(&en3->list, &sbi->extent_list); + list_move_tail(&en1->list, &sbi->extent_list); } if (den && !list_empty(&den->list)) list_del(&den->list); @@ -597,45 +551,44 @@ update_extent: unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; + struct extent_tree *et, *next; struct extent_node *en, *tmp; unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_root *root = &sbi->extent_tree_root; unsigned int found; unsigned int node_cnt = 0, tree_cnt = 0; int remained; + bool do_free = false; if (!test_opt(sbi, EXTENT_CACHE)) return 0; + if (!atomic_read(&sbi->total_zombie_tree)) + goto free_node; + if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - if (!atomic_read(&et->refcount)) { - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); + list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + } - radix_tree_delete(root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; + list_del_init(&et->list); + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&sbi->total_ext_tree); + atomic_dec(&sbi->total_zombie_tree); + tree_cnt++; - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } - } + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; } up_write(&sbi->extent_tree_lock); +free_node: /* 2. remove LRU extent entries */ if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; @@ -647,10 +600,19 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!remained--) break; list_del_init(&en->list); + do_free = true; } spin_unlock(&sbi->extent_lock); - while ((found = radix_tree_gang_lookup(root, + if (do_free == false) + goto unlock_out; + + /* + * reset ino for searching victims from beginning of global extent tree. + */ + ino = F2FS_ROOT_INO(sbi); + + while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { unsigned i; @@ -658,12 +620,16 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) for (i = 0; i < found; i++) { struct extent_tree *et = treevec[i]; - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); + if (!atomic_read(&et->node_cnt)) + continue; + + if (write_trylock(&et->lock)) { + node_cnt += __free_extent_tree(sbi, et, false); + write_unlock(&et->lock); + } if (node_cnt + tree_cnt >= nr_shrink) - break; + goto unlock_out; } } unlock_out: @@ -680,7 +646,7 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) struct extent_tree *et = F2FS_I(inode)->extent_tree; unsigned int node_cnt = 0; - if (!et) + if (!et || !atomic_read(&et->node_cnt)) return 0; write_lock(&et->lock); @@ -699,8 +665,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (!et) return; - if (inode->i_nlink && !is_bad_inode(inode) && et->count) { - atomic_dec(&et->refcount); + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + down_write(&sbi->extent_tree_lock); + list_add_tail(&et->list, &sbi->zombie_list); + atomic_inc(&sbi->total_zombie_tree); + up_write(&sbi->extent_tree_lock); return; } @@ -709,11 +679,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) /* delete extent tree entry in radix tree */ down_write(&sbi->extent_tree_lock); - atomic_dec(&et->refcount); - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; + atomic_dec(&sbi->total_ext_tree); up_write(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -765,7 +734,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) init_rwsem(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_tree, 0); + INIT_LIST_HEAD(&sbi->zombie_list); + atomic_set(&sbi->total_zombie_tree, 0); atomic_set(&sbi->total_ext_node, 0); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f1a90ffd7cad..ff79054c6cf6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,7 +19,9 @@ #include <linux/magic.h> #include <linux/kobject.h> #include <linux/sched.h> +#include <linux/vmalloc.h> #include <linux/bio.h> +#include <linux/blkdev.h> #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -52,6 +54,8 @@ #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_FORCE_FG_GC 0x00004000 +#define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -122,6 +126,8 @@ enum { (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 120 /* 2 mins */ struct cp_control { int reason; @@ -155,13 +161,7 @@ struct ino_entry { nid_t ino; /* inode number */ }; -/* - * for the list of directory inodes or gc inodes. - * NOTE: there are two slab users for this structure, if we add/modify/delete - * fields in structure for one of slab users, it may affect fields or size of - * other one, in this condition, it's better to split both of slab and related - * data structure. - */ +/* for the list of inodes to be GCed */ struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ @@ -230,6 +230,8 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) +#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) #define F2FS_IOC_SET_ENCRYPTION_POLICY \ _IOR('f', 19, struct f2fs_encryption_policy) @@ -246,15 +248,22 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ #define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ #define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ +#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_defragment { + u64 start; + u64 len; +}; + /* * For INODE and NODE manager */ @@ -352,9 +361,9 @@ struct extent_tree { struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ + struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ - atomic_t refcount; /* reference count of rb-tree */ - unsigned int count; /* # of extent node in rb-tree*/ + atomic_t node_cnt; /* # of extent node in rb-tree*/ }; /* @@ -429,8 +438,8 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct inode_entry *dirty_dir; /* the pointer of dirty dir */ + struct list_head dirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ @@ -492,12 +501,20 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } +static inline void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (en->ei.len > et->largest.len) + et->largest = en->ei; +} + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ nid_t available_nids; /* maximum available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ + unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ @@ -531,6 +548,7 @@ struct dnode_of_data { nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ bool inode_page_locked; /* inode page is locked or not */ + bool node_changed; /* is node block changed */ block_t data_blkaddr; /* block address of the node block */ }; @@ -634,6 +652,7 @@ struct f2fs_sm_info { enum count_type { F2FS_WRITEBACK, F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, @@ -682,6 +701,12 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + NR_INODE_TYPE, +}; + /* for inner inode cache management */ struct inode_management { struct radix_tree_root ino_root; /* ino entry array */ @@ -698,11 +723,17 @@ enum { SBI_POR_DOING, /* recovery is doing or not */ }; +enum { + CP_TIME, + REQ_TIME, + MAX_TIME, +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ - struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ + int valid_super_block; /* valid super block no */ int s_flag; /* flags for sbi */ /* for node-related operations */ @@ -724,22 +755,26 @@ struct f2fs_sb_info { struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ - /* for directory inode management */ - struct list_head dir_inode_list; /* dir inode list */ - spinlock_t dir_inode_lock; /* for dir inode list lock */ + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ - int total_ext_tree; /* extent tree count */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ atomic_t total_ext_node; /* extent info count */ /* basic filesystem units */ @@ -757,6 +792,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ unsigned int total_valid_inode_count; /* valid inode count */ + loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ @@ -787,15 +823,15 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - atomic_t total_hit_ext; /* # of lookup extent cache */ - atomic_t read_hit_rbtree; /* # of hit rbtree extent node */ - atomic_t read_hit_largest; /* # of hit largest extent node */ - atomic_t read_hit_cached; /* # of hit cached extent node */ + atomic64_t total_hit_ext; /* # of lookup extent cache */ + atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ + atomic64_t read_hit_largest; /* # of hit largest extent node */ + atomic64_t read_hit_cached; /* # of hit cached extent node */ atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ - unsigned int n_dirty_dirs; /* # of dir inodes */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ @@ -810,6 +846,31 @@ struct f2fs_sb_info { unsigned int shrinker_run_no; }; +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + sbi->last_time[type] = jiffies; +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + struct timespec ts = {sbi->interval_time[type], 0}; + unsigned long interval = timespec_to_jiffies(&ts); + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline bool is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) + return 0; + + return f2fs_time_over(sbi, REQ_TIME); +} + /* * Inline functions */ @@ -1045,8 +1106,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { atomic_inc(&F2FS_I(inode)->dirty_pages); - if (S_ISDIR(inode->i_mode)) - inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1061,9 +1122,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode) return; atomic_dec(&F2FS_I(inode)->dirty_pages); - - if (S_ISDIR(inode->i_mode)) - dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) @@ -1078,8 +1138,7 @@ static inline int get_dirty_pages(struct inode *inode) static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); + unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; return ((get_pages(sbi, block_type) + pages_per_sec - 1) >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; } @@ -1220,6 +1279,24 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) return sbi->total_valid_inode_count; } +static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + if (!for_write) + return grab_cache_page(mapping, index); + return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); +} + +static inline void f2fs_copy_page(struct page *src, struct page *dst) +{ + char *src_kaddr = kmap(src); + char *dst_kaddr = kmap(dst); + + memcpy(dst_kaddr, src_kaddr, PAGE_SIZE); + kunmap(dst); + kunmap(src); +} + static inline void f2fs_put_page(struct page *page, int unlock) { if (!page) @@ -1384,6 +1461,8 @@ enum { FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -1570,13 +1649,31 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { - mode_t mode = inode->i_mode; - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) return false; - return S_ISREG(mode); + return S_ISREG(inode->i_mode); +} + +static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kzalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); + return ret; } #define get_inode_mode(i) \ @@ -1609,8 +1706,8 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); -void update_inode(struct inode *, struct page *); -void update_inode_page(struct inode *); +int update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); void handle_failed_inode(struct inode *); @@ -1715,12 +1812,13 @@ void destroy_node_manager_caches(void); */ void register_inmem_page(struct inode *, struct page *); int commit_inmem_pages(struct inode *, bool); -void f2fs_balance_fs(struct f2fs_sb_info *); +void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); +bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); @@ -1739,6 +1837,7 @@ void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); void f2fs_wait_on_page_writeback(struct page *, enum page_type); +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, @@ -1754,13 +1853,14 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); +int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void release_dirty_inode(struct f2fs_sb_info *); +void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void release_ino_entry(struct f2fs_sb_info *); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); @@ -1770,9 +1870,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); -void remove_dirty_dir_inode(struct inode *); -void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); +void remove_dirty_inode(struct inode *); +int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); +int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1787,11 +1887,12 @@ void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int); +struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t); +struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -1802,7 +1903,7 @@ int f2fs_release_page(struct page *, gfp_t); int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); -int f2fs_gc(struct f2fs_sb_info *); +int f2fs_gc(struct f2fs_sb_info *, bool); void build_gc_manager(struct f2fs_sb_info *); /* @@ -1820,9 +1921,11 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext; - int ext_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; + unsigned long long hit_largest, hit_cached, hit_rbtree; + unsigned long long hit_total, total_ext; + int ext_tree, zombie_tree, ext_node; + int ndirty_node, ndirty_meta; + int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; int bg_gc, inmem_pages, wb_pages; @@ -1832,7 +1935,7 @@ struct f2fs_stat_info { int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count, cp_count; + int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; @@ -1844,7 +1947,7 @@ struct f2fs_stat_info { unsigned int segment_count[2]; unsigned int block_count[2]; unsigned int inplace_count; - unsigned base_mem, cache_mem, page_mem; + unsigned long long base_mem, cache_mem, page_mem; }; static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) @@ -1853,14 +1956,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } #define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) -#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) -#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) -#define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext)) -#define stat_inc_rbtree_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_rbtree)) -#define stat_inc_largest_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_largest)) -#define stat_inc_cached_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_cached)) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) +#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) +#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) +#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ @@ -1931,14 +2035,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void __init f2fs_create_root_stats(void); +int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) +#define stat_inc_bg_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) -#define stat_inc_dirty_dir(sbi) -#define stat_dec_dirty_dir(sbi) +#define stat_inc_dirty_inode(sbi, type) +#define stat_dec_dirty_inode(sbi, type) #define stat_inc_total_hit(sb) #define stat_inc_rbtree_node_hit(sb) #define stat_inc_largest_node_hit(sbi) @@ -1959,7 +2064,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void __init f2fs_create_root_stats(void) { } +static inline int __init f2fs_create_root_stats(void) { return 0; } static inline void f2fs_destroy_root_stats(void) { } #endif @@ -1998,6 +2103,8 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, bool f2fs_empty_inline_dir(struct inode *); int f2fs_read_inline_dir(struct file *, struct dir_context *, struct f2fs_str *); +int f2fs_inline_data_fiemap(struct inode *, + struct fiemap_extent_info *, __u64, __u64); /* * shrinker.c @@ -2011,8 +2118,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *); * extent_cache.c */ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_drop_largest_extent(struct inode *, pgoff_t); -void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); @@ -2063,7 +2169,7 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) static inline bool f2fs_may_encrypt(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #else diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8120f8685141..18ddb1e5182a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -40,8 +40,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; - f2fs_balance_fs(sbi); - sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -57,6 +55,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + file_update_time(vma->vm_file); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || @@ -74,7 +74,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, goto mapped; /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { + if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) > + i_size_read(inode)) { unsigned offset; offset = i_size_read(inode) & ~PAGE_CACHE_MASK; zero_user_segment(page, offset, PAGE_CACHE_SIZE); @@ -86,10 +87,16 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA); + + /* wait for GCed encrypted page writeback */ + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + /* if gced page is attached, don't write to cold segment */ clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); + f2fs_update_time(sbi, REQ_TIME); return block_page_mkwrite_return(err); } @@ -195,7 +202,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ - if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(fi, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); clear_inode_flag(fi, FI_NEED_IPU); @@ -227,9 +234,6 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } go_write: - /* guarantee free sections for fsync */ - f2fs_balance_fs(sbi); - /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. @@ -255,8 +259,10 @@ sync_nodes: sync_node_pages(sbi, ino, &wbc); /* if cp_error was enabled, we should avoid infinite loop */ - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto out; + } if (need_inode_block_update(sbi, ino)) { mark_inode_dirty_sync(inode); @@ -269,12 +275,13 @@ sync_nodes: goto out; /* once recovery info is written, don't need to tack this */ - remove_dirty_inode(sbi, ino, APPEND_INO); + remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(fi, FI_APPEND_WRITE); flush_out: - remove_dirty_inode(sbi, ino, UPDATE_INO); + remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(fi, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); + f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); f2fs_trace_ios(NULL, 1); @@ -343,7 +350,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) { + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); if (err && err != -ENOENT) { @@ -412,19 +419,18 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + int err; if (f2fs_encrypted_inode(inode)) { - int err = f2fs_get_encryption_info(inode); + err = f2fs_get_encryption_info(inode); if (err) return 0; } /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; @@ -477,11 +483,11 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) F2FS_I(dn->inode)) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); - set_page_dirty(dn->node_page); sync_inode_page(dn); } dn->ofs_in_node = ofs; + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); return nr_free; @@ -504,14 +510,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; if (cache_only) { - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (page && PageUptodate(page)) goto truncate_out; f2fs_put_page(page, 1); return 0; } - page = get_lock_data_page(inode, index); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) return 0; truncate_out: @@ -598,7 +604,7 @@ int f2fs_truncate(struct inode *inode, bool lock) trace_f2fs_truncate(inode); /* we should check inline_data size */ - if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { + if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -673,13 +679,21 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode, true); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode)); + f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + + /* should convert inline inode here */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } @@ -720,7 +734,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (!len) return 0; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); page = get_new_data_page(inode, NULL, index, false); @@ -738,23 +752,31 @@ static int fill_zero(struct inode *inode, pgoff_t index, int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) { - pgoff_t index; int err; - for (index = pg_start; index < pg_end; index++) { + while (pg_start < pg_end) { struct dnode_of_data dn; + pgoff_t end_offset, count; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { - if (err == -ENOENT) + if (err == -ENOENT) { + pg_start++; continue; + } return err; } - if (dn.data_blkaddr != NULL_ADDR) - truncate_data_blocks_range(&dn, 1); + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); + + f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); + + truncate_data_blocks_range(&dn, count); f2fs_put_dnode(&dn); + + pg_start += count; } return 0; } @@ -763,16 +785,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; - int ret = 0; - - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; + int ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -803,10 +820,10 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); - blk_start = pg_start << PAGE_CACHE_SHIFT; - blk_end = pg_end << PAGE_CACHE_SHIFT; + blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -819,86 +836,100 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int __exchange_data_block(struct inode *inode, pgoff_t src, + pgoff_t dst, bool full) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; - int ret = 0; - - for (; end < nrpages; start++, end++) { - block_t new_addr, old_addr; - - f2fs_lock_op(sbi); + block_t new_addr; + bool do_replace = false; + int ret; - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - new_addr = NULL_ADDR; - } else { - new_addr = dn.data_blkaddr; - truncate_data_blocks_range(&dn, 1); - f2fs_put_dnode(&dn); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + return ret; + } else if (ret == -ENOENT) { + new_addr = NULL_ADDR; + } else { + new_addr = dn.data_blkaddr; + if (!is_checkpointed_data(sbi, new_addr)) { + dn.data_blkaddr = NULL_ADDR; + /* do not invalidate this block address */ + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + do_replace = true; } + f2fs_put_dnode(&dn); + } - if (new_addr == NULL_ADDR) { - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - f2fs_unlock_op(sbi); - continue; - } + if (new_addr == NULL_ADDR) + return full ? truncate_hole(inode, dst, dst + 1) : 0; - if (dn.data_blkaddr == NULL_ADDR) { - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - continue; - } else { - truncate_data_blocks_range(&dn, 1); - } + if (do_replace) { + struct page *ipage = get_node_page(sbi, inode->i_ino); + struct node_info ni; - f2fs_put_dnode(&dn); - } else { - struct page *ipage; + if (IS_ERR(ipage)) { + ret = PTR_ERR(ipage); + goto err_out; + } - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto out; - } + set_new_dnode(&dn, inode, ipage, NULL, 0); + ret = f2fs_reserve_block(&dn, dst); + if (ret) + goto err_out; - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, start); - if (ret) - goto out; + truncate_data_blocks_range(&dn, 1); - old_addr = dn.data_blkaddr; - if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) { - dn.data_blkaddr = NULL_ADDR; - f2fs_update_extent_cache(&dn); - invalidate_blocks(sbi, old_addr); + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true); + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = get_lock_data_page(inode, src, true); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = get_new_data_page(inode, NULL, dst, false); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); + f2fs_put_page(psrc, 1); - dn.data_blkaddr = new_addr; - set_data_blkaddr(&dn); - } else if (new_addr != NEW_ADDR) { - struct node_info ni; + return truncate_hole(inode, src, src + 1); + } + return 0; - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, old_addr, new_addr, - ni.version, true); - } +err_out: + if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { + dn.data_blkaddr = new_addr; + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + f2fs_put_dnode(&dn); + } + return ret; +} - f2fs_put_dnode(&dn); - } +static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + int ret = 0; + + for (; end < nrpages; start++, end++) { + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + ret = __exchange_data_block(inode, end, start, true); f2fs_unlock_op(sbi); + if (ret) + break; } - return 0; -out: - f2fs_unlock_op(sbi); return ret; } @@ -908,9 +939,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size; int ret; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - if (offset + len >= i_size_read(inode)) return -EINVAL; @@ -918,13 +946,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode)); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; pg_start = offset >> PAGE_CACHE_SHIFT; pg_end = (offset + len) >> PAGE_CACHE_SHIFT; @@ -940,7 +964,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; + /* write out all moved pages, if possible */ + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + new_size = i_size_read(inode) - len; + truncate_pagecache(inode, new_size); ret = truncate_blocks(inode, new_size, true); if (!ret) @@ -959,20 +988,13 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, loff_t off_start, off_end; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; - f2fs_balance_fs(sbi); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) @@ -1003,7 +1025,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, return ret; new_size = max_t(loff_t, new_size, - pg_start << PAGE_CACHE_SHIFT); + (loff_t)pg_start << PAGE_CACHE_SHIFT); } for (index = pg_start; index < pg_end; index++) { @@ -1039,7 +1061,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_unlock_op(sbi); new_size = max_t(loff_t, new_size, - (index + 1) << PAGE_CACHE_SHIFT); + (loff_t)(index + 1) << PAGE_CACHE_SHIFT); } if (off_end) { @@ -1066,10 +1088,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t pg_start, pg_end, delta, nrpages, idx; loff_t new_size; - int ret; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; + int ret = 0; new_size = i_size_read(inode) + len; if (new_size > inode->i_sb->s_maxbytes) @@ -1082,13 +1101,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(sbi); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + f2fs_balance_fs(sbi, true); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1107,57 +1124,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { - struct dnode_of_data dn; - struct page *ipage; - block_t new_addr, old_addr; - f2fs_lock_op(sbi); - - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - goto next; - } else if (dn.data_blkaddr == NULL_ADDR) { - f2fs_put_dnode(&dn); - goto next; - } else { - new_addr = dn.data_blkaddr; - truncate_data_blocks_range(&dn, 1); - f2fs_put_dnode(&dn); - } - - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, idx + delta); - if (ret) - goto out; - - old_addr = dn.data_blkaddr; - f2fs_bug_on(sbi, old_addr != NEW_ADDR); - - if (new_addr != NEW_ADDR) { - struct node_info ni; - - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, old_addr, new_addr, - ni.version, true); - } - f2fs_put_dnode(&dn); -next: + ret = __exchange_data_block(inode, idx, idx + delta, false); f2fs_unlock_op(sbi); + if (ret) + break; } - i_size_write(inode, new_size); - return 0; -out: - f2fs_unlock_op(sbi); + /* write out all moved pages, if possible */ + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + + if (!ret) + i_size_write(inode, new_size); return ret; } @@ -1170,17 +1149,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t off_start, off_end; int ret = 0; - f2fs_balance_fs(sbi); - ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -1204,9 +1181,10 @@ noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) - new_size = (index + 1) << PAGE_CACHE_SHIFT; + new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT; else if (index == pg_end) - new_size = (index << PAGE_CACHE_SHIFT) + off_end; + new_size = ((loff_t)index << PAGE_CACHE_SHIFT) + + off_end; else new_size += PAGE_CACHE_SIZE; } @@ -1228,6 +1206,10 @@ static long f2fs_fallocate(struct file *file, int mode, struct inode *inode = file_inode(file); long ret = 0; + /* f2fs only support ->fallocate for regular file */ + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (f2fs_encrypted_inode(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; @@ -1257,6 +1239,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } out: @@ -1364,8 +1347,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - f2fs_balance_fs(F2FS_I_SB(inode)); - if (f2fs_is_atomic_file(inode)) return 0; @@ -1374,6 +1355,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) return ret; set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return 0; } @@ -1395,8 +1378,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); ret = commit_inmem_pages(inode, false); - if (ret) + if (ret) { + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); goto err_out; + } } ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); @@ -1421,6 +1406,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) return ret; set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1437,8 +1423,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) if (!f2fs_is_first_block_written(inode)) return truncate_partial_data_page(inode, 0, true); - punch_hole(inode, 0, F2FS_BLKSIZE); - return 0; + return punch_hole(inode, 0, F2FS_BLKSIZE); } static int f2fs_ioc_abort_volatile_write(struct file *filp) @@ -1453,17 +1438,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - f2fs_balance_fs(F2FS_I_SB(inode)); - if (f2fs_is_atomic_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); commit_inmem_pages(inode, true); } - - if (f2fs_is_volatile_file(inode)) + if (f2fs_is_volatile_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + } mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -1496,9 +1481,14 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) case F2FS_GOING_DOWN_NOSYNC: f2fs_stop_checkpoint(sbi); break; + case F2FS_GOING_DOWN_METAFLUSH: + sync_meta_pages(sbi, META, LONG_MAX); + f2fs_stop_checkpoint(sbi); + break; default: return -EINVAL; } + f2fs_update_time(sbi, REQ_TIME); return 0; } @@ -1529,6 +1519,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1552,6 +1543,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) sizeof(policy))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return f2fs_process_policy(&policy, inode); #else return -EOPNOTSUPP; @@ -1598,13 +1590,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) generate_random_uuid(sbi->raw_super->encrypt_pw_salt); err = f2fs_commit_super(sbi, false); - - mnt_drop_write_file(filp); if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + mnt_drop_write_file(filp); return err; } + mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) @@ -1616,29 +1608,228 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - __u32 i, count; + __u32 sync; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (get_user(count, (__u32 __user *)arg)) + if (get_user(sync, (__u32 __user *)arg)) return -EFAULT; - if (!count || count > F2FS_BATCH_GC_MAX_NUM) - return -EINVAL; + if (f2fs_readonly(sbi->sb)) + return -EROFS; - for (i = 0; i < count; i++) { + if (!sync) { if (!mutex_trylock(&sbi->gc_mutex)) - break; + return -EBUSY; + } else { + mutex_lock(&sbi->gc_mutex); + } + + return f2fs_gc(sbi, sync); +} + +static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + return f2fs_sync_fs(sbi->sb, 1); +} + +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map; + struct extent_info ei; + pgoff_t pg_start, pg_end; + unsigned int blk_per_seg = sbi->blocks_per_seg; + unsigned int total = 0, sec_num; + unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; + block_t blk_end = 0; + bool fragmented = false; + int err; + + /* if in-place-update policy is enabled, don't waste time here */ + if (need_inplace_update(inode)) + return -EINVAL; + + pg_start = range->start >> PAGE_CACHE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT; + + f2fs_balance_fs(sbi, true); + + mutex_lock(&inode->i_mutex); + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, range->start, + range->start + range->len - 1); + if (err) + goto out; + + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (ei.fofs + ei.len >= pg_end) + goto out; + } + + map.m_lblk = pg_start; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } - if (f2fs_gc(sbi)) + if (blk_end && blk_end != map.m_pblk) { + fragmented = true; break; + } + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; } - if (put_user(i, (__u32 __user *)arg)) - return -EFAULT; + if (!fragmented) + goto out; - return 0; + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + + sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, sec_num)) { + err = -EAGAIN; + goto out; + } + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + struct page *page; + + page = get_lock_data_page(inode, idx, true); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto clear_out; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; + + if (idx < pg_end && cnt < blk_per_seg) + goto do_map; + + clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); +out: + mutex_unlock(&inode->i_mutex); + if (!err) + range->len = (u64)total << PAGE_CACHE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; + goto out; + } + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) { + err = -EFAULT; + goto out; + } + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || + range.len & (F2FS_BLKSIZE - 1)) { + err = -EINVAL; + goto out; + } + + err = f2fs_defragment_range(sbi, filp, &range); + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + goto out; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + err = -EFAULT; +out: + mnt_drop_write_file(filp); + return err; } long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -1672,6 +1863,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_WRITE_CHECKPOINT: + return f2fs_ioc_write_checkpoint(filp, arg); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); default: return -ENOTTY; } @@ -1699,6 +1894,22 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC32_SETFLAGS: cmd = F2FS_IOC_SETFLAGS; break; + case F2FS_IOC32_GETVERSION: + cmd = F2FS_IOC_GETVERSION; + break; + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_SHUTDOWN: + case F2FS_IOC_SET_ENCRYPTION_POLICY: + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case F2FS_IOC_GET_ENCRYPTION_POLICY: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 782b8e72c094..f610c2a9bdde 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -16,7 +16,6 @@ #include <linux/kthread.h> #include <linux/delay.h> #include <linux/freezer.h> -#include <linux/blkdev.h> #include "f2fs.h" #include "node.h" @@ -78,9 +77,12 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) wait_ms = gc_th->no_gc_sleep_time; + trace_f2fs_background_gc(sbi->sb, wait_ms, + prefree_segments(sbi), free_segments(sbi)); + /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); @@ -170,9 +172,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) - return 1 << sbi->log_blocks_per_seg; + return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + return sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -257,6 +259,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; unsigned int secno, max_cost; + unsigned int last_segment = MAIN_SEGS(sbi); int nsearched = 0; mutex_lock(&dirty_i->seglist_lock); @@ -267,6 +270,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = max_cost = get_max_cost(sbi, &p); + if (p.max_search == 0) + goto out; + if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -277,9 +283,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned long cost; unsigned int segno; - segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset); - if (segno >= MAIN_SEGS(sbi)) { + segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); + if (segno >= last_segment) { if (sbi->last_victim[p.gc_mode]) { + last_segment = sbi->last_victim[p.gc_mode]; sbi->last_victim[p.gc_mode] = 0; p.offset = 0; continue; @@ -327,6 +334,7 @@ got_it: sbi->cur_victim_sec, prefree_segments(sbi), free_segments(sbi)); } +out: mutex_unlock(&dirty_i->seglist_lock); return (p.min_segno == NULL_SEGNO) ? 0 : 1; @@ -541,7 +549,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) int err; /* do not read out */ - page = grab_cache_page(inode->i_mapping, bidx); + page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); if (!page) return; @@ -550,8 +558,16 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (err) goto out; - if (unlikely(dn.data_blkaddr == NULL_ADDR)) + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + ClearPageUptodate(page); goto put_out; + } + + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_wait_on_page_writeback(page, DATA); get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); @@ -580,7 +596,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) goto put_page_out; set_page_dirty(fio.encrypted_page); - f2fs_wait_on_page_writeback(fio.encrypted_page, META); + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); @@ -611,7 +627,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) { struct page *page; - page = get_lock_data_page(inode, bidx); + page = get_lock_data_page(inode, bidx, true); if (IS_ERR(page)) return; @@ -705,7 +721,7 @@ next_step: start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA); + start_bidx + ofs_in_node, READA, true); if (IS_ERR(data_page)) { iput(inode); continue; @@ -797,13 +813,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, return nfree; } -int f2fs_gc(struct f2fs_sb_info *sbi) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) { - unsigned int segno = NULL_SEGNO; - unsigned int i; - int gc_type = BG_GC; - int nfree = 0; - int ret = -1; + unsigned int segno, i; + int gc_type = sync ? FG_GC : BG_GC; + int sec_freed = 0; + int ret = -EINVAL; struct cp_control cpc; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), @@ -812,12 +827,16 @@ int f2fs_gc(struct f2fs_sb_info *sbi) cpc.reason = __get_cp_reason(sbi); gc_more: + segno = NULL_SEGNO; + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto stop; + } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { gc_type = FG_GC; if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) write_checkpoint(sbi, &cpc); @@ -830,23 +849,38 @@ gc_more: /* readahead multi ssa blocks those have contiguous address */ if (sbi->segs_per_sec > 1) ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, - META_SSA); + META_SSA, true); - for (i = 0; i < sbi->segs_per_sec; i++) - nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type); + for (i = 0; i < sbi->segs_per_sec; i++) { + /* + * for FG_GC case, halt gcing left segments once failed one + * of segments in selected section to avoid long latency. + */ + if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) && + gc_type == FG_GC) + break; + } + + if (i == sbi->segs_per_sec && gc_type == FG_GC) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (has_not_enough_free_secs(sbi, nfree)) - goto gc_more; + if (!sync) { + if (has_not_enough_free_secs(sbi, sec_freed)) + goto gc_more; - if (gc_type == FG_GC) - write_checkpoint(sbi, &cpc); + if (gc_type == FG_GC) + write_checkpoint(sbi, &cpc); + } stop: mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); + + if (sync) + ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index c5a055b3376e..a993967dcdb9 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -19,12 +19,6 @@ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ -/* - * with this macro, we can control the max time we do garbage collection, - * when user triggers batch mode gc by ioctl. - */ -#define F2FS_BATCH_GC_MAX_NUM 16 - /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ @@ -106,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) return true; return false; } - -static inline int is_idle(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; - return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); -} diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3d143be42895..c3f0b7d4cfca 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -12,12 +12,10 @@ #include <linux/f2fs_fs.h> #include "f2fs.h" +#include "node.h" bool f2fs_may_inline_data(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) - return false; - if (f2fs_is_atomic_file(inode)) return false; @@ -176,6 +174,9 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; + if (!f2fs_has_inline_data(inode)) + return 0; + page = grab_cache_page(inode->i_mapping, 0); if (!page) return -ENOMEM; @@ -198,6 +199,9 @@ out: f2fs_unlock_op(sbi); f2fs_put_page(page, 1); + + f2fs_balance_fs(sbi, dn.node_changed); + return err; } @@ -274,12 +278,14 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - truncate_inline_inode(ipage, 0); + if (!truncate_inline_inode(ipage, 0)) + return false; f2fs_clear_inline_inode(inode); update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - truncate_blocks(inode, 0, false); + if (truncate_blocks(inode, 0, false)) + return false; goto process_inline; } return false; @@ -568,3 +574,38 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, f2fs_put_page(ipage, 1); return 0; } + +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) +{ + __u64 byteaddr, ilen; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; + struct node_info ni; + struct page *ipage; + int err = 0; + + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + if (!f2fs_has_inline_data(inode)) { + err = -EAGAIN; + goto out; + } + + ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + if (start >= ilen) + goto out; + if (start + len < ilen) + ilen = start + len; + ilen -= start; + + get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; + byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); +out: + f2fs_put_page(ipage, 1); + return err; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 35aae65b3e5d..2adeff26be11 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -138,7 +138,8 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, &ri->i_ext); + if (f2fs_init_extent_tree(inode, &ri->i_ext)) + set_page_dirty(node_page); get_inline_info(fi, ri); @@ -202,6 +203,7 @@ make_now: inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { @@ -221,7 +223,7 @@ bad_inode: return ERR_PTR(ret); } -void update_inode(struct inode *inode, struct page *node_page) +int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; @@ -259,15 +261,16 @@ void update_inode(struct inode *inode, struct page *node_page) __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); - set_page_dirty(node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + + return set_page_dirty(node_page); } -void update_inode_page(struct inode *inode) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; + int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -278,10 +281,11 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi); } - return; + return 0; } - update_inode(inode, node_page); + ret = update_inode(inode, node_page); f2fs_put_page(node_page, 1); + return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -296,16 +300,11 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; /* - * We need to lock here to prevent from producing dirty node pages + * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - f2fs_lock_op(sbi); - update_inode_page(inode); - f2fs_unlock_op(sbi); - - if (wbc) - f2fs_balance_fs(sbi); - + if (update_inode_page(inode)) + f2fs_balance_fs(sbi, true); return 0; } @@ -331,7 +330,7 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); @@ -361,9 +360,9 @@ no_delete: if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (is_inode_flag_set(fi, FI_APPEND_WRITE)) - add_dirty_inode(sbi, inode->i_ino, APPEND_INO); + add_ino_entry(sbi, inode->i_ino, APPEND_INO); if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) - add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); if (is_inode_flag_set(fi, FI_FREE_NID)) { if (err && err != -ENOENT) alloc_nid_done(sbi, inode->i_ino); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a680bf38e4f0..6f944e5eb76e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -60,7 +60,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - if (f2fs_may_inline_data(inode)) + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); @@ -128,8 +128,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -142,6 +140,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -172,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !f2fs_is_child_context_consistent_with_parent(dir, inode)) return -EPERM; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); inode->i_ctime = CURRENT_TIME; ihold(inode); @@ -214,6 +214,15 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) struct page *page; int err = 0; + if (f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "skip recovering inline_dots inode (ino:%lu, pino:%u) " + "in readonly mountpoint", dir->i_ino, pino); + return 0; + } + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); de = f2fs_find_entry(dir, &dot, &page); @@ -288,12 +297,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) int err = -ENOENT; trace_f2fs_unlink_enter(dir, dentry); - f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) goto fail; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) { @@ -315,12 +325,15 @@ fail: return err; } -static const char *f2fs_follow_link(struct dentry *dentry, void **cookie) +static const char *f2fs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - const char *link = page_follow_link_light(dentry, cookie); + const char *link = page_get_link(dentry, inode, done); if (!IS_ERR(link) && !*link) { /* this is broken symlink case */ - page_put_link(NULL, *cookie); + do_delayed_call(done); + clear_delayed_call(done); link = ERR_PTR(-ENOENT); } return link; @@ -341,8 +354,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -351,8 +362,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -410,11 +424,14 @@ err_out: * If the symlink path is stored into inline_data, there is no * performance regression. */ - if (!err) + if (!err) { filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); - if (IS_DIRSYNC(dir)) - f2fs_sync_fs(sbi->sb, 1); + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + } else { + f2fs_unlink(dir, dentry); + } kfree(sd); f2fs_fname_crypto_free_buffer(&disk_link); @@ -430,8 +447,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -441,6 +456,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + f2fs_balance_fs(sbi, true); + set_inode_flag(F2FS_I(inode), FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -478,11 +495,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; - if (!new_valid_dev(rdev)) - return -EINVAL; - - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -490,6 +502,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -516,9 +530,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - if (!whiteout) - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -532,6 +543,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -604,8 +617,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) goto out; @@ -635,6 +646,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_whiteout; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); @@ -666,6 +679,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, update_inode_page(old_inode); update_inode_page(new_inode); } else { + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(new_dentry, old_inode); @@ -763,8 +778,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode))) return -EPERM; - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) goto out; @@ -807,6 +820,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_new_dir; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); @@ -923,18 +938,22 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, } #ifdef CONFIG_F2FS_FS_ENCRYPTION -static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie) +static const char *f2fs_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { struct page *cpage = NULL; char *caddr, *paddr = NULL; - struct f2fs_str cstr; + struct f2fs_str cstr = FSTR_INIT(NULL, 0); struct f2fs_str pstr = FSTR_INIT(NULL, 0); - struct inode *inode = d_inode(dentry); struct f2fs_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); u32 max_size = inode->i_sb->s_blocksize; int res; + if (!dentry) + return ERR_PTR(-ECHILD); + res = f2fs_get_encryption_info(inode); if (res) return ERR_PTR(res); @@ -942,16 +961,27 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) return ERR_CAST(cpage); - caddr = kmap(cpage); + caddr = page_address(cpage); caddr[size] = 0; /* Symlink is encrypted */ sd = (struct f2fs_encrypted_symlink_data *)caddr; - cstr.name = sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); /* this is broken symlink case */ - if (cstr.name[0] == 0 && cstr.len == 0) { + if (unlikely(cstr.len == 0)) { + res = -ENOENT; + goto errout; + } + cstr.name = kmalloc(cstr.len, GFP_NOFS); + if (!cstr.name) { + res = -ENOMEM; + goto errout; + } + memcpy(cstr.name, sd->encrypted_path, cstr.len); + + /* this is broken symlink case */ + if (unlikely(cstr.name[0] == 0)) { res = -ENOENT; goto errout; } @@ -970,31 +1000,34 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook if (res < 0) goto errout; + kfree(cstr.name); + paddr = pstr.name; /* Null-terminate the name */ paddr[res] = '\0'; - kunmap(cpage); page_cache_release(cpage); - return *cookie = paddr; + set_delayed_call(done, kfree_link, paddr); + return paddr; errout: + kfree(cstr.name); f2fs_fname_crypto_free_buffer(&pstr); - kunmap(cpage); page_cache_release(cpage); return ERR_PTR(res); } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = f2fs_encrypted_follow_link, - .put_link = kfree_put_link, + .get_link = f2fs_encrypted_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, +#endif }; #endif @@ -1023,8 +1056,7 @@ const struct inode_operations f2fs_dir_inode_operations = { const struct inode_operations f2fs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = f2fs_follow_link, - .put_link = page_put_link, + .get_link = f2fs_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, #ifdef CONFIG_F2FS_FS_XATTR diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27d1a74dd6f3..342597a5897f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -65,13 +65,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { - mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + mem_size = (atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->wb.dirty_exceeded) - return false; + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; } return res; } @@ -261,13 +262,11 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, { struct nat_entry *e; - down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) { e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } - up_write(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -379,6 +378,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + down_write(&nm_i->nat_tree_lock); + /* Check current segment summary */ mutex_lock(&curseg->curseg_mutex); i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); @@ -399,6 +400,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) cache: /* cache nat entry */ cache_nat_entry(NM_I(sbi), nid, &ne); + up_write(&nm_i->nat_tree_lock); } /* @@ -676,7 +678,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, ret = truncate_dnode(&rdn); if (ret < 0) goto out_err; - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; } } else { child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; @@ -689,7 +692,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, rdn.nid = child_nid; ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); if (ret == (NIDS_PER_BLOCK + 1)) { - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; child_nofs += ret; } else if (ret < 0 && ret != -ENOENT) { goto out_err; @@ -750,7 +754,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, err = truncate_dnode(dn); if (err < 0) goto fail; - set_nid(pages[idx], i, 0, false); + if (set_nid(pages[idx], i, 0, false)) + dn->node_changed = true; } if (offset[idx + 1] == 0) { @@ -975,7 +980,8 @@ struct page *new_node_page(struct dnode_of_data *dn, fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); SetPageUptodate(page); - set_page_dirty(page); + if (set_page_dirty(page)) + dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) F2FS_I(dn->inode)->i_xattr_nid = dn->nid; @@ -1035,6 +1041,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *apage; int err; + if (!nid) + return; + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); + apage = find_get_page(NODE_MAPPING(sbi), nid); if (apage && PageUptodate(apage)) { f2fs_put_page(apage, 0); @@ -1050,51 +1060,38 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +/* + * readahead MAX_RA_NODE number of node pages. + */ +void ra_node_pages(struct page *parent, int start) { - struct page *page; - int err; -repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); - if (!page) - return ERR_PTR(-ENOMEM); + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; - err = read_node_page(page, READ_SYNC); - if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } else if (err != LOCKED_PAGE) { - lock_page(page); - } + blk_start_plug(&plug); - if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto repeat; + /* Then, try readahead for siblings of the desired node */ + end = start + MAX_RA_NODE; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); } - return page; + + blk_finish_plug(&plug); } -/* - * Return a locked page for the desired node page. - * And, readahead MAX_RA_NODE number of node pages. - */ -struct page *get_node_page_ra(struct page *parent, int start) +struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start) { - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; struct page *page; - int err, i, end; - nid_t nid; + int err; - /* First, try getting the desired direct node. */ - nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) @@ -1108,46 +1105,53 @@ repeat: goto page_hit; } - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start + 1; i < end; i++) { - nid = get_nid(parent, i, false); - if (!nid) - continue; - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); + if (parent) + ra_node_pages(parent, start + 1); lock_page(page); + + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } page_hit: - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } + f2fs_bug_on(sbi, nid != nid_of_node(page)); return page; } +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +{ + return __get_node_page(sbi, nid, NULL, 0); +} + +struct page *get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_page(sbi, nid, parent, start); +} + void sync_inode_page(struct dnode_of_data *dn) { + int ret = 0; + if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - update_inode(dn->inode, dn->node_page); + ret = update_inode(dn->inode, dn->node_page); } else if (dn->inode_page) { if (!dn->inode_page_locked) lock_page(dn->inode_page); - update_inode(dn->inode, dn->inode_page); + ret = update_inode(dn->inode, dn->inode_page); if (!dn->inode_page_locked) unlock_page(dn->inode_page); } else { - update_inode_page(dn->inode); + ret = update_inode_page(dn->inode); } + dn->node_changed = ret ? true: false; } int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, @@ -1175,6 +1179,11 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (unlikely(f2fs_cp_error(sbi))) { + pagevec_release(&pvec); + return -EIO; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1323,23 +1332,24 @@ static int f2fs_write_node_page(struct page *page, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + get_node_info(sbi, nid, &ni); /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); unlock_page(page); return 0; } - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - set_page_writeback(page); fio.blk_addr = ni.blk_addr; write_node_page(nid, &fio); @@ -1348,7 +1358,7 @@ static int f2fs_write_node_page(struct page *page, up_read(&sbi->node_write); unlock_page(page); - if (wbc->for_reclaim) + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, NODE, WRITE); return 0; @@ -1439,13 +1449,10 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (build) { /* do not add allocated nids */ - down_read(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && - (!get_nat_flag(ne, IS_CHECKPOINTED) || + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; - up_read(&nm_i->nat_tree_lock); if (allocated) return 0; } @@ -1528,7 +1535,10 @@ static void build_free_nids(struct f2fs_sb_info *sbi) return; /* readahead nat pages to be scanned */ - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, + META_NAT, true); + + down_read(&nm_i->nat_tree_lock); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1558,6 +1568,10 @@ static void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); + up_read(&nm_i->nat_tree_lock); + + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), + nm_i->ra_nid_pages, META_NAT, false); } /* @@ -1577,8 +1591,6 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - struct node_info ni; - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1589,13 +1601,6 @@ retry: i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); - - /* check nid is allocated already */ - get_node_info(sbi, *nid, &ni); - if (ni.blk_addr != NULL_ADDR) { - alloc_nid_done(sbi, *nid); - goto retry; - } return true; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1803,10 +1808,10 @@ int restore_node_summary(struct f2fs_sb_info *sbi, nrpages = min(last_offset - i, bio_blocks); /* readahead node pages */ - ra_meta_pages(sbi, addr, nrpages, META_POR); + ra_meta_pages(sbi, addr, nrpages, META_POR, true); for (idx = addr; idx < addr + nrpages; idx++) { - struct page *page = get_meta_page(sbi, idx); + struct page *page = get_tmp_page(sbi, idx); rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; @@ -1837,14 +1842,12 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) raw_ne = nat_in_journal(sum, i); - down_write(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (!ne) { ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } __set_nat_cache_dirty(nm_i, ne); - up_write(&nm_i->nat_tree_lock); } update_nats_in_cursum(sum, -i); mutex_unlock(&curseg->curseg_mutex); @@ -1878,7 +1881,6 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; - struct f2fs_nm_info *nm_i = NM_I(sbi); /* * there are two steps to flush nat entries: @@ -1915,12 +1917,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_ne = &nat_blk->entries[nid - start_nid]; } raw_nat_from_node_info(raw_ne, &ne->ni); - - down_write(&NM_I(sbi)->nat_tree_lock); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - up_write(&NM_I(sbi)->nat_tree_lock); - if (nat_get_blkaddr(ne) == NULL_ADDR) add_free_nid(sbi, nid, false); } @@ -1932,9 +1930,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, set->entry_cnt); - down_write(&nm_i->nat_tree_lock); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - up_write(&nm_i->nat_tree_lock); kmem_cache_free(nat_entry_set_slab, set); } @@ -1954,6 +1950,9 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; + + down_write(&nm_i->nat_tree_lock); + /* * if there are no enough space in journal to store dirty nat * entries, remove all entries from journal and merge them @@ -1962,7 +1961,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); - down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; @@ -1971,12 +1969,13 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) __adjust_nat_entry_set(setvec[idx], &sets, MAX_NAT_JENTRIES(sum)); } - up_write(&nm_i->nat_tree_lock); /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); + up_write(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -2000,6 +1999,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->fcnt = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; + nm_i->ra_nid_pages = DEF_RA_NID_PAGES; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7427e956ad81..d4d1f636fe1c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -14,9 +14,11 @@ /* node block offset on the NAT area dedicated to the given start node id */ #define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) -/* # of pages to perform readahead before building free nids */ +/* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 4 +#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */ + /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 @@ -181,7 +183,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) block_addr = (pgoff_t)(nm_i->nat_blkaddr + (seg_off << sbi->log_blocks_per_seg << 1) + - (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -315,7 +317,7 @@ static inline bool IS_DNODE(struct page *node_page) return true; } -static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); @@ -325,7 +327,7 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); - set_page_dirty(p); + return set_page_dirty(p); } static inline nid_t get_nid(struct page *p, int off, bool i) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index faec2ca004b9..589b20b8677b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -168,6 +168,32 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } +static bool is_same_inode(struct inode *inode, struct page *ipage) +{ + struct f2fs_inode *ri = F2FS_INODE(ipage); + struct timespec disk; + + if (!IS_INODE(ipage)) + return true; + + disk.tv_sec = le64_to_cpu(ri->i_ctime); + disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + if (timespec_compare(&inode->i_ctime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_atime); + disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + if (timespec_compare(&inode->i_atime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_mtime); + disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + if (timespec_compare(&inode->i_mtime, &disk) > 0) + return false; + + return true; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); @@ -180,7 +206,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR); + ra_meta_pages(sbi, blkaddr, 1, META_POR, true); while (1) { struct fsync_inode_entry *entry; @@ -188,7 +214,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; - page = get_meta_page(sbi, blkaddr); + page = get_tmp_page(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) break; @@ -197,7 +223,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (!entry) { + if (entry) { + if (!is_same_inode(entry->inode, page)) + goto next; + } else { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) @@ -383,15 +412,11 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, start = start_bidx_of_node(ofs_of_node(page), fi); end = start + ADDRS_PER_PAGE(page, fi); - f2fs_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, start, ALLOC_NODE); - if (err) { - f2fs_unlock_op(sbi); + if (err) goto out; - } f2fs_wait_on_page_writeback(dn.node_page, NODE); @@ -456,7 +481,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, set_page_dirty(dn.node_page); err: f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); out: f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, recovered = %d blocks, err = %d", @@ -464,8 +488,7 @@ out: return err; } -static int recover_data(struct f2fs_sb_info *sbi, - struct list_head *head, int type) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; @@ -474,7 +497,7 @@ static int recover_data(struct f2fs_sb_info *sbi, block_t blkaddr; /* get node pages in the current segment */ - curseg = CURSEG_I(sbi, type); + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); while (1) { @@ -485,7 +508,7 @@ static int recover_data(struct f2fs_sb_info *sbi, ra_meta_pages_cond(sbi, blkaddr); - page = get_meta_page(sbi, blkaddr); + page = get_tmp_page(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) { f2fs_put_page(page, 1); @@ -561,7 +584,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + err = recover_data(sbi, &inode_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: @@ -570,7 +593,7 @@ out: /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), - MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); @@ -600,7 +623,7 @@ out: .reason = CP_RECOVERY, }; mutex_unlock(&sbi->cp_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); } else { mutex_unlock(&sbi->cp_mutex); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 78e6d0696847..5904a411c86f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -14,8 +14,8 @@ #include <linux/blkdev.h> #include <linux/prefetch.h> #include <linux/kthread.h> -#include <linux/vmalloc.h> #include <linux/swap.h> +#include <linux/timer.h> #include "f2fs.h" #include "segment.h" @@ -29,6 +29,21 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; +static unsigned long __reverse_ulong(unsigned char *str) +{ + unsigned long tmp = 0; + int shift = 24, idx = 0; + +#if BITS_PER_LONG == 64 + shift = 56; +#endif + while (shift >= 0) { + tmp |= (unsigned long)str[idx++] << shift; + shift -= BITS_PER_BYTE; + } + return tmp; +} + /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since * MSB and LSB are reversed in a byte by f2fs_set_bit. @@ -38,27 +53,31 @@ static inline unsigned long __reverse_ffs(unsigned long word) int num = 0; #if BITS_PER_LONG == 64 - if ((word & 0xffffffff) == 0) { + if ((word & 0xffffffff00000000UL) == 0) num += 32; + else word >>= 32; - } #endif - if ((word & 0xffff) == 0) { + if ((word & 0xffff0000) == 0) num += 16; + else word >>= 16; - } - if ((word & 0xff) == 0) { + + if ((word & 0xff00) == 0) num += 8; + else word >>= 8; - } + if ((word & 0xf0) == 0) num += 4; else word >>= 4; + if ((word & 0xc) == 0) num += 2; else word >>= 2; + if ((word & 0x2) == 0) num += 1; return num; @@ -67,130 +86,83 @@ static inline unsigned long __reverse_ffs(unsigned long word) /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. * Example: - * LSB <--> MSB - * f2fs_set_bit(0, bitmap) => 0000 0001 - * f2fs_set_bit(7, bitmap) => 1000 0000 + * MSB <--> LSB + * f2fs_set_bit(0, bitmap) => 1000 0000 + * f2fs_set_bit(7, bitmap) => 0000 0001 */ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - while (!f2fs_test_bit(offset, (unsigned char *)addr)) - offset++; - - if (offset > size) - offset = size; - - return offset; -#if 0 const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; - unsigned long mask, submask; - unsigned long quot, rest; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - - tmp = *(p++); - quot = (offset >> 3) << 3; - rest = offset & 0x7; - mask = ~0UL << quot; - submask = (unsigned char)(0xff << rest) >> rest; - submask <<= quot; - mask &= submask; - tmp &= mask; - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; -aligned: - while (size & ~(BITS_PER_LONG-1)) { - tmp = *(p++); + + while (1) { + if (*p == 0) + goto pass; + + tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; + p++; } - if (!size) - return result; - tmp = *p; -found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffs(tmp); -#endif + return result; +found: + return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - while (f2fs_test_bit(offset, (unsigned char *)addr)) - offset++; - - if (offset > size) - offset = size; - - return offset; -#if 0 const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; - unsigned long mask, submask; - unsigned long quot, rest; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - - tmp = *(p++); - quot = (offset >> 3) << 3; - rest = offset & 0x7; - mask = ~(~0UL << quot); - submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest); - submask <<= quot; - mask += submask; - tmp |= mask; - if (size < BITS_PER_LONG) - goto found_first; - if (~tmp) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; -aligned: - while (size & ~(BITS_PER_LONG - 1)) { - tmp = *(p++); - if (~tmp) - goto found_middle; - result += BITS_PER_LONG; + + while (1) { + if (*p == ~0UL) + goto pass; + + tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; + if (tmp != ~0UL) + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; + p++; } - if (!size) - return result; - tmp = *p; - -found_first: - tmp |= ~0UL << size; - if (tmp == ~0UL) /* Are any bits zero? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffz(tmp); -#endif + return result; +found: + return result - size + __reverse_ffz(tmp); } void register_inmem_page(struct inode *inode, struct page *page) @@ -241,7 +213,7 @@ int commit_inmem_pages(struct inode *inode, bool abort) * inode becomes free by iget_locked in f2fs_iget. */ if (!abort) { - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); } @@ -257,13 +229,15 @@ int commit_inmem_pages(struct inode *inode, bool abort) trace_f2fs_commit_inmem_page(cur->page, INMEM); fio.page = cur->page; err = do_write_data_page(&fio); - submit_bio = true; if (err) { unlock_page(cur->page); break; } + clear_cold_data(cur->page); + submit_bio = true; } } else { + ClearPageUptodate(cur->page); trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); } set_page_private(cur->page, 0); @@ -288,15 +262,17 @@ int commit_inmem_pages(struct inode *inode, bool abort) * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ -void f2fs_balance_fs(struct f2fs_sb_info *sbi) +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { + if (!need) + return; /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ if (has_not_enough_free_secs(sbi, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi); + f2fs_gc(sbi, false); } } @@ -316,8 +292,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || - !available_free_memory(sbi, INO_ENTRIES)) + !available_free_memory(sbi, INO_ENTRIES) || + (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + if (test_opt(sbi, DATA_FLUSH)) + sync_dirty_inodes(sbi, FILE_INODE); f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); + } } static int issue_flush_thread(void *data) @@ -767,6 +748,30 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) mutex_unlock(&sit_i->sentry_lock); } +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno, offset; + struct seg_entry *se; + bool is_cp = false; + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + return true; + + mutex_lock(&sit_i->sentry_lock); + + segno = GET_SEGNO(sbi, blkaddr); + se = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (f2fs_test_bit(offset, se->ckpt_valid_map)) + is_cp = true; + + mutex_unlock(&sit_i->sentry_lock); + + return is_cp; +} + /* * This function should be resided under the curseg_mutex lock */ @@ -1116,6 +1121,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; struct cp_control cpc; + int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -1146,12 +1152,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) sbi->segs_per_sec) - 1, end_segno); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); - return 0; + return err; } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -1292,6 +1298,9 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .encrypted_page = NULL, }; + if (unlikely(page->index >= MAIN_BLKADDR(sbi))) + fio.rw &= ~REQ_META; + set_page_writeback(page); f2fs_submit_page_mbio(&fio); } @@ -1369,7 +1378,14 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); + if (!recover_curseg) + update_sit_entry(sbi, new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); + locate_dirty_segment(sbi, old_cursegno); if (recover_curseg) { @@ -1449,6 +1465,23 @@ void f2fs_wait_on_page_writeback(struct page *page, } } +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + struct page *cpage; + + if (blkaddr == NEW_ADDR) + return; + + f2fs_bug_on(sbi, blkaddr == NULL_ADDR); + + cpage = find_lock_page(META_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_wait_on_page_writeback(cpage, DATA); + f2fs_put_page(cpage, 1); + } +} + static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1586,7 +1619,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) if (npages >= 2) ra_meta_pages(sbi, start_sum_block(sbi), npages, - META_CP); + META_CP, true); /* restore for compacted data summary */ if (read_compacted_summaries(sbi)) @@ -1596,7 +1629,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) if (__exist_node_summaries(sbi)) ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), - NR_CURSEG_TYPE - type, META_CP); + NR_CURSEG_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); @@ -1704,13 +1737,13 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, if (le32_to_cpu(nid_in_journal(sum, i)) == val) return i; } - if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) + if (alloc && __has_cursum_space(sum, 1, NAT_JOURNAL)) return update_nats_in_cursum(sum, 1); } else if (type == SIT_JOURNAL) { for (i = 0; i < sits_in_cursum(sum); i++) if (le32_to_cpu(segno_in_journal(sum, i)) == val) return i; - if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) + if (alloc && __has_cursum_space(sum, 1, SIT_JOURNAL)) return update_sits_in_cursum(sum, 1); } return -1; @@ -1955,12 +1988,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry)); + sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -1982,8 +2016,8 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) * - sizeof(struct sec_entry)); + sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } @@ -2028,12 +2062,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -2082,7 +2116,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int nrpages = MAX_BIO_BLOCKS(sbi); do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; @@ -2174,7 +2208,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -2196,7 +2230,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -2301,7 +2335,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); mutex_lock(&dirty_i->seglist_lock); - kfree(dirty_i->dirty_segmap[dirty_type]); + kvfree(dirty_i->dirty_segmap[dirty_type]); dirty_i->nr_dirty[dirty_type] = 0; mutex_unlock(&dirty_i->seglist_lock); } @@ -2309,7 +2343,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - kfree(dirty_i->victim_secmap); + kvfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) @@ -2348,8 +2382,8 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi) if (!free_i) return; SM_I(sbi)->free_info = NULL; - kfree(free_i->free_segmap); - kfree(free_i->free_secmap); + kvfree(free_i->free_segmap); + kvfree(free_i->free_secmap); kfree(free_i); } @@ -2370,9 +2404,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) } kfree(sit_i->tmp_map); - vfree(sit_i->sentries); - vfree(sit_i->sec_entries); - kfree(sit_i->dirty_sentries_bitmap); + kvfree(sit_i->sentries); + kvfree(sit_i->sec_entries); + kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b6e4ed15c698..ee44d346ea44 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -137,10 +137,12 @@ enum { /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. + * FORCE_FG_GC means on-demand cleaning job in background. */ enum { BG_GC = 0, - FG_GC + FG_GC, + FORCE_FG_GC, }; /* for a function parameter to select a victim segment */ diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index da0d8e0b55a5..93606f281bf9 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -32,7 +32,8 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) { - return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); + return atomic_read(&sbi->total_zombie_tree) + + atomic_read(&sbi->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f79478115d37..6134832baaaf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -67,6 +67,7 @@ enum { Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, + Opt_data_flush, Opt_err, }; @@ -91,6 +92,7 @@ static match_table_t f2fs_tokens = { {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, + {Opt_data_flush, "data_flush"}, {Opt_err, NULL}, }; @@ -213,8 +215,11 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -231,6 +236,9 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), NULL, }; @@ -292,11 +300,16 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (strlen(name) == 2 && !strncmp(name, "on", 2)) + if (strlen(name) == 2 && !strncmp(name, "on", 2)) { set_opt(sbi, BG_GC); - else if (strlen(name) == 3 && !strncmp(name, "off", 3)) + clear_opt(sbi, FORCE_FG_GC); + } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { clear_opt(sbi, BG_GC); - else { + clear_opt(sbi, FORCE_FG_GC); + } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) { + set_opt(sbi, BG_GC); + set_opt(sbi, FORCE_FG_GC); + } else { kfree(name); return -EINVAL; } @@ -397,6 +410,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; + case Opt_data_flush: + set_opt(sbi, DATA_FLUSH); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -423,6 +439,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); @@ -539,7 +556,7 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_dirty_inode(sbi); + release_ino_entry(sbi); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); @@ -557,13 +574,14 @@ static void f2fs_put_super(struct super_block *sb) wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; - brelse(sbi->raw_super_buf); + kfree(sbi->raw_super); kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; trace_f2fs_sync_fs(sb, sync); @@ -573,14 +591,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); - } else { - f2fs_balance_fs(sbi); } f2fs_trace_ios(NULL, 1); - return 0; + return err; } static int f2fs_freeze(struct super_block *sb) @@ -631,10 +647,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) - seq_printf(seq, ",background_gc=%s", "on"); - else + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) { + if (test_opt(sbi, FORCE_FG_GC)) + seq_printf(seq, ",background_gc=%s", "sync"); + else + seq_printf(seq, ",background_gc=%s", "on"); + } else { seq_printf(seq, ",background_gc=%s", "off"); + } if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, DISCARD)) @@ -673,6 +693,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -742,6 +764,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; + bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); sync_filesystem(sb); @@ -767,6 +790,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + /* disallow enable/disable extent_cache dynamically */ + if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + err = -EINVAL; + f2fs_msg(sbi->sb, KERN_WARNING, + "switch extent_cache option is not allowed"); + goto restore_opts; + } + /* * We stop the GC thread if FS is mounted as RO * or if background_gc = off is passed in mount @@ -876,7 +907,7 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_size(unsigned bits) +static loff_t max_file_blocks(void) { loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; @@ -892,10 +923,82 @@ static loff_t max_file_size(unsigned bits) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; - result <<= bits; return result; } +static inline bool sanity_check_area_boundary(struct super_block *sb, + struct f2fs_super_block *raw_super) +{ + u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); + u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); + u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); + u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); + u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); + u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); + u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); + u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); + u32 segment_count = le32_to_cpu(raw_super->segment_count); + u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + + if (segment0_blkaddr != cp_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Mismatch start address, segment0(%u) cp_blkaddr(%u)", + segment0_blkaddr, cp_blkaddr); + return true; + } + + if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != + sit_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong CP boundary, start(%u) end(%u) blocks(%u)", + cp_blkaddr, sit_blkaddr, + segment_count_ckpt << log_blocks_per_seg); + return true; + } + + if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != + nat_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", + sit_blkaddr, nat_blkaddr, + segment_count_sit << log_blocks_per_seg); + return true; + } + + if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != + ssa_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", + nat_blkaddr, ssa_blkaddr, + segment_count_nat << log_blocks_per_seg); + return true; + } + + if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != + main_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", + ssa_blkaddr, main_blkaddr, + segment_count_ssa << log_blocks_per_seg); + return true; + } + + if (main_blkaddr + (segment_count_main << log_blocks_per_seg) != + segment0_blkaddr + (segment_count << log_blocks_per_seg)) { + f2fs_msg(sb, KERN_INFO, + "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)", + main_blkaddr, + segment0_blkaddr + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + return true; + } + + return false; +} + static int sanity_check_raw_super(struct super_block *sb, struct f2fs_super_block *raw_super) { @@ -925,6 +1028,14 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_msg(sb, KERN_INFO, + "Invalid log blocks per segment (%u)\n", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return 1; + } + /* Currently, support 512/1024/2048/4096 bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || @@ -943,6 +1054,23 @@ static int sanity_check_raw_super(struct super_block *sb, le32_to_cpu(raw_super->log_sectorsize)); return 1; } + + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || + le32_to_cpu(raw_super->root_ino) != 3) { + f2fs_msg(sb, KERN_INFO, + "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", + le32_to_cpu(raw_super->node_ino), + le32_to_cpu(raw_super->meta_ino), + le32_to_cpu(raw_super->root_ino)); + return 1; + } + + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sb, raw_super)) + return 1; + return 0; } @@ -996,6 +1124,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); sbi->dir_level = DEF_DIR_LEVEL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); @@ -1009,111 +1139,114 @@ static void init_sb_info(struct f2fs_sb_info *sbi) */ static int read_raw_super_block(struct super_block *sb, struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, - int *recovery) + int *valid_super_block, int *recovery) { int block = 0; - struct buffer_head *buffer; - struct f2fs_super_block *super; + struct buffer_head *bh; + struct f2fs_super_block *super, *buf; int err = 0; + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; retry: - buffer = sb_bread(sb, block); - if (!buffer) { + bh = sb_bread(sb, block); + if (!bh) { *recovery = 1; f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { - err = -EIO; - goto out; - } + err = -EIO; + goto next; } - super = (struct f2fs_super_block *) - ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET); + buf = (struct f2fs_super_block *)(bh->b_data + F2FS_SUPER_OFFSET); /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, super)) { - brelse(buffer); + if (sanity_check_raw_super(sb, buf)) { + brelse(bh); *recovery = 1; f2fs_msg(sb, KERN_ERR, "Can't find valid F2FS filesystem in %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { - err = -EINVAL; - goto out; - } + err = -EINVAL; + goto next; } if (!*raw_super) { - *raw_super_buf = buffer; + memcpy(super, buf, sizeof(*super)); + *valid_super_block = block; *raw_super = super; - } else { - /* already have a valid superblock */ - brelse(buffer); } + brelse(bh); +next: /* check the validity of the second superblock */ if (block == 0) { block++; goto retry; } -out: /* No valid superblock */ - if (!*raw_super) + if (!*raw_super) { + kfree(super); return err; + } return 0; } +static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block) +{ + struct f2fs_super_block *super = F2FS_RAW_SUPER(sbi); + struct buffer_head *bh; + int err; + + bh = sb_getblk(sbi->sb, block); + if (!bh) + return -EIO; + + lock_buffer(bh); + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + err = __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); + brelse(bh); + + return err; +} + int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { - struct buffer_head *sbh = sbi->raw_super_buf; - sector_t block = sbh->b_blocknr; int err; /* write back-up superblock first */ - sbh->b_blocknr = block ? 0 : 1; - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); - - sbh->b_blocknr = block; + err = __f2fs_commit_super(sbi, sbi->valid_super_block ? 0 : 1); /* if we are in recovery path, skip writing valid superblock */ if (recover || err) - goto out; + return err; /* write current valid superblock */ - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); -out: - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - return err; + return __f2fs_commit_super(sbi, sbi->valid_super_block); } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; - struct buffer_head *raw_super_buf; struct inode *root; long err; bool retry = true, need_fsck = false; char *options = NULL; - int recovery, i; + int recovery, i, valid_super_block; try_onemore: err = -EINVAL; raw_super = NULL; - raw_super_buf = NULL; + valid_super_block = -1; recovery = 0; /* allocate memory for f2fs-specific super block info */ @@ -1127,7 +1260,8 @@ try_onemore: goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery); + err = read_raw_super_block(sb, &raw_super, &valid_super_block, + &recovery); if (err) goto free_sbi; @@ -1144,7 +1278,9 @@ try_onemore: if (err) goto free_options; - sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); + sbi->max_file_blocks = max_file_blocks(); + sb->s_maxbytes = sbi->max_file_blocks << + le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); @@ -1160,7 +1296,7 @@ try_onemore: /* init f2fs-specific super block info */ sbi->sb = sb; sbi->raw_super = raw_super; - sbi->raw_super_buf = raw_super_buf; + sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); @@ -1213,8 +1349,10 @@ try_onemore: le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->alloc_valid_block_count = 0; - INIT_LIST_HEAD(&sbi->dir_inode_list); - spin_lock_init(&sbi->dir_inode_lock); + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } init_extent_cache_info(sbi); @@ -1332,10 +1470,14 @@ try_onemore: f2fs_commit_super(sbi, true); } + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); return 0; free_kobj: kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); @@ -1362,7 +1504,7 @@ free_meta_inode: free_options: kfree(options); free_sb_buf: - brelse(raw_super_buf); + kfree(raw_super); free_sbi: kfree(sbi); @@ -1399,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { - f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info)); + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); if (!f2fs_inode_cachep) return -ENOMEM; return 0; @@ -1453,10 +1596,14 @@ static int __init init_f2fs_fs(void) err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - f2fs_create_root_stats(); + err = f2fs_create_root_stats(); + if (err) + goto free_filesystem; f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_filesystem: + unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_crypto: diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 4de2286c0e4d..10f1e784fa23 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -25,49 +25,37 @@ #include "f2fs.h" #include "xattr.h" -static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t len, int type) +static int f2fs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, void *buffer, + size_t size) { struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - int total_len, prefix_len = 0; - const char *prefix = NULL; - switch (type) { + switch (handler->flags) { case F2FS_XATTR_INDEX_USER: if (!test_opt(sbi, XATTR_USER)) return -EOPNOTSUPP; - prefix = XATTR_USER_PREFIX; - prefix_len = XATTR_USER_PREFIX_LEN; break; case F2FS_XATTR_INDEX_TRUSTED: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - prefix = XATTR_TRUSTED_PREFIX; - prefix_len = XATTR_TRUSTED_PREFIX_LEN; break; case F2FS_XATTR_INDEX_SECURITY: - prefix = XATTR_SECURITY_PREFIX; - prefix_len = XATTR_SECURITY_PREFIX_LEN; break; default: return -EINVAL; } - - total_len = prefix_len + len + 1; - if (list && total_len <= list_size) { - memcpy(list, prefix, prefix_len); - memcpy(list + prefix_len, name, len); - list[prefix_len + len] = '\0'; - } - return total_len; + return f2fs_getxattr(d_inode(dentry), handler->flags, name, + buffer, size, NULL); } -static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int f2fs_xattr_generic_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) { struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - switch (type) { + switch (handler->flags) { case F2FS_XATTR_INDEX_USER: if (!test_opt(sbi, XATTR_USER)) return -EOPNOTSUPP; @@ -81,72 +69,39 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, default: return -EINVAL; } - if (strcmp(name, "") == 0) - return -EINVAL; - return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL); + return f2fs_setxattr(d_inode(dentry), handler->flags, name, + value, size, NULL, flags); } -static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static bool f2fs_xattr_user_list(struct dentry *dentry) { struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - switch (type) { - case F2FS_XATTR_INDEX_USER: - if (!test_opt(sbi, XATTR_USER)) - return -EOPNOTSUPP; - break; - case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; - case F2FS_XATTR_INDEX_SECURITY: - break; - default: - return -EINVAL; - } - if (strcmp(name, "") == 0) - return -EINVAL; - - return f2fs_setxattr(d_inode(dentry), type, name, - value, size, NULL, flags); + return test_opt(sbi, XATTR_USER); } -static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t len, int type) +static bool f2fs_xattr_trusted_list(struct dentry *dentry) { - const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; - size_t size; - - if (type != F2FS_XATTR_INDEX_ADVISE) - return 0; - - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; + return capable(CAP_SYS_ADMIN); } -static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int f2fs_xattr_advise_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, void *buffer, + size_t size) { struct inode *inode = d_inode(dentry); - if (strcmp(name, "") != 0) - return -EINVAL; - if (buffer) *((char *)buffer) = F2FS_I(inode)->i_advise; return sizeof(char); } -static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int f2fs_xattr_advise_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) { struct inode *inode = d_inode(dentry); - if (strcmp(name, "") != 0) - return -EINVAL; if (!inode_owner_or_capable(inode)) return -EPERM; if (value == NULL) @@ -185,7 +140,7 @@ int f2fs_init_security(struct inode *inode, struct inode *dir, const struct xattr_handler f2fs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = F2FS_XATTR_INDEX_USER, - .list = f2fs_xattr_generic_list, + .list = f2fs_xattr_user_list, .get = f2fs_xattr_generic_get, .set = f2fs_xattr_generic_set, }; @@ -193,15 +148,14 @@ const struct xattr_handler f2fs_xattr_user_handler = { const struct xattr_handler f2fs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = F2FS_XATTR_INDEX_TRUSTED, - .list = f2fs_xattr_generic_list, + .list = f2fs_xattr_trusted_list, .get = f2fs_xattr_generic_get, .set = f2fs_xattr_generic_set, }; const struct xattr_handler f2fs_xattr_advise_handler = { - .prefix = F2FS_SYSTEM_ADVISE_PREFIX, + .name = F2FS_SYSTEM_ADVISE_NAME, .flags = F2FS_XATTR_INDEX_ADVISE, - .list = f2fs_xattr_advise_list, .get = f2fs_xattr_advise_get, .set = f2fs_xattr_advise_set, }; @@ -209,7 +163,6 @@ const struct xattr_handler f2fs_xattr_advise_handler = { const struct xattr_handler f2fs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = F2FS_XATTR_INDEX_SECURITY, - .list = f2fs_xattr_generic_list, .get = f2fs_xattr_generic_get, .set = f2fs_xattr_generic_set, }; @@ -457,20 +410,27 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = f2fs_xattr_handler(entry->e_name_index); + const char *prefix; + size_t prefix_len; size_t size; - if (!handler) + if (!handler || (handler->list && !handler->list(dentry))) continue; - size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len, handler->flags); - if (buffer && size > rest) { - error = -ERANGE; - goto cleanup; + prefix = handler->prefix ?: handler->name; + prefix_len = strlen(prefix); + size = prefix_len + entry->e_name_len + 1; + if (buffer) { + if (size > rest) { + error = -ERANGE; + goto cleanup; + } + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, entry->e_name, entry->e_name_len); + buffer += entry->e_name_len; + *buffer++ = 0; } - - if (buffer) - buffer += size; rest -= size; } error = buffer_size - rest; @@ -611,7 +571,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); /* protect xattr_ver */ @@ -620,5 +580,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); + f2fs_update_time(sbi, REQ_TIME); return err; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 71a7100d5492..79dccc8252dd 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -27,7 +27,7 @@ #define F2FS_XATTR_REFCOUNT_MAX 1024 /* Name indexes */ -#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise" +#define F2FS_SYSTEM_ADVISE_NAME "system.advise" #define F2FS_XATTR_INDEX_USER 1 #define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2 #define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 4afc4d9d2e41..8b2127ffb226 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -610,9 +610,9 @@ parse_record: int status = fat_parse_long(inode, &cpos, &bh, &de, &unicode, &nr_slots); if (status < 0) { - ctx->pos = cpos; + bh = NULL; ret = status; - goto out; + goto end_of_dir; } else if (status == PARSE_INVALID) goto record_end; else if (status == PARSE_NOT_LONGNAME) @@ -654,8 +654,9 @@ parse_record: fill_len = short_len; start_filldir: - if (!fake_offset) - ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); + ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); + if (fake_offset && ctx->pos < 2) + ctx->pos = 2; if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) { if (!dir_emit_dot(file, ctx)) @@ -681,14 +682,19 @@ record_end: fake_offset = 0; ctx->pos = cpos; goto get_new; + end_of_dir: - ctx->pos = cpos; + if (fake_offset && cpos < 2) + ctx->pos = 2; + else + ctx->pos = cpos; fill_failed: brelse(bh); if (unicode) __putname(unicode); out: mutex_unlock(&sbi->s_lock); + return ret; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 509411dd3698..6aece96df19f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -677,7 +677,7 @@ static int __init fat_init_inodecache(void) fat_inode_cachep = kmem_cache_create("fat_inode_cache", sizeof(struct msdos_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (fat_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/fcntl.c b/fs/fcntl.c index ee85cd4e136a..350a2c8cfd28 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -51,7 +51,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg) if (arg & O_NDELAY) arg |= O_NONBLOCK; - if (arg & O_DIRECT) { + /* Pipe packetized mode is controlled by O_DIRECT flag */ + if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) { if (!filp->f_mapping || !filp->f_mapping->a_ops || !filp->f_mapping->a_ops->direct_IO) return -EINVAL; diff --git a/fs/file.c b/fs/file.c index 6c672ad329e9..1fbc5c0555a9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -25,9 +25,9 @@ int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; -/* our max() is unusable in constant expressions ;-/ */ -#define __const_max(x, y) ((x) < (y) ? (x) : (y)) -int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) & +/* our min() is unusable in constant expressions ;-/ */ +#define __const_min(x, y) ((x) < (y) ? (x) : (y)) +int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void *alloc_fdmem(size_t size) @@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size) * vmalloc() if the allocation size will be considered "large" by the VM. */ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); + void *data = kmalloc(size, GFP_KERNEL_ACCOUNT | + __GFP_NOWARN | __GFP_NORETRY); if (data != NULL) return data; } - return vmalloc(size); + return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); } static void __free_fdtable(struct fdtable *fdt) @@ -56,9 +57,35 @@ static void free_fdtable_rcu(struct rcu_head *rcu) __free_fdtable(container_of(rcu, struct fdtable, rcu)); } +#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) +#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) + +/* + * Copy 'count' fd bits from the old table to the new table and clear the extra + * space if any. This does not copy the file pointers. Called with the files + * spinlock held for write. + */ +static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, + unsigned int count) +{ + unsigned int cpy, set; + + cpy = count / BITS_PER_BYTE; + set = (nfdt->max_fds - count) / BITS_PER_BYTE; + memcpy(nfdt->open_fds, ofdt->open_fds, cpy); + memset((char *)nfdt->open_fds + cpy, 0, set); + memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); + memset((char *)nfdt->close_on_exec + cpy, 0, set); + + cpy = BITBIT_SIZE(count); + set = BITBIT_SIZE(nfdt->max_fds) - cpy; + memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); + memset((char *)nfdt->full_fds_bits + cpy, 0, set); +} + /* - * Expand the fdset in the files_struct. Called with the files spinlock - * held for write. + * Copy all file descriptors from the old table to the new, expanded table and + * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { @@ -69,14 +96,9 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); - memset((char *)(nfdt->fd) + cpy, 0, set); + memset((char *)nfdt->fd + cpy, 0, set); - cpy = ofdt->max_fds / BITS_PER_BYTE; - set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; - memcpy(nfdt->open_fds, ofdt->open_fds, cpy); - memset((char *)(nfdt->open_fds) + cpy, 0, set); - memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); - memset((char *)(nfdt->close_on_exec) + cpy, 0, set); + copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); } static struct fdtable * alloc_fdtable(unsigned int nr) @@ -105,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; - fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; @@ -115,12 +137,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr) fdt->fd = data; data = alloc_fdmem(max_t(size_t, - 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); + 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES)); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; + data += nr / BITS_PER_BYTE; + fdt->full_fds_bits = data; return fdt; @@ -226,17 +250,22 @@ static inline void __set_close_on_exec(int fd, struct fdtable *fdt) static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) { - __clear_bit(fd, fdt->close_on_exec); + if (test_bit(fd, fdt->close_on_exec)) + __clear_bit(fd, fdt->close_on_exec); } -static inline void __set_open_fd(int fd, struct fdtable *fdt) +static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->open_fds); + fd /= BITS_PER_LONG; + if (!~fdt->open_fds[fd]) + __set_bit(fd, fdt->full_fds_bits); } -static inline void __clear_open_fd(int fd, struct fdtable *fdt) +static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); + __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } static int count_open_files(struct fdtable *fdt) @@ -262,7 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i; + int open_files, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -280,6 +309,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; + new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); @@ -318,12 +348,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) open_files = count_open_files(old_fdt); } + copy_fd_bitmaps(new_fdt, old_fdt, open_files); + old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); - memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); - for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) { @@ -341,19 +370,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) } spin_unlock(&oldf->file_lock); - /* compute the remainder to be cleared */ - size = (new_fdt->max_fds - open_files) * sizeof(struct file *); - - /* This is long word aligned thus could use a optimized version */ - memset(new_fds, 0, size); - - if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds - open_files) / 8; - int start = open_files / BITS_PER_LONG; - - memset(&new_fdt->open_fds[start], 0, left); - memset(&new_fdt->close_on_exec[start], 0, left); - } + /* clear the remainder */ + memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); @@ -454,10 +472,25 @@ struct files_struct init_files = { .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, + .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), }; +static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start) +{ + unsigned long maxfd = fdt->max_fds; + unsigned long maxbit = maxfd / BITS_PER_LONG; + unsigned long bitbit = start / BITS_PER_LONG; + + bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; + if (bitbit > maxfd) + return maxfd; + if (bitbit > start) + start = bitbit; + return find_next_zero_bit(fdt->open_fds, maxfd, start); +} + /* * allocate a file descriptor, mark it busy. */ @@ -476,7 +509,7 @@ repeat: fd = files->next_fd; if (fd < fdt->max_fds) - fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); + fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index ef73ed674a27..3e2ccade61ed 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -326,6 +326,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino) } else if (S_ISLNK(ip->i_mode)) { if (!VXFS_ISIMMED(vip)) { ip->i_op = &page_symlink_inode_operations; + inode_nohighmem(ip); ip->i_mapping->a_ops = &vxfs_aops; } else { ip->i_op = &simple_symlink_inode_operations; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 29e4599f6fc1..6915c950e6e8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -677,9 +677,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, if (!wbc->wb) return; - rcu_read_lock(); id = mem_cgroup_css_from_page(page)->id; - rcu_read_unlock(); if (id == wbc->wb_id) { wbc->wb_bytes += bytes; @@ -779,8 +777,8 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, bool skip_if_busy) { struct bdi_writeback *last_wb = NULL; - struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list, - struct bdi_writeback, bdi_node); + struct bdi_writeback *wb = list_entry(&bdi->wb_list, + struct bdi_writeback, bdi_node); might_sleep(); restart: @@ -1981,9 +1979,9 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) * page->mapping->host, so the page-dirtying time is recorded in the internal * blockdev inode. */ -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) void __mark_inode_dirty(struct inode *inode, int flags) { +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) struct super_block *sb = inode->i_sb; int dirtytime; @@ -2093,6 +2091,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) out_unlock_inode: spin_unlock(&inode->i_lock); +#undef I_DIRTY_INODE } EXPORT_SYMBOL(__mark_inode_dirty); @@ -2149,7 +2148,12 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); old_inode = inode; - filemap_fdatawait(mapping); + /* + * We keep the error status of individual mapping so that + * applications can catch the writeback error using fsync(2). + * See filemap_fdatawait_keep_errors() for details. + */ + filemap_fdatawait_keep_errors(mapping); cond_resched(); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index d403c69bee08..4304072161aa 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -111,7 +111,7 @@ struct fscache_cookie *__fscache_acquire_cookie( /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ - INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); switch (cookie->def->type) { case FSCACHE_COOKIE_TYPE_INDEX: diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index 6d941f56faf4..9b28649df3a1 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -22,6 +22,7 @@ static LIST_HEAD(fscache_netfs_list); int __fscache_register_netfs(struct fscache_netfs *netfs) { struct fscache_netfs *ptr; + struct fscache_cookie *cookie; int ret; _enter("{%s}", netfs->name); @@ -29,29 +30,25 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) INIT_LIST_HEAD(&netfs->link); /* allocate a cookie for the primary index */ - netfs->primary_index = - kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); + cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); - if (!netfs->primary_index) { + if (!cookie) { _leave(" = -ENOMEM"); return -ENOMEM; } /* initialise the primary index cookie */ - atomic_set(&netfs->primary_index->usage, 1); - atomic_set(&netfs->primary_index->n_children, 0); - atomic_set(&netfs->primary_index->n_active, 1); + atomic_set(&cookie->usage, 1); + atomic_set(&cookie->n_children, 0); + atomic_set(&cookie->n_active, 1); - netfs->primary_index->def = &fscache_fsdef_netfs_def; - netfs->primary_index->parent = &fscache_fsdef_index; - netfs->primary_index->netfs_data = netfs; - netfs->primary_index->flags = 1 << FSCACHE_COOKIE_ENABLED; + cookie->def = &fscache_fsdef_netfs_def; + cookie->parent = &fscache_fsdef_index; + cookie->netfs_data = netfs; + cookie->flags = 1 << FSCACHE_COOKIE_ENABLED; - atomic_inc(&netfs->primary_index->parent->usage); - atomic_inc(&netfs->primary_index->parent->n_children); - - spin_lock_init(&netfs->primary_index->lock); - INIT_HLIST_HEAD(&netfs->primary_index->backing_objects); + spin_lock_init(&cookie->lock); + INIT_HLIST_HEAD(&cookie->backing_objects); /* check the netfs type is not already present */ down_write(&fscache_addremove_sem); @@ -62,6 +59,10 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) goto already_registered; } + atomic_inc(&cookie->parent->usage); + atomic_inc(&cookie->parent->n_children); + + netfs->primary_index = cookie; list_add(&netfs->link, &fscache_netfs_list); ret = 0; @@ -70,11 +71,8 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) already_registered: up_write(&fscache_addremove_sem); - if (ret < 0) { - netfs->primary_index->parent = NULL; - __fscache_cookie_put(netfs->primary_index); - netfs->primary_index = NULL; - } + if (ret < 0) + kmem_cache_free(fscache_cookie_jar, cookie); _leave(" = %d", ret); return ret; diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 51dde817e1f2..6b028b7c4250 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -316,7 +316,7 @@ static const struct seq_operations fscache_objlist_ops = { static void fscache_objlist_config(struct fscache_objlist_data *data) { #ifdef CONFIG_KEYS - struct user_key_payload *confkey; + const struct user_key_payload *confkey; unsigned long config; struct key *key; const char *buf; @@ -329,7 +329,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data) config = 0; rcu_read_lock(); - confkey = key->payload.data; + confkey = user_key_payload(key); buf = confkey->data; for (len = confkey->datalen - 1; len >= 0; len--) { diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 483bbc613bf0..6b35fc4860a0 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -58,7 +58,7 @@ bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page) /* * decide whether a page can be released, possibly by cancelling a store to it - * - we're allowed to sleep if __GFP_WAIT is flagged + * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged */ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, struct page *page, @@ -122,7 +122,7 @@ page_busy: * allocator as the work threads writing to the cache may all end up * sleeping on memory allocation, so we may need to impose a timeout * too. */ - if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) { + if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) { fscache_stat(&fscache_n_store_vmscan_busy); return false; } @@ -132,7 +132,7 @@ page_busy: _debug("fscache writeout timeout page: %p{%lx}", page, page->index); - gfp &= ~__GFP_WAIT; + gfp &= ~__GFP_DIRECT_RECLAIM; goto try_again; } EXPORT_SYMBOL(__fscache_maybe_release_page); @@ -816,7 +816,7 @@ static void fscache_write_op(struct fscache_operation *_op) goto superseded; page = results[0]; _debug("gang %d [%lx]", n, page->index); - if (page->index > op->store_limit) { + if (page->index >= op->store_limit) { fscache_stat(&fscache_n_store_pages_over_limit); goto superseded; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5e2e08712d3b..712601f299b8 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1365,15 +1365,19 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) return err; } -static const char *fuse_follow_link(struct dentry *dentry, void **cookie) +static const char *fuse_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); char *link; ssize_t ret; - link = (char *) __get_free_page(GFP_KERNEL); + if (!dentry) + return ERR_PTR(-ECHILD); + + link = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); @@ -1385,11 +1389,11 @@ static const char *fuse_follow_link(struct dentry *dentry, void **cookie) args.out.args[0].value = link; ret = fuse_simple_request(fc, &args); if (ret < 0) { - free_page((unsigned long) link); + kfree(link); link = ERR_PTR(ret); } else { link[ret] = '\0'; - *cookie = link; + set_delayed_call(done, kfree_link, link); } fuse_invalidate_atime(inode); return link; @@ -1909,8 +1913,7 @@ static const struct inode_operations fuse_common_inode_operations = { static const struct inode_operations fuse_symlink_inode_operations = { .setattr = fuse_setattr, - .follow_link = fuse_follow_link, - .put_link = free_page_put_link, + .get_link = fuse_get_link, .readlink = generic_readlink, .getattr = fuse_getattr, .setxattr = fuse_setxattr, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 47f181191060..aa03aab6a24f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2189,7 +2189,7 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) int err; if (fc->no_flock) { - err = flock_lock_file_wait(file, fl); + err = locks_lock_file_wait(file, fl); } else { struct fuse_file *ff = file->private_data; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2913db2a5b99..4d69d5c0bedc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void) int err; fuse_inode_cachep = kmem_cache_create("fuse_inode", - sizeof(struct fuse_inode), - 0, SLAB_HWCACHE_ALIGN, + sizeof(struct fuse_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 1be3b061c05c..791932617d1a 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -31,9 +31,9 @@ static const char *gfs2_acl_name(int type) { switch (type) { case ACL_TYPE_ACCESS: - return GFS2_POSIX_ACL_ACCESS; + return XATTR_POSIX_ACL_ACCESS; case ACL_TYPE_DEFAULT: - return GFS2_POSIX_ACL_DEFAULT; + return XATTR_POSIX_ACL_DEFAULT; } return NULL; } diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 2d65ec4cd4be..3af4f407a483 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -12,8 +12,6 @@ #include "incore.h" -#define GFS2_POSIX_ACL_ACCESS "posix_acl_access" -#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" #define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1caee0534587..93f07465e5a6 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -914,7 +914,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, failed: gfs2_trans_end(sdp); gfs2_inplace_release(ip); - if (ip->i_res->rs_qa_qd_num) + if (ip->i_qadata && ip->i_qadata->qa_qd_num) gfs2_quota_unlock(ip); if (inode == sdp->sd_rindex) { gfs2_glock_dq(&m_ip->i_gh); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 61296ecbd0e2..0860f0b5b3f1 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -787,8 +787,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, if (error) goto out_rlist; - if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */ - gfs2_rs_deltree(ip->i_res); + if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */ + gfs2_rs_deltree(&ip->i_res); error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + RES_STATFS + RES_QUOTA, @@ -1291,13 +1291,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) if (ret) return ret; - ret = get_write_access(inode); - if (ret) - return ret; - inode_dio_wait(inode); - ret = gfs2_rs_alloc(ip); + ret = gfs2_rsqa_alloc(ip); if (ret) goto out; @@ -1307,10 +1303,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) goto out; } - gfs2_rs_deltree(ip->i_res); ret = do_shrink(inode, oldsize, newsize); out: - put_write_access(inode); + gfs2_rsqa_delete(ip, NULL); return ret; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 487527b42d94..6a92592304fb 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -82,6 +82,8 @@ #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) +#define GFS2_HASH_INDEX_MASK 0xffffc000 +#define GFS2_USE_HASH_FLAG 0x2000 struct qstr gfs2_qdot __read_mostly; struct qstr gfs2_qdotdot __read_mostly; @@ -108,7 +110,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head *bh; int error; - error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh); if (error) return error; if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { @@ -305,7 +307,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, BUG_ON(extlen < 1); bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); } else { - error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh); if (error) goto fail; } @@ -388,8 +390,13 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) */ void gfs2_dir_hash_inval(struct gfs2_inode *ip) { - __be64 *hc = ip->i_hash_cache; + __be64 *hc; + + spin_lock(&ip->i_inode.i_lock); + hc = ip->i_hash_cache; ip->i_hash_cache = NULL; + spin_unlock(&ip->i_inode.i_lock); + kvfree(hc); } @@ -438,6 +445,27 @@ static int gfs2_dirent_last(const struct gfs2_dirent *dent, return 0; } +/* Look for the dirent that contains the offset specified in data. Once we + * find that dirent, there must be space available there for the new dirent */ +static int gfs2_dirent_find_offset(const struct gfs2_dirent *dent, + const struct qstr *name, + void *ptr) +{ + unsigned required = GFS2_DIRENT_SIZE(name->len); + unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + unsigned totlen = be16_to_cpu(dent->de_rec_len); + + if (ptr < (void *)dent || ptr >= (void *)dent + totlen) + return 0; + if (gfs2_dirent_sentinel(dent)) + actual = 0; + if (ptr < (void *)dent + actual) + return -1; + if ((void *)dent + totlen >= ptr + required) + return 1; + return -1; +} + static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, const struct qstr *name, void *opaque) @@ -677,6 +705,27 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, prev->de_rec_len = cpu_to_be16(prev_rec_len); } + +static struct gfs2_dirent *do_init_dirent(struct inode *inode, + struct gfs2_dirent *dent, + const struct qstr *name, + struct buffer_head *bh, + unsigned offset) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_dirent *ndent; + unsigned totlen; + + totlen = be16_to_cpu(dent->de_rec_len); + BUG_ON(offset + name->len > totlen); + gfs2_trans_add_meta(ip->i_gl, bh); + ndent = (struct gfs2_dirent *)((char *)dent + offset); + dent->de_rec_len = cpu_to_be16(offset); + gfs2_qstr2dirent(name, totlen - offset, ndent); + return ndent; +} + + /* * Takes a dent from which to grab space as an argument. Returns the * newly created dent. @@ -686,31 +735,25 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, const struct qstr *name, struct buffer_head *bh) { - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_dirent *ndent; - unsigned offset = 0, totlen; + unsigned offset = 0; if (!gfs2_dirent_sentinel(dent)) offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); - totlen = be16_to_cpu(dent->de_rec_len); - BUG_ON(offset + name->len > totlen); - gfs2_trans_add_meta(ip->i_gl, bh); - ndent = (struct gfs2_dirent *)((char *)dent + offset); - dent->de_rec_len = cpu_to_be16(offset); - gfs2_qstr2dirent(name, totlen - offset, ndent); - return ndent; + return do_init_dirent(inode, dent, name, bh, offset); } -static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode, - struct buffer_head *bh, - const struct qstr *name) +static struct gfs2_dirent *gfs2_dirent_split_alloc(struct inode *inode, + struct buffer_head *bh, + const struct qstr *name, + void *ptr) { struct gfs2_dirent *dent; dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, - gfs2_dirent_find_space, name, NULL); + gfs2_dirent_find_offset, name, ptr); if (!dent || IS_ERR(dent)) return dent; - return gfs2_init_dirent(inode, dent, name, bh); + return do_init_dirent(inode, dent, name, bh, + (unsigned)(ptr - (void *)dent)); } static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, @@ -718,7 +761,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, { int error; - error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); + error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp); if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { /* pr_info("block num=%llu\n", leaf_no); */ error = -EIO; @@ -1046,10 +1089,11 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) if (!gfs2_dirent_sentinel(dent) && be32_to_cpu(dent->de_hash) < divider) { struct qstr str; + void *ptr = ((char *)dent - obh->b_data) + nbh->b_data; str.name = (char*)(dent+1); str.len = be16_to_cpu(dent->de_name_len); str.hash = be32_to_cpu(dent->de_hash); - new = gfs2_dirent_alloc(inode, nbh, &str); + new = gfs2_dirent_split_alloc(inode, nbh, &str, ptr); if (IS_ERR(new)) { error = PTR_ERR(new); break; @@ -1181,10 +1225,10 @@ static int compare_dents(const void *a, const void *b) int ret = 0; dent_a = *(const struct gfs2_dirent **)a; - hash_a = be32_to_cpu(dent_a->de_hash); + hash_a = dent_a->de_cookie; dent_b = *(const struct gfs2_dirent **)b; - hash_b = be32_to_cpu(dent_b->de_hash); + hash_b = dent_b->de_cookie; if (hash_a > hash_b) ret = 1; @@ -1222,19 +1266,20 @@ static int compare_dents(const void *a, const void *b) */ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, - const struct gfs2_dirent **darr, u32 entries, - int *copied) + struct gfs2_dirent **darr, u32 entries, + u32 sort_start, int *copied) { const struct gfs2_dirent *dent, *dent_next; u64 off, off_next; unsigned int x, y; int run = 0; - sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); + if (sort_start < entries) + sort(&darr[sort_start], entries - sort_start, + sizeof(struct gfs2_dirent *), compare_dents, NULL); dent_next = darr[0]; - off_next = be32_to_cpu(dent_next->de_hash); - off_next = gfs2_disk_hash2offset(off_next); + off_next = dent_next->de_cookie; for (x = 0, y = 1; x < entries; x++, y++) { dent = dent_next; @@ -1242,8 +1287,7 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, if (y < entries) { dent_next = darr[y]; - off_next = be32_to_cpu(dent_next->de_hash); - off_next = gfs2_disk_hash2offset(off_next); + off_next = dent_next->de_cookie; if (off < ctx->pos) continue; @@ -1290,6 +1334,40 @@ static void *gfs2_alloc_sort_buffer(unsigned size) return ptr; } + +static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh, + unsigned leaf_nr, struct gfs2_dirent **darr, + unsigned entries) +{ + int sort_id = -1; + int i; + + for (i = 0; i < entries; i++) { + unsigned offset; + + darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash); + darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie); + + if (!sdp->sd_args.ar_loccookie) + continue; + offset = (char *)(darr[i]) - + (bh->b_data + gfs2_dirent_offset(bh->b_data)); + offset /= GFS2_MIN_DIRENT_SIZE; + offset += leaf_nr * sdp->sd_max_dents_per_leaf; + if (offset >= GFS2_USE_HASH_FLAG || + leaf_nr >= GFS2_USE_HASH_FLAG) { + darr[i]->de_cookie |= GFS2_USE_HASH_FLAG; + if (sort_id < 0) + sort_id = i; + continue; + } + darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK; + darr[i]->de_cookie |= offset; + } + return sort_id; +} + + static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, int *copied, unsigned *depth, u64 leaf_no) @@ -1299,12 +1377,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, struct buffer_head *bh; struct gfs2_leaf *lf; unsigned entries = 0, entries2 = 0; - unsigned leaves = 0; - const struct gfs2_dirent **darr, *dent; + unsigned leaves = 0, leaf = 0, offset, sort_offset; + struct gfs2_dirent **darr, *dent; struct dirent_gather g; struct buffer_head **larr; - int leaf = 0; - int error, i; + int error, i, need_sort = 0, sort_id; u64 lfn = leaf_no; do { @@ -1320,6 +1397,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, brelse(bh); } while(lfn); + if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) { + need_sort = 1; + sort_offset = 0; + } + if (!entries) return 0; @@ -1333,8 +1415,8 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); if (!larr) goto out; - darr = (const struct gfs2_dirent **)(larr + leaves); - g.pdent = darr; + darr = (struct gfs2_dirent **)(larr + leaves); + g.pdent = (const struct gfs2_dirent **)darr; g.offset = 0; lfn = leaf_no; @@ -1345,6 +1427,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, lf = (struct gfs2_leaf *)bh->b_data; lfn = be64_to_cpu(lf->lf_next); if (lf->lf_entries) { + offset = g.offset; entries2 += be16_to_cpu(lf->lf_entries); dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, gfs2_dirent_gather, NULL, &g); @@ -1362,17 +1445,26 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, goto out_free; } error = 0; + sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset], + be16_to_cpu(lf->lf_entries)); + if (!need_sort && sort_id >= 0) { + need_sort = 1; + sort_offset = offset + sort_id; + } larr[leaf++] = bh; } else { + larr[leaf++] = NULL; brelse(bh); } } while(lfn); BUG_ON(entries2 != entries); - error = do_filldir_main(ip, ctx, darr, entries, copied); + error = do_filldir_main(ip, ctx, darr, entries, need_sort ? + sort_offset : entries, copied); out_free: for(i = 0; i < leaf; i++) - brelse(larr[i]); + if (larr[i]) + brelse(larr[i]); kvfree(larr); out: return error; @@ -1478,7 +1570,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, struct gfs2_inode *dip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct dirent_gather g; - const struct gfs2_dirent **darr, *dent; + struct gfs2_dirent **darr, *dent; struct buffer_head *dibh; int copied = 0; int error; @@ -1502,7 +1594,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, /* 96 is max number of dirents which can be stuffed into an inode */ darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS); if (darr) { - g.pdent = darr; + g.pdent = (const struct gfs2_dirent **)darr; g.offset = 0; dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size, gfs2_dirent_gather, NULL, &g); @@ -1519,8 +1611,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, error = -EIO; goto out; } + gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries); error = do_filldir_main(dip, ctx, darr, - dip->i_entries, &copied); + dip->i_entries, 0, &copied); out: kfree(darr); } @@ -1555,15 +1648,22 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); if (dent) { + struct inode *inode; + u16 rahead; + if (IS_ERR(dent)) return ERR_CAST(dent); dtype = be16_to_cpu(dent->de_type); + rahead = be16_to_cpu(dent->de_rahead); addr = be64_to_cpu(dent->de_inum.no_addr); formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino); brelse(bh); if (fail_on_exist) return ERR_PTR(-EEXIST); - return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); + inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); + if (!IS_ERR(inode)) + GFS2_I(inode)->i_rahead = rahead; + return inode; } return ERR_PTR(-ENOENT); } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index cf4ab89159f4..7412863cda1e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -298,9 +298,9 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) gfsflags &= ~GFS2_DIF_TOPDIR; if (gfsflags & GFS2_DIF_INHERIT_JDATA) gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); - return do_gfs2_set_flags(filp, gfsflags, ~0); + return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM); } - return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); + return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA)); } static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -336,8 +336,8 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; int hint = min_t(size_t, INT_MAX, blks); - if (hint > atomic_read(&ip->i_res->rs_sizehint)) - atomic_set(&ip->i_res->rs_sizehint, hint); + if (hint > atomic_read(&ip->i_res.rs_sizehint)) + atomic_set(&ip->i_res.rs_sizehint, hint); } /** @@ -397,14 +397,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* Update file times before taking page lock */ file_update_time(vma->vm_file); - ret = get_write_access(inode); + ret = gfs2_rsqa_alloc(ip); if (ret) goto out; - ret = gfs2_rs_alloc(ip); - if (ret) - goto out_write_access; - gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); @@ -486,8 +482,6 @@ out_uninit: set_page_dirty(page); wait_for_stable_page(page); } -out_write_access: - put_write_access(inode); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); @@ -623,7 +617,7 @@ static int gfs2_release(struct inode *inode, struct file *file) if (!(file->f_mode & FMODE_WRITE)) return 0; - gfs2_rs_delete(ip, &inode->i_writecount); + gfs2_rsqa_delete(ip, &inode->i_writecount); return 0; } @@ -703,7 +697,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct gfs2_inode *ip = GFS2_I(file_inode(file)); int ret; - ret = gfs2_rs_alloc(ip); + ret = gfs2_rsqa_alloc(ip); if (ret) return ret; @@ -897,8 +891,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) { i_size_write(inode, pos + count); - /* Marks the inode as dirty */ file_update_time(file); + mark_inode_dirty(inode); } return generic_write_sync(file, pos, count); @@ -938,13 +932,14 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le if (ret) goto out_unlock; - ret = gfs2_rs_alloc(ip); + ret = gfs2_rsqa_alloc(ip); if (ret) goto out_putw; ret = __gfs2_fallocate(file, mode, offset, len); if (ret) - gfs2_rs_deltree(ip->i_res); + gfs2_rs_deltree(&ip->i_res); + out_putw: put_write_access(inode); out_unlock: @@ -962,7 +957,7 @@ static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, int error; struct gfs2_inode *ip = GFS2_I(out->f_mapping->host); - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) return (ssize_t)error; @@ -1000,7 +995,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) } if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { if (fl->fl_type == F_UNLCK) - posix_lock_file_wait(file, fl); + locks_lock_file_wait(file, fl); return -EIO; } if (IS_GETLK(cmd)) @@ -1018,7 +1013,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) struct gfs2_inode *ip = GFS2_I(file_inode(file)); struct gfs2_glock *gl; unsigned int state; - int flags; + u16 flags; int error = 0; int sleeptime; @@ -1031,8 +1026,11 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) if (gl) { if (fl_gh->gh_state == state) goto out; - flock_lock_file_wait(file, - &(struct file_lock){.fl_type = F_UNLCK}); + locks_lock_file_wait(file, + &(struct file_lock) { + .fl_type = F_UNLCK, + .fl_flags = FL_FLOCK + }); gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); } else { @@ -1056,7 +1054,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) if (error == GLR_TRYFAILED) error = -EAGAIN; } else { - error = flock_lock_file_wait(file, fl); + error = locks_lock_file_wait(file, fl); gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); } @@ -1071,7 +1069,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) struct gfs2_holder *fl_gh = &fp->f_fl_gh; mutex_lock(&fp->f_fl_mutex); - flock_lock_file_wait(file, fl); + locks_lock_file_wait(file, fl); if (fl_gh->gh_gl) { gfs2_glock_dq(fl_gh); gfs2_holder_uninit(fl_gh); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9bd1244caf38..a4ff7b56f5cd 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -246,8 +246,8 @@ static inline void do_error(struct gfs2_glock *gl, const int ret) */ static int do_promote(struct gfs2_glock *gl) -__releases(&gl->gl_spin) -__acquires(&gl->gl_spin) +__releases(&gl->gl_lockref.lock) +__acquires(&gl->gl_lockref.lock) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh, *tmp; @@ -260,10 +260,10 @@ restart: if (may_grant(gl, gh)) { if (gh->gh_list.prev == &gl->gl_holders && glops->go_lock) { - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); /* FIXME: eliminate this eventually */ ret = glops->go_lock(gh); - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); if (ret) { if (ret == 1) return 2; @@ -361,7 +361,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) unsigned state = ret & LM_OUT_ST_MASK; int rv; - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); trace_gfs2_glock_state_change(gl, state); state_change(gl, state); gh = find_first_waiter(gl); @@ -405,7 +405,7 @@ retry: pr_err("wanted %u got %u\n", gl->gl_target, state); GLOCK_BUG_ON(gl, 1); } - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); return; } @@ -414,9 +414,9 @@ retry: gfs2_demote_wake(gl); if (state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); rv = glops->go_xmote_bh(gl, gh); - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); if (rv) { do_error(gl, rv); goto out; @@ -429,7 +429,7 @@ retry: out: clear_bit(GLF_LOCK, &gl->gl_flags); out_locked: - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); } /** @@ -441,12 +441,12 @@ out_locked: */ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) -__releases(&gl->gl_spin) -__acquires(&gl->gl_spin) +__releases(&gl->gl_lockref.lock) +__acquires(&gl->gl_lockref.lock) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - unsigned int lck_flags = gh ? gh->gh_flags : 0; + unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | @@ -464,7 +464,7 @@ __acquires(&gl->gl_spin) (gl->gl_state == LM_ST_EXCLUSIVE) || (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) clear_bit(GLF_BLOCKING, &gl->gl_flags); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); if (glops->go_sync) glops->go_sync(gl); if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) @@ -485,7 +485,7 @@ __acquires(&gl->gl_spin) gfs2_glock_put(gl); } - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); } /** @@ -513,8 +513,8 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) */ static void run_queue(struct gfs2_glock *gl, const int nonblock) -__releases(&gl->gl_spin) -__acquires(&gl->gl_spin) +__releases(&gl->gl_lockref.lock) +__acquires(&gl->gl_lockref.lock) { struct gfs2_holder *gh = NULL; int ret; @@ -596,7 +596,7 @@ static void glock_work_func(struct work_struct *work) finish_xmote(gl, gl->gl_reply); drop_ref = 1; } - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { @@ -612,7 +612,7 @@ static void glock_work_func(struct work_struct *work) } } run_queue(gl, 0); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); if (!delay) gfs2_glock_put(gl); else { @@ -750,7 +750,7 @@ again: * */ -void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, +void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh) { INIT_LIST_HEAD(&gh->gh_list); @@ -774,7 +774,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, * */ -void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh) +void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh) { gh->gh_state = state; gh->gh_flags = flags; @@ -876,8 +876,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) */ static inline void add_to_queue(struct gfs2_holder *gh) -__releases(&gl->gl_spin) -__acquires(&gl->gl_spin) +__releases(&gl->gl_lockref.lock) +__acquires(&gl->gl_lockref.lock) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; @@ -926,10 +926,10 @@ fail: do_cancel: gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); if (sdp->sd_lockstruct.ls_ops->lm_cancel) sdp->sd_lockstruct.ls_ops->lm_cancel(gl); - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); } return; @@ -967,7 +967,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) if (test_bit(GLF_LRU, &gl->gl_flags)) gfs2_glock_remove_from_lru(gl); - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); add_to_queue(gh); if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) { @@ -977,7 +977,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) gl->gl_lockref.count--; } run_queue(gl, 1); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); if (!(gh->gh_flags & GL_ASYNC)) error = gfs2_glock_wait(gh); @@ -1010,7 +1010,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) unsigned delay = 0; int fast_path = 0; - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); if (gh->gh_flags & GL_NOCACHE) handle_callback(gl, LM_ST_UNLOCKED, 0, false); @@ -1018,9 +1018,9 @@ void gfs2_glock_dq(struct gfs2_holder *gh) if (find_first_holder(gl) == NULL) { if (glops->go_unlock) { GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags)); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); glops->go_unlock(gh); - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); clear_bit(GLF_LOCK, &gl->gl_flags); } if (list_empty(&gl->gl_holders) && @@ -1033,7 +1033,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) gfs2_glock_add_to_lru(gl); trace_gfs2_glock_queue(gh, 0); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); if (likely(fast_path)) return; @@ -1080,7 +1080,7 @@ void gfs2_glock_dq_uninit(struct gfs2_holder *gh) int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, - unsigned int state, int flags, struct gfs2_holder *gh) + unsigned int state, u16 flags, struct gfs2_holder *gh) { struct gfs2_glock *gl; int error; @@ -1217,9 +1217,9 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) delay = gl->gl_hold_time; } - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); handle_callback(gl, state, delay, true); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); } @@ -1259,7 +1259,7 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl) * @gl: Pointer to the glock * @ret: The return value from the dlm * - * The gl_reply field is under the gl_spin lock so that it is ok + * The gl_reply field is under the gl_lockref.lock lock so that it is ok * to use a bitfield shared with other glock state fields. */ @@ -1267,20 +1267,20 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { if (gfs2_should_freeze(gl)) { set_bit(GLF_FROZEN, &gl->gl_flags); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); return; } } gl->gl_lockref.count++; set_bit(GLF_REPLY_PENDING, &gl->gl_flags); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); @@ -1326,14 +1326,14 @@ __acquires(&lru_lock) while(!list_empty(list)) { gl = list_entry(list->next, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); - if (!spin_trylock(&gl->gl_spin)) { + if (!spin_trylock(&gl->gl_lockref.lock)) { add_back_to_lru: list_add(&gl->gl_lru, &lru_list); atomic_inc(&lru_count); continue; } if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); goto add_back_to_lru; } clear_bit(GLF_LRU, &gl->gl_flags); @@ -1343,7 +1343,7 @@ add_back_to_lru: WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gl->gl_lockref.count--; - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); cond_resched_lock(&lru_lock); } } @@ -1417,14 +1417,14 @@ static struct shrinker glock_shrinker = { static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; - struct rhash_head *pos, *next; + struct rhash_head *pos; const struct bucket_table *tbl; int i; rcu_read_lock(); tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table); for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) { + rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) { if ((gl->gl_name.ln_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); @@ -1461,10 +1461,10 @@ static void clear_glock(struct gfs2_glock *gl) { gfs2_glock_remove_from_lru(gl); - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); if (gl->gl_state != LM_ST_UNLOCKED) handle_callback(gl, LM_ST_UNLOCKED, 0, false); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); } @@ -1482,9 +1482,9 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp) static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl) { - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); gfs2_dump_glock(seq, gl); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); } static void dump_glock_func(struct gfs2_glock *gl) @@ -1506,7 +1506,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) flush_workqueue(glock_workqueue); glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); - wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); + wait_event_timeout(sdp->sd_glock_wait, + atomic_read(&sdp->sd_glock_disposal) == 0, + HZ * 600); glock_hash_walk(dump_glock_func, sdp); } @@ -1518,10 +1520,10 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip) ret = gfs2_truncatei_resume(ip); gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0); - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); clear_bit(GLF_LOCK, &gl->gl_flags); run_queue(gl, 1); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); } static const char *state2str(unsigned state) @@ -1539,7 +1541,7 @@ static const char *state2str(unsigned state) return "??"; } -static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) +static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) { char *p = buf; if (flags & LM_FLAG_TRY) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 32572f71f027..46ab67fc16da 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -79,15 +79,15 @@ enum { * requested had acquired and released the lock. */ -#define LM_FLAG_TRY 0x00000001 -#define LM_FLAG_TRY_1CB 0x00000002 -#define LM_FLAG_NOEXP 0x00000004 -#define LM_FLAG_ANY 0x00000008 -#define LM_FLAG_PRIORITY 0x00000010 -#define GL_ASYNC 0x00000040 -#define GL_EXACT 0x00000080 -#define GL_SKIP 0x00000100 -#define GL_NOCACHE 0x00000400 +#define LM_FLAG_TRY 0x0001 +#define LM_FLAG_TRY_1CB 0x0002 +#define LM_FLAG_NOEXP 0x0004 +#define LM_FLAG_ANY 0x0008 +#define LM_FLAG_PRIORITY 0x0010 +#define GL_ASYNC 0x0040 +#define GL_EXACT 0x0080 +#define GL_SKIP 0x0100 +#define GL_NOCACHE 0x0400 /* * lm_async_cb return flags @@ -141,7 +141,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock * struct pid *pid; /* Look in glock's list of holders for one with current task as owner */ - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); pid = task_pid(current); list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) @@ -151,7 +151,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock * } gh = NULL; out: - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); return gh; } @@ -183,8 +183,8 @@ extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, int create, struct gfs2_glock **glp); extern void gfs2_glock_put(struct gfs2_glock *gl); extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, - unsigned flags, struct gfs2_holder *gh); -extern void gfs2_holder_reinit(unsigned int state, unsigned flags, + u16 flags, struct gfs2_holder *gh); +extern void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh); extern void gfs2_holder_uninit(struct gfs2_holder *gh); extern int gfs2_glock_nq(struct gfs2_holder *gh); @@ -195,7 +195,7 @@ extern void gfs2_glock_dq_wait(struct gfs2_holder *gh); extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh); extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, - unsigned int state, int flags, + unsigned int state, u16 flags, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); @@ -215,7 +215,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); */ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, - unsigned int state, int flags, + unsigned int state, u16 flags, struct gfs2_holder *gh) { int error; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 1f6c9c3fe5cb..437fd73e381e 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -13,6 +13,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/bio.h> #include <linux/posix_acl.h> +#include <linux/security.h> #include "gfs2.h" #include "incore.h" @@ -146,11 +147,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl) struct gfs2_rgrpd *rgd; int error; - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); rgd = gl->gl_object; if (rgd) gfs2_rgrp_brelse(rgd); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; @@ -162,11 +163,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl) mapping_set_error(mapping, error); gfs2_ail_empty_gl(gl); - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); rgd = gl->gl_object; if (rgd) gfs2_free_clones(rgd); - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); } /** @@ -262,6 +263,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) if (ip) { set_bit(GIF_INVALID, &ip->i_flags); forget_all_cached_acls(&ip->i_inode); + security_inode_invalidate_secctx(&ip->i_inode); gfs2_dir_hash_inval(ip); } } @@ -542,7 +544,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl) * iopen_go_callback - schedule the dcache entry for the inode to be deleted * @gl: the glock * - * gl_spin lock is held while calling this + * gl_lockref.lock lock is held while calling this */ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 121ed08d9d9f..845fb09cc606 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -259,8 +259,8 @@ struct gfs2_holder { struct gfs2_glock *gh_gl; struct pid *gh_owner_pid; - unsigned int gh_state; - unsigned gh_flags; + u16 gh_flags; + u16 gh_state; int gh_error; unsigned long gh_iflags; /* HIF_... */ @@ -270,6 +270,13 @@ struct gfs2_holder { /* Number of quota types we support */ #define GFS2_MAXQUOTAS 2 +struct gfs2_qadata { /* quota allocation data */ + /* Quota stuff */ + struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS]; + struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS]; + unsigned int qa_qd_num; +}; + /* Resource group multi-block reservation, in order of appearance: Step 1. Function prepares to write, allocates a mb, sets the size hint. @@ -288,11 +295,6 @@ struct gfs2_blkreserv { struct gfs2_rbm rs_rbm; /* Start of reservation */ u32 rs_free; /* how many blocks are still free */ u64 rs_inum; /* Inode number for reservation */ - - /* ancillary quota stuff */ - struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS]; - struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS]; - unsigned int rs_qa_qd_num; }; /* @@ -334,9 +336,8 @@ struct gfs2_glock { struct lm_lockname gl_name; struct lockref gl_lockref; -#define gl_spin gl_lockref.lock - /* State fields protected by gl_spin */ + /* State fields protected by gl_lockref.lock */ unsigned int gl_state:2, /* Current state */ gl_target:2, /* Target state */ gl_demote_state:2, /* State requested by remote node */ @@ -392,7 +393,8 @@ struct gfs2_inode { struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ - struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */ + struct gfs2_qadata *i_qadata; /* quota allocation data */ + struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */ struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; @@ -403,6 +405,7 @@ struct gfs2_inode { u32 i_diskflags; u8 i_height; u8 i_depth; + u16 i_rahead; }; /* @@ -559,6 +562,8 @@ struct gfs2_args { unsigned int ar_errors:2; /* errors=withdraw | panic */ unsigned int ar_nobarrier:1; /* do not send barriers */ unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ + unsigned int ar_loccookie:1; /* use location based readdir + cookies */ int ar_commit; /* Commit interval */ int ar_statfs_quantum; /* The fast statfs interval */ int ar_quota_quantum; /* The quota interval */ @@ -686,6 +691,7 @@ struct gfs2_sbd { u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; u32 sd_max_jheight; /* Max height of journaled file's meta tree */ u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1]; + u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */ struct gfs2_args sd_args; /* Mount arguments */ struct gfs2_tune sd_tune; /* Filesystem tuning structure */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 063fdfcf8275..3e94400d587c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -191,13 +191,13 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; ip->i_iopen_gh.gh_gl->gl_object = NULL; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_uninit(&ip->i_iopen_gh); fail_iopen: if (io_gl) gfs2_glock_put(io_gl); fail_put: ip->i_gl->gl_object = NULL; - gfs2_glock_put(ip->i_gl); fail: iget_failed(inode); return ERR_PTR(error); @@ -593,7 +593,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; - int error, free_vfs_inode = 0; + int error, free_vfs_inode = 1; u32 aflags = 0; unsigned blocks = 1; struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; @@ -601,7 +601,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; - error = gfs2_rs_alloc(dip); + error = gfs2_rsqa_alloc(dip); if (error) return error; @@ -650,10 +650,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) - goto fail_free_vfs_inode; + goto fail_gunlock; ip = GFS2_I(inode); - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) goto fail_free_acls; @@ -685,6 +685,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, ip->i_entries = 2; break; } + + /* Force SYSTEM flag on all files and subdirs of a SYSTEM directory */ + if (dip->i_diskflags & GFS2_DIF_SYSTEM) + ip->i_diskflags |= GFS2_DIF_SYSTEM; + gfs2_set_inode_flags(inode); if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) || @@ -733,6 +738,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_set_iop(inode); insert_inode_hash(inode); + free_vfs_inode = 0; /* After this point, the inode is no longer + considered free. Any failures need to undo + the gfs2 structures. */ if (default_acl) { error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); @@ -766,24 +774,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, return error; fail_gunlock3: - gfs2_glock_dq_uninit(ghs + 1); - if (ip->i_gl) - gfs2_glock_put(ip->i_gl); - goto fail_gunlock; - + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_put(io_gl); fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); fail_free_inode: if (ip->i_gl) gfs2_glock_put(ip->i_gl); - gfs2_rs_delete(ip, NULL); + gfs2_rsqa_delete(ip, NULL); fail_free_acls: if (default_acl) posix_acl_release(default_acl); if (acl) posix_acl_release(acl); -fail_free_vfs_inode: - free_vfs_inode = 1; fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); @@ -898,7 +901,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (S_ISDIR(inode->i_mode)) return -EPERM; - error = gfs2_rs_alloc(dip); + error = gfs2_rsqa_alloc(dip); if (error) return error; @@ -1371,7 +1374,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) return error; - error = gfs2_rs_alloc(ndip); + error = gfs2_rsqa_alloc(ndip); if (error) return error; @@ -1712,24 +1715,30 @@ static int gfs2_rename2(struct inode *odir, struct dentry *odentry, } /** - * gfs2_follow_link - Follow a symbolic link + * gfs2_get_link - Follow a symbolic link * @dentry: The dentry of the link - * @nd: Data that we pass to vfs_follow_link() + * @inode: The inode of the link + * @done: destructor for return value * * This can handle symlinks of any size. * * Returns: 0 on success or error code */ -static const char *gfs2_follow_link(struct dentry *dentry, void **cookie) +static const char *gfs2_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder i_gh; struct buffer_head *dibh; unsigned int size; char *buf; int error; + if (!dentry) + return ERR_PTR(-ECHILD); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); error = gfs2_glock_nq(&i_gh); if (error) { @@ -1759,7 +1768,7 @@ static const char *gfs2_follow_link(struct dentry *dentry, void **cookie) out: gfs2_glock_dq_uninit(&i_gh); if (!IS_ERR(buf)) - *cookie = buf; + set_delayed_call(done, kfree_link, buf); return buf; } @@ -1854,11 +1863,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) ogid = ngid = NO_GID_QUOTA_CHANGE; - error = get_write_access(inode); - if (error) - return error; - - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) goto out; @@ -1898,7 +1903,6 @@ out_end_trans: out_gunlock_q: gfs2_quota_unlock(ip); out: - put_write_access(inode); return error; } @@ -1920,7 +1924,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) struct gfs2_holder i_gh; int error; - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) return error; @@ -2002,7 +2006,7 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name, gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret == 0) { - ret = gfs2_rs_alloc(ip); + ret = gfs2_rsqa_alloc(ip); if (ret == 0) ret = generic_setxattr(dentry, name, data, size, flags); gfs2_glock_dq(&gh); @@ -2043,7 +2047,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret == 0) { - ret = gfs2_rs_alloc(ip); + ret = gfs2_rsqa_alloc(ip); if (ret == 0) ret = generic_removexattr(dentry, name); gfs2_glock_dq(&gh); @@ -2132,8 +2136,7 @@ const struct inode_operations gfs2_dir_iops = { const struct inode_operations gfs2_symlink_iops = { .readlink = generic_readlink, - .follow_link = gfs2_follow_link, - .put_link = kfree_put_link, + .get_link = gfs2_get_link, .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 284c1542783e..8b907c5cc913 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -50,7 +50,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, s64 delta = sample - s->stats[index]; s->stats[index] += (delta >> 3); index++; - s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2); + s->stats[index] += ((abs(delta) - s->stats[index]) >> 2); } /** diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 536e7a6252cd..0ff028c15199 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -716,6 +716,9 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, } trace_gfs2_log_flush(sdp, 1); + if (type == SHUTDOWN_FLUSH) + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + sdp->sd_log_flush_head = sdp->sd_log_head; sdp->sd_log_flush_wrapped = 0; tr = sdp->sd_log_tr; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 241a399bf83d..f99f8e94de3f 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -41,7 +41,9 @@ static void gfs2_init_inode_once(void *foo) inode_init_once(&ip->i_inode); init_rwsem(&ip->i_rw_mutex); INIT_LIST_HEAD(&ip->i_trunc_list); - ip->i_res = NULL; + ip->i_qadata = NULL; + memset(&ip->i_res, 0, sizeof(ip->i_res)); + RB_CLEAR_NODE(&ip->i_res.rs_node); ip->i_hash_cache = NULL; } @@ -50,7 +52,7 @@ static void gfs2_init_glock_once(void *foo) struct gfs2_glock *gl = foo; INIT_HLIST_BL_NODE(&gl->gl_list); - spin_lock_init(&gl->gl_spin); + spin_lock_init(&gl->gl_lockref.lock); INIT_LIST_HEAD(&gl->gl_holders); INIT_LIST_HEAD(&gl->gl_lru); INIT_LIST_HEAD(&gl->gl_ail_list); @@ -112,7 +114,8 @@ static int __init init_gfs2_fs(void) gfs2_inode_cachep = kmem_cache_create("gfs2_inode", sizeof(struct gfs2_inode), 0, SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD| + SLAB_ACCOUNT, gfs2_init_inode_once); if (!gfs2_inode_cachep) goto fail; @@ -135,10 +138,10 @@ static int __init init_gfs2_fs(void) if (!gfs2_quotad_cachep) goto fail; - gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk", - sizeof(struct gfs2_blkreserv), + gfs2_qadata_cachep = kmem_cache_create("gfs2_qadata", + sizeof(struct gfs2_qadata), 0, 0, NULL); - if (!gfs2_rsrv_cachep) + if (!gfs2_qadata_cachep) goto fail; register_shrinker(&gfs2_qd_shrinker); @@ -193,8 +196,8 @@ fail_lru: unregister_shrinker(&gfs2_qd_shrinker); gfs2_glock_exit(); - if (gfs2_rsrv_cachep) - kmem_cache_destroy(gfs2_rsrv_cachep); + if (gfs2_qadata_cachep) + kmem_cache_destroy(gfs2_qadata_cachep); if (gfs2_quotad_cachep) kmem_cache_destroy(gfs2_quotad_cachep); @@ -238,7 +241,7 @@ static void __exit exit_gfs2_fs(void) rcu_barrier(); mempool_destroy(gfs2_page_pool); - kmem_cache_destroy(gfs2_rsrv_cachep); + kmem_cache_destroy(gfs2_qadata_cachep); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0e1d4be5865a..e137d96f1b17 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -187,6 +187,52 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) return bh; } +static void gfs2_meta_read_endio(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + struct buffer_head *bh = page_buffers(page); + unsigned int len = bvec->bv_len; + + while (bh_offset(bh) < bvec->bv_offset) + bh = bh->b_this_page; + do { + struct buffer_head *next = bh->b_this_page; + len -= bh->b_size; + bh->b_end_io(bh, !bio->bi_error); + bh = next; + } while (bh && len); + } + bio_put(bio); +} + +/* + * Submit several consecutive buffer head I/O requests as a single bio I/O + * request. (See submit_bh_wbc.) + */ +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) +{ + struct buffer_head *bh = bhs[0]; + struct bio *bio; + int i; + + if (!num) + return; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + for (i = 0; i < num; i++) { + bh = bhs[i]; + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + } + bio->bi_end_io = gfs2_meta_read_endio; + submit_bio(rw, bio); +} + /** * gfs2_meta_read - Read a block from disk * @gl: The glock covering the block @@ -198,10 +244,11 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) */ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - struct buffer_head **bhp) + int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct buffer_head *bh; + struct buffer_head *bh, *bhs[2]; + int num = 0; if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { *bhp = NULL; @@ -213,14 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); - return 0; + flags &= ~DIO_WAIT; + } else { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + bhs[num++] = bh; } - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); + + if (rahead) { + bh = gfs2_getbuf(gl, blkno + 1, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + } else { + bh->b_end_io = end_buffer_read_sync; + bhs[num++] = bh; + } + } + + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; + bh = *bhp; wait_on_buffer(bh); if (unlikely(!buffer_uptodate(bh))) { struct gfs2_trans *tr = current->journal_info; @@ -341,8 +405,12 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, struct buffer_head *bh; int ret = 0; u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; + int rahead = 0; + + if (num == ip->i_no_addr) + rahead = ip->i_rahead; - ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); + ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh); if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 8ca161567a93..c5086c8af5ed 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -53,7 +53,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - struct buffer_head **bhp); + int rahead, struct buffer_head **bhp); extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 02586e7eb964..dbed9e243ea2 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_jheightsize[x] = ~0; gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); + sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_leaf)) / + GFS2_MIN_DIRENT_SIZE; return 0; } @@ -910,8 +913,7 @@ fail_qc_i: fail_ut_i: iput(sdp->sd_sc_inode); fail: - if (pn) - iput(pn); + iput(pn); return error; } @@ -1291,6 +1293,9 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, up_write(&s->s_umount); blkdev_put(bdev, mode); down_write(&s->s_umount); + } else { + /* s_mode must be set before deactivate_locked_super calls */ + s->s_mode = mode; } memset(&args, 0, sizeof(args)); @@ -1312,10 +1317,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if ((flags ^ s->s_flags) & MS_RDONLY) goto error_super; } else { - char b[BDEVNAME_SIZE]; - - s->s_mode = mode; - strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); if (error) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3a31226531ea..be6d9c450b22 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -388,7 +388,7 @@ static int bh_get(struct gfs2_quota_data *qd) error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); if (error) goto fail; - error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh); if (error) goto fail; error = -EIO; @@ -527,37 +527,70 @@ static void qdsb_put(struct gfs2_quota_data *qd) qd_put(qd); } +/** + * gfs2_qa_alloc - make sure we have a quota allocations data structure, + * if necessary + * @ip: the inode for this reservation + */ +int gfs2_qa_alloc(struct gfs2_inode *ip) +{ + int error = 0; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + + down_write(&ip->i_rw_mutex); + if (ip->i_qadata == NULL) { + ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS); + if (!ip->i_qadata) + error = -ENOMEM; + } + up_write(&ip->i_rw_mutex); + return error; +} + +void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount) +{ + down_write(&ip->i_rw_mutex); + if (ip->i_qadata && ((wcount == NULL) || (atomic_read(wcount) <= 1))) { + kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata); + ip->i_qadata = NULL; + } + up_write(&ip->i_rw_mutex); +} + int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data **qd; int error; - if (ip->i_res == NULL) { - error = gfs2_rs_alloc(ip); + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + + if (ip->i_qadata == NULL) { + error = gfs2_rsqa_alloc(ip); if (error) return error; } - qd = ip->i_res->rs_qa_qd; + qd = ip->i_qadata->qa_qd; - if (gfs2_assert_warn(sdp, !ip->i_res->rs_qa_qd_num) || + if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) || gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) return -EIO; - if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) - return 0; - error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd); if (error) goto out; - ip->i_res->rs_qa_qd_num++; + ip->i_qadata->qa_qd_num++; qd++; error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd); if (error) goto out; - ip->i_res->rs_qa_qd_num++; + ip->i_qadata->qa_qd_num++; qd++; if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) && @@ -565,7 +598,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) error = qdsb_get(sdp, make_kqid_uid(uid), qd); if (error) goto out; - ip->i_res->rs_qa_qd_num++; + ip->i_qadata->qa_qd_num++; qd++; } @@ -574,7 +607,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) error = qdsb_get(sdp, make_kqid_gid(gid), qd); if (error) goto out; - ip->i_res->rs_qa_qd_num++; + ip->i_qadata->qa_qd_num++; qd++; } @@ -587,17 +620,17 @@ out: void gfs2_quota_unhold(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int x; + u32 x; - if (ip->i_res == NULL) + if (ip->i_qadata == NULL) return; gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); - for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { - qdsb_put(ip->i_res->rs_qa_qd[x]); - ip->i_res->rs_qa_qd[x] = NULL; + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qdsb_put(ip->i_qadata->qa_qd[x]); + ip->i_qadata->qa_qd[x] = NULL; } - ip->i_res->rs_qa_qd_num = 0; + ip->i_qadata->qa_qd_num = 0; } static int sort_qd(const void *a, const void *b) @@ -843,7 +876,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) unsigned int nalloc = 0, blocks; int error; - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) return error; @@ -1003,23 +1036,23 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; - unsigned int x; + u32 x; int error = 0; - error = gfs2_quota_hold(ip, uid, gid); - if (error) - return error; - if (capable(CAP_SYS_RESOURCE) || sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - sort(ip->i_res->rs_qa_qd, ip->i_res->rs_qa_qd_num, + error = gfs2_quota_hold(ip, uid, gid); + if (error) + return error; + + sort(ip->i_qadata->qa_qd, ip->i_qadata->qa_qd_num, sizeof(struct gfs2_quota_data *), sort_qd, NULL); - for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { - qd = ip->i_res->rs_qa_qd[x]; - error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]); + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qd = ip->i_qadata->qa_qd[x]; + error = do_glock(qd, NO_FORCE, &ip->i_qadata->qa_qd_ghs[x]); if (error) break; } @@ -1028,7 +1061,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) set_bit(GIF_QD_LOCKED, &ip->i_flags); else { while (x--) - gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); + gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]); gfs2_quota_unhold(ip); } @@ -1076,20 +1109,20 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qda[4]; unsigned int count = 0; - unsigned int x; + u32 x; int found; if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) goto out; - for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { struct gfs2_quota_data *qd; int sync; - qd = ip->i_res->rs_qa_qd[x]; + qd = ip->i_qadata->qa_qd[x]; sync = need_sync(qd); - gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); + gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]); if (!sync) continue; @@ -1158,7 +1191,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; s64 value, warn, limit; - unsigned int x; + u32 x; int error = 0; ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ @@ -1168,8 +1201,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { - qd = ip->i_res->rs_qa_qd[x]; + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qd = ip->i_qadata->qa_qd[x]; if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) || qid_eq(qd->qd_id, make_kqid_gid(gid)))) @@ -1216,15 +1249,17 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, kuid_t uid, kgid_t gid) { struct gfs2_quota_data *qd; - unsigned int x; + u32 x; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON || + gfs2_assert_warn(sdp, change)) return; if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; - for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { - qd = ip->i_res->rs_qa_qd[x]; + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qd = ip->i_qadata->qa_qd[x]; if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || qid_eq(qd->qd_id, make_kqid_gid(gid))) { @@ -1635,7 +1670,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (error) return error; - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) goto out_put; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index ad04b3acae2b..5e47c935a515 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -18,6 +18,8 @@ struct gfs2_sbd; #define NO_UID_QUOTA_CHANGE INVALID_UID #define NO_GID_QUOTA_CHANGE INVALID_GID +extern int gfs2_qa_alloc(struct gfs2_inode *ip); +extern void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount); extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_unhold(struct gfs2_inode *ip); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 475985d14758..07c0265aa195 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -596,27 +596,13 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd) } /** - * gfs2_rs_alloc - make sure we have a reservation assigned to the inode + * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode + * plus a quota allocations data structure, if necessary * @ip: the inode for this reservation */ -int gfs2_rs_alloc(struct gfs2_inode *ip) +int gfs2_rsqa_alloc(struct gfs2_inode *ip) { - int error = 0; - - down_write(&ip->i_rw_mutex); - if (ip->i_res) - goto out; - - ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); - if (!ip->i_res) { - error = -ENOMEM; - goto out; - } - - RB_CLEAR_NODE(&ip->i_res->rs_node); -out: - up_write(&ip->i_rw_mutex); - return error; + return gfs2_qa_alloc(ip); } static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) @@ -678,21 +664,20 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) } /** - * gfs2_rs_delete - delete a multi-block reservation + * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation * @ip: The inode for this reservation * @wcount: The inode's write count, or NULL * */ -void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) +void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount) { down_write(&ip->i_rw_mutex); - if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) { - gfs2_rs_deltree(ip->i_res); - BUG_ON(ip->i_res->rs_free); - kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); - ip->i_res = NULL; + if ((wcount == NULL) || (atomic_read(wcount) <= 1)) { + gfs2_rs_deltree(&ip->i_res); + BUG_ON(ip->i_res.rs_free); } up_write(&ip->i_rw_mutex); + gfs2_qa_delete(ip, wcount); } /** @@ -729,9 +714,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { - spin_lock(&gl->gl_spin); + spin_lock(&gl->gl_lockref.lock); gl->gl_object = NULL; - spin_unlock(&gl->gl_spin); + spin_unlock(&gl->gl_lockref.lock); gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } @@ -933,8 +918,9 @@ static int read_rindex_entry(struct gfs2_inode *ip) goto fail; rgd->rd_gl->gl_object = rgd; - rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize; - rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1; + rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_CACHE_MASK; + rgd->rd_gl->gl_vm.end = PAGE_CACHE_ALIGN((rgd->rd_addr + + rgd->rd_length) * bsize) - 1; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) @@ -1157,7 +1143,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; - error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); + error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh); if (error) goto fail; } @@ -1455,7 +1441,7 @@ static void rs_insert(struct gfs2_inode *ip) { struct rb_node **newn, *parent = NULL; int rc; - struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = &ip->i_res; struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd; u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm); @@ -1502,7 +1488,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, { struct gfs2_rbm rbm = { .rgd = rgd, }; u64 goal; - struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = &ip->i_res; u32 extlen; u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; int ret; @@ -1573,7 +1559,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, } if (n) { - while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) { + while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) { block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free; n = n->rb_right; if (n == NULL) @@ -1803,7 +1789,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip continue; *last_unlinked = block; - error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl); + error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl); if (error) continue; @@ -1983,7 +1969,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; - struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = &ip->i_res; int error = 0, rg_locked, flags = 0; u64 last_unlinked = NO_BLOCK; int loops = 0; @@ -2112,7 +2098,7 @@ next_rgrp: void gfs2_inplace_release(struct gfs2_inode *ip) { - struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = &ip->i_res; if (rs->rs_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); @@ -2266,7 +2252,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) static void gfs2_adjust_reservation(struct gfs2_inode *ip, const struct gfs2_rbm *rbm, unsigned len) { - struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = &ip->i_res; struct gfs2_rgrpd *rgd = rbm->rgd; unsigned rlen; u64 block; @@ -2309,8 +2295,8 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, { u64 goal; - if (gfs2_rs_active(ip->i_res)) { - *rbm = ip->i_res->rs_rbm; + if (gfs2_rs_active(&ip->i_res)) { + *rbm = ip->i_res.rs_rbm; return; } @@ -2364,7 +2350,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_alloc_extent(&rbm, dinode, nblocks); block = gfs2_rbm_to_block(&rbm); rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0; - if (gfs2_rs_active(ip->i_res)) + if (gfs2_rs_active(&ip->i_res)) gfs2_adjust_reservation(ip, &rbm, *nblocks); ndata = *nblocks; if (dinode) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index c0ab33fa3eed..66b51cf66dfa 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -49,9 +49,9 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); -extern int gfs2_rs_alloc(struct gfs2_inode *ip); +extern int gfs2_rsqa_alloc(struct gfs2_inode *ip); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); @@ -78,7 +78,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, extern int gfs2_fitrim(struct file *filp, void __user *argp); /* This is how to tell if a reservation is in the rgrp tree: */ -static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs) +static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs) { return rs && !RB_EMPTY_NODE(&rs->rs_node); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 894fb01a91da..8f960a51a9a0 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -83,6 +83,8 @@ enum { Opt_nobarrier, Opt_rgrplvb, Opt_norgrplvb, + Opt_loccookie, + Opt_noloccookie, Opt_error, }; @@ -122,6 +124,8 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_rgrplvb, "rgrplvb"}, {Opt_norgrplvb, "norgrplvb"}, + {Opt_loccookie, "loccookie"}, + {Opt_noloccookie, "noloccookie"}, {Opt_error, NULL} }; @@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) case Opt_norgrplvb: args->ar_rgrplvb = 0; break; + case Opt_loccookie: + args->ar_loccookie = 1; + break; + case Opt_noloccookie: + args->ar_loccookie = 0; + break; case Opt_error: default: pr_warn("invalid mount option: %s\n", o); @@ -556,6 +566,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; gfs2_trans_add_meta(l_ip->i_gl, l_bh); + gfs2_trans_add_meta(m_ip->i_gl, m_bh); spin_lock(&sdp->sd_statfs_spin); m_sc->sc_total += l_sc->sc_total; @@ -564,10 +575,8 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); memset(l_bh->b_data + sizeof(struct gfs2_dinode), 0, sizeof(struct gfs2_statfs_change)); - spin_unlock(&sdp->sd_statfs_spin); - - gfs2_trans_add_meta(m_ip->i_gl, m_bh); gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); } int gfs2_statfs_sync(struct super_block *sb, int type) @@ -842,10 +851,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); - down_write(&sdp->sd_log_flush_lock); - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - up_write(&sdp->sd_log_flush_lock); - gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH); wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0); gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); @@ -1419,6 +1424,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",demote_interface_used"); if (args->ar_rgrplvb) seq_puts(s, ",rgrplvb"); + if (args->ar_loccookie) + seq_puts(s, ",loccookie"); return 0; } @@ -1512,6 +1519,7 @@ static void gfs2_evict_inode(struct inode *inode) struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; + struct address_space *metamapping; int error; if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { @@ -1526,7 +1534,8 @@ static void gfs2_evict_inode(struct inode *inode) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_uninit(&ip->i_iopen_gh); goto out; } @@ -1575,8 +1584,8 @@ static void gfs2_evict_inode(struct inode *inode) out_truncate: gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); + metamapping = gfs2_glock2aspace(ip->i_gl); if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { - struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); filemap_fdatawrite(metamapping); filemap_fdatawait(metamapping); } @@ -1589,16 +1598,17 @@ out_truncate: goto out_unlock; /* Needs to be done before glock release & also in a transaction */ truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(metamapping, 0); gfs2_trans_end(sdp); out_unlock: /* Error path for case 1 */ - if (gfs2_rs_active(ip->i_res)) - gfs2_rs_deltree(ip->i_res); + if (gfs2_rs_active(&ip->i_res)) + gfs2_rs_deltree(&ip->i_res); if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq(&ip->i_iopen_gh); + gfs2_glock_dq_wait(&ip->i_iopen_gh); } gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); @@ -1607,7 +1617,7 @@ out_unlock: out: /* Case 3 starts here */ truncate_inode_pages_final(&inode->i_data); - gfs2_rs_delete(ip, NULL); + gfs2_rsqa_delete(ip, NULL); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); @@ -1619,7 +1629,8 @@ out: if (ip->i_iopen_gh.gh_gl) { ip->i_iopen_gh.gh_gl->gl_object = NULL; ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_uninit(&ip->i_iopen_gh); } } @@ -1632,7 +1643,9 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) ip->i_flags = 0; ip->i_gl = NULL; ip->i_rgd = NULL; - ip->i_res = NULL; + memset(&ip->i_res, 0, sizeof(ip->i_res)); + RB_CLEAR_NODE(&ip->i_res.rs_node); + ip->i_rahead = 0; } return &ip->i_inode; } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index b95d0d625f32..0c1bde395062 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -176,6 +176,8 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) unlock_buffer(bh); if (bh->b_private == NULL) bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops); + else + bd = bh->b_private; lock_buffer(bh); gfs2_log_lock(sdp); } @@ -236,6 +238,8 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) lock_page(bh->b_page); if (bh->b_private == NULL) bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops); + else + bd = bh->b_private; unlock_page(bh->b_page); lock_buffer(bh); gfs2_log_lock(sdp); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 86d2035ac669..cf645835710f 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -27,7 +27,7 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly; struct kmem_cache *gfs2_bufdata_cachep __read_mostly; struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; struct kmem_cache *gfs2_quotad_cachep __read_mostly; -struct kmem_cache *gfs2_rsrv_cachep __read_mostly; +struct kmem_cache *gfs2_qadata_cachep __read_mostly; mempool_t *gfs2_page_pool __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index cbdcbdf39614..c81295f407f6 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -149,7 +149,7 @@ extern struct kmem_cache *gfs2_inode_cachep; extern struct kmem_cache *gfs2_bufdata_cachep; extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; -extern struct kmem_cache *gfs2_rsrv_cachep; +extern struct kmem_cache *gfs2_qadata_cachep; extern mempool_t *gfs2_page_pool; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 4c096fa9e2a1..e8dfb4740c04 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -119,7 +119,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) __be64 *eablk, *end; int error; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh); if (error) return error; @@ -143,7 +143,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) break; bn = be64_to_cpu(*eablk); - error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh); + error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh); if (error) break; error = ea_foreach_i(ip, eabh, ea_call, data); @@ -477,7 +477,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, return -ENOMEM; for (x = 0; x < nptrs; x++) { - error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0, bh + x); if (error) { while (x--) @@ -583,11 +583,13 @@ out: * * Returns: actual size of data on success, -errno on error */ -static int gfs2_xattr_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int gfs2_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_ea_location el; + int type = handler->flags; int error; if (!ip->i_eattr) @@ -977,7 +979,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { __be64 *end; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); if (error) return error; @@ -1227,61 +1229,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, return error; } -static int gfs2_xattr_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int gfs2_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { return __gfs2_xattr_set(d_inode(dentry), name, value, - size, flags, type); -} - - -static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, - struct gfs2_ea_header *ea, char *data) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int amount = GFS2_EA_DATA_LEN(ea); - unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); - int ret; - - ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0); - if (ret) - return ret; - - ret = gfs2_iter_unstuffed(ip, ea, data, NULL); - gfs2_trans_end(sdp); - - return ret; -} - -int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) -{ - struct inode *inode = &ip->i_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_ea_location el; - int error; - - error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); - if (error) - return error; - - if (GFS2_EA_IS_STUFFED(el.el_ea)) { - error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); - if (error == 0) { - gfs2_trans_add_meta(ip->i_gl, el.el_bh); - memcpy(GFS2_EA2DATA(el.el_ea), data, - GFS2_EA_DATA_LEN(el.el_ea)); - } - } else { - error = ea_acl_chmod_unstuffed(ip, el.el_ea, data); - } - - brelse(el.el_bh); - if (error) - return error; - - error = gfs2_setattr_simple(inode, attr); - gfs2_trans_end(sdp); - return error; + size, flags, handler->flags); } static int ea_dealloc_indirect(struct gfs2_inode *ip) @@ -1303,7 +1256,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); if (error) return error; diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h index d392f8358f2f..2d887c88eb49 100644 --- a/fs/gfs2/xattr.h +++ b/fs/gfs2/xattr.h @@ -62,6 +62,5 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip); /* Exported to acl.c */ extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); -extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data); #endif /* __EATTR_DOT_H__ */ diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index aa3f0d6d043c..a3ec3ae7d347 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -166,7 +166,7 @@ int hfs_mdb_get(struct super_block *sb) pr_warn("continuing without an alternate MDB\n"); } - HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0); + HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL); if (!HFS_SB(sb)->bitmap) goto out; @@ -360,7 +360,7 @@ void hfs_mdb_put(struct super_block *sb) unload_nls(HFS_SB(sb)->nls_io); unload_nls(HFS_SB(sb)->nls_disk); - free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0); + kfree(HFS_SB(sb)->bitmap); kfree(HFS_SB(sb)); sb->s_fs_info = NULL; } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4574fdd3d421..1ca95c232bb5 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -483,8 +483,8 @@ static int __init init_hfs_fs(void) int err; hfs_inode_cachep = kmem_cache_create("hfs_inode_cache", - sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN, - hfs_init_once); + sizeof(struct hfs_inode_info), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once); if (!hfs_inode_cachep) return -ENOMEM; err = register_filesystem(&hfs_fs_type); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 6dd107d7421e..19b33f8151f1 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -403,6 +403,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) } else if (S_ISLNK(inode->i_mode)) { sbi->file_count++; inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &hfsplus_aops; hip->clump_blocks = 1; } else @@ -526,6 +527,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_mapping->a_ops = &hfsplus_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &hfsplus_aops; } else { init_special_inode(inode, inode->i_mode, diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index df0c9af68d05..afb33eda6d7d 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -21,10 +21,10 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - xattr_name = POSIX_ACL_XATTR_ACCESS; + xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - xattr_name = POSIX_ACL_XATTR_DEFAULT; + xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return ERR_PTR(-EINVAL); @@ -66,7 +66,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, switch (type) { case ACL_TYPE_ACCESS: - xattr_name = POSIX_ACL_XATTR_ACCESS; + xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { err = posix_acl_equiv_mode(acl, &inode->i_mode); if (err < 0) @@ -76,7 +76,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, break; case ACL_TYPE_DEFAULT: - xattr_name = POSIX_ACL_XATTR_DEFAULT; + xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 7302d96ae8bf..5d54490a136d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void) int err; hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache", - HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN, + HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfsplus_init_once); if (!hfsplus_inode_cachep) return -ENOMEM; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 416b1dbafe51..ab01530b4930 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -431,9 +431,6 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, char *xattr_name; int res; - if (!strcmp(name, "")) - return -EINVAL; - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, GFP_KERNEL); if (!xattr_name) @@ -589,9 +586,6 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, int res; char *xattr_name; - if (!strcmp(name, "")) - return -EINVAL; - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, GFP_KERNEL); if (!xattr_name) @@ -849,12 +843,10 @@ end_removexattr: return err; } -static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int hfsplus_osx_getxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (!strcmp(name, "")) - return -EINVAL; - /* * Don't allow retrieving properly prefixed attributes * by prepending them with "osx." @@ -871,12 +863,10 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, return __hfsplus_getxattr(d_inode(dentry), name, buffer, size); } -static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags, int type) +static int hfsplus_osx_setxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags) { - if (!strcmp(name, "")) - return -EINVAL; - /* * Don't allow setting properly prefixed attributes * by prepending them with "osx." @@ -893,19 +883,8 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags); } -static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - /* - * This method is not used. - * It is used hfsplus_listxattr() instead of generic_listxattr(). - */ - return -EOPNOTSUPP; -} - const struct xattr_handler hfsplus_xattr_osx_handler = { .prefix = XATTR_MAC_OSX_PREFIX, - .list = hfsplus_osx_listxattr, .get = hfsplus_osx_getxattr, .set = hfsplus_osx_setxattr, }; diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index aacff00a9ff9..72a68a3a0c99 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -13,32 +13,24 @@ #include "xattr.h" #include "acl.h" -static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int hfsplus_security_getxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { return hfsplus_getxattr(dentry, name, buffer, size, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } -static int hfsplus_security_setxattr(struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags, int type) +static int hfsplus_security_setxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags) { return hfsplus_setxattr(dentry, name, buffer, size, flags, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } -static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - /* - * This method is not used. - * It is used hfsplus_listxattr() instead of generic_listxattr(). - */ - return -EOPNOTSUPP; -} - static int hfsplus_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) @@ -92,7 +84,6 @@ int hfsplus_init_inode_security(struct inode *inode, const struct xattr_handler hfsplus_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = hfsplus_security_listxattr, .get = hfsplus_security_getxattr, .set = hfsplus_security_setxattr, }; diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index bcf65089b7f7..95a7704c7abb 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -11,34 +11,25 @@ #include "hfsplus_fs.h" #include "xattr.h" -static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { return hfsplus_getxattr(dentry, name, buffer, size, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } -static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags, int type) +static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags) { return hfsplus_setxattr(dentry, name, buffer, size, flags, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } -static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - /* - * This method is not used. - * It is used hfsplus_listxattr() instead of generic_listxattr(). - */ - return -EOPNOTSUPP; -} - const struct xattr_handler hfsplus_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .list = hfsplus_trusted_listxattr, .get = hfsplus_trusted_getxattr, .set = hfsplus_trusted_setxattr, }; diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index 5aa0e6dc4a1e..6fc269baf959 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -11,34 +11,25 @@ #include "hfsplus_fs.h" #include "xattr.h" -static int hfsplus_user_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int hfsplus_user_getxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { return hfsplus_getxattr(dentry, name, buffer, size, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } -static int hfsplus_user_setxattr(struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags, int type) +static int hfsplus_user_setxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags) { return hfsplus_setxattr(dentry, name, buffer, size, flags, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } -static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - /* - * This method is not used. - * It is used hfsplus_listxattr() instead of generic_listxattr(). - */ - return -EOPNOTSUPP; -} - const struct xattr_handler hfsplus_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .list = hfsplus_user_listxattr, .get = hfsplus_user_getxattr, .set = hfsplus_user_setxattr, }; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2ac99db3750e..cfaa18c7a337 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) { struct hostfs_inode_info *hi; - hi = kmalloc(sizeof(*hi), GFP_KERNEL); + hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT); if (hi == NULL) return NULL; hi->fd = -1; @@ -730,15 +730,13 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, init_special_inode(inode, mode, dev); err = do_mknod(name, mode, MAJOR(dev), MINOR(dev)); - if (!err) + if (err) goto out_free; err = read_name(inode, name); __putname(name); if (err) goto out_put; - if (err) - goto out_put; d_instantiate(dentry, inode); return 0; @@ -892,9 +890,14 @@ static const struct inode_operations hostfs_dir_iops = { .setattr = hostfs_setattr, }; -static const char *hostfs_follow_link(struct dentry *dentry, void **cookie) +static const char *hostfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - char *link = __getname(); + char *link; + if (!dentry) + return ERR_PTR(-ECHILD); + link = kmalloc(PATH_MAX, GFP_KERNEL); if (link) { char *path = dentry_name(dentry); int err = -ENOMEM; @@ -905,25 +908,20 @@ static const char *hostfs_follow_link(struct dentry *dentry, void **cookie) __putname(path); } if (err < 0) { - __putname(link); + kfree(link); return ERR_PTR(err); } } else { return ERR_PTR(-ENOMEM); } - return *cookie = link; -} - -static void hostfs_put_link(struct inode *unused, void *cookie) -{ - __putname(cookie); + set_delayed_call(done, kfree_link, link); + return link; } static const struct inode_operations hostfs_link_iops = { .readlink = generic_readlink, - .follow_link = hostfs_follow_link, - .put_link = hostfs_put_link, + .get_link = hostfs_get_link, }; static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 933c73780813..1f3c6d76200b 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -77,6 +77,7 @@ void hpfs_read_inode(struct inode *i) kfree(ea); i->i_mode = S_IFLNK | 0777; i->i_op = &page_symlink_inode_operations; + inode_nohighmem(i); i->i_data.a_ops = &hpfs_symlink_aops; set_nlink(i, 1); i->i_size = ea_size; diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index a69bbc1e87f8..a136929189f0 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -133,7 +133,7 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock) { struct quad_buffer_head qbh; - u32 *directory; + __le32 *directory; u32 n_hotfixes, n_used_hotfixes; unsigned i; diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 9e92c9c2d319..506765afa1a3 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -227,8 +227,6 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de int err; if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; - if (!new_valid_dev(rdev)) - return -EINVAL; hpfs_lock(dir->i_sb); err = -ENOSPC; fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); @@ -334,6 +332,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy result->i_blocks = 1; set_nlink(result, 1); result->i_size = strlen(symlink); + inode_nohighmem(result); result->i_op = &page_symlink_inode_operations; result->i_data.a_ops = &hpfs_symlink_aops; @@ -502,7 +501,7 @@ out: static int hpfs_symlink_readpage(struct file *file, struct page *page) { - char *link = kmap(page); + char *link = page_address(page); struct inode *i = page->mapping->host; struct fnode *fnode; struct buffer_head *bh; @@ -518,14 +517,12 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page) goto fail; hpfs_unlock(i->i_sb); SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; fail: hpfs_unlock(i->i_sb); SetPageError(page); - kunmap(page); unlock_page(page); return err; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a561591896bd..458cf463047b 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -261,7 +261,7 @@ static int init_inodecache(void) hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", sizeof(struct hpfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (hpfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 316adb968b65..8bbf7f3e2a27 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -4,11 +4,11 @@ * Nadia Yvette Chambers, 2002 * * Copyright (C) 2002 Linus Torvalds. + * License: GPL */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/thread_info.h> #include <asm/current.h> #include <linux/sched.h> /* remove ASAP */ @@ -324,20 +324,62 @@ static void remove_huge_page(struct page *page) delete_from_page_cache(page); } +static void +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) +{ + struct vm_area_struct *vma; + + /* + * end == 0 indicates that the entire range after + * start should be unmapped. + */ + vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { + unsigned long v_offset; + unsigned long v_end; + + /* + * Can the expression below overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ + if (vma->vm_pgoff < start) + v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; + else + v_offset = 0; + + if (!end) + v_end = vma->vm_end; + else { + v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + + vma->vm_start; + if (v_end > vma->vm_end) + v_end = vma->vm_end; + } + + unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, + NULL); + } +} /* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. - + * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv - * maps and global counts. + * maps and global counts. Page faults can not race with truncation + * in this routine. hugetlb_no_page() prevents page faults in the + * truncated range. It checks i_size before allocation, and again after + * with the page table lock for the page held. The same lock must be + * acquired to unmap a page. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserv map * deleted. The region/reserv map for ranges without associated - * pages are not modified. + * pages are not modified. Page faults can race with hole punch. + * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation. */ @@ -361,77 +403,81 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, next = start; while (next < end) { /* - * Make sure to never grab more pages that we - * might possibly need. + * Don't grab more pages than the number left in the range. */ if (end - next < lookup_nr) lookup_nr = end - next; /* - * This pagevec_lookup() may return pages past 'end', - * so we must check for page->index > end. + * When no more pages are found, we are done. */ - if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) { - if (next == start) - break; - next = start; - continue; - } + if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) + break; for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; + bool rsv_on_error; u32 hash; + /* + * The page (index) could be beyond end. This is + * only possible in the punch hole case as end is + * max page offset in the truncate case. + */ + next = page->index; + if (next >= end) + break; + hash = hugetlb_fault_mutex_hash(h, current->mm, &pseudo_vma, mapping, next, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); - lock_page(page); - if (page->index >= end) { - unlock_page(page); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - next = end; /* we are done */ - break; - } - /* * If page is mapped, it was faulted in after being - * unmapped. Do nothing in this race case. In the - * normal case page is not mapped. + * unmapped in caller. Unmap (again) now after taking + * the fault mutex. The mutex will prevent faults + * until we finish removing the page. + * + * This race can only happen in the hole punch case. + * Getting here in a truncate operation is a bug. */ - if (!page_mapped(page)) { - bool rsv_on_error = !PagePrivate(page); - /* - * We must free the huge page and remove - * from page cache (remove_huge_page) BEFORE - * removing the region/reserve map - * (hugetlb_unreserve_pages). In rare out - * of memory conditions, removal of the - * region/reserve map could fail. Before - * free'ing the page, note PagePrivate which - * is used in case of error. - */ - remove_huge_page(page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages( - inode, next, - next + 1, 1))) - hugetlb_fix_reserve_counts( - inode, rsv_on_error); - } + if (unlikely(page_mapped(page))) { + BUG_ON(truncate_op); + + i_mmap_lock_write(mapping); + hugetlb_vmdelete_list(&mapping->i_mmap, + next * pages_per_huge_page(h), + (next + 1) * pages_per_huge_page(h)); + i_mmap_unlock_write(mapping); } - if (page->index > next) - next = page->index; + lock_page(page); + /* + * We must free the huge page and remove from page + * cache (remove_huge_page) BEFORE removing the + * region/reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the + * region/reserve map could fail. Before free'ing + * the page, note PagePrivate which is used in case + * of error. + */ + rsv_on_error = !PagePrivate(page); + remove_huge_page(page); + freed++; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, + next, next + 1, 1))) + hugetlb_fix_reserve_counts(inode, + rsv_on_error); + } - ++next; unlock_page(page); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); } + ++next; huge_pagevec_release(&pvec); + cond_resched(); } if (truncate_op) @@ -450,41 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode) clear_inode(inode); } -static inline void -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) -{ - struct vm_area_struct *vma; - - /* - * end == 0 indicates that the entire range after - * start should be unmapped. - */ - vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { - unsigned long v_offset; - - /* - * Can the expression below overflow on 32-bit arches? - * No, because the interval tree returns us only those vmas - * which overlap the truncated area starting at pgoff, - * and no vma on a 32-bit arch can span beyond the 4GB. - */ - if (vma->vm_pgoff < start) - v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; - else - v_offset = 0; - - if (end) { - end = ((end - start) << PAGE_SHIFT) + - vma->vm_start + v_offset; - if (end > vma->vm_end) - end = vma->vm_end; - } else - end = vma->vm_end; - - unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL); - } -} - static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; @@ -647,9 +658,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); inode->i_ctime = CURRENT_TIME; - spin_lock(&inode->i_lock); - inode->i_private = NULL; - spin_unlock(&inode->i_lock); out: mutex_unlock(&inode->i_mutex); return error; @@ -709,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, /* * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an - * annotation because huge_pmd_share() does an allocation under + * annotation because huge_pmd_share() does an allocation under hugetlb's * i_mmap_rwsem. */ static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; @@ -739,7 +747,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * The policy is initialized here even if we are creating a * private inode because initialization simply creates an - * an empty rb tree and calls spin_lock_init(), later when we + * an empty rb tree and calls rwlock_init(), later when we * call mpol_free_shared_policy() it will just return because * the rb tree will still be empty. */ @@ -761,6 +769,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); break; } lockdep_annotate_inode_mutex_key(inode); @@ -1202,7 +1211,6 @@ static struct file_system_type hugetlbfs_fs_type = { .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; -MODULE_ALIAS_FS("hugetlbfs"); static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1322,7 +1330,7 @@ static int __init init_hugetlbfs_fs(void) error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), - 0, 0, init_once); + 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) goto out2; @@ -1356,26 +1364,4 @@ static int __init init_hugetlbfs_fs(void) out2: return error; } - -static void __exit exit_hugetlbfs_fs(void) -{ - struct hstate *h; - int i; - - - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(hugetlbfs_inode_cachep); - i = 0; - for_each_hstate(h) - kern_unmount(hugetlbfs_vfsmount[i++]); - unregister_filesystem(&hugetlbfs_fs_type); -} - -module_init(init_hugetlbfs_fs) -module_exit(exit_hugetlbfs_fs) - -MODULE_LICENSE("GPL"); +fs_initcall(init_hugetlbfs_fs) diff --git a/fs/inode.c b/fs/inode.c index 78a17b8859e1..e491e54d2430 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -225,7 +225,7 @@ void __destroy_inode(struct inode *inode) inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); - locks_free_lock_context(inode->i_flctx); + locks_free_lock_context(inode); if (!inode->i_nlink) { WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); atomic_long_dec(&inode->i_sb->s_remove_count); @@ -1597,6 +1597,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) /** * touch_atime - update the access time * @path: the &struct path to update + * @inode: inode to update * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, @@ -1882,7 +1883,7 @@ void __init inode_init(void) sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ @@ -2027,3 +2028,9 @@ void inode_set_flags(struct inode *inode, unsigned int flags, new_flags) != old_flags)); } EXPORT_SYMBOL(inode_set_flags); + +void inode_nohighmem(struct inode *inode) +{ + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); +} +EXPORT_SYMBOL(inode_nohighmem); diff --git a/fs/internal.h b/fs/internal.h index 71859c4d0b41..b71deeecea17 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -55,7 +55,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, /* * namespace.c */ -extern int copy_mount_options(const void __user *, unsigned long *); +extern void *copy_mount_options(const void __user *); extern char *copy_mount_string(const void __user *); extern struct vfsmount *lookup_mnt(struct path *); @@ -151,3 +151,10 @@ extern void mnt_pin_kill(struct mount *m); * fs/nsfs.c */ extern struct dentry_operations ns_dentry_operations; + +/* + * fs/ioctl.c + */ +extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, + unsigned long arg); +extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/ioctl.c b/fs/ioctl.c index 5d01d2638ca5..29466c380958 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -15,6 +15,7 @@ #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/falloc.h> +#include "internal.h" #include <asm/ioctls.h> @@ -32,8 +33,7 @@ * * Returns 0 on success, -errno on error. */ -static long vfs_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) +long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -215,6 +215,29 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) return error; } +static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct fd src_file = fdget(srcfd); + int ret; + + if (!src_file.file) + return -EBADF; + ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); + fdput(src_file); + return ret; +} + +static long ioctl_file_clone_range(struct file *file, void __user *argp) +{ + struct file_clone_range args; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + return ioctl_file_clone(file, args.src_fd, args.src_offset, + args.src_length, args.dest_offset); +} + #ifdef CONFIG_BLOCK static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -545,6 +568,41 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } +static long ioctl_file_dedupe_range(struct file *file, void __user *arg) +{ + struct file_dedupe_range __user *argp = arg; + struct file_dedupe_range *same = NULL; + int ret; + unsigned long size; + u16 count; + + if (get_user(count, &argp->dest_count)) { + ret = -EFAULT; + goto out; + } + + size = offsetof(struct file_dedupe_range __user, info[count]); + + same = memdup_user(argp, size); + if (IS_ERR(same)) { + ret = PTR_ERR(same); + same = NULL; + goto out; + } + + ret = vfs_dedupe_file_range(file, same); + if (ret) + goto out; + + ret = copy_to_user(argp, same, size); + if (ret) + ret = -EFAULT; + +out: + kfree(same); + return ret; +} + /* * When you add any new common ioctls to the switches above and below * please update compat_sys_ioctl() too. @@ -600,6 +658,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, case FIGETBSZ: return put_user(inode->i_sb->s_blocksize, argp); + case FICLONE: + return ioctl_file_clone(filp, arg, 0, 0, 0); + + case FICLONERANGE: + return ioctl_file_clone_range(filp, argp); + + case FIDEDUPERANGE: + return ioctl_file_dedupe_range(filp, argp); + default: if (S_ISREG(inode->i_mode)) error = file_ioctl(filp, cmd, arg); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index d67a16f2a45d..bcd2d41b318a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -94,7 +94,7 @@ static int __init init_inodecache(void) isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (isofs_inode_cachep == NULL) return -ENOMEM; @@ -1417,6 +1417,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) inode->i_fop = &isofs_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &isofs_symlink_aops; } else /* XXX - parse_rock_ridge_inode() had already set i_rdev. */ diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 735d7522a3a9..5384ceb35b1c 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -687,7 +687,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page) struct inode *inode = page->mapping->host; struct iso_inode_info *ei = ISOFS_I(inode); struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); - char *link = kmap(page); + char *link = page_address(page); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); struct buffer_head *bh; char *rpnt = link; @@ -774,7 +774,6 @@ repeat: brelse(bh); *rpnt = '\0'; SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; @@ -791,7 +790,6 @@ fail: brelse(bh); error: SetPageError(page); - kunmap(page); unlock_page(page); return -EIO; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 8c44654ce274..684996c8a3a4 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -427,7 +427,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) struct journal_head *last_jh; struct journal_head *next_jh = jh; int ret; - int freed = 0; if (!jh) return 0; @@ -441,10 +440,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) else ret = __jbd2_journal_remove_checkpoint(jh) + 1; if (!ret) - return freed; + return 0; if (ret == 2) return 1; - freed = 1; /* * This function only frees up some memory * if possible so we dont have an obligation @@ -452,10 +450,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) * requested: */ if (need_resched()) - return freed; + return 0; } while (jh != last_jh); - return freed; + return 0; } /* diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 362e5f614450..36345fefa3ff 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -142,8 +142,7 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_commit_sec = cpu_to_be64(now.tv_sec); tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); - if (JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_has_feature_checksum(journal)) { tmp->h_chksum_type = JBD2_CRC32_CHKSUM; tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; tmp->h_chksum[0] = cpu_to_be32(crc32_sum); @@ -157,8 +156,7 @@ static int journal_submit_commit_record(journal_t *journal, bh->b_end_io = journal_end_buffer_io_sync; if (journal->j_flags & JBD2_BARRIER && - !JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) + !jbd2_has_feature_async_commit(journal)) ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); else ret = submit_bh(WRITE_SYNC, bh); @@ -317,7 +315,7 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag, unsigned long long block) { tag->t_blocknr = cpu_to_be32(block & (u32)~0); - if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(j)) tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } @@ -356,7 +354,7 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, bh->b_size); kunmap_atomic(addr); - if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + if (jbd2_has_feature_csum3(j)) tag3->t_checksum = cpu_to_be32(csum32); else tag->t_checksum = cpu_to_be16(csum32); @@ -730,8 +728,7 @@ start_journal_io: /* * Compute checksum. */ - if (JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_has_feature_checksum(journal)) { crc32_sum = jbd2_checksum_data(crc32_sum, bh); } @@ -797,8 +794,7 @@ start_journal_io: blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); /* Done it all: now write the commit record asynchronously. */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (jbd2_has_feature_async_commit(journal)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) @@ -889,8 +885,7 @@ start_journal_io: commit_transaction->t_state = T_COMMIT_JFLUSH; write_unlock(&journal->j_state_lock); - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (!jbd2_has_feature_async_commit(journal)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) @@ -898,8 +893,7 @@ start_journal_io: } if (cbh) err = journal_wait_on_commit_record(journal, cbh); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && + if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8270fe9e3641..81e622681c82 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug); /* Checksumming functions */ static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { - if (!jbd2_journal_has_csum_v2or3(j)) + if (!jbd2_journal_has_csum_v2or3_feature(j)) return 1; return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; @@ -1523,16 +1523,16 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && - JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + if (jbd2_has_feature_csum2(journal) && + jbd2_has_feature_csum3(journal)) { /* Can't have checksum v2 and v3 at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " "at the same time!\n"); goto out; } - if (jbd2_journal_has_csum_v2or3(journal) && - JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_journal_has_csum_v2or3_feature(journal) && + jbd2_has_feature_checksum(journal)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " "at the same time!\n"); @@ -1545,7 +1545,7 @@ static int journal_get_superblock(journal_t *journal) } /* Load the checksum driver */ - if (jbd2_journal_has_csum_v2or3(journal)) { + if (jbd2_journal_has_csum_v2or3_feature(journal)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); @@ -1558,6 +1558,7 @@ static int journal_get_superblock(journal_t *journal) /* Check superblock checksum */ if (!jbd2_superblock_csum_verify(journal, sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; goto out; } @@ -1649,7 +1650,7 @@ int jbd2_journal_load(journal_t *journal) printk(KERN_ERR "JBD2: journal transaction %u on %s " "is corrupt.\n", journal->j_failed_commit, journal->j_devname); - return -EIO; + return -EFSCORRUPTED; } /* OK, we've finished with the dynamic journal bits: @@ -2071,8 +2072,12 @@ static void __journal_abort_soft (journal_t *journal, int errno) __jbd2_journal_abort_hard(journal); - if (errno) + if (errno) { jbd2_journal_update_sb_errno(journal); + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_REC_ERR; + write_unlock(&journal->j_state_lock); + } } /** @@ -2197,15 +2202,15 @@ size_t journal_tag_bytes(journal_t *journal) { size_t sz; - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + if (jbd2_has_feature_csum3(journal)) return sizeof(journal_block_tag3_t); sz = sizeof(journal_block_tag_t); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_has_feature_csum2(journal)) sz += sizeof(__u16); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) return sz; else return sz - sizeof(__u32); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index a9079d035ae5..7f277e49fe88 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -140,7 +140,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, if (offset >= journal->j_maxlen) { printk(KERN_ERR "JBD2: corrupted journal superblock\n"); - return -EIO; + return -EFSCORRUPTED; } err = jbd2_journal_bmap(journal, offset, &blocknr); @@ -342,7 +342,7 @@ static inline unsigned long long read_tag_block(journal_t *journal, journal_block_tag_t *tag) { unsigned long long block = be32_to_cpu(tag->t_blocknr); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; return block; } @@ -411,7 +411,7 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); - if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + if (jbd2_has_feature_csum3(j)) return tag3->t_checksum == cpu_to_be32(csum32); else return tag->t_checksum == cpu_to_be16(csum32); @@ -527,7 +527,7 @@ static int do_one_pass(journal_t *journal, printk(KERN_ERR "JBD2: Invalid checksum " "recovering block %lu in log\n", next_log_block); - err = -EIO; + err = -EFSBADCRC; brelse(bh); goto failed; } @@ -538,8 +538,7 @@ static int do_one_pass(journal_t *journal, * just skip over the blocks it describes. */ if (pass != PASS_REPLAY) { if (pass == PASS_SCAN && - JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM) && + jbd2_has_feature_checksum(journal) && !info->end_transaction) { if (calc_chksums(journal, bh, &next_log_block, @@ -602,7 +601,7 @@ static int do_one_pass(journal_t *journal, journal, tag, obh->b_data, be32_to_cpu(tmp->h_sequence))) { brelse(obh); - success = -EIO; + success = -EFSBADCRC; printk(KERN_ERR "JBD2: Invalid " "checksum recovering " "block %llu in log\n", @@ -694,8 +693,7 @@ static int do_one_pass(journal_t *journal, * much to do other than move on to the next sequence * number. */ if (pass == PASS_SCAN && - JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + jbd2_has_feature_checksum(journal)) { int chksum_err, chksum_seen; struct commit_header *cbh = (struct commit_header *)bh->b_data; @@ -735,8 +733,7 @@ static int do_one_pass(journal_t *journal, if (chksum_err) { info->end_transaction = next_commit_ID; - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){ + if (!jbd2_has_feature_async_commit(journal)) { journal->j_failed_commit = next_commit_ID; brelse(bh); @@ -750,8 +747,7 @@ static int do_one_pass(journal_t *journal, bh->b_data)) { info->end_transaction = next_commit_ID; - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (!jbd2_has_feature_async_commit(journal)) { journal->j_failed_commit = next_commit_ID; brelse(bh); @@ -851,7 +847,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, rcount = be32_to_cpu(header->r_count); if (!jbd2_revoke_block_csum_verify(journal, header)) - return -EINVAL; + return -EFSBADCRC; if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); @@ -859,7 +855,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, return -EINVAL; max = rcount; - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) record_len = 8; while (offset + record_len <= max) { diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 0abf2e7f725b..705ae577882b 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -589,7 +589,7 @@ static void write_one_revoke_record(journal_t *journal, if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) sz = 8; else sz = 4; @@ -619,7 +619,7 @@ static void write_one_revoke_record(journal_t *journal, *descriptorp = descriptor; } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); else diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6b8338ec2464..081dff087fc0 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -764,13 +764,11 @@ void jbd2_journal_unlock_updates (journal_t *journal) static void warn_dirty_buffer(struct buffer_head *bh) { - char b[BDEVNAME_SIZE]; - printk(KERN_WARNING - "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). " "There's a risk of filesystem corruption in case of system " "crash.\n", - bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); + bh->b_bdev, (unsigned long long)bh->b_blocknr); } /* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ @@ -1009,7 +1007,8 @@ out: } /* Fast check whether buffer is already attached to the required transaction */ -static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh) +static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh, + bool undo) { struct journal_head *jh; bool ret = false; @@ -1036,6 +1035,9 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh) jh = READ_ONCE(bh->b_private); if (!jh) goto out; + /* For undo access buffer must have data copied */ + if (undo && !jh->b_committed_data) + goto out; if (jh->b_transaction != handle->h_transaction && jh->b_next_transaction != handle->h_transaction) goto out; @@ -1073,7 +1075,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int rc; - if (jbd2_write_access_granted(handle, bh)) + if (jbd2_write_access_granted(handle, bh, false)) return 0; jh = jbd2_journal_add_journal_head(bh); @@ -1210,7 +1212,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) char *committed_data = NULL; JBUFFER_TRACE(jh, "entry"); - if (jbd2_write_access_granted(handle, bh)) + if (jbd2_write_access_granted(handle, bh, true)) return 0; jh = jbd2_journal_add_journal_head(bh); @@ -1937,8 +1939,8 @@ out: * @journal: journal for operation * @page: to try and free * @gfp_mask: we use the mask to detect how hard should we try to release - * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to - * release the buffers. + * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit + * code to release the buffers. * * * For all the buffers on this page, @@ -2152,6 +2154,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, if (!buffer_dirty(bh)) { /* bdflush has written it. We can drop it now */ + __jbd2_journal_remove_checkpoint(jh); goto zap_buffer; } @@ -2181,6 +2184,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, /* The orphan record's transaction has * committed. We can cleanse this buffer */ clear_buffer_jbddirty(bh); + __jbd2_journal_remove_checkpoint(jh); goto zap_buffer; } } diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index bb9cebc9ca8a..e5c1783ab64a 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -80,7 +80,6 @@ static int jffs2_garbage_collect_thread(void *_c) siginitset(&hupmask, sigmask(SIGHUP)); allow_signal(SIGKILL); allow_signal(SIGSTOP); - allow_signal(SIGCONT); allow_signal(SIGHUP); c->gc_task = current; @@ -121,20 +120,18 @@ static int jffs2_garbage_collect_thread(void *_c) /* Put_super will send a SIGKILL and then wait on the sem. */ while (signal_pending(current) || freezing(current)) { - siginfo_t info; unsigned long signr; if (try_to_freeze()) goto again; - signr = dequeue_signal_lock(current, ¤t->blocked, &info); + signr = kernel_dequeue_signal(NULL); switch(signr) { case SIGSTOP: jffs2_dbg(1, "%s(): SIGSTOP received\n", __func__); - set_current_state(TASK_STOPPED); - schedule(); + kernel_signal_stop(); break; case SIGKILL: diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 81180022923f..d211b8e18566 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -621,9 +621,6 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode uint32_t alloclen; int ret; - if (!new_valid_dev(rdev)) - return -EINVAL; - ri = jffs2_alloc_raw_inode(); if (!ri) return -ENOMEM; diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index b8fd651307a4..ce1189793288 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -97,25 +97,16 @@ int __init jffs2_create_slab_caches(void) void jffs2_destroy_slab_caches(void) { - if(full_dnode_slab) - kmem_cache_destroy(full_dnode_slab); - if(raw_dirent_slab) - kmem_cache_destroy(raw_dirent_slab); - if(raw_inode_slab) - kmem_cache_destroy(raw_inode_slab); - if(tmp_dnode_info_slab) - kmem_cache_destroy(tmp_dnode_info_slab); - if(raw_node_ref_slab) - kmem_cache_destroy(raw_node_ref_slab); - if(node_frag_slab) - kmem_cache_destroy(node_frag_slab); - if(inode_cache_slab) - kmem_cache_destroy(inode_cache_slab); + kmem_cache_destroy(full_dnode_slab); + kmem_cache_destroy(raw_dirent_slab); + kmem_cache_destroy(raw_inode_slab); + kmem_cache_destroy(tmp_dnode_info_slab); + kmem_cache_destroy(raw_node_ref_slab); + kmem_cache_destroy(node_frag_slab); + kmem_cache_destroy(inode_cache_slab); #ifdef CONFIG_JFFS2_FS_XATTR - if (xattr_datum_cache) - kmem_cache_destroy(xattr_datum_cache); - if (xattr_ref_cache) - kmem_cache_destroy(xattr_ref_cache); + kmem_cache_destroy(xattr_datum_cache); + kmem_cache_destroy(xattr_ref_cache); #endif } diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 28e0aab42bc3..bfebbf13698c 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -660,8 +660,12 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r err = jffs2_flash_read(c, (ref_offset(ref)) + read, rd->nsize - already, &read, &fd->name[already]); - if (unlikely(read != rd->nsize - already) && likely(!err)) + if (unlikely(read != rd->nsize - already) && likely(!err)) { + jffs2_free_full_dirent(fd); + JFFS2_ERROR("short read: wanted %d bytes, got %zd\n", + rd->nsize - already, read); return -EIO; + } if (unlikely(err)) { JFFS2_ERROR("read remainder of name: error %d\n", err); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index d4b43fb7adb1..7a28facd7175 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -48,42 +48,24 @@ int jffs2_init_security(struct inode *inode, struct inode *dir, } /* ---- XATTR Handler for "security.*" ----------------- */ -static int jffs2_security_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int jffs2_security_getxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (!strcmp(name, "")) - return -EINVAL; - return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, name, buffer, size); } -static int jffs2_security_setxattr(struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags, int type) +static int jffs2_security_setxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags) { - if (!strcmp(name, "")) - return -EINVAL; - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); } -static size_t jffs2_security_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1; - - if (list && retlen <= list_size) { - strcpy(list, XATTR_SECURITY_PREFIX); - strcpy(list + XATTR_SECURITY_PREFIX_LEN, name); - } - - return retlen; -} - const struct xattr_handler jffs2_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = jffs2_security_listxattr, .set = jffs2_security_setxattr, .get = jffs2_security_getxattr }; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d86c5e3176a1..bb080c272149 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void) jffs2_inode_cachep = kmem_cache_create("jffs2_i", sizeof(struct jffs2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), jffs2_i_init_once); if (!jffs2_inode_cachep) { pr_err("error: Failed to initialise inode cache\n"); diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index 8ce2f240125b..2cabd649d4fb 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -14,7 +14,7 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 09ed55190ee2..5a3da3f52908 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1153,7 +1153,7 @@ static struct jffs2_sb_info *work_to_sb(struct work_struct *work) { struct delayed_work *dwork; - dwork = container_of(work, struct delayed_work, work); + dwork = to_delayed_work(work); return container_of(dwork, struct jffs2_sb_info, wbuf_dwork); } @@ -1264,7 +1264,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { if ((c->flash_size % c->sector_size) != 0) { c->flash_size = (c->flash_size / c->sector_size) * c->sector_size; pr_warn("flash size adjusted to %dKiB\n", c->flash_size); - }; + } c->wbuf_ofs = 0xFFFFFFFF; c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); @@ -1274,7 +1274,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { #ifdef CONFIG_JFFS2_FS_WBUF_VERIFY c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); if (!c->wbuf_verify) { - kfree(c->oobbuf); kfree(c->wbuf); return -ENOMEM; } diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index f092fee5be50..da3e18503c65 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -967,7 +967,8 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) struct jffs2_xattr_ref *ref, **pref; struct jffs2_xattr_datum *xd; const struct xattr_handler *xhandle; - ssize_t len, rc; + const char *prefix; + ssize_t prefix_len, len, rc; int retry = 0; rc = check_xattr_ref_inode(c, ic); @@ -998,17 +999,23 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) } } xhandle = xprefix_to_handler(xd->xprefix); - if (!xhandle) + if (!xhandle || (xhandle->list && !xhandle->list(dentry))) continue; + prefix = xhandle->prefix ?: xhandle->name; + prefix_len = strlen(prefix); + rc = prefix_len + xd->name_len + 1; + if (buffer) { - rc = xhandle->list(dentry, buffer+len, size-len, - xd->xname, xd->name_len, xd->flags); - } else { - rc = xhandle->list(dentry, NULL, 0, xd->xname, - xd->name_len, xd->flags); + if (rc > size - len) { + rc = -ERANGE; + goto out; + } + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, xd->xname, xd->name_len); + buffer += xd->name_len; + *buffer++ = 0; } - if (rc < 0) - goto out; len += rc; } rc = len; diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index ceaf9c693225..b2555ef07a12 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -16,35 +16,25 @@ #include <linux/mtd/mtd.h> #include "nodelist.h" -static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int jffs2_trusted_getxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (!strcmp(name, "")) - return -EINVAL; return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, name, buffer, size); } -static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags, int type) +static int jffs2_trusted_setxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags) { - if (!strcmp(name, "")) - return -EINVAL; return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); } -static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) +static bool jffs2_trusted_listxattr(struct dentry *dentry) { - size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1; - - if (list && retlen<=list_size) { - strcpy(list, XATTR_TRUSTED_PREFIX); - strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name); - } - - return retlen; + return capable(CAP_SYS_ADMIN); } const struct xattr_handler jffs2_trusted_xattr_handler = { diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index a71391eba514..539bd630b5e4 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -16,40 +16,24 @@ #include <linux/mtd/mtd.h> #include "nodelist.h" -static int jffs2_user_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int jffs2_user_getxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (!strcmp(name, "")) - return -EINVAL; return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER, name, buffer, size); } -static int jffs2_user_setxattr(struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags, int type) +static int jffs2_user_setxattr(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags) { - if (!strcmp(name, "")) - return -EINVAL; return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER, name, buffer, size, flags); } -static size_t jffs2_user_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1; - - if (list && retlen <= list_size) { - strcpy(list, XATTR_USER_PREFIX); - strcpy(list + XATTR_USER_PREFIX_LEN, name); - } - - return retlen; -} - const struct xattr_handler jffs2_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, - .list = jffs2_user_listxattr, .set = jffs2_user_setxattr, .get = jffs2_user_getxattr }; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 0c8ca830b113..49456853e9de 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -40,10 +40,10 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type) switch(type) { case ACL_TYPE_ACCESS: - ea_name = POSIX_ACL_XATTR_ACCESS; + ea_name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - ea_name = POSIX_ACL_XATTR_DEFAULT; + ea_name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return ERR_PTR(-EINVAL); @@ -82,7 +82,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: - ea_name = POSIX_ACL_XATTR_ACCESS; + ea_name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { rc = posix_acl_equiv_mode(acl, &inode->i_mode); if (rc < 0) @@ -94,7 +94,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, } break; case ACL_TYPE_DEFAULT: - ea_name = POSIX_ACL_XATTR_DEFAULT; + ea_name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return -EINVAL; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 41aa3ca6a6a4..9d9bae63ae2a 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -60,6 +60,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) } else if (S_ISLNK(inode->i_mode)) { if (inode->i_size >= IDATASIZE) { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &jfs_aops; } else { inode->i_op = &jfs_fast_symlink_inode_operations; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a69bdf2a1085..a270cb7ff4e0 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1835,17 +1835,16 @@ static int lbmLogInit(struct jfs_log * log) for (i = 0; i < LOGPAGES;) { char *buffer; uint offset; - struct page *page; + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); - buffer = (char *) get_zeroed_page(GFP_KERNEL); - if (buffer == NULL) + if (!page) goto error; - page = virt_to_page(buffer); + buffer = page_address(page); for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) { lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL); if (lbuf == NULL) { if (offset == 0) - free_page((unsigned long) buffer); + __free_page(page); goto error; } if (offset) /* we already have one reference */ diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 35976bdccafc..701f89370de7 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -983,6 +983,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); ip->i_op = &jfs_symlink_inode_operations; + inode_nohighmem(ip); ip->i_mapping->a_ops = &jfs_aops; /* @@ -1372,9 +1373,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, tid_t tid; struct tblock *tblk; - if (!new_valid_dev(rdev)) - return -EINVAL; - jfs_info("jfs_mknod: %pd", dentry); rc = dquot_initialize(dir); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 4cd9798f4948..900925b5eb8c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -496,9 +496,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); - if (!new_valid_dev(sb->s_bdev->bd_dev)) - return -EOVERFLOW; - sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; @@ -901,7 +898,7 @@ static int __init init_jfs_fs(void) jfs_inode_cachep = kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index 5929e2363cb8..f8db4fde0b0b 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -23,7 +23,7 @@ const struct inode_operations jfs_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = jfs_setattr, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, @@ -33,8 +33,7 @@ const struct inode_operations jfs_fast_symlink_inode_operations = { const struct inode_operations jfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = jfs_setattr, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 91e004518237..821973853340 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -541,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - /* - * If the ino of the sysfs entry created for a kmem cache gets - * allocated from an ida layer, which is accounted to the memcg that - * owns the cache, the memcg will get pinned forever. So do not account - * ino ida allocations. - */ - ret = ida_simple_get(&root->ino_ida, 1, 0, - GFP_KERNEL | __GFP_NOACCOUNT); + ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); if (ret < 0) goto err_out2; kn->ino = ret; @@ -694,6 +687,29 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, return NULL; } +static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, + const unsigned char *path, + const void *ns) +{ + static char path_buf[PATH_MAX]; /* protected by kernfs_mutex */ + size_t len = strlcpy(path_buf, path, PATH_MAX); + char *p = path_buf; + char *name; + + lockdep_assert_held(&kernfs_mutex); + + if (len >= PATH_MAX) + return NULL; + + while ((name = strsep(&p, "/")) && parent) { + if (*name == '\0') + continue; + parent = kernfs_find_ns(parent, name, ns); + } + + return parent; +} + /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under @@ -719,6 +735,29 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** + * kernfs_walk_and_get_ns - find and get kernfs_node with the given path + * @parent: kernfs_node to search under + * @path: path to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with path @path under @parent and get a reference + * if found. This function may sleep and returns pointer to the found + * kernfs_node on success, %NULL on failure. + */ +struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, + const char *path, const void *ns) +{ + struct kernfs_node *kn; + + mutex_lock(&kernfs_mutex); + kn = kernfs_walk_ns(parent, path, ns); + kernfs_get(kn); + mutex_unlock(&kernfs_mutex); + + return kn; +} + +/** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 756dd56aaf60..16405ae88d2d 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -205,7 +205,7 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name) if (!attrs) return -ENOMEM; - return simple_xattr_remove(&attrs->xattrs, name); + return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE); } ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, @@ -230,7 +230,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) if (!attrs) return -ENOMEM; - return simple_xattr_list(&attrs->xattrs, buf, size); + return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } static inline void set_default_inode_attr(struct inode *inode, umode_t mode) diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index db272528ab5b..117b8b3416f9 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -112,18 +112,25 @@ static int kernfs_getlink(struct dentry *dentry, char *path) return error; } -static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie) +static const char *kernfs_iop_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - int error = -ENOMEM; - unsigned long page = get_zeroed_page(GFP_KERNEL); - if (!page) + char *body; + int error; + + if (!dentry) + return ERR_PTR(-ECHILD); + body = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!body) return ERR_PTR(-ENOMEM); - error = kernfs_getlink(dentry, (char *)page); + error = kernfs_getlink(dentry, body); if (unlikely(error < 0)) { - free_page((unsigned long)page); + kfree(body); return ERR_PTR(error); } - return *cookie = (char *)page; + set_delayed_call(done, kfree_link, body); + return body; } const struct inode_operations kernfs_symlink_iops = { @@ -132,8 +139,7 @@ const struct inode_operations kernfs_symlink_iops = { .getxattr = kernfs_iop_getxattr, .listxattr = kernfs_iop_listxattr, .readlink = generic_readlink, - .follow_link = kernfs_iop_follow_link, - .put_link = free_page_put_link, + .get_link = kernfs_iop_get_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .permission = kernfs_iop_permission, diff --git a/fs/libfs.c b/fs/libfs.c index c7cbfb092e94..01491299f348 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1019,17 +1019,12 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); -void kfree_put_link(struct inode *unused, void *cookie) +/* Because kfree isn't assignment-compatible with void(void*) ;-/ */ +void kfree_link(void *p) { - kfree(cookie); + kfree(p); } -EXPORT_SYMBOL(kfree_put_link); - -void free_page_put_link(struct inode *unused, void *cookie) -{ - free_page((unsigned long) cookie); -} -EXPORT_SYMBOL(free_page_put_link); +EXPORT_SYMBOL(kfree_link); /* * nop .set_page_dirty method so that people can use .page_mkwrite on @@ -1092,14 +1087,15 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(simple_nosetlease); -const char *simple_follow_link(struct dentry *dentry, void **cookie) +const char *simple_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) { - return d_inode(dentry)->i_link; + return inode->i_link; } -EXPORT_SYMBOL(simple_follow_link); +EXPORT_SYMBOL(simple_get_link); const struct inode_operations simple_symlink_inode_operations = { - .follow_link = simple_follow_link, + .get_link = simple_get_link, .readlink = generic_readlink }; EXPORT_SYMBOL(simple_symlink_inode_operations); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index acd394716349..112952037933 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -474,18 +474,7 @@ static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *ho static int do_vfs_lock(struct file_lock *fl) { - int res = 0; - switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { - case FL_POSIX: - res = posix_lock_file_wait(fl->fl_file, fl); - break; - case FL_FLOCK: - res = flock_lock_file_wait(fl->fl_file, fl); - break; - default: - BUG(); - } - return res; + return locks_lock_file_wait(fl->fl_file, fl); } /* diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 969d589c848d..d716c9993a26 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -116,7 +116,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, atomic_inc(&nsm->sm_count); else { host = NULL; - nsm = nsm_get_handle(ni->sap, ni->salen, + nsm = nsm_get_handle(ni->net, ni->sap, ni->salen, ni->hostname, ni->hostname_len); if (unlikely(nsm == NULL)) { dprintk("lockd: %s failed; no nsm handle\n", @@ -161,6 +161,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, host->h_nsmhandle = nsm; host->h_addrbuf = nsm->sm_addrbuf; host->net = ni->net; + strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename)); out: return host; @@ -534,17 +535,18 @@ static struct nlm_host *next_host_state(struct hlist_head *cache, /** * nlm_host_rebooted - Release all resources held by rebooted host + * @net: network namespace * @info: pointer to decoded results of NLM_SM_NOTIFY call * * We were notified that the specified host has rebooted. Release * all resources held by that peer. */ -void nlm_host_rebooted(const struct nlm_reboot *info) +void nlm_host_rebooted(const struct net *net, const struct nlm_reboot *info) { struct nsm_handle *nsm; struct nlm_host *host; - nsm = nsm_reboot_lookup(info); + nsm = nsm_reboot_lookup(net, info); if (unlikely(nsm == NULL)) return; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 47a32b6d9b90..19166d4a8d31 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -42,7 +42,7 @@ struct nsm_args { u32 proc; char *mon_name; - char *nodename; + const char *nodename; }; struct nsm_res { @@ -51,7 +51,6 @@ struct nsm_res { }; static const struct rpc_program nsm_program; -static LIST_HEAD(nsm_handles); static DEFINE_SPINLOCK(nsm_lock); /* @@ -87,69 +86,18 @@ static struct rpc_clnt *nsm_create(struct net *net, const char *nodename) return rpc_create(&args); } -static struct rpc_clnt *nsm_client_set(struct lockd_net *ln, - struct rpc_clnt *clnt) -{ - spin_lock(&ln->nsm_clnt_lock); - if (ln->nsm_users == 0) { - if (clnt == NULL) - goto out; - ln->nsm_clnt = clnt; - } - clnt = ln->nsm_clnt; - ln->nsm_users++; -out: - spin_unlock(&ln->nsm_clnt_lock); - return clnt; -} - -static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename) -{ - struct rpc_clnt *clnt, *new; - struct lockd_net *ln = net_generic(net, lockd_net_id); - - clnt = nsm_client_set(ln, NULL); - if (clnt != NULL) - goto out; - - clnt = new = nsm_create(net, nodename); - if (IS_ERR(clnt)) - goto out; - - clnt = nsm_client_set(ln, new); - if (clnt != new) - rpc_shutdown_client(new); -out: - return clnt; -} - -static void nsm_client_put(struct net *net) -{ - struct lockd_net *ln = net_generic(net, lockd_net_id); - struct rpc_clnt *clnt = NULL; - - spin_lock(&ln->nsm_clnt_lock); - ln->nsm_users--; - if (ln->nsm_users == 0) { - clnt = ln->nsm_clnt; - ln->nsm_clnt = NULL; - } - spin_unlock(&ln->nsm_clnt_lock); - if (clnt != NULL) - rpc_shutdown_client(clnt); -} - static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, - struct rpc_clnt *clnt) + const struct nlm_host *host) { int status; + struct rpc_clnt *clnt; struct nsm_args args = { .priv = &nsm->sm_priv, .prog = NLM_PROGRAM, .vers = 3, .proc = NLMPROC_NSM_NOTIFY, .mon_name = nsm->sm_mon_name, - .nodename = clnt->cl_nodename, + .nodename = host->nodename, }; struct rpc_message msg = { .rpc_argp = &args, @@ -158,6 +106,13 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, memset(res, 0, sizeof(*res)); + clnt = nsm_create(host->net, host->nodename); + if (IS_ERR(clnt)) { + dprintk("lockd: failed to create NSM upcall transport, " + "status=%ld, net=%p\n", PTR_ERR(clnt), host->net); + return PTR_ERR(clnt); + } + msg.rpc_proc = &clnt->cl_procinfo[proc]; status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); if (status == -ECONNREFUSED) { @@ -171,6 +126,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, status); else status = 0; + + rpc_shutdown_client(clnt); return status; } @@ -190,32 +147,19 @@ int nsm_monitor(const struct nlm_host *host) struct nsm_handle *nsm = host->h_nsmhandle; struct nsm_res res; int status; - struct rpc_clnt *clnt; - const char *nodename = NULL; dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); if (nsm->sm_monitored) return 0; - if (host->h_rpcclnt) - nodename = host->h_rpcclnt->cl_nodename; - /* * Choose whether to record the caller_name or IP address of * this peer in the local rpc.statd's database. */ nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; - clnt = nsm_client_get(host->net, nodename); - if (IS_ERR(clnt)) { - status = PTR_ERR(clnt); - dprintk("lockd: failed to create NSM upcall transport, " - "status=%d, net=%p\n", status, host->net); - return status; - } - - status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt); + status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host); if (unlikely(res.status != 0)) status = -EIO; if (unlikely(status < 0)) { @@ -247,11 +191,9 @@ void nsm_unmonitor(const struct nlm_host *host) if (atomic_read(&nsm->sm_count) == 1 && nsm->sm_monitored && !nsm->sm_sticky) { - struct lockd_net *ln = net_generic(host->net, lockd_net_id); - dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); - status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt); + status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host); if (res.status != 0) status = -EIO; if (status < 0) @@ -259,38 +201,38 @@ void nsm_unmonitor(const struct nlm_host *host) nsm->sm_name); else nsm->sm_monitored = 0; - - nsm_client_put(host->net); } } -static struct nsm_handle *nsm_lookup_hostname(const char *hostname, - const size_t len) +static struct nsm_handle *nsm_lookup_hostname(const struct list_head *nsm_handles, + const char *hostname, const size_t len) { struct nsm_handle *nsm; - list_for_each_entry(nsm, &nsm_handles, sm_link) + list_for_each_entry(nsm, nsm_handles, sm_link) if (strlen(nsm->sm_name) == len && memcmp(nsm->sm_name, hostname, len) == 0) return nsm; return NULL; } -static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap) +static struct nsm_handle *nsm_lookup_addr(const struct list_head *nsm_handles, + const struct sockaddr *sap) { struct nsm_handle *nsm; - list_for_each_entry(nsm, &nsm_handles, sm_link) + list_for_each_entry(nsm, nsm_handles, sm_link) if (rpc_cmp_addr(nsm_addr(nsm), sap)) return nsm; return NULL; } -static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv) +static struct nsm_handle *nsm_lookup_priv(const struct list_head *nsm_handles, + const struct nsm_private *priv) { struct nsm_handle *nsm; - list_for_each_entry(nsm, &nsm_handles, sm_link) + list_for_each_entry(nsm, nsm_handles, sm_link) if (memcmp(nsm->sm_priv.data, priv->data, sizeof(priv->data)) == 0) return nsm; @@ -353,6 +295,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, /** * nsm_get_handle - Find or create a cached nsm_handle + * @net: network namespace * @sap: pointer to socket address of handle to find * @salen: length of socket address * @hostname: pointer to C string containing hostname to find @@ -365,11 +308,13 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, * @hostname cannot be found in the handle cache. Returns NULL if * an error occurs. */ -struct nsm_handle *nsm_get_handle(const struct sockaddr *sap, +struct nsm_handle *nsm_get_handle(const struct net *net, + const struct sockaddr *sap, const size_t salen, const char *hostname, const size_t hostname_len) { struct nsm_handle *cached, *new = NULL; + struct lockd_net *ln = net_generic(net, lockd_net_id); if (hostname && memchr(hostname, '/', hostname_len) != NULL) { if (printk_ratelimit()) { @@ -384,9 +329,10 @@ retry: spin_lock(&nsm_lock); if (nsm_use_hostnames && hostname != NULL) - cached = nsm_lookup_hostname(hostname, hostname_len); + cached = nsm_lookup_hostname(&ln->nsm_handles, + hostname, hostname_len); else - cached = nsm_lookup_addr(sap); + cached = nsm_lookup_addr(&ln->nsm_handles, sap); if (cached != NULL) { atomic_inc(&cached->sm_count); @@ -400,7 +346,7 @@ retry: } if (new != NULL) { - list_add(&new->sm_link, &nsm_handles); + list_add(&new->sm_link, &ln->nsm_handles); spin_unlock(&nsm_lock); dprintk("lockd: created nsm_handle for %s (%s)\n", new->sm_name, new->sm_addrbuf); @@ -417,19 +363,22 @@ retry: /** * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle + * @net: network namespace * @info: pointer to NLMPROC_SM_NOTIFY arguments * * Returns a matching nsm_handle if found in the nsm cache. The returned * nsm_handle's reference count is bumped. Otherwise returns NULL if some * error occurred. */ -struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) +struct nsm_handle *nsm_reboot_lookup(const struct net *net, + const struct nlm_reboot *info) { struct nsm_handle *cached; + struct lockd_net *ln = net_generic(net, lockd_net_id); spin_lock(&nsm_lock); - cached = nsm_lookup_priv(&info->priv); + cached = nsm_lookup_priv(&ln->nsm_handles, &info->priv); if (unlikely(cached == NULL)) { spin_unlock(&nsm_lock); dprintk("lockd: never saw rebooted peer '%.*s' before\n", diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 097bfa3adb1c..5426189406c1 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -12,9 +12,7 @@ struct lockd_net { struct delayed_work grace_period_end; struct lock_manager lockd_manager; - spinlock_t nsm_clnt_lock; - unsigned int nsm_users; - struct rpc_clnt *nsm_clnt; + struct list_head nsm_handles; }; extern int lockd_net_id; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index d678bcc3cbcb..154a107cd376 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -25,13 +25,17 @@ #include <linux/mutex.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/inetdevice.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/svc_xprt.h> #include <net/ip.h> +#include <net/addrconf.h> +#include <net/ipv6.h> #include <linux/lockd/lockd.h> #include <linux/nfs.h> @@ -44,7 +48,7 @@ static struct svc_program nlmsvc_program; -struct nlmsvc_binding * nlmsvc_ops; +const struct nlmsvc_binding *nlmsvc_ops; EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); @@ -90,8 +94,7 @@ static unsigned long get_lockd_grace_period(void) static void grace_ender(struct work_struct *grace) { - struct delayed_work *dwork = container_of(grace, struct delayed_work, - work); + struct delayed_work *dwork = to_delayed_work(grace); struct lockd_net *ln = container_of(dwork, struct lockd_net, grace_period_end); @@ -279,6 +282,68 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) } } +static int lockd_inetaddr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct sockaddr_in sin; + + if (event != NETDEV_DOWN) + goto out; + + if (nlmsvc_rqst) { + dprintk("lockd_inetaddr_event: removed %pI4\n", + &ifa->ifa_local); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ifa->ifa_local; + svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, + (struct sockaddr *)&sin); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block lockd_inetaddr_notifier = { + .notifier_call = lockd_inetaddr_event, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static int lockd_inet6addr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + struct sockaddr_in6 sin6; + + if (event != NETDEV_DOWN) + goto out; + + if (nlmsvc_rqst) { + dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ifa->addr; + svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, + (struct sockaddr *)&sin6); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block lockd_inet6addr_notifier = { + .notifier_call = lockd_inet6addr_event, +}; +#endif + +static void lockd_svc_exit_thread(void) +{ + unregister_inetaddr_notifier(&lockd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&lockd_inet6addr_notifier); +#endif + svc_exit_thread(nlmsvc_rqst); +} + static int lockd_start_svc(struct svc_serv *serv) { int error; @@ -315,7 +380,7 @@ static int lockd_start_svc(struct svc_serv *serv) return 0; out_task: - svc_exit_thread(nlmsvc_rqst); + lockd_svc_exit_thread(); nlmsvc_task = NULL; out_rqst: nlmsvc_rqst = NULL; @@ -360,6 +425,10 @@ static struct svc_serv *lockd_create_svc(void) printk(KERN_WARNING "lockd_up: create service failed\n"); return ERR_PTR(-ENOMEM); } + register_inetaddr_notifier(&lockd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + register_inet6addr_notifier(&lockd_inet6addr_notifier); +#endif dprintk("lockd_up: service created\n"); return serv; } @@ -428,7 +497,7 @@ lockd_down(struct net *net) } kthread_stop(nlmsvc_task); dprintk("lockd_down: service stopped\n"); - svc_exit_thread(nlmsvc_rqst); + lockd_svc_exit_thread(); dprintk("lockd_down: service destroyed\n"); nlmsvc_task = NULL; nlmsvc_rqst = NULL; @@ -592,7 +661,7 @@ static int lockd_init_net(struct net *net) INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); INIT_LIST_HEAD(&ln->lockd_manager.list); ln->lockd_manager.block_opens = false; - spin_lock_init(&ln->nsm_clnt_lock); + INIT_LIST_HEAD(&ln->nsm_handles); return 0; } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index b147d1ae71fd..09c576f26c7b 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -421,7 +421,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, return rpc_system_err; } - nlm_host_rebooted(argp); + nlm_host_rebooted(SVC_NET(rqstp), argp); return rpc_success; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 21171f0c6477..fb26b9f522e7 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -464,7 +464,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, return rpc_system_err; } - nlm_host_rebooted(argp); + nlm_host_rebooted(SVC_NET(rqstp), argp); return rpc_success; } diff --git a/fs/locks.c b/fs/locks.c index 2a54c800a223..af1ed74a657f 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -119,7 +119,6 @@ #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/init.h> -#include <linux/module.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/syscalls.h> @@ -205,37 +204,69 @@ static struct kmem_cache *filelock_cache __read_mostly; static struct file_lock_context * locks_get_lock_context(struct inode *inode, int type) { - struct file_lock_context *new; + struct file_lock_context *ctx; - if (likely(inode->i_flctx) || type == F_UNLCK) + /* paired with cmpxchg() below */ + ctx = smp_load_acquire(&inode->i_flctx); + if (likely(ctx) || type == F_UNLCK) goto out; - new = kmem_cache_alloc(flctx_cache, GFP_KERNEL); - if (!new) + ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL); + if (!ctx) goto out; - spin_lock_init(&new->flc_lock); - INIT_LIST_HEAD(&new->flc_flock); - INIT_LIST_HEAD(&new->flc_posix); - INIT_LIST_HEAD(&new->flc_lease); + spin_lock_init(&ctx->flc_lock); + INIT_LIST_HEAD(&ctx->flc_flock); + INIT_LIST_HEAD(&ctx->flc_posix); + INIT_LIST_HEAD(&ctx->flc_lease); /* * Assign the pointer if it's not already assigned. If it is, then * free the context we just allocated. */ - if (cmpxchg(&inode->i_flctx, NULL, new)) - kmem_cache_free(flctx_cache, new); + if (cmpxchg(&inode->i_flctx, NULL, ctx)) { + kmem_cache_free(flctx_cache, ctx); + ctx = smp_load_acquire(&inode->i_flctx); + } out: - return inode->i_flctx; + trace_locks_get_lock_context(inode, type, ctx); + return ctx; +} + +static void +locks_dump_ctx_list(struct list_head *list, char *list_type) +{ + struct file_lock *fl; + + list_for_each_entry(fl, list, fl_list) { + pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid); + } +} + +static void +locks_check_ctx_lists(struct inode *inode) +{ + struct file_lock_context *ctx = inode->i_flctx; + + if (unlikely(!list_empty(&ctx->flc_flock) || + !list_empty(&ctx->flc_posix) || + !list_empty(&ctx->flc_lease))) { + pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n", + MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), + inode->i_ino); + locks_dump_ctx_list(&ctx->flc_flock, "FLOCK"); + locks_dump_ctx_list(&ctx->flc_posix, "POSIX"); + locks_dump_ctx_list(&ctx->flc_lease, "LEASE"); + } } void -locks_free_lock_context(struct file_lock_context *ctx) +locks_free_lock_context(struct inode *inode) { - if (ctx) { - WARN_ON_ONCE(!list_empty(&ctx->flc_flock)); - WARN_ON_ONCE(!list_empty(&ctx->flc_posix)); - WARN_ON_ONCE(!list_empty(&ctx->flc_lease)); + struct file_lock_context *ctx = inode->i_flctx; + + if (unlikely(ctx)) { + locks_check_ctx_lists(inode); kmem_cache_free(flctx_cache, ctx); } } @@ -762,7 +793,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) struct file_lock_context *ctx; struct inode *inode = file_inode(filp); - ctx = inode->i_flctx; + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) { fl->fl_type = F_UNLCK; return; @@ -930,7 +961,8 @@ out: return error; } -static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock) +static int posix_lock_inode(struct inode *inode, struct file_lock *request, + struct file_lock *conflock) { struct file_lock *fl, *tmp; struct file_lock *new_fl = NULL; @@ -1138,6 +1170,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (new_fl2) locks_free_lock(new_fl2); locks_dispose_list(&dispose); + trace_posix_lock_inode(inode, request, error); + return error; } @@ -1158,7 +1192,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return __posix_lock_file(file_inode(filp), fl, conflock); + return posix_lock_inode(file_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1167,15 +1201,14 @@ EXPORT_SYMBOL(posix_lock_file); * @inode: inode of file to which lock request should be applied * @fl: The lock to be applied * - * Variant of posix_lock_file_wait that does not take a filp, and so can be - * used after the filp has already been torn down. + * Apply a POSIX style lock request to an inode. */ -int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) +static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep (); for (;;) { - error = __posix_lock_file(inode, fl, NULL); + error = posix_lock_inode(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1187,8 +1220,8 @@ int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) } return error; } -EXPORT_SYMBOL(posix_lock_inode_wait); +#ifdef CONFIG_MANDATORY_FILE_LOCKING /** * locks_mandatory_locked - Check for an active lock * @file: the file to check @@ -1203,7 +1236,7 @@ int locks_mandatory_locked(struct file *file) struct file_lock_context *ctx; struct file_lock *fl; - ctx = inode->i_flctx; + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) return 0; @@ -1225,20 +1258,16 @@ int locks_mandatory_locked(struct file *file) /** * locks_mandatory_area - Check for a conflicting lock - * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ - * for shared - * @inode: the file to check + * @inode: the file to check * @filp: how the file was opened (if it was) - * @offset: start of area to check - * @count: length of area to check + * @start: first byte in the file to check + * @end: lastbyte in the file to check + * @type: %F_WRLCK for a write lock, else %F_RDLCK * * Searches the inode's list of locks to find any POSIX locks which conflict. - * This function is called from rw_verify_area() and - * locks_verify_truncate(). */ -int locks_mandatory_area(int read_write, struct inode *inode, - struct file *filp, loff_t offset, - size_t count) +int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, + loff_t end, unsigned char type) { struct file_lock fl; int error; @@ -1250,15 +1279,15 @@ int locks_mandatory_area(int read_write, struct inode *inode, fl.fl_flags = FL_POSIX | FL_ACCESS; if (filp && !(filp->f_flags & O_NONBLOCK)) sleep = true; - fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; - fl.fl_start = offset; - fl.fl_end = offset + count - 1; + fl.fl_type = type; + fl.fl_start = start; + fl.fl_end = end; for (;;) { if (filp) { fl.fl_owner = filp; fl.fl_flags &= ~FL_SLEEP; - error = __posix_lock_file(inode, &fl, NULL); + error = posix_lock_inode(inode, &fl, NULL); if (!error) break; } @@ -1266,7 +1295,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, if (sleep) fl.fl_flags |= FL_SLEEP; fl.fl_owner = current->files; - error = __posix_lock_file(inode, &fl, NULL); + error = posix_lock_inode(inode, &fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl.fl_wait, !fl.fl_next); @@ -1287,6 +1316,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, } EXPORT_SYMBOL(locks_mandatory_area); +#endif /* CONFIG_MANDATORY_FILE_LOCKING */ static void lease_clear_pending(struct file_lock *fl, int arg) { @@ -1388,7 +1418,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker) int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { int error = 0; - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx; struct file_lock *new_fl, *fl, *tmp; unsigned long break_time; int want_write = (mode & O_ACCMODE) != O_RDONLY; @@ -1400,6 +1430,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) new_fl->fl_flags = type; /* typically we will check that ctx is non-NULL before calling */ + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx) { WARN_ON_ONCE(1); return error; @@ -1494,17 +1525,16 @@ EXPORT_SYMBOL(__break_lease); void lease_get_mtime(struct inode *inode, struct timespec *time) { bool has_lease = false; - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx; struct file_lock *fl; + ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); - if (!list_empty(&ctx->flc_lease)) { - fl = list_first_entry(&ctx->flc_lease, - struct file_lock, fl_list); - if (fl->fl_type == F_WRLCK) - has_lease = true; - } + fl = list_first_entry_or_null(&ctx->flc_lease, + struct file_lock, fl_list); + if (fl && (fl->fl_type == F_WRLCK)) + has_lease = true; spin_unlock(&ctx->flc_lock); } @@ -1543,10 +1573,11 @@ int fcntl_getlease(struct file *filp) { struct file_lock *fl; struct inode *inode = file_inode(filp); - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); + ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); time_out_leases(file_inode(filp), &dispose); @@ -1711,11 +1742,11 @@ static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct file_lock_context *ctx = inode->i_flctx; + struct inode *inode = file_inode(filp); + struct file_lock_context *ctx; LIST_HEAD(dispose); + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx) { trace_generic_delete_lease(inode, NULL); return error; @@ -1751,8 +1782,7 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(filp); int error; if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) @@ -1856,7 +1886,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) * * Apply a FLOCK style lock request to an inode. */ -int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) +static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep(); @@ -1873,7 +1903,30 @@ int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) } return error; } -EXPORT_SYMBOL(flock_lock_inode_wait); + +/** + * locks_lock_inode_wait - Apply a lock to an inode + * @inode: inode of the file to apply to + * @fl: The lock to be applied + * + * Apply a POSIX or FLOCK style lock request to an inode. + */ +int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) +{ + int res = 0; + switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + case FL_POSIX: + res = posix_lock_inode_wait(inode, fl); + break; + case FL_FLOCK: + res = flock_lock_inode_wait(inode, fl); + break; + default: + BUG(); + } + return res; +} +EXPORT_SYMBOL(locks_lock_inode_wait); /** * sys_flock: - flock() system call. @@ -1931,7 +1984,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) (can_sleep) ? F_SETLKW : F_SETLK, lock); else - error = flock_lock_file_wait(f.file, lock); + error = locks_lock_file_wait(f.file, lock); out_free: locks_free_lock(lock); @@ -2107,7 +2160,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, return error; } -/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */ +/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */ static int check_fmode_for_setlk(struct file_lock *fl) { @@ -2138,6 +2191,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (file_lock == NULL) return -ENOLCK; + inode = file_inode(filp); + /* * This might block, so we do it before checking the inode. */ @@ -2145,8 +2200,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = file_inode(filp); - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2155,7 +2208,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, goto out; } -again: error = flock_to_posix_lock(filp, file_lock, &flock); if (error) goto out; @@ -2194,23 +2246,29 @@ again: error = do_lock_file_wait(filp, cmd, file_lock); /* - * Attempt to detect a close/fcntl race and recover by - * releasing the lock that was just acquired. + * Attempt to detect a close/fcntl race and recover by releasing the + * lock that was just acquired. There is no need to do that when we're + * unlocking though, or for OFD locks. */ - /* - * we need that spin_lock here - it prevents reordering between - * update of i_flctx->flc_posix and check for it done in close(). - * rcu_read_lock() wouldn't do. - */ - spin_lock(¤t->files->file_lock); - f = fcheck(fd); - spin_unlock(¤t->files->file_lock); - if (!error && f != filp && flock.l_type != F_UNLCK) { - flock.l_type = F_UNLCK; - goto again; + if (!error && file_lock->fl_type != F_UNLCK && + !(file_lock->fl_flags & FL_OFDLCK)) { + /* + * We need that spin_lock here - it prevents reordering between + * update of i_flctx->flc_posix and check for it done in + * close(). rcu_read_lock() wouldn't do. + */ + spin_lock(¤t->files->file_lock); + f = fcheck(fd); + spin_unlock(¤t->files->file_lock); + if (f != filp) { + file_lock->fl_type = F_UNLCK; + error = do_lock_file_wait(filp, cmd, file_lock); + WARN_ON_ONCE(error); + error = -EBADF; + } } - out: + trace_fcntl_setlk(inode, file_lock, error); locks_free_lock(file_lock); return error; } @@ -2295,7 +2353,6 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, goto out; } -again: error = flock64_to_posix_lock(filp, file_lock, &flock); if (error) goto out; @@ -2334,17 +2391,27 @@ again: error = do_lock_file_wait(filp, cmd, file_lock); /* - * Attempt to detect a close/fcntl race and recover by - * releasing the lock that was just acquired. + * Attempt to detect a close/fcntl race and recover by releasing the + * lock that was just acquired. There is no need to do that when we're + * unlocking though, or for OFD locks. */ - spin_lock(¤t->files->file_lock); - f = fcheck(fd); - spin_unlock(¤t->files->file_lock); - if (!error && f != filp && flock.l_type != F_UNLCK) { - flock.l_type = F_UNLCK; - goto again; + if (!error && file_lock->fl_type != F_UNLCK && + !(file_lock->fl_flags & FL_OFDLCK)) { + /* + * We need that spin_lock here - it prevents reordering between + * update of i_flctx->flc_posix and check for it done in + * close(). rcu_read_lock() wouldn't do. + */ + spin_lock(¤t->files->file_lock); + f = fcheck(fd); + spin_unlock(¤t->files->file_lock); + if (f != filp) { + file_lock->fl_type = F_UNLCK; + error = do_lock_file_wait(filp, cmd, file_lock); + WARN_ON_ONCE(error); + error = -EBADF; + } } - out: locks_free_lock(file_lock); return error; @@ -2358,14 +2425,16 @@ out: */ void locks_remove_posix(struct file *filp, fl_owner_t owner) { + int error; struct file_lock lock; - struct file_lock_context *ctx = file_inode(filp)->i_flctx; + struct file_lock_context *ctx; /* * If there are no locks held on this file, we don't need to call * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ + ctx = smp_load_acquire(&file_inode(filp)->i_flctx); if (!ctx || list_empty(&ctx->flc_posix)) return; @@ -2379,17 +2448,18 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) lock.fl_ops = NULL; lock.fl_lmops = NULL; - vfs_lock_file(filp, F_SETLK, &lock, NULL); + error = vfs_lock_file(filp, F_SETLK, &lock, NULL); if (lock.fl_ops && lock.fl_ops->fl_release_private) lock.fl_ops->fl_release_private(&lock); + trace_locks_remove_posix(file_inode(filp), &lock, error); } EXPORT_SYMBOL(locks_remove_posix); /* The i_flctx must be valid when calling into here */ static void -locks_remove_flock(struct file *filp) +locks_remove_flock(struct file *filp, struct file_lock_context *flctx) { struct file_lock fl = { .fl_owner = filp, @@ -2400,7 +2470,6 @@ locks_remove_flock(struct file *filp) .fl_end = OFFSET_MAX, }; struct inode *inode = file_inode(filp); - struct file_lock_context *flctx = inode->i_flctx; if (list_empty(&flctx->flc_flock)) return; @@ -2416,10 +2485,8 @@ locks_remove_flock(struct file *filp) /* The i_flctx must be valid when calling into here */ static void -locks_remove_lease(struct file *filp) +locks_remove_lease(struct file *filp, struct file_lock_context *ctx) { - struct inode *inode = file_inode(filp); - struct file_lock_context *ctx = inode->i_flctx; struct file_lock *fl, *tmp; LIST_HEAD(dispose); @@ -2439,17 +2506,20 @@ locks_remove_lease(struct file *filp) */ void locks_remove_file(struct file *filp) { - if (!file_inode(filp)->i_flctx) + struct file_lock_context *ctx; + + ctx = smp_load_acquire(&file_inode(filp)->i_flctx); + if (!ctx) return; /* remove any OFD locks */ locks_remove_posix(filp, filp); /* remove flock locks */ - locks_remove_flock(filp); + locks_remove_flock(filp, ctx); /* remove any leases */ - locks_remove_lease(filp); + locks_remove_lease(filp, ctx); } /** @@ -2616,7 +2686,7 @@ void show_fd_locks(struct seq_file *f, struct file_lock_context *ctx; int id = 0; - ctx = inode->i_flctx; + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx) return; @@ -2678,7 +2748,7 @@ static int __init proc_locks_init(void) proc_create("locks", 0, NULL, &proc_locks_operations); return 0; } -module_init(proc_locks_init); +fs_initcall(proc_locks_init); #endif static int __init filelock_init(void) diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig index 09ed066c0221..2b4503163930 100644 --- a/fs/logfs/Kconfig +++ b/fs/logfs/Kconfig @@ -1,6 +1,6 @@ config LOGFS tristate "LogFS file system" - depends on (MTD || BLOCK) + depends on MTD || (!MTD && BLOCK) select ZLIB_INFLATE select ZLIB_DEFLATE select CRC32 diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index a7fdbd868474..a709d80c8ebc 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, BIO_MAX_PAGES); + max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); @@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, BIO_MAX_PAGES); + max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index f9b45d46d4c4..542468e9bfb4 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -528,7 +528,8 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) return PTR_ERR(inode); - inode->i_op = &logfs_symlink_iops; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &logfs_reg_aops; return __logfs_create(dir, dentry, inode, target, destlen); @@ -776,12 +777,6 @@ fail: return -EIO; } -const struct inode_operations logfs_symlink_iops = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, -}; - const struct inode_operations logfs_dir_iops = { .create = logfs_create, .link = logfs_link, diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index af49e2d6941a..db9cfc598883 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -64,7 +64,8 @@ static void logfs_inode_setops(struct inode *inode) inode->i_mapping->a_ops = &logfs_reg_aops; break; case S_IFLNK: - inode->i_op = &logfs_symlink_iops; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &logfs_reg_aops; break; case S_IFSOCK: /* fall through */ @@ -408,7 +409,8 @@ const struct super_operations logfs_super_operations = { int logfs_init_inode_cache(void) { logfs_inode_cache = kmem_cache_create("logfs_inode_cache", - sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + sizeof(struct logfs_inode), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, logfs_init_once); if (!logfs_inode_cache) return -ENOMEM; diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 5f0937609465..27d040e35faa 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -302,7 +302,7 @@ struct logfs_block { struct inode *inode; struct logfs_transaction *ta; unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG]; - struct logfs_block_ops *ops; + const struct logfs_block_ops *ops; int full; int partial; int reserved_bytes; @@ -485,7 +485,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s, #endif /* dev_mtd.c */ -#ifdef CONFIG_MTD +#if IS_ENABLED(CONFIG_MTD) int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr); #else static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) @@ -495,7 +495,6 @@ static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) #endif /* dir.c */ -extern const struct inode_operations logfs_symlink_iops; extern const struct inode_operations logfs_dir_iops; extern const struct file_operations logfs_dir_fops; int logfs_replay_journal(struct super_block *sb); @@ -579,7 +578,7 @@ int logfs_exist_block(struct inode *inode, u64 bix); int get_page_reserve(struct inode *inode, struct page *page); void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock); void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock); -extern struct logfs_block_ops indirect_block_ops; +extern const struct logfs_block_ops indirect_block_ops; /* segment.c */ int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase); diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 380d86e1ab45..20973c9e52f8 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -569,13 +569,13 @@ static void indirect_free_block(struct super_block *sb, } -static struct logfs_block_ops inode_block_ops = { +static const struct logfs_block_ops inode_block_ops = { .write_block = inode_write_block, .free_block = inode_free_block, .write_alias = inode_write_alias, }; -struct logfs_block_ops indirect_block_ops = { +const struct logfs_block_ops indirect_block_ops = { .write_block = indirect_write_block, .free_block = indirect_free_block, .write_alias = indirect_write_alias, diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 7f9b096d8d57..d270e4b2ab6b 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -57,7 +57,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, filler_t *filler = super->s_devops->readpage; struct page *page; - BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS); + BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS)); if (use_filler) page = read_cache_page(mapping, index, filler, sb); else { @@ -197,7 +197,7 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block, return 0; } -static struct logfs_block_ops btree_block_ops = { +static const struct logfs_block_ops btree_block_ops = { .write_block = btree_write_block, .free_block = __free_block, .write_alias = btree_write_alias, diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 086cd0a61e80..f975d667c539 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -91,7 +91,7 @@ static int __init init_inodecache(void) minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (minix_inode_cachep == NULL) return -ENOMEM; @@ -435,8 +435,7 @@ static const struct address_space_operations minix_aops = { static const struct inode_operations minix_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = minix_getattr, }; @@ -452,6 +451,7 @@ void minix_set_inode(struct inode *inode, dev_t rdev) inode->i_mapping->a_ops = &minix_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &minix_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &minix_aops; } else init_special_inode(inode, inode->i_mode, rdev); diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c index 282e15ad8cd8..46ca39d6c735 100644 --- a/fs/minix/itree_v1.c +++ b/fs/minix/itree_v1.c @@ -24,16 +24,15 @@ static inline block_t *i_data(struct inode *inode) static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) { int n = 0; - char b[BDEVNAME_SIZE]; if (block < 0) { - printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n", - block, bdevname(inode->i_sb->s_bdev, b)); + printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n", + block, inode->i_sb->s_bdev); } else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) { if (printk_ratelimit()) printk("MINIX-fs: block_to_path: " - "block %ld too big on dev %s\n", - block, bdevname(inode->i_sb->s_bdev, b)); + "block %ld too big on dev %pg\n", + block, inode->i_sb->s_bdev); } else if (block < 7) { offsets[n++] = block; } else if ((block -= 7) < 512) { diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c index 78e2d93e5c83..1ee101352586 100644 --- a/fs/minix/itree_v2.c +++ b/fs/minix/itree_v2.c @@ -26,18 +26,17 @@ static inline block_t *i_data(struct inode *inode) static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) { int n = 0; - char b[BDEVNAME_SIZE]; struct super_block *sb = inode->i_sb; if (block < 0) { - printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n", - block, bdevname(sb->s_bdev, b)); + printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n", + block, sb->s_bdev); } else if ((u64)block * (u64)sb->s_blocksize >= minix_sb(sb)->s_max_size) { if (printk_ratelimit()) printk("MINIX-fs: block_to_path: " - "block %ld too big on dev %s\n", - block, bdevname(sb->s_bdev, b)); + "block %ld too big on dev %pg\n", + block, sb->s_bdev); } else if (block < DIRCOUNT) { offsets[n++] = block; } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) { diff --git a/fs/mpage.c b/fs/mpage.c index a7c34274f207..1480d3a18037 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -361,7 +361,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping); + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); map_bh.b_state = 0; map_bh.b_size = 0; @@ -397,7 +397,7 @@ int mpage_readpage(struct page *page, get_block_t get_block) sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping); + gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); map_bh.b_state = 0; map_bh.b_size = 0; @@ -485,6 +485,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; + int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -593,7 +594,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); alloc_new: if (bio == NULL) { @@ -620,7 +621,7 @@ alloc_new: wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); goto alloc_new; } @@ -630,7 +631,7 @@ alloc_new: set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -642,7 +643,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); @@ -698,8 +699,11 @@ mpage_writepages(struct address_space *mapping, }; ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } } blk_finish_plug(&plug); return ret; @@ -716,8 +720,11 @@ int mpage_writepage(struct page *page, get_block_t get_block, .use_writepage = 0, }; int ret = __mpage_writepage(page, wbc, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } return ret; } EXPORT_SYMBOL(mpage_writepage); diff --git a/fs/namei.c b/fs/namei.c index 33e9495a3129..bceefd5588a2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -505,13 +505,13 @@ struct nameidata { int total_link_count; struct saved { struct path link; - void *cookie; + struct delayed_call done; const char *name; - struct inode *inode; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; struct nameidata *saved; + struct inode *link_inode; unsigned root_seq; int dfd; }; @@ -534,10 +534,8 @@ static void restore_nameidata(void) current->nameidata = old; if (old) old->total_link_count = now->total_link_count; - if (now->stack != now->internal) { + if (now->stack != now->internal) kfree(now->stack); - now->stack = now->internal; - } } static int __nd_alloc_stack(struct nameidata *nd) @@ -592,11 +590,8 @@ static void drop_links(struct nameidata *nd) int i = nd->depth; while (i--) { struct saved *last = nd->stack + i; - struct inode *inode = last->inode; - if (last->cookie && inode->i_op->put_link) { - inode->i_op->put_link(inode, last->cookie); - last->cookie = NULL; - } + do_delayed_call(&last->done); + clear_delayed_call(&last->done); } } @@ -657,7 +652,7 @@ static bool legitimize_links(struct nameidata *nd) * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab - * normal reference counts on dentries and vfsmounts to transition to rcu-walk + * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller @@ -807,19 +802,19 @@ static int complete_walk(struct nameidata *nd) static void set_root(struct nameidata *nd) { - get_fs_root(current->fs, &nd->root); -} - -static void set_root_rcu(struct nameidata *nd) -{ struct fs_struct *fs = current->fs; - unsigned seq; - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + if (nd->flags & LOOKUP_RCU) { + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + nd->root = fs->root; + nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } else { + get_fs_root(fs, &nd->root); + } } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -841,8 +836,28 @@ static inline void path_to_nameidata(const struct path *path, nd->path.dentry = path->dentry; } +static int nd_jump_root(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + struct dentry *d; + nd->path = nd->root; + d = nd->path.dentry; + nd->inode = d->d_inode; + nd->seq = nd->root_seq; + if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) + return -ECHILD; + } else { + path_put(&nd->path); + nd->path = nd->root; + path_get(&nd->path); + nd->inode = nd->path.dentry->d_inode; + } + nd->flags |= LOOKUP_JUMPED; + return 0; +} + /* - * Helper to directly jump to a known parsed path from ->follow_link, + * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ void nd_jump_link(struct path *path) @@ -858,9 +873,7 @@ void nd_jump_link(struct path *path) static inline void put_link(struct nameidata *nd) { struct saved *last = nd->stack + --nd->depth; - struct inode *inode = last->inode; - if (last->cookie && inode->i_op->put_link) - inode->i_op->put_link(inode, last->cookie); + do_delayed_call(&last->done); if (!(nd->flags & LOOKUP_RCU)) path_put(&last->link); } @@ -892,7 +905,7 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if owner and follower match. */ - inode = nd->stack[0].inode; + inode = nd->link_inode; if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; @@ -955,26 +968,23 @@ static bool safe_hardlink_source(struct inode *inode) * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) - * - not CAP_FOWNER + * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * Returns 0 if successful, -ve on error. */ static int may_linkat(struct path *link) { - const struct cred *cred; struct inode *inode; if (!sysctl_protected_hardlinks) return 0; - cred = current_cred(); inode = link->dentry->d_inode; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) || - capable(CAP_FOWNER)) + if (inode_owner_or_capable(inode) || safe_hardlink_source(inode)) return 0; audit_log_link_denied("linkat", link); @@ -986,7 +996,7 @@ const char *get_link(struct nameidata *nd) { struct saved *last = nd->stack + nd->depth - 1; struct dentry *dentry = last->link.dentry; - struct inode *inode = last->inode; + struct inode *inode = nd->link_inode; int error; const char *res; @@ -1007,36 +1017,27 @@ const char *get_link(struct nameidata *nd) nd->last_type = LAST_BIND; res = inode->i_link; if (!res) { + const char * (*get)(struct dentry *, struct inode *, + struct delayed_call *); + get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, NULL, 0))) - return ERR_PTR(-ECHILD); + res = get(NULL, inode, &last->done); + if (res == ERR_PTR(-ECHILD)) { + if (unlikely(unlazy_walk(nd, NULL, 0))) + return ERR_PTR(-ECHILD); + res = get(dentry, inode, &last->done); + } + } else { + res = get(dentry, inode, &last->done); } - res = inode->i_op->follow_link(dentry, &last->cookie); - if (IS_ERR_OR_NULL(res)) { - last->cookie = NULL; + if (IS_ERR_OR_NULL(res)) return res; - } } if (*res == '/') { - if (nd->flags & LOOKUP_RCU) { - struct dentry *d; - if (!nd->root.mnt) - set_root_rcu(nd); - nd->path = nd->root; - d = nd->path.dentry; - nd->inode = d->d_inode; - nd->seq = nd->root_seq; - if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) - return ERR_PTR(-ECHILD); - } else { - if (!nd->root.mnt) - set_root(nd); - path_put(&nd->path); - nd->path = nd->root; - path_get(&nd->root); - nd->inode = nd->path.dentry->d_inode; - } - nd->flags |= LOOKUP_JUMPED; + if (!nd->root.mnt) + set_root(nd); + if (unlikely(nd_jump_root(nd))) + return ERR_PTR(-ECHILD); while (unlikely(*++res == '/')) ; } @@ -1297,8 +1298,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, static int follow_dotdot_rcu(struct nameidata *nd) { struct inode *inode = nd->inode; - if (!nd->root.mnt) - set_root_rcu(nd); while (1) { if (path_equal(&nd->path, &nd->root)) @@ -1418,9 +1417,6 @@ static void follow_mount(struct path *path) static int follow_dotdot(struct nameidata *nd) { - if (!nd->root.mnt) - set_root(nd); - while(1) { struct dentry *old = nd->path.dentry; @@ -1658,6 +1654,8 @@ static inline int may_lookup(struct nameidata *nd) static inline int handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { + if (!nd->root.mnt) + set_root(nd); if (nd->flags & LOOKUP_RCU) { return follow_dotdot_rcu(nd); } else @@ -1694,8 +1692,8 @@ static int pick_link(struct nameidata *nd, struct path *link, last = nd->stack + nd->depth++; last->link = *link; - last->cookie = NULL; - last->inode = inode; + clear_delayed_call(&last->done); + nd->link_inode = inode; last->seq = seq; return 1; } @@ -1969,7 +1967,7 @@ OK: if (err) { const char *s = get_link(nd); - if (unlikely(IS_ERR(s))) + if (IS_ERR(s)) return PTR_ERR(s); err = 0; if (unlikely(!s)) { @@ -1999,7 +1997,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; - nd->total_link_count = 0; if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; @@ -2024,18 +2021,19 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } nd->root.mnt = NULL; + nd->path.mnt = NULL; + nd->path.dentry = NULL; nd->m_seq = read_seqbegin(&mount_lock); if (*s == '/') { - if (flags & LOOKUP_RCU) { + if (flags & LOOKUP_RCU) rcu_read_lock(); - set_root_rcu(nd); - nd->seq = nd->root_seq; - } else { - set_root(nd); - path_get(&nd->root); - } - nd->path = nd->root; + set_root(nd); + if (likely(!nd_jump_root(nd))) + return s; + nd->root.mnt = NULL; + rcu_read_unlock(); + return ERR_PTR(-ECHILD); } else if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; @@ -2046,11 +2044,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags) do { seq = read_seqcount_begin(&fs->seq); nd->path = fs->pwd; + nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); + nd->inode = nd->path.dentry->d_inode; } + return s; } else { /* Caller must check execute permissions on the starting path component */ struct fd f = fdget_raw(nd->dfd); @@ -2080,16 +2081,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) fdput(f); return s; } - - nd->inode = nd->path.dentry->d_inode; - if (!(flags & LOOKUP_RCU)) - return s; - if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) - return s; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - rcu_read_unlock(); - return ERR_PTR(-ECHILD); } static const char *trailing_symlink(struct nameidata *nd) @@ -2282,6 +2273,8 @@ EXPORT_SYMBOL(vfs_path_lookup); * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. + * + * The caller must hold base->i_mutex. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { @@ -2325,6 +2318,75 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) } EXPORT_SYMBOL(lookup_one_len); +/** + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + * + * Unlike lookup_one_len, it should be called without the parent + * i_mutex held, and will take the i_mutex itself if necessary. + */ +struct dentry *lookup_one_len_unlocked(const char *name, + struct dentry *base, int len) +{ + struct qstr this; + unsigned int c; + int err; + struct dentry *ret; + + this.name = name; + this.len = len; + this.hash = full_name_hash(name, len); + if (!len) + return ERR_PTR(-EACCES); + + if (unlikely(name[0] == '.')) { + if (len < 2 || (len == 2 && name[1] == '.')) + return ERR_PTR(-EACCES); + } + + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return ERR_PTR(-EACCES); + } + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_flags & DCACHE_OP_HASH) { + int err = base->d_op->d_hash(base, &this); + if (err < 0) + return ERR_PTR(err); + } + + err = inode_permission(base->d_inode, MAY_EXEC); + if (err) + return ERR_PTR(err); + + /* + * __d_lookup() is used to try to get a quick answer and avoid the + * mutex. A false-negative does no harm. + */ + ret = __d_lookup(base, &this); + if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) { + dput(ret); + ret = NULL; + } + if (ret) + return ret; + + mutex_lock(&base->d_inode->i_mutex); + ret = __lookup_hash(&this, base, 0); + mutex_unlock(&base->d_inode->i_mutex); + return ret; +} +EXPORT_SYMBOL(lookup_one_len_unlocked); + int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { @@ -2673,10 +2735,6 @@ static int may_open(struct path *path, int acc_mode, int flag) struct inode *inode = dentry->d_inode; int error; - /* O_PATH? */ - if (!acc_mode) - return 0; - if (!inode) return -ENOENT; @@ -2698,7 +2756,7 @@ static int may_open(struct path *path, int acc_mode, int flag) break; } - error = inode_permission(inode, acc_mode); + error = inode_permission(inode, MAY_OPEN | acc_mode); if (error) return error; @@ -2890,7 +2948,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, if (*opened & FILE_CREATED) { WARN_ON(!(open_flag & O_CREAT)); fsnotify_create(dir, dentry); - acc_mode = MAY_OPEN; + acc_mode = 0; } error = may_open(&file->f_path, acc_mode, open_flag); if (error) @@ -3103,7 +3161,7 @@ retry_lookup: /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; will_truncate = false; - acc_mode = MAY_OPEN; + acc_mode = 0; path_to_nameidata(&path, nd); goto finish_open_created; } @@ -3187,10 +3245,11 @@ finish_open: got_write = true; } finish_open_created: - error = may_open(&nd->path, acc_mode, open_flag); - if (error) - goto out; - + if (likely(!(open_flag & O_PATH))) { + error = may_open(&nd->path, acc_mode, open_flag); + if (error) + goto out; + } BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file, current_cred()); if (!error) { @@ -3277,7 +3336,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, goto out2; audit_inode(nd->name, child, 0); /* Don't check for other permissions, the inode was just created */ - error = may_open(&path, MAY_OPEN, op->open_flag); + error = may_open(&path, 0, op->open_flag); if (error) goto out2; file->f_path.mnt = path.mnt; @@ -3383,7 +3442,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, return ERR_PTR(-ELOOP); filename = getname_kernel(name); - if (unlikely(IS_ERR(filename))) + if (IS_ERR(filename)) return ERR_CAST(filename); set_nameidata(&nd, -1, filename); @@ -4499,72 +4558,73 @@ EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that - * have ->follow_link() touching nd only in nd_set_link(). Using (or not - * using) it for any given inode is up to filesystem. + * have ->get_link() not calling nd_jump_link(). Using (or not using) it + * for any given inode is up to filesystem. */ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - void *cookie; + DEFINE_DELAYED_CALL(done); struct inode *inode = d_inode(dentry); const char *link = inode->i_link; int res; if (!link) { - link = inode->i_op->follow_link(dentry, &cookie); + link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } res = readlink_copy(buffer, buflen, link); - if (inode->i_op->put_link) - inode->i_op->put_link(inode, cookie); + do_delayed_call(&done); return res; } EXPORT_SYMBOL(generic_readlink); /* get the link contents into pagecache */ -static char *page_getlink(struct dentry * dentry, struct page **ppage) +const char *page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { char *kaddr; struct page *page; - struct address_space *mapping = dentry->d_inode->i_mapping; - page = read_mapping_page(mapping, 0, NULL); - if (IS_ERR(page)) - return (char*)page; - *ppage = page; - kaddr = kmap(page); - nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); + struct address_space *mapping = inode->i_mapping; + + if (!dentry) { + page = find_get_page(mapping, 0); + if (!page) + return ERR_PTR(-ECHILD); + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-ECHILD); + } + } else { + page = read_mapping_page(mapping, 0, NULL); + if (IS_ERR(page)) + return (char*)page; + } + set_delayed_call(callback, page_put_link, page); + BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); + kaddr = page_address(page); + nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } -int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) -{ - struct page *page = NULL; - int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); - if (page) { - kunmap(page); - page_cache_release(page); - } - return res; -} -EXPORT_SYMBOL(page_readlink); +EXPORT_SYMBOL(page_get_link); -const char *page_follow_link_light(struct dentry *dentry, void **cookie) +void page_put_link(void *arg) { - struct page *page = NULL; - char *res = page_getlink(dentry, &page); - if (!IS_ERR(res)) - *cookie = page; - return res; + put_page(arg); } -EXPORT_SYMBOL(page_follow_link_light); +EXPORT_SYMBOL(page_put_link); -void page_put_link(struct inode *unused, void *cookie) +int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct page *page = cookie; - kunmap(page); - page_cache_release(page); + DEFINE_DELAYED_CALL(done); + int res = readlink_copy(buffer, buflen, + page_get_link(dentry, d_inode(dentry), + &done)); + do_delayed_call(&done); + return res; } -EXPORT_SYMBOL(page_put_link); +EXPORT_SYMBOL(page_readlink); /* * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS @@ -4575,7 +4635,6 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) struct page *page; void *fsdata; int err; - char *kaddr; unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; if (nofs) flags |= AOP_FLAG_NOFS; @@ -4586,9 +4645,7 @@ retry: if (err) goto fail; - kaddr = kmap_atomic(page); - memcpy(kaddr, symname, len-1); - kunmap_atomic(kaddr); + memcpy(page_address(page), symname, len-1); err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, page, fsdata); @@ -4607,13 +4664,12 @@ EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, - !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); + !mapping_gfp_constraint(inode->i_mapping, __GFP_FS)); } EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, }; EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/fs/namespace.c b/fs/namespace.c index 0570729c87fd..a830e1463704 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1584,6 +1584,14 @@ static inline bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } +static inline bool may_mandlock(void) +{ +#ifndef CONFIG_MANDATORY_FILE_LOCKING + return false; +#endif + return capable(CAP_SYS_ADMIN); +} + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -2601,18 +2609,18 @@ static long exact_copy_from_user(void *to, const void __user * from, return n; } -int copy_mount_options(const void __user * data, unsigned long *where) +void *copy_mount_options(const void __user * data) { int i; - unsigned long page; unsigned long size; + char *copy; - *where = 0; if (!data) - return 0; + return NULL; - if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + copy = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!copy) + return ERR_PTR(-ENOMEM); /* We only care that *some* data at the address the user * gave us is valid. Just in case, we'll zero @@ -2623,15 +2631,14 @@ int copy_mount_options(const void __user * data, unsigned long *where) if (size > PAGE_SIZE) size = PAGE_SIZE; - i = size - exact_copy_from_user((void *)page, data, size); + i = size - exact_copy_from_user(copy, data, size); if (!i) { - free_page(page); - return -EFAULT; + kfree(copy); + return ERR_PTR(-EFAULT); } if (i != PAGE_SIZE) - memset((char *)page + i, 0, PAGE_SIZE - i); - *where = page; - return 0; + memset(copy + i, 0, PAGE_SIZE - i); + return copy; } char *copy_mount_string(const void __user *data) @@ -2677,6 +2684,8 @@ long do_mount(const char *dev_name, const char __user *dir_name, type_page, flags, data_page); if (!retval && !may_mount()) retval = -EPERM; + if (!retval && (flags & MS_MANDLOCK) && !may_mandlock()) + retval = -EPERM; if (retval) goto dput_out; @@ -2896,7 +2905,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, int ret; char *kernel_type; char *kernel_dev; - unsigned long data_page; + void *options; kernel_type = copy_mount_string(type); ret = PTR_ERR(kernel_type); @@ -2908,14 +2917,14 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, if (IS_ERR(kernel_dev)) goto out_dev; - ret = copy_mount_options(data, &data_page); - if (ret < 0) + options = copy_mount_options(data); + ret = PTR_ERR(options); + if (IS_ERR(options)) goto out_data; - ret = do_mount(kernel_dev, dir_name, kernel_type, flags, - (void *) data_page); + ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options); - free_page(data_page); + kfree(options); out_data: kfree(kernel_dev); out_dev: @@ -2939,9 +2948,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } -int path_is_under(struct path *path1, struct path *path2) +bool path_is_under(struct path *path1, struct path *path2) { - int res; + bool res; read_seqlock_excl(&mount_lock); res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); read_sequnlock_excl(&mount_lock); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 93575e91a7aa..f0e3e9e747dd 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -597,7 +597,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, qname.name = __name; newdent = d_hash_and_lookup(dentry, &qname); - if (unlikely(IS_ERR(newdent))) + if (IS_ERR(newdent)) goto end_advance; if (!newdent) { newdent = d_alloc(dentry, &qname); @@ -1165,8 +1165,6 @@ out: static int ncp_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - if (!new_valid_dev(rdev)) - return -EINVAL; if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) { ncp_dbg(1, "mode = 0%ho\n", mode); return ncp_create_new(dir, dentry, mode, rdev, 0); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 9605a2f63549..1af15fcbe57b 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -82,7 +82,7 @@ static int init_inodecache(void) ncp_inode_cachep = kmem_cache_create("ncp_inode_cache", sizeof(struct ncp_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ncp_inode_cachep == NULL) return -ENOMEM; @@ -244,8 +244,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) static const struct inode_operations ncp_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = ncp_notify_change, }; #endif @@ -283,6 +282,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &ncp_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &ncp_symlink_aops; #endif } else { diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 79b113048eac..0a3f9b594602 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -525,6 +525,8 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg switch (rqdata.cmd) { case NCP_LOCK_EX: case NCP_LOCK_SH: + if (rqdata.timeout < 0) + return -EINVAL; if (rqdata.timeout == 0) rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT; else if (rqdata.timeout > NCP_LOCK_MAX_TIMEOUT) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 9cd4eb3a1e22..ddd0138f410c 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -229,7 +229,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) struct parallel_io *par; loff_t f_offset = header->args.offset; size_t bytes_left = header->args.count; - unsigned int pg_offset, pg_len; + unsigned int pg_offset = header->args.pgbase, pg_len; struct page **pages = header->args.pages; int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; const bool is_dio = (header->dreq != NULL); @@ -262,7 +262,6 @@ bl_read_pagelist(struct nfs_pgio_header *header) extent_length = be.be_length - (isect - be.be_f_offset); } - pg_offset = f_offset & ~PAGE_CACHE_MASK; if (is_dio) { if (pg_offset + bytes_left > PAGE_CACHE_SIZE) pg_len = PAGE_CACHE_SIZE - pg_offset; @@ -273,9 +272,6 @@ bl_read_pagelist(struct nfs_pgio_header *header) pg_len = PAGE_CACHE_SIZE; } - isect += (pg_offset >> SECTOR_SHIFT); - extent_length -= (pg_offset >> SECTOR_SHIFT); - if (is_hole(&be)) { bio = bl_submit_bio(READ, bio); /* Fill hole w/ zeroes w/o accessing device */ @@ -301,6 +297,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) extent_length -= (pg_len >> SECTOR_SHIFT); f_offset += pg_len; bytes_left -= pg_len; + pg_offset = 0; } if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { header->res.eof = 1; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 75f7c0a7538a..a7f2e6e33305 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -99,17 +99,6 @@ nfs4_callback_up(struct svc_serv *serv) } #if defined(CONFIG_NFS_V4_1) -static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) -{ - /* - * Create an svc_sock for the back channel service that shares the - * fore channel connection. - * Returns the input port (0) and sets the svc_serv bc_xprt on success - */ - return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0, - SVC_SOCK_ANONYMOUS); -} - /* * The callback service for NFSv4.1 callbacks */ @@ -184,11 +173,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, xprt->bc_serv = serv; } #else -static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) -{ - return 0; -} - static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) { @@ -259,7 +243,8 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc svc_shutdown_net(serv, net); } -static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net) +static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, + struct net *net, struct rpc_xprt *xprt) { struct nfs_net *nn = net_generic(net, nfs_net_id); int ret; @@ -275,20 +260,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n goto err_bind; } - switch (minorversion) { - case 0: - ret = nfs4_callback_up_net(serv, net); - break; - case 1: - case 2: - ret = nfs41_callback_up_net(serv, net); - break; - default: - printk(KERN_ERR "NFS: unknown callback version: %d\n", - minorversion); - ret = -EINVAL; - break; - } + ret = -EPROTONOSUPPORT; + if (minorversion == 0) + ret = nfs4_callback_up_net(serv, net); + else if (xprt->ops->bc_up) + ret = xprt->ops->bc_up(serv, net); if (ret < 0) { printk(KERN_ERR "NFS: callback service start failed\n"); @@ -364,7 +340,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) goto err_create; } - ret = nfs_callback_up_net(minorversion, serv, net); + ret = nfs_callback_up_net(minorversion, serv, net, xprt); if (ret < 0) goto err_net; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 84326e9fb47a..ff8195bd75ea 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -61,7 +61,6 @@ struct cb_compound_hdr_res { }; struct cb_getattrargs { - struct sockaddr *addr; struct nfs_fh fh; uint32_t bitmap[2]; }; @@ -76,7 +75,6 @@ struct cb_getattrres { }; struct cb_recallargs { - struct sockaddr *addr; struct nfs_fh fh; nfs4_stateid stateid; uint32_t truncate; @@ -119,9 +117,6 @@ extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_sequenceres *res, struct cb_process_state *cps); -extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, - const nfs4_stateid *stateid); - #define RCA4_TYPE_MASK_RDATA_DLG 0 #define RCA4_TYPE_MASK_WDATA_DLG 1 #define RCA4_TYPE_MASK_DIR_DLG 2 @@ -134,7 +129,6 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, #define RCA4_TYPE_MASK_ALL 0xf31f struct cb_recallanyargs { - struct sockaddr *craa_addr; uint32_t craa_objs_to_keep; uint32_t craa_type_mask; }; @@ -144,7 +138,6 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, struct cb_process_state *cps); struct cb_recallslotargs { - struct sockaddr *crsa_addr; uint32_t crsa_target_highest_slotid; }; extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, @@ -152,7 +145,6 @@ extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, struct cb_process_state *cps); struct cb_layoutrecallargs { - struct sockaddr *cbl_addr; uint32_t cbl_recall_type; uint32_t cbl_layout_type; uint32_t cbl_layoutchanged; @@ -196,9 +188,6 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); extern void nfs_callback_down(int minorversion, struct net *net); -extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, - const nfs4_stateid *stateid); -extern int nfs4_set_callback_sessionid(struct nfs_client *clp); #endif /* CONFIG_NFS_V4 */ /* * nfs41: Callbacks are expected to not cause substantial latency, @@ -209,6 +198,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp); #define NFS41_BC_MAX_CALLBACKS 1 extern unsigned int nfs_callback_set_tcpport; -extern unsigned short nfs_callback_tcpport; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index b85cf7a30232..f0939d097406 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -17,9 +17,7 @@ #include "nfs4session.h" #include "nfs4trace.h" -#ifdef NFS_DEBUG #define NFSDBG_FACILITY NFSDBG_CALLBACK -#endif __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res, @@ -85,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, res = htonl(NFS4ERR_BADHANDLE); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) + if (inode == NULL) { + trace_nfs4_cb_recall(cps->clp, &args->fh, NULL, + &args->stateid, -ntohl(res)); goto out; + } /* Set up a helper thread to actually return the delegation */ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { case 0: @@ -98,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, default: res = htonl(NFS4ERR_RESOURCE); } - trace_nfs4_recall_delegation(inode, -ntohl(res)); + trace_nfs4_cb_recall(cps->clp, &args->fh, inode, + &args->stateid, -ntohl(res)); iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); @@ -162,6 +164,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, return lo; } +/* + * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) + */ +static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new) +{ + u32 oldseq, newseq; + + oldseq = be32_to_cpu(lo->plh_stateid.seqid); + newseq = be32_to_cpu(new->seqid); + + if (newseq > oldseq + 1) + return false; + return true; +} + static u32 initiate_file_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { @@ -171,34 +189,52 @@ static u32 initiate_file_draining(struct nfs_client *clp, LIST_HEAD(free_me_list); lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); - if (!lo) + if (!lo) { + trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, + &args->cbl_stateid, -rv); goto out; + } ino = lo->plh_inode; spin_lock(&ino->i_lock); + if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { + rv = NFS4ERR_DELAY; + goto unlock; + } pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); spin_unlock(&ino->i_lock); pnfs_layoutcommit_inode(ino, false); spin_lock(&ino->i_lock); - if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, - &args->cbl_range)) { + /* + * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) + */ + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { rv = NFS4ERR_DELAY; goto unlock; } + if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, + &args->cbl_range)) { + rv = NFS4_OK; + goto unlock; + } + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &args->cbl_range); } + pnfs_mark_layout_returned_if_empty(lo); unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); + /* Free all lsegs that are attached to commit buckets */ + nfs_commit_inode(ino, 0); pnfs_put_layout_hdr(lo); - trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv); + trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino, + &args->cbl_stateid, -rv); iput(ino); out: return rv; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 6b1697a01dde..646cdac73488 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -18,19 +18,21 @@ #include "internal.h" #include "nfs4session.h" -#define CB_OP_TAGLEN_MAXSZ (512) -#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) -#define CB_OP_GETATTR_BITMAP_MAXSZ (4) -#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ - CB_OP_GETATTR_BITMAP_MAXSZ + \ - 2 + 2 + 3 + 3) -#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_TAGLEN_MAXSZ (512) +#define CB_OP_HDR_RES_MAXSZ (2 * 4) // opcode, status +#define CB_OP_GETATTR_BITMAP_MAXSZ (4 * 4) // bitmap length, 3 bitmaps +#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + CB_OP_GETATTR_BITMAP_MAXSZ + \ + /* change, size, ctime, mtime */\ + (2 + 2 + 3 + 3) * 4) +#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #if defined(CONFIG_NFS_V4_1) #define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ - 4 + 1 + 3) + NFS4_MAX_SESSIONID_LEN + \ + (1 + 3) * 4) // seqid, 3 slotids #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #endif /* CONFIG_NFS_V4_1 */ @@ -157,7 +159,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound if (unlikely(status != 0)) return status; /* We do not like overly long tags! */ - if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) { + if (hdr->taglen > CB_OP_TAGLEN_MAXSZ) { printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n", __func__, hdr->taglen); return htonl(NFS4ERR_RESOURCE); @@ -198,7 +200,6 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr status = decode_fh(xdr, &args->fh); if (unlikely(status != 0)) goto out; - args->addr = svc_addr(rqstp); status = decode_bitmap(xdr, args->bitmap); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); @@ -210,7 +211,6 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, __be32 *p; __be32 status; - args->addr = svc_addr(rqstp); status = decode_stateid(xdr, &args->stateid); if (unlikely(status != 0)) goto out; @@ -236,7 +236,6 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, __be32 status = 0; uint32_t iomode; - args->cbl_addr = svc_addr(rqstp); p = read_buf(xdr, 4 * sizeof(uint32_t)); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); @@ -383,13 +382,12 @@ static __be32 decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) { __be32 *p; - int len = NFS4_MAX_SESSIONID_LEN; - p = read_buf(xdr, len); + p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - memcpy(sid->data, p, len); + memcpy(sid->data, p, NFS4_MAX_SESSIONID_LEN); return 0; } @@ -500,7 +498,6 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, uint32_t bitmap[2]; __be32 *p, status; - args->craa_addr = svc_addr(rqstp); p = read_buf(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); @@ -519,7 +516,6 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp, { __be32 *p; - args->crsa_addr = svc_addr(rqstp); p = read_buf(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); @@ -684,13 +680,12 @@ static __be32 encode_sessionid(struct xdr_stream *xdr, const struct nfs4_sessionid *sid) { __be32 *p; - int len = NFS4_MAX_SESSIONID_LEN; - p = xdr_reserve_space(xdr, len); + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - memcpy(p, sid, len); + memcpy(p, sid, NFS4_MAX_SESSIONID_LEN); return 0; } @@ -704,7 +699,9 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, if (unlikely(status != 0)) goto out; - encode_sessionid(xdr, &res->csr_sessionid); + status = encode_sessionid(xdr, &res->csr_sessionid); + if (status) + goto out; p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t)); if (unlikely(p == NULL)) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 57c5a02f6213..d6d5d2a48e83 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -764,6 +764,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->time_delta = fsinfo->time_delta; + server->clone_blksize = fsinfo->clone_blksize; /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index be806ead7f4d..5166adcfc0fb 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -721,14 +721,12 @@ int nfs_async_inode_return_delegation(struct inode *inode, struct nfs_client *clp = server->nfs_client; struct nfs_delegation *delegation; - filemap_flush(inode->i_mapping); - rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation == NULL) goto out_enoent; - - if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) + if (stateid != NULL && + !clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) goto out_enoent; nfs_mark_return_delegation(server, delegation); rcu_read_unlock(); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3d8e4ffa0a33..c82a21228a34 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1714,9 +1714,6 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); - if (!new_valid_dev(rdev)) - return -EINVAL; - attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; @@ -1897,15 +1894,14 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_USER); if (!page) return -ENOMEM; - kaddr = kmap_atomic(page); + kaddr = page_address(page); memcpy(kaddr, symname, pathlen); if (pathlen < PAGE_SIZE) memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); - kunmap_atomic(kaddr); trace_nfs_symlink_enter(dir, dentry); error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); @@ -2435,6 +2431,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) } EXPORT_SYMBOL_GPL(nfs_may_open); +static int nfs_execute_ok(struct inode *inode, int mask) +{ + struct nfs_server *server = NFS_SERVER(inode); + int ret; + + if (mask & MAY_NOT_BLOCK) + ret = nfs_revalidate_inode_rcu(server, inode); + else + ret = nfs_revalidate_inode(server, inode); + if (ret == 0 && !execute_ok(inode)) + ret = -EACCES; + return ret; +} + int nfs_permission(struct inode *inode, int mask) { struct rpc_cred *cred; @@ -2452,6 +2462,9 @@ int nfs_permission(struct inode *inode, int mask) case S_IFLNK: goto out; case S_IFREG: + if ((mask & MAY_OPEN) && + nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)) + return 0; break; case S_IFDIR: /* @@ -2484,8 +2497,8 @@ force_lookup: res = PTR_ERR(cred); } out: - if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) - res = -EACCES; + if (!res && (mask & MAY_EXEC)) + res = nfs_execute_ok(inode, mask); dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4b1d08f56aba..7ab7ec9f4eed 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } -void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq) -{ - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; -} -EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes); - static void nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) { @@ -670,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) req = nfs_list_entry(reqs.next); nfs_direct_setup_mirroring(dreq, &desc, req); + if (desc.pg_error < 0) { + list_splice_init(&reqs, &failed); + goto out_failed; + } list_for_each_entry_safe(req, tmp, &reqs, wb_list) { if (!nfs_pageio_add_request(&desc, req)) { @@ -677,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_list_add_request(req, &failed); spin_lock(cinfo.lock); dreq->flags = 0; - dreq->error = -EIO; + if (desc.pg_error < 0) + dreq->error = desc.pg_error; + else + dreq->error = -EIO; spin_unlock(cinfo.lock); } nfs_release_request(req); } nfs_pageio_complete(&desc); +out_failed: while (!list_empty(&failed)) { req = nfs_list_entry(failed.next); nfs_list_remove_request(req); @@ -727,14 +729,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) nfs_direct_write_complete(dreq, data->inode); } -static void nfs_direct_error_cleanup(struct nfs_inode *nfsi) +static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) { - /* There is no lock to clear */ + struct nfs_direct_req *dreq = cinfo->dreq; + + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_mark_request_commit(req, NULL, cinfo, 0); } static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { .completion = nfs_direct_commit_complete, - .error_cleanup = nfs_direct_error_cleanup, + .resched_write = nfs_direct_resched_write, }; static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) @@ -839,10 +847,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head) } } +static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) +{ + struct nfs_direct_req *dreq = hdr->dreq; + + spin_lock(&dreq->lock); + if (dreq->error == 0) { + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + /* fake unstable write to let common nfs resend pages */ + hdr->verf.committed = NFS_UNSTABLE; + hdr->good_bytes = hdr->args.count; + } + spin_unlock(&dreq->lock); +} + static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { .error_cleanup = nfs_write_sync_pgio_error, .init_hdr = nfs_direct_pgio_init, .completion = nfs_direct_write_completion, + .reschedule_io = nfs_direct_write_reschedule_io, }; @@ -900,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, } nfs_direct_setup_mirroring(dreq, &desc, req); + if (desc.pg_error < 0) { + nfs_free_request(req); + result = desc.pg_error; + break; + } nfs_lock_request(req); req->wb_index = pos >> PAGE_SHIFT; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c0f9b1ed12b9..4ef8f5addcad 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -473,8 +473,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* Always try to initiate a 'commit' if relevant, but only - * wait for it if __GFP_WAIT is set. Even then, only wait 1 - * second and only if the 'bdi' is not congested. + * wait for it if the caller allows blocking. Even then, + * only wait 1 second and only if the 'bdi' is not congested. * Waiting indefinitely can cause deadlocks when the NFS * server is on this machine, when a new TCP connection is * needed and in other rare cases. There is no particular @@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp) if (mapping) { struct nfs_server *nfss = NFS_SERVER(mapping->host); nfs_commit_inode(mapping->host, 0); - if ((gfp & __GFP_WAIT) && + if (gfpflags_allow_blocking(gfp) && !bdi_write_congested(&nfss->backing_dev_info)) { wait_on_page_bit_killable_timeout(page, PG_private, HZ); @@ -514,7 +514,7 @@ static void nfs_check_dirty_writeback(struct page *page, * so it will not block due to pages that will shortly be freeable. */ nfsi = NFS_I(mapping->host); - if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { + if (atomic_read(&nfsi->commit_info.rpcs_out)) { *writeback = true; return; } @@ -545,7 +545,7 @@ static int nfs_launder_page(struct page *page) inode->i_ino, (long long)page_offset(page)); nfs_fscache_wait_on_page_write(nfsi, page); - return nfs_wb_page(inode, page); + return nfs_wb_launder_page(inode, page); } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -738,18 +738,7 @@ out_noconflict: static int do_vfs_lock(struct file *file, struct file_lock *fl) { - int res = 0; - switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { - case FL_POSIX: - res = posix_lock_file_wait(file, fl); - break; - case FL_FLOCK: - res = flock_lock_file_wait(file, fl); - break; - default: - BUG(); - } - return res; + return locks_lock_file_wait(file, fl); } static int @@ -767,7 +756,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { - status = nfs_iocounter_wait(&l_ctx->io_count); + status = nfs_iocounter_wait(l_ctx); nfs_put_lock_context(l_ctx); if (status < 0) return status; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 02ec07973bc4..bb1f4e7a3270 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, task->tk_status); nfs4_mark_deviceid_unavailable(devid); pnfs_error_mark_layout_for_return(inode, lseg); + pnfs_set_lo_fail(lseg); rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: @@ -883,13 +884,19 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_read_mds(pgio); @@ -902,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } + /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) goto out_mds; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index fbc5a56de875..6594e9f903a0 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -145,7 +145,7 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, return false; for (i = 0; i < m1->fh_versions_cnt; i++) { bool found_fh = false; - for (j = 0; j < m2->fh_versions_cnt; i++) { + for (j = 0; j < m2->fh_versions_cnt; j++) { if (nfs_compare_fh(&m1->fh_versions[i], &m2->fh_versions[j]) == 0) { found_fh = true; @@ -339,6 +339,19 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) } } +static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls) +{ + struct nfs4_deviceid_node *node; + int i; + + if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS)) + return; + for (i = 0; i < fls->mirror_array_cnt; i++) { + node = &fls->mirror_array[i]->mirror_ds->id_node; + clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + } +} + static struct pnfs_layout_segment * ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_layoutget_res *lgr, @@ -492,13 +505,22 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, } p = xdr_inline_decode(&stream, 4); - if (p) - fls->flags = be32_to_cpup(p); + if (!p) + goto out_sort_mirrors; + fls->flags = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_sort_mirrors; + for (i=0; i < fls->mirror_array_cnt; i++) + fls->mirror_array[i]->report_interval = be32_to_cpup(p); +out_sort_mirrors: ff_layout_sort_mirrors(fls); rc = ff_layout_check_layout(lgr); if (rc) goto out_err_free; + ff_layout_mark_devices_valid(fls); ret = &fls->generic_hdr; dprintk("<-- %s (success)\n", __func__); @@ -589,7 +611,9 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, mirror->start_time = now; if (ktime_equal(mirror->last_report_time, notime)) mirror->last_report_time = now; - if (layoutstats_timer != 0) + if (mirror->report_interval != 0) + report_interval = (s64)mirror->report_interval * 1000LL; + else if (layoutstats_timer != 0) report_interval = (s64)layoutstats_timer * 1000LL; if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= report_interval) { @@ -741,17 +765,17 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, } static struct nfs4_pnfs_ds * -ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio, +ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx) { - struct nfs4_ff_layout_segment *fls; + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_pnfs_ds *ds; int idx; - fls = FF_LAYOUT_LSEG(pgio->pg_lseg); /* mirrors are sorted by efficiency */ - for (idx = 0; idx < fls->mirror_array_cnt; idx++) { - ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false); + for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { + ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); if (ds) { *best_idx = idx; return ds; @@ -771,18 +795,24 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; /* Use full layout for now */ - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) goto out_mds; - ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx); + ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); if (!ds) goto out_mds; mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); @@ -811,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int i; int status; - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) goto out_mds; @@ -853,18 +889,25 @@ static unsigned int ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + goto out; + } + } if (pgio->pg_lseg) return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); /* no lseg means that pnfs is not in use, so no mirroring here */ nfs_pageio_reset_write_mds(pgio); +out: return 1; } @@ -898,18 +941,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) hdr->args.count, (unsigned long long)hdr->args.offset); - if (!hdr->dreq) { - struct nfs_open_context *ctx; - - ctx = nfs_list_entry(hdr->pages.next)->wb_context; - set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); - hdr->completion_ops->error_cleanup(&hdr->pages); - } else { - nfs_direct_set_resched_writes(hdr->dreq); - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.count; - } + hdr->completion_ops->reschedule_io(hdr); return; } @@ -1035,7 +1067,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: - if (ff_layout_has_available_ds(lseg)) + if (ff_layout_no_fallback_to_mds(lseg) || + ff_layout_has_available_ds(lseg)) return -NFS4ERR_RESET_TO_PNFS; reset: dprintk("%s Retry through MDS. Error %d\n", __func__, @@ -1086,7 +1119,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, return -NFS4ERR_RESET_TO_PNFS; out_retry: task->tk_status = 0; - rpc_restart_call(task); + rpc_restart_call_prepare(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); return -EAGAIN; } @@ -1144,6 +1177,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, } } + switch (status) { + case NFS4ERR_DELAY: + case NFS4ERR_GRACE: + return; + default: + break; + } + mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, @@ -1153,7 +1194,6 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, } /* NFS_PROTO call done callback routines */ - static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { @@ -1171,6 +1211,10 @@ static int ff_layout_read_done_cb(struct rpc_task *task, switch (err) { case -NFS4ERR_RESET_TO_PNFS: + if (ff_layout_choose_best_ds_for_read(hdr->lseg, + hdr->pgio_mirror_idx + 1, + &hdr->pgio_mirror_idx)) + goto out_eagain; set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &hdr->lseg->pls_layout->plh_flags); pnfs_read_resend_pnfs(hdr); @@ -1179,11 +1223,13 @@ static int ff_layout_read_done_cb(struct rpc_task *task, ff_layout_reset_read(hdr); return task->tk_status; case -EAGAIN: - rpc_restart_call_prepare(task); - return -EAGAIN; + goto out_eagain; } return 0; +out_eagain: + rpc_restart_call_prepare(task); + return -EAGAIN; } static bool @@ -1222,14 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) return ff_layout_test_devid_unavailable(node); } -static int ff_layout_read_prepare_common(struct rpc_task *task, - struct nfs_pgio_header *hdr) +static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, + struct nfs_pgio_header *hdr) { + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; nfs4_ff_layout_stat_io_start_read(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), hdr->args.count, task->tk_start); +} + +static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; + nfs4_ff_layout_stat_io_end_read(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, + hdr->res.count); +} +static int ff_layout_read_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1245,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, } hdr->pgio_done_cb = ff_layout_read_done_cb; + ff_layout_read_record_layoutstats_start(task, hdr); return 0; } @@ -1303,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data) dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); - nfs4_ff_layout_stat_io_end_read(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1321,10 +1381,20 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + ff_layout_read_record_layoutstats_done(task, hdr); rpc_count_iostats_metrics(task, &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]); } +static void ff_layout_read_release(void *data) +{ + struct nfs_pgio_header *hdr = data; + + ff_layout_read_record_layoutstats_done(&hdr->task, hdr); + pnfs_generic_rw_release(data); +} + + static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { @@ -1342,15 +1412,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task, switch (err) { case -NFS4ERR_RESET_TO_PNFS: - pnfs_set_retry_layoutget(hdr->lseg->pls_layout); ff_layout_reset_write(hdr, true); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); ff_layout_reset_write(hdr, false); return task->tk_status; case -EAGAIN: - rpc_restart_call_prepare(task); return -EAGAIN; } @@ -1382,11 +1449,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, switch (err) { case -NFS4ERR_RESET_TO_PNFS: - pnfs_set_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -NFS4ERR_RESET_TO_MDS: - pnfs_clear_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -EAGAIN: @@ -1401,14 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return 0; } -static int ff_layout_write_prepare_common(struct rpc_task *task, - struct nfs_pgio_header *hdr) +static void ff_layout_write_record_layoutstats_start(struct rpc_task *task, + struct nfs_pgio_header *hdr) { + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; nfs4_ff_layout_stat_io_start_write(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), hdr->args.count, task->tk_start); +} +static void ff_layout_write_record_layoutstats_done(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count, + hdr->res.verf->committed); +} + +static int ff_layout_write_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1425,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EAGAIN; } + ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1460,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count, - hdr->res.verf->committed); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1479,18 +1557,53 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + ff_layout_write_record_layoutstats_done(task, hdr); rpc_count_iostats_metrics(task, &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); } -static void ff_layout_commit_prepare_common(struct rpc_task *task, +static void ff_layout_write_release(void *data) +{ + struct nfs_pgio_header *hdr = data; + + ff_layout_write_record_layoutstats_done(&hdr->task, hdr); + pnfs_generic_rw_release(data); +} + +static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task, struct nfs_commit_data *cdata) { + if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags)) + return; nfs4_ff_layout_stat_io_start_write(cdata->inode, FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), 0, task->tk_start); } +static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + struct nfs_page *req; + __u64 count = 0; + + if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags)) + return; + + if (task->tk_status == 0) { + list_for_each_entry(req, &cdata->pages, wb_list) + count += req->wb_bytes; + } + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + count, count, NFS_FILE_SYNC); +} + +static void ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + ff_layout_commit_record_layoutstats_start(task, cdata); +} + static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { ff_layout_commit_prepare_common(task, data); @@ -1511,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) static void ff_layout_commit_done(struct rpc_task *task, void *data) { - struct nfs_commit_data *cdata = data; - struct nfs_page *req; - __u64 count = 0; - - if (task->tk_status == 0) { - list_for_each_entry(req, &cdata->pages, wb_list) - count += req->wb_bytes; - } - - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), - count, count, NFS_FILE_SYNC); - pnfs_generic_write_commit_done(task, data); } @@ -1531,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) { struct nfs_commit_data *cdata = data; + ff_layout_commit_record_layoutstats_done(task, cdata); rpc_count_iostats_metrics(task, &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]); } +static void ff_layout_commit_release(void *data) +{ + struct nfs_commit_data *cdata = data; + + ff_layout_commit_record_layoutstats_done(&cdata->task, cdata); + pnfs_generic_commit_release(data); +} + static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { .rpc_call_prepare = ff_layout_read_prepare_v3, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_read_release, }; static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { .rpc_call_prepare = ff_layout_read_prepare_v4, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_read_release, }; static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { .rpc_call_prepare = ff_layout_write_prepare_v3, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_write_release, }; static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { .rpc_call_prepare = ff_layout_write_prepare_v4, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_write_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { .rpc_call_prepare = ff_layout_commit_prepare_v3, .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, - .rpc_release = pnfs_generic_commit_release, + .rpc_release = ff_layout_commit_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { .rpc_call_prepare = ff_layout_commit_prepare_v4, .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, - .rpc_release = pnfs_generic_commit_release, + .rpc_release = ff_layout_commit_release, }; static enum pnfs_try_status diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 68cc0d9828f9..dd353bb7dc0a 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -10,6 +10,7 @@ #define FS_NFS_NFS4FLEXFILELAYOUT_H #define FF_FLAGS_NO_LAYOUTCOMMIT 1 +#define FF_FLAGS_NO_IO_THRU_MDS 2 #include "../pnfs.h" @@ -84,6 +85,7 @@ struct nfs4_ff_layout_mirror { struct nfs4_ff_layoutstat write_stat; ktime_t start_time; ktime_t last_report_time; + u32 report_interval; }; struct nfs4_ff_layout_segment { @@ -146,6 +148,12 @@ FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg) } static inline bool +ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg) +{ + return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS; +} + +static inline bool ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) { return nfs4_test_deviceid_unavailable(node); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e125e55de86d..bd0327541366 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -429,22 +429,14 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); - if (fail_return) { - pnfs_error_mark_layout_for_return(ino, lseg); - if (ff_layout_has_available_ds(lseg)) - pnfs_set_retry_layoutget(lseg->pls_layout); - else - pnfs_clear_retry_layoutget(lseg->pls_layout); - - } else { + if (!fail_return) { if (ff_layout_has_available_ds(lseg)) set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lseg->pls_layout->plh_flags); - else { + else pnfs_error_mark_layout_for_return(ino, lseg); - pnfs_clear_retry_layoutget(lseg->pls_layout); - } - } + } else + pnfs_error_mark_layout_for_return(ino, lseg); } out_update_creds: if (ff_layout_update_mirror_cred(mirror, ds)) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 326d9e10d833..8e24d886d2c5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -71,19 +71,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -/** - * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks - * @word: long word containing the bit lock - */ -int nfs_wait_bit_killable(struct wait_bit_key *key) +static int nfs_wait_killable(int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } + +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) +{ + return nfs_wait_killable(mode); +} EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); +int nfs_wait_atomic_killable(atomic_t *p) +{ + return nfs_wait_killable(TASK_KILLABLE); +} + /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid @@ -408,9 +414,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_fop = NULL; inode->i_flags |= S_AUTOMOUNT; } - } else if (S_ISLNK(inode->i_mode)) + } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &nfs_symlink_inode_operations; - else + inode_nohighmem(inode); + } else init_special_inode(inode, inode->i_mode, fattr->rdev); memset(&inode->i_atime, 0, sizeof(inode->i_atime)); @@ -618,7 +625,10 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } - nfs_update_inode(inode, fattr); + if (fattr->valid) + nfs_update_inode(inode, fattr); + else + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); @@ -696,7 +706,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) l_ctx->lockowner.l_owner = current->files; l_ctx->lockowner.l_pid = current->tgid; INIT_LIST_HEAD(&l_ctx->list); - nfs_iocounter_init(&l_ctx->io_count); + atomic_set(&l_ctx->io_count, 0); } static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) @@ -909,6 +919,12 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx) { struct inode *inode = d_inode(ctx->dentry); + /* + * We fatal error on write before. Try to writeback + * every page again. + */ + if (ctx->error < 0) + invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); @@ -1083,6 +1099,27 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode) || NFS_STALE(inode); } +int nfs_revalidate_mapping_rcu(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; + int ret = 0; + + if (IS_SWAPFILE(inode)) + goto out; + if (nfs_mapping_need_revalidate_inode(inode)) { + ret = -ECHILD; + goto out; + } + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock) || + (nfsi->cache_validity & NFS_INO_INVALID_DATA)) + ret = -ECHILD; + spin_unlock(&inode->i_lock); +out: + return ret; +} + /** * __nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode @@ -1638,6 +1675,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; + bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1699,22 +1737,28 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); inode->i_version = fattr->change_attr; } - } else + } else { nfsi->cache_validity |= save_cache_validity; + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_MTIME) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - } else if (server->caps & NFS_CAP_MTIME) + } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_CTIME) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - } else if (server->caps & NFS_CAP_CTIME) + } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1734,19 +1778,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) (long long)cur_isize, (long long)new_isize); } - } else + } else { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_ATIME) memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); - else if (server->caps & NFS_CAP_ATIME) + else if (server->caps & NFS_CAP_ATIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { @@ -1755,36 +1803,42 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } - } else if (server->caps & NFS_CAP_MODE) + } else if (server->caps & NFS_CAP_MODE) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } - } else if (server->caps & NFS_CAP_OWNER) + } else if (server->caps & NFS_CAP_OWNER) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } - } else if (server->caps & NFS_CAP_OWNER_GROUP) + } else if (server->caps & NFS_CAP_OWNER_GROUP) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { @@ -1793,19 +1847,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); } - } else if (server->caps & NFS_CAP_NLINK) + } else if (server->caps & NFS_CAP_NLINK) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } - if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; + else + cache_revalidated = false; /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { @@ -1815,16 +1872,24 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Set barrier to be more recent than all outstanding updates */ nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { - if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { - if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) - nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); + if (cache_revalidated) { + if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, + nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { + nfsi->attrtimeo <<= 1; + if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode)) + nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); + } nfsi->attrtimeo_timestamp = now; } /* Set the barrier to be more recent than this fattr */ if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0) nfsi->attr_gencount = fattr->gencount; } - invalid &= ~NFS_INO_INVALID_ATTR; + + /* Don't declare attrcache up to date if there were no attrs! */ + if (cache_revalidated) + invalid &= ~NFS_INO_INVALID_ATTR; + /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -1904,7 +1969,7 @@ static int __init nfs_init_inodecache(void) nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 56cfde26fb9c..4e8cc942336c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -238,7 +238,7 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); -int nfs_iocounter_wait(struct nfs_io_counter *c); +int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); @@ -252,18 +252,18 @@ void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); -static inline void nfs_iocounter_init(struct nfs_io_counter *c) -{ - c->flags = 0; - atomic_set(&c->io_count, 0); -} - static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc) { WARN_ON_ONCE(desc->pg_mirror_count < 1); return desc->pg_mirror_count > 1; } +static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, + const struct nfs_open_context *ctx2) +{ + return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; +} + /* nfs2xdr.c */ extern struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, @@ -379,7 +379,8 @@ extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); -extern int nfs_wait_bit_killable(struct wait_bit_key *key); +extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); +extern int nfs_wait_atomic_killable(atomic_t *p); /* super.c */ extern const struct super_operations nfs_sops; @@ -519,7 +520,6 @@ static inline void nfs_inode_dio_wait(struct inode *inode) inode_dio_wait(inode); } extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); -extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); @@ -696,9 +696,32 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); } +static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) +{ + return ~crc32_le(0xFFFFFFFF, &stateid->other[0], + NFS4_STATEID_OTHER_SIZE); +} #else static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return 0; } +static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) +{ + return 0; +} #endif + +static inline bool nfs_error_is_fatal(int err) +{ + switch (err) { + case -ERESTARTSYS: + case -EIO: + case -ENOSPC: + case -EROFS: + case -E2BIG: + return true; + default: + return false; + } +} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 99a45283b9ee..09b190015df4 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -16,9 +16,7 @@ #include <linux/nfs_fs.h> #include "internal.h" -#ifdef NFS_DEBUG -# define NFSDBG_FACILITY NFSDBG_MOUNT -#endif +#define NFSDBG_FACILITY NFSDBG_MOUNT /* * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4 diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 1ebe2fc7cda2..17c0fa1eccfa 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -284,12 +284,12 @@ nfs3_listxattr(struct dentry *dentry, char *data, size_t size) int error; error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, - POSIX_ACL_XATTR_ACCESS, data, size, &result); + XATTR_NAME_POSIX_ACL_ACCESS, data, size, &result); if (error) return error; error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, - POSIX_ACL_XATTR_DEFAULT, data, size, &result); + XATTR_NAME_POSIX_ACL_DEFAULT, data, size, &result); if (error) return error; return result; diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 814c1255f1d2..b587ccd31083 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -17,5 +17,6 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); +int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t); #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 0f020e4d8421..6e8174930a48 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -14,7 +14,7 @@ #include "pnfs.h" #include "internal.h" -#define NFSDBG_FACILITY NFSDBG_PNFS +#define NFSDBG_FACILITY NFSDBG_PROC static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, fmode_t fmode) @@ -204,6 +204,8 @@ static void nfs42_layoutstat_done(struct rpc_task *task, void *calldata) { struct nfs42_layoutstat_data *data = calldata; + struct inode *inode = data->inode; + struct pnfs_layout_hdr *lo; if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -211,12 +213,35 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo && nfs4_stateid_match(&data->args.stateid, + &lo->plh_stateid)) { + LIST_HEAD(head); + + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + } else + spin_unlock(&inode->i_lock); + break; case -ENOTSUPP: case -EOPNOTSUPP: - NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS; + NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; default: - dprintk("%s server returns %d\n", __func__, task->tk_status); + break; } + + dprintk("%s server returns %d\n", __func__, task->tk_status); } static void @@ -271,3 +296,75 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, return PTR_ERR(task); return 0; } + +static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, + struct file *dst_f, loff_t src_offset, + loff_t dst_offset, loff_t count) +{ + struct inode *src_inode = file_inode(src_f); + struct inode *dst_inode = file_inode(dst_f); + struct nfs_server *server = NFS_SERVER(dst_inode); + struct nfs42_clone_args args = { + .src_fh = NFS_FH(src_inode), + .dst_fh = NFS_FH(dst_inode), + .src_offset = src_offset, + .dst_offset = dst_offset, + .count = count, + .dst_bitmask = server->cache_consistency_bitmask, + }; + struct nfs42_clone_res res = { + .server = server, + }; + int status; + + msg->rpc_argp = &args; + msg->rpc_resp = &res; + + status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ); + if (status) + return status; + + status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE); + if (status) + return status; + + res.dst_fattr = nfs_alloc_fattr(); + if (!res.dst_fattr) + return -ENOMEM; + + status = nfs4_call_sync(server->client, server, msg, + &args.seq_args, &res.seq_res, 0); + if (status == 0) + status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); + + kfree(res.dst_fattr); + return status; +} + +int nfs42_proc_clone(struct file *src_f, struct file *dst_f, + loff_t src_offset, loff_t dst_offset, loff_t count) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE], + }; + struct inode *inode = file_inode(src_f); + struct nfs_server *server = NFS_SERVER(file_inode(src_f)); + struct nfs4_exception exception = { }; + int err; + + if (!nfs_server_capable(inode, NFS_CAP_CLONE)) + return -EOPNOTSUPP; + + do { + err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset, + dst_offset, count); + if (err == -ENOTSUPP || err == -EOPNOTSUPP) { + NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE; + return -EOPNOTSUPP; + } + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + + return err; + +} diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 0eb29e14070d..0ca482a51e53 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -34,6 +34,12 @@ 1 /* opaque devaddr4 length */ + \ XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE)) #define decode_layoutstats_maxsz (op_decode_hdr_maxsz) +#define encode_clone_maxsz (encode_stateid_maxsz + \ + encode_stateid_maxsz + \ + 2 /* src offset */ + \ + 2 /* dst offset */ + \ + 2 /* count */) +#define decode_clone_maxsz (op_decode_hdr_maxsz) #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ @@ -65,7 +71,20 @@ decode_sequence_maxsz + \ decode_putfh_maxsz + \ PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz) - +#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_putfh_maxsz + \ + encode_clone_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_clone_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_putfh_maxsz + \ + decode_clone_maxsz + \ + decode_getattr_maxsz) static void encode_fallocate(struct xdr_stream *xdr, struct nfs42_falloc_args *args) @@ -128,6 +147,21 @@ static void encode_layoutstats(struct xdr_stream *xdr, encode_uint32(xdr, 0); } +static void encode_clone(struct xdr_stream *xdr, + struct nfs42_clone_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_CLONE, decode_clone_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->src_stateid); + encode_nfs4_stateid(xdr, &args->dst_stateid); + p = reserve_space(xdr, 3*8); + p = xdr_encode_hyper(p, args->src_offset); + p = xdr_encode_hyper(p, args->dst_offset); + xdr_encode_hyper(p, args->count); +} + /* * Encode ALLOCATE request */ @@ -206,6 +240,27 @@ static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode CLONE request + */ +static void nfs4_xdr_enc_clone(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_clone_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->src_fh, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->dst_fh, &hdr); + encode_clone(xdr, args, &hdr); + encode_getfattr(xdr, args->dst_bitmask, &hdr); + encode_nops(&hdr); +} + static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_ALLOCATE); @@ -243,6 +298,11 @@ static int decode_layoutstats(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_LAYOUTSTATS); } +static int decode_clone(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_CLONE); +} + /* * Decode ALLOCATE request */ @@ -351,4 +411,39 @@ out: return status; } +/* + * Decode CLONE request + */ +static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_clone_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_clone(xdr); + if (status) + goto out; + status = decode_getfattr(xdr, res->dst_fattr, res->server); + +out: + res->rpc_status = status; + return status; +} + #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 50cfc4ca7a02..4afdee420d25 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -183,10 +183,12 @@ struct nfs4_state { struct nfs4_exception { - long timeout; - int retry; struct nfs4_state *state; struct inode *inode; + long timeout; + unsigned char delay : 1, + recovering : 1, + retry : 1; }; struct nfs4_state_recovery_ops { diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 223bedda64ae..10410e8b5853 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -33,7 +33,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) return ret; idr_preload(GFP_KERNEL); spin_lock(&nn->nfs_client_lock); - ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT); + ret = idr_alloc(&nn->cb_ident_idr, clp, 1, 0, GFP_NOWAIT); if (ret >= 0) clp->cl_cb_ident = ret; spin_unlock(&nn->nfs_client_lock); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index b0dbe0abed53..26f9a23e2b25 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -4,8 +4,10 @@ * Copyright (C) 1992 Rick Sladkey */ #include <linux/fs.h> +#include <linux/file.h> #include <linux/falloc.h> #include <linux/nfs_fs.h> +#include <uapi/linux/btrfs.h> /* BTRFS_IOC_CLONE/BTRFS_IOC_CLONE_RANGE */ #include "delegation.h" #include "internal.h" #include "iostat.h" @@ -192,14 +194,72 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t return nfs42_proc_deallocate(filep, offset, len); return nfs42_proc_allocate(filep, offset, len); } + +static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, u64 count) +{ + struct inode *dst_inode = file_inode(dst_file); + struct nfs_server *server = NFS_SERVER(dst_inode); + struct inode *src_inode = file_inode(src_file); + unsigned int bs = server->clone_blksize; + bool same_inode = false; + int ret; + + /* check alignment w.r.t. clone_blksize */ + ret = -EINVAL; + if (bs) { + if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs)) + goto out; + if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count)) + goto out; + } + + if (src_inode == dst_inode) + same_inode = true; + + /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ + if (same_inode) { + mutex_lock(&src_inode->i_mutex); + } else if (dst_inode < src_inode) { + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD); + } + + /* flush all pending writes on both src and dst so that server + * has the latest data */ + ret = nfs_sync_inode(src_inode); + if (ret) + goto out_unlock; + ret = nfs_sync_inode(dst_inode); + if (ret) + goto out_unlock; + + ret = nfs42_proc_clone(src_file, dst_file, src_off, dst_off, count); + + /* truncate inode page cache of the dst range so that future reads can fetch + * new data from server */ + if (!ret) + truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1); + +out_unlock: + if (same_inode) { + mutex_unlock(&src_inode->i_mutex); + } else if (dst_inode < src_inode) { + mutex_unlock(&src_inode->i_mutex); + mutex_unlock(&dst_inode->i_mutex); + } else { + mutex_unlock(&dst_inode->i_mutex); + mutex_unlock(&src_inode->i_mutex); + } +out: + return ret; +} #endif /* CONFIG_NFS_V4_2 */ const struct file_operations nfs4_file_operations = { -#ifdef CONFIG_NFS_V4_2 - .llseek = nfs4_file_llseek, -#else - .llseek = nfs_file_llseek, -#endif .read_iter = nfs_file_read, .write_iter = nfs_file_write, .mmap = nfs_file_mmap, @@ -211,9 +271,13 @@ const struct file_operations nfs4_file_operations = { .flock = nfs_flock, .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, -#ifdef CONFIG_NFS_V4_2 - .fallocate = nfs42_fallocate, -#endif /* CONFIG_NFS_V4_2 */ .check_flags = nfs_check_flags, .setlease = simple_nosetlease, +#ifdef CONFIG_NFS_V4_2 + .llseek = nfs4_file_llseek, + .fallocate = nfs42_fallocate, + .clone_file_range = nfs42_clone_file_range, +#else + .llseek = nfs_file_llseek, +#endif }; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 2e4902203c35..5ba22c6b0ffa 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -297,7 +297,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, { const struct cred *saved_cred; struct key *rkey; - struct user_key_payload *payload; + const struct user_key_payload *payload; ssize_t ret; saved_cred = override_creds(id_resolver_cache); @@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, if (ret < 0) goto out_up; - payload = rcu_dereference(rkey->payload.rcudata); + payload = user_key_payload(rkey); if (IS_ERR_OR_NULL(payload)) { ret = PTR_ERR(payload); goto out_up; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5133bb18830e..4bfc33ad0563 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -78,7 +78,6 @@ struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); -static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); @@ -209,6 +208,9 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY, FATTR4_WORD2_MDSTHRESHOLD +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + | FATTR4_WORD2_SECURITY_LABEL +#endif }; static const u32 nfs4_open_noattr_bitmap[3] = { @@ -239,6 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_FS_LAYOUT_TYPES, FATTR4_WORD2_LAYOUT_BLKSIZE + | FATTR4_WORD2_CLONE_BLKSIZE }; const u32 nfs4_fs_locations_bitmap[3] = { @@ -344,13 +347,16 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) /* This is the error handling routine for processes that are allowed * to sleep. */ -int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +static int nfs4_do_handle_exception(struct nfs_server *server, + int errorcode, struct nfs4_exception *exception) { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; struct inode *inode = exception->inode; int ret = errorcode; + exception->delay = 0; + exception->recovering = 0; exception->retry = 0; switch(errorcode) { case 0: @@ -359,11 +365,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - if (inode && nfs4_have_delegation(inode, FMODE_READ)) { - nfs4_inode_return_delegation(inode); - exception->retry = 1; - return 0; - } + if (inode && nfs_async_inode_return_delegation(inode, + NULL) == 0) + goto wait_on_recovery; if (state == NULL) break; ret = nfs4_schedule_stateid_recovery(server, state); @@ -409,11 +413,12 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ ret = -EBUSY; break; } - case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: - ret = nfs4_delay(server->client, &exception->timeout); - if (ret != 0) - break; + nfs_inc_server_stats(server, NFSIOS_DELAY); + case -NFS4ERR_GRACE: + exception->delay = 1; + return 0; + case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_OLD_STATEID: exception->retry = 1; @@ -434,14 +439,85 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ /* We failed to handle the error */ return nfs4_map_errors(ret); wait_on_recovery: - ret = nfs4_wait_clnt_recover(clp); + exception->recovering = 1; + return 0; +} + +/* This is the error handling routine for processes that are allowed + * to sleep. + */ +int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +{ + struct nfs_client *clp = server->nfs_client; + int ret; + + ret = nfs4_do_handle_exception(server, errorcode, exception); + if (exception->delay) { + ret = nfs4_delay(server->client, &exception->timeout); + goto out_retry; + } + if (exception->recovering) { + ret = nfs4_wait_clnt_recover(clp); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -EIO; + goto out_retry; + } + return ret; +out_retry: + if (ret == 0) + exception->retry = 1; + return ret; +} + +static int +nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server, + int errorcode, struct nfs4_exception *exception) +{ + struct nfs_client *clp = server->nfs_client; + int ret; + + ret = nfs4_do_handle_exception(server, errorcode, exception); + if (exception->delay) { + rpc_delay(task, nfs4_update_delay(&exception->timeout)); + goto out_retry; + } + if (exception->recovering) { + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) + rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); + goto out_retry; + } if (test_bit(NFS_MIG_FAILED, &server->mig_status)) - return -EIO; + ret = -EIO; + return ret; +out_retry: if (ret == 0) exception->retry = 1; return ret; } +static int +nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server, + struct nfs4_state *state, long *timeout) +{ + struct nfs4_exception exception = { + .state = state, + }; + + if (task->tk_status >= 0) + return 0; + if (timeout) + exception.timeout = *timeout; + task->tk_status = nfs4_async_handle_exception(task, server, + task->tk_status, + &exception); + if (exception.delay && timeout) + *timeout = exception.timeout; + if (exception.retry) + return -EAGAIN; + return 0; +} + /* * Return 'true' if 'clp' is using an rpc_client that is integrity protected * or 'false' otherwise. @@ -1312,6 +1388,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s * Protect the call to nfs4_state_set_mode_locked and * serialise the stateid update */ + spin_lock(&state->owner->so_lock); write_seqlock(&state->seqlock); if (deleg_stateid != NULL) { nfs4_stateid_copy(&state->stateid, deleg_stateid); @@ -1320,7 +1397,6 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s if (open_stateid != NULL) nfs_set_open_stateid_locked(state, open_stateid, fmode); write_sequnlock(&state->seqlock); - spin_lock(&state->owner->so_lock); update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); } @@ -1525,6 +1601,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) if (!data->rpc_done) { state = nfs4_try_open_cached(data); + trace_nfs4_cached_open(data->state); goto out; } @@ -1942,6 +2019,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) } return; unlock_no_action: + trace_nfs4_cached_open(data->state); rcu_read_unlock(); out_no_action: task->tk_action = NULL; @@ -2630,6 +2708,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status == 0 && state != NULL) renew_lease(server, timestamp); + trace_nfs4_setattr(inode, &arg.stateid, status); return status; } @@ -2646,7 +2725,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, int err; do { err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); - trace_nfs4_setattr(inode, err); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -4530,7 +4608,7 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server) #define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE) static int buf_to_pages_noslab(const void *buf, size_t buflen, - struct page **pages, unsigned int *pgbase) + struct page **pages) { struct page *newpage, **spages; int rc = 0; @@ -4674,7 +4752,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu goto out_free; args.acl_len = npages * PAGE_SIZE; - args.acl_pgbase = 0; dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n", __func__, buf, buflen, npages, args.acl_len); @@ -4766,7 +4843,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl return -EOPNOTSUPP; if (npages > ARRAY_SIZE(pages)) return -ERANGE; - i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); + i = buf_to_pages_noslab(buf, buflen, arg.acl_pages); if (i < 0) return i; nfs4_inode_return_delegation(inode); @@ -4955,79 +5032,6 @@ out: #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ -static int -nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, - struct nfs4_state *state, long *timeout) -{ - struct nfs_client *clp = server->nfs_client; - - if (task->tk_status >= 0) - return 0; - switch(task->tk_status) { - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OPENMODE: - if (state == NULL) - break; - if (nfs4_schedule_stateid_recovery(server, state) < 0) - goto recovery_failed; - goto wait_on_recovery; - case -NFS4ERR_EXPIRED: - if (state != NULL) { - if (nfs4_schedule_stateid_recovery(server, state) < 0) - goto recovery_failed; - } - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_STALE_CLIENTID: - nfs4_schedule_lease_recovery(clp); - goto wait_on_recovery; - case -NFS4ERR_MOVED: - if (nfs4_schedule_migration_recovery(server) < 0) - goto recovery_failed; - goto wait_on_recovery; - case -NFS4ERR_LEASE_MOVED: - nfs4_schedule_lease_moved_recovery(clp); - goto wait_on_recovery; -#if defined(CONFIG_NFS_V4_1) - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: - dprintk("%s ERROR %d, Reset session\n", __func__, - task->tk_status); - nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - goto wait_on_recovery; -#endif /* CONFIG_NFS_V4_1 */ - case -NFS4ERR_DELAY: - nfs_inc_server_stats(server, NFSIOS_DELAY); - rpc_delay(task, nfs4_update_delay(timeout)); - goto restart_call; - case -NFS4ERR_GRACE: - rpc_delay(task, NFS4_POLL_RETRY_MAX); - case -NFS4ERR_RETRY_UNCACHED_REP: - case -NFS4ERR_OLD_STATEID: - goto restart_call; - } - task->tk_status = nfs4_map_errors(task->tk_status); - return 0; -recovery_failed: - task->tk_status = -EIO; - return 0; -wait_on_recovery: - rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); - if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) - rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); - if (test_bit(NFS_MIG_FAILED, &server->mig_status)) - goto recovery_failed; -restart_call: - task->tk_status = 0; - return -EAGAIN; -} - static void nfs4_init_boot_verifier(const struct nfs_client *clp, nfs4_verifier *bootverf) { @@ -5049,7 +5053,6 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, static int nfs4_init_nonuniform_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5077,7 +5080,7 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) return -ENOMEM; rcu_read_lock(); - result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", + scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", clp->cl_ipaddr, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); @@ -5090,7 +5093,6 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) static int nfs4_init_uniquifier_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5110,7 +5112,7 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp) if (!str) return -ENOMEM; - result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s", + scnprintf(str, len, "Linux NFSv%u.%u %s/%s", clp->rpc_ops->version, clp->cl_minorversion, nfs4_client_id_uniquifier, clp->cl_rpcclient->cl_nodename); @@ -5121,7 +5123,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp) static int nfs4_init_uniform_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5146,7 +5147,7 @@ nfs4_init_uniform_client_string(struct nfs_client *clp) if (!str) return -ENOMEM; - result = scnprintf(str, len, "Linux NFSv%u.%u %s", + scnprintf(str, len, "Linux NFSv%u.%u %s", clp->rpc_ops->version, clp->cl_minorversion, clp->cl_rpcclient->cl_nodename); clp->cl_owner_id = str; @@ -5385,6 +5386,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co if (data == NULL) return -ENOMEM; nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + + nfs4_state_protect(server->nfs_client, + NFS_SP4_MACH_CRED_CLEANUP, + &task_setup_data.rpc_client, &msg); + data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; data->args.bitmask = server->cache_consistency_bitmask; @@ -5427,7 +5433,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 int err; do { err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); - trace_nfs4_delegreturn(inode, err); + trace_nfs4_delegreturn(inode, stateid, err); switch (err) { case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -5513,18 +5519,7 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * static int do_vfs_lock(struct inode *inode, struct file_lock *fl) { - int res = 0; - switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { - case FL_POSIX: - res = posix_lock_inode_wait(inode, fl); - break; - case FL_FLOCK: - res = flock_lock_inode_wait(inode, fl); - break; - default: - BUG(); - } - return res; + return locks_lock_inode_wait(inode, fl); } struct nfs4_unlockdata { @@ -5533,7 +5528,7 @@ struct nfs4_unlockdata { struct nfs4_lock_state *lsp; struct nfs_open_context *ctx; struct file_lock fl; - const struct nfs_server *server; + struct nfs_server *server; unsigned long timestamp; }; @@ -5948,6 +5943,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f data->cancelled = 1; rpc_put_task(task); dprintk("%s: done, ret = %d!\n", __func__, ret); + trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret); return ret; } @@ -5964,7 +5960,6 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); - trace_nfs4_lock_reclaim(request, state, F_SETLK, err); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -5991,7 +5986,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); - trace_nfs4_lock_expired(request, state, F_SETLK, err); switch (err) { default: goto out; @@ -6099,7 +6093,6 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * do { err = _nfs4_proc_setlk(state, cmd, request); - trace_nfs4_set_lock(request, state, cmd, err); if (err == -NFS4ERR_DENIED) err = -EAGAIN; err = nfs4_handle_exception(NFS_SERVER(state->inode), @@ -6260,48 +6253,32 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" -static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key, +static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, + struct dentry *dentry, const char *key, const void *buf, size_t buflen, - int flags, int type) + int flags) { - if (strcmp(key, "") != 0) - return -EINVAL; - return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); } -static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, - void *buf, size_t buflen, int type) +static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, + struct dentry *dentry, const char *key, + void *buf, size_t buflen) { - if (strcmp(key, "") != 0) - return -EINVAL; - return nfs4_proc_get_acl(d_inode(dentry), buf, buflen); } -static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, - size_t list_len, const char *name, - size_t name_len, int type) +static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) { - size_t len = sizeof(XATTR_NAME_NFSV4_ACL); - - if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)))) - return 0; - - if (list && len <= list_len) - memcpy(list, XATTR_NAME_NFSV4_ACL, len); - return len; + return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))); } #ifdef CONFIG_NFS_V4_SECURITY_LABEL -static inline int nfs4_server_supports_labels(struct nfs_server *server) -{ - return server->caps & NFS_CAP_SECURITY_LABEL; -} -static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags, int type) +static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, + struct dentry *dentry, const char *key, + const void *buf, size_t buflen, + int flags) { if (security_ismaclabel(key)) return nfs4_set_security_label(dentry, buf, buflen); @@ -6309,36 +6286,43 @@ static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key, return -EOPNOTSUPP; } -static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key, - void *buf, size_t buflen, int type) +static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler, + struct dentry *dentry, const char *key, + void *buf, size_t buflen) { if (security_ismaclabel(key)) return nfs4_get_security_label(d_inode(dentry), buf, buflen); return -EOPNOTSUPP; } -static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list, - size_t list_len, const char *name, - size_t name_len, int type) +static ssize_t +nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) { - size_t len = 0; + int len = 0; - if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) { - len = security_inode_listsecurity(d_inode(dentry), NULL, 0); - if (list && len <= list_len) - security_inode_listsecurity(d_inode(dentry), list, len); + if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) { + len = security_inode_listsecurity(inode, list, list_len); + if (list_len && len > list_len) + return -ERANGE; } return len; } static const struct xattr_handler nfs4_xattr_nfs4_label_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = nfs4_xattr_list_nfs4_label, .get = nfs4_xattr_get_nfs4_label, .set = nfs4_xattr_set_nfs4_label, }; -#endif +#else + +static ssize_t +nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) +{ + return 0; +} + +#endif /* * nfs_fhget will use either the mounted_on_fileid or the fileid @@ -6868,10 +6852,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = { }, .allow.u.words = { [0] = 1 << (OP_CLOSE) | + 1 << (OP_OPEN_DOWNGRADE) | 1 << (OP_LOCKU) | + 1 << (OP_DELEGRETURN) | 1 << (OP_COMMIT), [1] = 1 << (OP_SECINFO - 32) | 1 << (OP_SECINFO_NO_NAME - 32) | + 1 << (OP_LAYOUTRETURN - 32) | 1 << (OP_TEST_STATEID - 32) | 1 << (OP_FREE_STATEID - 32) | 1 << (OP_WRITE - 32) @@ -6936,11 +6923,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, } if (test_bit(OP_CLOSE, sp->allow.u.longs) && + test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) && + test_bit(OP_DELEGRETURN, sp->allow.u.longs) && test_bit(OP_LOCKU, sp->allow.u.longs)) { dfprintk(MOUNT, " cleanup mode enabled\n"); set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); } + if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) { + dfprintk(MOUNT, " pnfs cleanup mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, + &clp->cl_sp4_flags); + } + if (test_bit(OP_SECINFO, sp->allow.u.longs) && test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { dfprintk(MOUNT, " secinfo mode enabled\n"); @@ -7769,6 +7764,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); struct nfs4_session *session = nfs4_get_session(server); + int ret; dprintk("--> %s\n", __func__); /* Note the is a race here, where a CB_LAYOUTRECALL can come in @@ -7779,12 +7775,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) if (nfs41_setup_sequence(session, &lgp->args.seq_args, &lgp->res.seq_res, task)) return; - if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, + ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid, NFS_I(lgp->args.inode)->layout, &lgp->args.range, - lgp->args.ctx->state)) { - rpc_exit(task, NFS4_OK); - } + lgp->args.ctx->state); + if (ret < 0) + rpc_exit(task, ret); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) @@ -7804,6 +7800,15 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: goto out; + + /* + * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs + * on the file. set tk_status to -ENODATA to tell upper layer to + * retry go inband. + */ + case -NFS4ERR_LAYOUTUNAVAILABLE: + task->tk_status = -ENODATA; + goto out; /* * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). @@ -7872,7 +7877,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) spin_unlock(&inode->i_lock); goto out_restart; } - if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN) goto out_restart; out: dprintk("<-- %s\n", __func__); @@ -8000,6 +8005,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, + &lgp->res.stateid, status); /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) @@ -8056,11 +8062,11 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); + pnfs_mark_layout_returned_if_empty(lo); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); pnfs_clear_layoutreturn_waitbit(lo); - lo->plh_block_lgets--; spin_unlock(&lo->plh_inode->i_lock); pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); @@ -8092,6 +8098,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) }; int status = 0; + nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client, + NFS_SP4_MACH_CRED_PNFS_CLEANUP, + &task_setup_data.rpc_client, &msg); + dprintk("--> %s\n", __func__); if (!sync) { lrp->inode = nfs_igrab_and_active(lrp->args.inode); @@ -8107,7 +8117,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) return PTR_ERR(task); if (sync) status = task->tk_status; - trace_nfs4_layoutreturn(lrp->args.inode, status); + trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status); dprintk("<-- %s status=%d\n", __func__, status); rpc_put_task(task); return status; @@ -8255,7 +8265,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) return PTR_ERR(task); if (sync) status = task->tk_status; - trace_nfs4_layoutcommit(data->args.inode, status); + trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status); dprintk("%s: status %d\n", __func__, status); rpc_put_task(task); return status; @@ -8729,7 +8739,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_ALLOCATE | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK - | NFS_CAP_LAYOUTSTATS, + | NFS_CAP_LAYOUTSTATS + | NFS_CAP_CLONE, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -8754,6 +8765,24 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { #endif }; +ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) +{ + ssize_t error, error2; + + error = generic_listxattr(dentry, list, size); + if (error < 0) + return error; + if (list) { + list += error; + size -= error; + } + + error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size); + if (error2 < 0) + return error2; + return error + error2; +} + static const struct inode_operations nfs4_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -8770,7 +8799,7 @@ static const struct inode_operations nfs4_dir_inode_operations = { .setattr = nfs_setattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, - .listxattr = generic_listxattr, + .listxattr = nfs4_listxattr, .removexattr = generic_removexattr, }; @@ -8780,7 +8809,7 @@ static const struct inode_operations nfs4_file_inode_operations = { .setattr = nfs_setattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, - .listxattr = generic_listxattr, + .listxattr = nfs4_listxattr, .removexattr = generic_removexattr, }; @@ -8839,7 +8868,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { - .prefix = XATTR_NAME_NFSV4_ACL, + .name = XATTR_NAME_NFSV4_ACL, .list = nfs4_xattr_list_nfs4_acl, .get = nfs4_xattr_get_nfs4_acl, .set = nfs4_xattr_set_nfs4_acl, diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 0fbd3ab1be22..8693d77c45ea 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -12,7 +12,7 @@ #include "nfs4idmap.h" #include "callback.h" -static const int nfs_set_port_min = 0; +static const int nfs_set_port_min; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index d774335cc8bc..2850bce19244 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -6,6 +6,7 @@ #include "internal.h" #include "nfs4session.h" #include "callback.h" +#include "pnfs.h" #define CREATE_TRACE_POINTS #include "nfs4trace.h" diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 671cf68fe56b..2c8d05dae5b1 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -321,6 +321,7 @@ TRACE_EVENT(nfs4_sequence_done, __entry->highest_slotid = res->sr_highest_slotid; __entry->target_highest_slotid = res->sr_target_highest_slotid; + __entry->status_flags = res->sr_status_flags; __entry->error = res->sr_status; ), TP_printk( @@ -399,6 +400,10 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __field(u64, fileid) __field(u64, dir) __string(name, ctx->dentry->d_name.name) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, openstateid_seq) + __field(u32, openstateid_hash) ), TP_fast_assign( @@ -409,8 +414,22 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->flags = flags; __entry->fmode = (__force unsigned int)ctx->mode; __entry->dev = ctx->dentry->d_sb->s_dev; - if (!IS_ERR_OR_NULL(state)) + if (!IS_ERR_OR_NULL(state)) { inode = state->inode; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + __entry->openstateid_seq = + be32_to_cpu(state->open_stateid.seqid); + __entry->openstateid_hash = + nfs_stateid_hash(&state->open_stateid); + } else { + __entry->stateid_seq = 0; + __entry->stateid_hash = 0; + __entry->openstateid_seq = 0; + __entry->openstateid_hash = 0; + } if (inode != NULL) { __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); @@ -425,7 +444,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event, TP_printk( "error=%d (%s) flags=%d (%s) fmode=%s " "fileid=%02x:%02x:%llu fhandle=0x%08x " - "name=%02x:%02x:%llu/%s", + "name=%02x:%02x:%llu/%s stateid=%d:0x%08x " + "openstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), __entry->flags, @@ -436,7 +456,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fhandle, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->stateid_seq, __entry->stateid_hash, + __entry->openstateid_seq, __entry->openstateid_hash ) ); @@ -452,6 +474,45 @@ DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim); DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired); DEFINE_NFS4_OPEN_EVENT(nfs4_open_file); +TRACE_EVENT(nfs4_cached_open, + TP_PROTO( + const struct nfs4_state *state + ), + TP_ARGS(state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, fmode) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fmode = (__force unsigned int)state->state; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + ), + + TP_printk( + "fmode=%s fileid=%02x:%02x:%llu " + "fhandle=0x%08x stateid=%d:0x%08x", + __entry->fmode ? show_fmode_flags(__entry->fmode) : + "closed", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash + ) +); + TRACE_EVENT(nfs4_close, TP_PROTO( const struct nfs4_state *state, @@ -468,6 +529,8 @@ TRACE_EVENT(nfs4_close, __field(u64, fileid) __field(unsigned int, fmode) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -478,18 +541,23 @@ TRACE_EVENT(nfs4_close, __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->fmode = (__force unsigned int)state->state; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(args->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&args->stateid); ), TP_printk( "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu " - "fhandle=0x%08x", + "fhandle=0x%08x openstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), __entry->fmode ? show_fmode_flags(__entry->fmode) : "closed", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -523,6 +591,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -536,11 +606,16 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) cmd=%s:%s range=%lld:%lld " - "fileid=%02x:%02x:%llu fhandle=0x%08x", + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), show_lock_cmd(__entry->cmd), @@ -549,7 +624,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, (long long)__entry->end, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -563,11 +639,73 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, ), \ TP_ARGS(request, state, cmd, error)) DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock); -DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock); -DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim); -DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired); DEFINE_NFS4_LOCK_EVENT(nfs4_unlock); +TRACE_EVENT(nfs4_set_lock, + TP_PROTO( + const struct file_lock *request, + const struct nfs4_state *state, + const nfs4_stateid *lockstateid, + int cmd, + int error + ), + + TP_ARGS(request, state, lockstateid, cmd, error), + + TP_STRUCT__entry( + __field(int, error) + __field(int, cmd) + __field(char, type) + __field(loff_t, start) + __field(loff_t, end) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, lockstateid_seq) + __field(u32, lockstateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->error = error; + __entry->cmd = cmd; + __entry->type = request->fl_type; + __entry->start = request->fl_start; + __entry->end = request->fl_end; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + __entry->lockstateid_seq = + be32_to_cpu(lockstateid->seqid); + __entry->lockstateid_hash = + nfs_stateid_hash(lockstateid); + ), + + TP_printk( + "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x lockstateid=%d:0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + show_lock_cmd(__entry->cmd), + show_lock_type(__entry->type), + (long long)__entry->start, + (long long)__entry->end, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __entry->lockstateid_seq, __entry->lockstateid_hash + ) +); + DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_PROTO( const struct inode *inode, @@ -621,20 +759,28 @@ TRACE_EVENT(nfs4_delegreturn_exit, __field(dev_t, dev) __field(u32, fhandle) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( __entry->dev = res->server->s_dev; __entry->fhandle = nfs_fhandle_hash(args->fhandle); __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(args->stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(args->stateid); ), TP_printk( - "error=%d (%s) dev=%02x:%02x fhandle=0x%08x", + "error=%d (%s) dev=%02x:%02x fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -653,6 +799,8 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -662,15 +810,21 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -820,7 +974,6 @@ DECLARE_EVENT_CLASS(nfs4_inode_event, ), \ TP_ARGS(inode, error)) -DEFINE_NFS4_INODE_EVENT(nfs4_setattr); DEFINE_NFS4_INODE_EVENT(nfs4_access); DEFINE_NFS4_INODE_EVENT(nfs4_readlink); DEFINE_NFS4_INODE_EVENT(nfs4_readdir); @@ -830,8 +983,59 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl); DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label); DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label); #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ -DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation); -DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn); + +DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, + TP_PROTO( + const struct inode *inode, + const nfs4_stateid *stateid, + int error + ), + + TP_ARGS(inode, stateid, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(stateid); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash + ) +); + +#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_stateid_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const nfs4_stateid *stateid, \ + int error \ + ), \ + TP_ARGS(inode, stateid, error)) + +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn); DECLARE_EVENT_CLASS(nfs4_getattr_event, TP_PROTO( @@ -941,8 +1145,74 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, ), \ TP_ARGS(clp, fhandle, inode, error)) DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr); -DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode); +DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs_fh *fhandle, + const struct inode *inode, + const nfs4_stateid *stateid, + int error + ), + + TP_ARGS(clp, fhandle, inode, stateid, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->error = error; + __entry->fhandle = nfs_fhandle_hash(fhandle); + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->dev = inode->i_sb->s_dev; + } else { + __entry->fileid = 0; + __entry->dev = 0; + } + __assign_str(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + __entry->stateid_seq = + be32_to_cpu(stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(stateid); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __get_str(dstaddr) + ) +); + +#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + const struct nfs_fh *fhandle, \ + const struct inode *inode, \ + const nfs4_stateid *stateid, \ + int error \ + ), \ + TP_ARGS(clp, fhandle, inode, stateid, error)) +DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall); +DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file); DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( @@ -1005,28 +1275,37 @@ DECLARE_EVENT_CLASS(nfs4_read_event, __field(loff_t, offset) __field(size_t, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( const struct inode *inode = hdr->inode; + const struct nfs4_state *state = + hdr->args.context->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu", + "offset=%lld count=%zu stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count + __entry->count, + __entry->stateid_seq, __entry->stateid_hash ) ); #define DEFINE_NFS4_READ_EVENT(name) \ @@ -1056,28 +1335,37 @@ DECLARE_EVENT_CLASS(nfs4_write_event, __field(loff_t, offset) __field(size_t, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( const struct inode *inode = hdr->inode; + const struct nfs4_state *state = + hdr->args.context->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu", + "offset=%lld count=%zu stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count + __entry->count, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -1154,10 +1442,11 @@ TRACE_EVENT(nfs4_layoutget, const struct nfs_open_context *ctx, const struct pnfs_layout_range *args, const struct pnfs_layout_range *res, + const nfs4_stateid *layout_stateid, int error ), - TP_ARGS(ctx, args, res, error), + TP_ARGS(ctx, args, res, layout_stateid, error), TP_STRUCT__entry( __field(dev_t, dev) @@ -1167,10 +1456,15 @@ TRACE_EVENT(nfs4_layoutget, __field(u64, offset) __field(u64, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) ), TP_fast_assign( const struct inode *inode = d_inode(ctx->dentry); + const struct nfs4_state *state = ctx->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); @@ -1178,11 +1472,25 @@ TRACE_EVENT(nfs4_layoutget, __entry->offset = args->offset; __entry->count = args->length; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + if (!error) { + __entry->layoutstateid_seq = + be32_to_cpu(layout_stateid->seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(layout_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "iomode=%s offset=%llu count=%llu", + "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x " + "layoutstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1190,14 +1498,83 @@ TRACE_EVENT(nfs4_layoutget, __entry->fhandle, show_pnfs_iomode(__entry->iomode), (unsigned long long)__entry->offset, - (unsigned long long)__entry->count + (unsigned long long)__entry->count, + __entry->stateid_seq, __entry->stateid_hash, + __entry->layoutstateid_seq, __entry->layoutstateid_hash ) ); -DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); -DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn); DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); +#define show_pnfs_update_layout_reason(reason) \ + __print_symbolic(reason, \ + { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \ + { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" }, \ + { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" }, \ + { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" }, \ + { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" }, \ + { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" }, \ + { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" }, \ + { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \ + { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \ + { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ + { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) + +TRACE_EVENT(pnfs_update_layout, + TP_PROTO(struct inode *inode, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + struct pnfs_layout_hdr *lo, + enum pnfs_update_layout_reason reason + ), + TP_ARGS(inode, pos, count, iomode, lo, reason), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, pos) + __field(u64, count) + __field(enum pnfs_iomode, iomode) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) + __field(enum pnfs_update_layout_reason, reason) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->pos = pos; + __entry->count = count; + __entry->iomode = iomode; + __entry->reason = reason; + if (lo != NULL) { + __entry->layoutstateid_seq = + be32_to_cpu(lo->plh_stateid.seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(&lo->plh_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } + ), + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "iomode=%s pos=%llu count=%llu " + "layoutstateid=%d:0x%08x (%s)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_pnfs_iomode(__entry->iomode), + (unsigned long long)__entry->pos, + (unsigned long long)__entry->count, + __entry->layoutstateid_seq, __entry->layoutstateid_hash, + show_pnfs_update_layout_reason(__entry->reason) + ) +); + #endif /* CONFIG_NFS_V4_1 */ #endif /* _TRACE_NFS4_H */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 788adf3897c7..4e4441216804 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1659,7 +1659,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun *p = cpu_to_be32(FATTR4_WORD0_ACL); p = reserve_space(xdr, 4); *p = cpu_to_be32(arg->acl_len); - xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); + xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len); } static void @@ -2491,7 +2491,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, - args->acl_pages, args->acl_pgbase, args->acl_len); + args->acl_pages, 0, args->acl_len); encode_nops(&hdr); } @@ -3615,6 +3615,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st status = 0; if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS))) goto out; + bitmap[0] &= ~FATTR4_WORD0_FS_LOCATIONS; status = -EIO; /* Ignore borken servers that return unrequested attrs */ if (unlikely(res == NULL)) @@ -4375,6 +4376,11 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) goto xdr_error; if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0) goto xdr_error; + + status = -EIO; + if (unlikely(bitmap[0])) + goto xdr_error; + if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0) goto xdr_error; if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0) @@ -4574,6 +4580,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; + status = -EIO; + if (unlikely(bitmap[0])) + goto xdr_error; + status = decode_attr_mode(xdr, bitmap, &fmode); if (status < 0) goto xdr_error; @@ -4627,6 +4637,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; + status = -EIO; + if (unlikely(bitmap[1])) + goto xdr_error; + status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold); if (status < 0) goto xdr_error; @@ -4764,6 +4778,28 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, return 0; } +/* + * The granularity of a CLONE operation. + */ +static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *res) +{ + __be32 *p; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); + *res = 0; + if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) { + print_overflow_msg(__func__, xdr); + return -EIO; + } + *res = be32_to_cpup(p); + bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE; + } + return 0; +} + static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { unsigned int savep; @@ -4789,15 +4825,28 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) goto xdr_error; fsinfo->wtpref = fsinfo->wtmax; + + status = -EIO; + if (unlikely(bitmap[0])) + goto xdr_error; + status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); if (status != 0) goto xdr_error; status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); if (status != 0) goto xdr_error; + + status = -EIO; + if (unlikely(bitmap[1])) + goto xdr_error; + status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); if (status) goto xdr_error; + status = decode_attr_clone_blksize(xdr, bitmap, &fsinfo->clone_blksize); + if (status) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: @@ -7465,6 +7514,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(ALLOCATE, enc_allocate, dec_allocate), PROC(DEALLOCATE, enc_deallocate, dec_deallocate), PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), + PROC(CLONE, enc_clone, dec_clone), #endif /* CONFIG_NFS_V4_2 */ }; diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 9bc9f04fb7f6..89a15dbe5efc 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -90,7 +90,7 @@ #define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096" /* Parameters passed from the kernel command line */ -static char nfs_root_parms[256] __initdata = ""; +static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = ""; /* Text-based mount options passed to super.c */ static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 59f838cdc009..9f80a086b612 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -39,7 +39,6 @@ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ - { 1 << NFS_INO_COMMIT, "COMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 5aaed363556a..9aebffb40505 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -124,7 +124,7 @@ objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, retry_lookup: od = osduld_info_lookup(&odi); - if (unlikely(IS_ERR(od))) { + if (IS_ERR(od)) { err = PTR_ERR(od); dprintk("%s: osduld_info_lookup => %d\n", __func__, err); if (err == -ENODEV && retry_flag) { @@ -476,10 +476,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) } unlock_page(page); } - if (PageDirty(page) || PageWriteback(page)) - *uptodate = true; - else - *uptodate = PageUptodate(page); + *uptodate = PageUptodate(page); dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); return page; } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index fe3ddd20ff89..8ce4f61cbaa5 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -101,53 +101,18 @@ nfs_page_free(struct nfs_page *p) kmem_cache_free(nfs_page_cachep, p); } -static void -nfs_iocounter_inc(struct nfs_io_counter *c) -{ - atomic_inc(&c->io_count); -} - -static void -nfs_iocounter_dec(struct nfs_io_counter *c) -{ - if (atomic_dec_and_test(&c->io_count)) { - clear_bit(NFS_IO_INPROGRESS, &c->flags); - smp_mb__after_atomic(); - wake_up_bit(&c->flags, NFS_IO_INPROGRESS); - } -} - -static int -__nfs_iocounter_wait(struct nfs_io_counter *c) -{ - wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); - DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); - int ret = 0; - - do { - prepare_to_wait(wq, &q.wait, TASK_KILLABLE); - set_bit(NFS_IO_INPROGRESS, &c->flags); - if (atomic_read(&c->io_count) == 0) - break; - ret = nfs_wait_bit_killable(&q.key); - } while (atomic_read(&c->io_count) != 0 && !ret); - finish_wait(wq, &q.wait); - return ret; -} - /** * nfs_iocounter_wait - wait for i/o to complete - * @c: nfs_io_counter to use + * @l_ctx: nfs_lock_context with io_counter to use * * returns -ERESTARTSYS if interrupted by a fatal signal. * Otherwise returns 0 once the io_count hits 0. */ int -nfs_iocounter_wait(struct nfs_io_counter *c) +nfs_iocounter_wait(struct nfs_lock_context *l_ctx) { - if (atomic_read(&c->io_count) == 0) - return 0; - return __nfs_iocounter_wait(c); + return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable, + TASK_KILLABLE); } /* @@ -370,7 +335,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, return ERR_CAST(l_ctx); } req->wb_lock_context = l_ctx; - nfs_iocounter_inc(&l_ctx->io_count); + atomic_inc(&l_ctx->io_count); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in @@ -431,7 +396,8 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { - nfs_iocounter_dec(&l_ctx->io_count); + if (atomic_dec_and_test(&l_ctx->io_count)) + wake_up_atomic_t(&l_ctx->io_count); nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } @@ -664,22 +630,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); * @desc: IO descriptor * @hdr: pageio header */ -static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) +static void nfs_pgio_error(struct nfs_pgio_header *hdr) { - struct nfs_pgio_mirror *mirror; - u32 midx; - set_bit(NFS_IOHDR_REDO, &hdr->flags); nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); - /* TODO: Make sure it's right to clean up all mirrors here - * and not just hdr->pgio_mirror_idx */ - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - } - return -ENOMEM; } /** @@ -800,8 +755,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, unsigned int pagecount, pageused; pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); - if (!nfs_pgarray_set(&hdr->page_array, pagecount)) - return nfs_pgio_error(desc, hdr); + if (!nfs_pgarray_set(&hdr->page_array, pagecount)) { + nfs_pgio_error(hdr); + desc->pg_error = -ENOMEM; + return desc->pg_error; + } nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); pages = hdr->page_array.pagevec; @@ -819,8 +777,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, *pages++ = last_page = req->wb_page; } } - if (WARN_ON_ONCE(pageused != pagecount)) - return nfs_pgio_error(desc, hdr); + if (WARN_ON_ONCE(pageused != pagecount)) { + nfs_pgio_error(hdr); + desc->pg_error = -EINVAL; + return desc->pg_error; + } if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) @@ -835,18 +796,13 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio); static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror; struct nfs_pgio_header *hdr; int ret; - mirror = nfs_pgio_current_mirror(desc); - hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - /* TODO: make sure this is right with mirroring - or - * should it back out all mirrors? */ - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); @@ -874,6 +830,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + if (pgio->pg_error < 0) + return pgio->pg_error; + if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) return -EINVAL; @@ -903,12 +862,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) pgio->pg_mirrors_dynamic = NULL; } -static bool nfs_match_open_context(const struct nfs_open_context *ctx1, - const struct nfs_open_context *ctx2) -{ - return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; -} - static bool nfs_match_lock_context(const struct nfs_lock_context *l1, const struct nfs_lock_context *l2) { @@ -982,6 +935,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, } else { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); + if (desc->pg_error < 0) + return 0; mirror->pg_base = req->wb_pgbase; } if (!nfs_can_coalesce_requests(prev, req, desc)) @@ -1147,6 +1102,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, bytes = req->wb_bytes; nfs_pageio_setup_mirroring(desc, req); + if (desc->pg_error < 0) + goto out_failed; for (midx = 0; midx < desc->pg_mirror_count; midx++) { if (midx) { @@ -1163,7 +1120,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (IS_ERR(dupreq)) { nfs_page_group_unlock(req); - return 0; + desc->pg_error = PTR_ERR(dupreq); + goto out_failed; } nfs_lock_request(dupreq); @@ -1176,10 +1134,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (nfs_pgio_has_mirroring(desc)) desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) - return 0; + goto out_failed; } return 1; + +out_failed: + /* + * We might have failed before sending any reqs over wire. + * Clean up rest of the reqs in mirror pg_list. + */ + if (desc->pg_error) { + struct nfs_pgio_mirror *mirror; + void (*func)(struct list_head *); + + /* remember fatal errors */ + if (nfs_error_is_fatal(desc->pg_error)) + mapping_set_error(desc->pg_inode->i_mapping, + desc->pg_error); + + func = desc->pg_completion_ops->error_cleanup; + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + func(&mirror->pg_list); + } + } + return 0; } /* @@ -1232,7 +1212,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, nfs_pageio_complete(desc); if (!list_empty(&failed)) { list_move(&failed, &hdr->pages); - return -EIO; + return desc->pg_error < 0 ? desc->pg_error : -EIO; } return 0; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8abe27165ad0..a3592cc34a20 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock); static LIST_HEAD(pnfs_modules_tbl); static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync); /* Return the registered pnfs layout driver module matching given id */ @@ -385,13 +385,13 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, enum pnfs_iomode iomode; bool send; - stateid = lo->plh_stateid; + nfs4_stateid_copy(&stateid, &lo->plh_stateid); iomode = lo->plh_return_iomode; send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, iomode, false); } } else spin_unlock(&inode->i_lock); @@ -566,10 +566,10 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range) + const struct pnfs_layout_range *recall_range) { struct pnfs_layout_segment *lseg, *next; - int invalid = 0, removed = 0; + int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -582,11 +582,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); - invalid++; - removed += mark_lseg_invalid(lseg, tmp_list); + if (!mark_lseg_invalid(lseg, tmp_list)) + remaining++; } - dprintk("%s:Return %i\n", __func__, invalid - removed); - return invalid - removed; + dprintk("%s:Return %i\n", __func__, remaining); + return remaining; } /* note free_me must contain lsegs from a single layout_hdr */ @@ -618,7 +618,6 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_get_layout_hdr(lo); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); - pnfs_clear_retry_layoutget(lo); spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -703,6 +702,8 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, ret = -EAGAIN; spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&lseg_list); + /* Free all lsegs that are attached to commit buckets */ + nfs_commit_inode(inode, 0); pnfs_put_layout_hdr(lo); iput(inode); } @@ -826,7 +827,7 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, struct nfs4_state *open_state) { int status = 0; @@ -861,7 +862,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; @@ -872,44 +873,41 @@ send_layoutget(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); - lgp = kzalloc(sizeof(*lgp), gfp_flags); - if (lgp == NULL) - return NULL; - - i_size = i_size_read(ino); - - lgp->args.minlength = PAGE_CACHE_SIZE; - if (lgp->args.minlength > range->length) - lgp->args.minlength = range->length; - if (range->iomode == IOMODE_READ) { - if (range->offset >= i_size) - lgp->args.minlength = 0; - else if (i_size - range->offset < lgp->args.minlength) - lgp->args.minlength = i_size - range->offset; - } - lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - lgp->args.range = *range; - lgp->args.type = server->pnfs_curr_ld->id; - lgp->args.inode = ino; - lgp->args.ctx = get_nfs_open_context(ctx); - lgp->gfp_flags = gfp_flags; - lgp->cred = lo->plh_lc_cred; - - /* Synchronously retrieve layout information from server and - * store in lseg. + /* + * Synchronously retrieve layout information from server and + * store in lseg. If we race with a concurrent seqid morphing + * op, then re-send the LAYOUTGET. */ - lseg = nfs4_proc_layoutget(lgp, gfp_flags); - if (IS_ERR(lseg)) { - switch (PTR_ERR(lseg)) { - case -ENOMEM: - case -ERESTARTSYS: - break; - default: - /* remember that LAYOUTGET failed and suspend trying */ - pnfs_layout_io_set_failed(lo, range->iomode); + do { + lgp = kzalloc(sizeof(*lgp), gfp_flags); + if (lgp == NULL) + return NULL; + + i_size = i_size_read(ino); + + lgp->args.minlength = PAGE_CACHE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; } - return NULL; - } else + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + pnfs_copy_range(&lgp->args.range, range); + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + lgp->gfp_flags = gfp_flags; + lgp->cred = lo->plh_lc_cred; + + lseg = nfs4_proc_layoutget(lgp, gfp_flags); + } while (lseg == ERR_PTR(-EAGAIN)); + + if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg))) + lseg = NULL; + else pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(range->iomode)); @@ -940,7 +938,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) } static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync) { struct inode *ino = lo->plh_inode; @@ -957,7 +955,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, goto out; } - lrp->args.stateid = stateid; + nfs4_stateid_copy(&lrp->args.stateid, stateid); lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; lrp->args.inode = ino; lrp->args.range.iomode = iomode; @@ -1000,7 +998,7 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout to return\n", __func__); goto out; } - stateid = nfsi->layout->plh_stateid; + nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); @@ -1028,7 +1026,7 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); if (send) - status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); out_put_layout_hdr: pnfs_put_layout_hdr(lo); out: @@ -1091,13 +1089,12 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } - stateid = lo->plh_stateid; + nfs4_stateid_copy(&stateid, &lo->plh_stateid); /* always send layoutreturn if being marked so */ if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags)) layoutreturn = pnfs_prepare_layoutreturn(lo); - pnfs_clear_retry_layoutget(lo); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) /* If we are sending layoutreturn, invalidate all valid lsegs */ if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { @@ -1119,7 +1116,7 @@ out_noroc: pnfs_free_lseg_list(&tmp_list); pnfs_layoutcommit_inode(ino, true); if (layoutreturn) - pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); return roc; } @@ -1144,6 +1141,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; + pnfs_mark_layout_returned_if_empty(lo); if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); @@ -1460,25 +1458,15 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, return ret; } -/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ -static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) -{ - if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) - return 1; - return nfs_wait_bit_killable(key); -} - static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) { - if (!pnfs_should_retry_layoutget(lo)) - return false; /* * send layoutcommit as it can hold up layoutreturn due to lseg * reference */ pnfs_layoutcommit_inode(lo->plh_inode, false); return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, - pnfs_layoutget_retry_bit_wait, + nfs_wait_bit_killable, TASK_UNINTERRUPTIBLE); } @@ -1515,14 +1503,23 @@ pnfs_update_layout(struct inode *ino, struct pnfs_layout_segment *lseg = NULL; bool first; - if (!pnfs_enabled_sb(NFS_SERVER(ino))) + if (!pnfs_enabled_sb(NFS_SERVER(ino))) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_NO_PNFS); goto out; + } - if (iomode == IOMODE_READ && i_size_read(ino) == 0) + if (iomode == IOMODE_READ && i_size_read(ino) == 0) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_RD_ZEROLEN); goto out; + } - if (pnfs_within_mdsthreshold(ctx, ino, iomode)) + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_MDSTHRESH); goto out; + } lookup_again: first = false; @@ -1530,19 +1527,25 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_NOMEM); goto out; } /* Do we even need to bother with this? */ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_BULK_RECALL); dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; } /* if LAYOUTGET already failed once we don't try again */ - if (pnfs_layout_io_test_failed(lo, iomode) && - !pnfs_should_retry_layoutget(lo)) + if (pnfs_layout_io_test_failed(lo, iomode)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); goto out_unlock; + } first = list_empty(&lo->plh_segs); if (first) { @@ -1562,8 +1565,11 @@ lookup_again: * already exists */ lseg = pnfs_find_lseg(lo, &arg); - if (lseg) + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_FOUND_CACHED); goto out_unlock; + } } /* @@ -1580,11 +1586,16 @@ lookup_again: dprintk("%s retrying\n", __func__); goto lookup_again; } + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_RETURN); goto out_put_layout_hdr; } - if (pnfs_layoutgets_blocked(lo)) + if (pnfs_layoutgets_blocked(lo)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_BLOCKED); goto out_unlock; + } atomic_inc(&lo->plh_outstanding); spin_unlock(&ino->i_lock); @@ -1607,8 +1618,9 @@ lookup_again: arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); - pnfs_clear_retry_layoutget(lo); atomic_dec(&lo->plh_outstanding); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); @@ -1618,7 +1630,7 @@ out: "(%s, offset: %llu, length: %llu)\n", __func__, ino->i_sb->s_id, (unsigned long long)NFS_FILEID(ino), - lseg == NULL ? "not found" : "found", + IS_ERR_OR_NULL(lseg) ? "not found" : "found", iomode==IOMODE_RW ? "read/write" : "read-only", (unsigned long long)pos, (unsigned long long)count); @@ -1687,6 +1699,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to sequence\n", __func__); + status = -EAGAIN; goto out_forget_reply; } pnfs_set_layout_stateid(lo, &res->stateid, false); @@ -1724,16 +1737,29 @@ out_forget_reply: } static void +pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) +{ + if (lo->plh_return_iomode == iomode) + return; + if (lo->plh_return_iomode != 0) + iomode = IOMODE_ANY; + lo->plh_return_iomode = iomode; +} + +int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *return_range) + const struct pnfs_layout_range *return_range) { struct pnfs_layout_segment *lseg, *next; + int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); if (list_empty(&lo->plh_segs)) - return; + return 0; + + assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (should_free_lseg(&lseg->pls_range, return_range)) { @@ -1743,38 +1769,47 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg->pls_range.offset, lseg->pls_range.length); set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); - mark_lseg_invalid(lseg, tmp_list); + pnfs_set_plh_return_iomode(lo, return_range->iomode); + if (!mark_lseg_invalid(lseg, tmp_list)) + remaining++; set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); } + return remaining; } void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg) { struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode); struct pnfs_layout_range range = { .iomode = lseg->pls_range.iomode, .offset = 0, .length = NFS4_MAX_UINT64, }; LIST_HEAD(free_me); + bool return_now = false; spin_lock(&inode->i_lock); - /* set failure bit so that pnfs path will be retried later */ - pnfs_layout_set_fail_bit(lo, iomode); - if (lo->plh_return_iomode == 0) - lo->plh_return_iomode = range.iomode; - else if (lo->plh_return_iomode != range.iomode) - lo->plh_return_iomode = IOMODE_ANY; + pnfs_set_plh_return_iomode(lo, range.iomode); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - pnfs_mark_matching_lsegs_return(lo, &free_me, &range); - spin_unlock(&inode->i_lock); + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) { + nfs4_stateid stateid; + enum pnfs_iomode iomode = lo->plh_return_iomode; + + nfs4_stateid_copy(&stateid, &lo->plh_stateid); + return_now = pnfs_prepare_layoutreturn(lo); + spin_unlock(&inode->i_lock); + if (return_now) + pnfs_send_layoutreturn(lo, &stateid, iomode, false); + } else { + spin_unlock(&inode->i_lock); + nfs_commit_inode(inode, 0); + } pnfs_free_lseg_list(&free_me); } EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); @@ -1796,6 +1831,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r rd_size, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) @@ -1808,13 +1848,19 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - if (pgio->pg_lseg == NULL) + if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), wb_size, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_write_mds(pgio); @@ -1912,12 +1958,13 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr) */ void pnfs_ld_write_done(struct nfs_pgio_header *hdr) { - trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); - if (!hdr->pnfs_error) { + if (likely(!hdr->pnfs_error)) { pnfs_set_layoutcommit(hdr->inode, hdr->lseg, hdr->mds_offset + hdr->res.count); hdr->mds_ops->rpc_call_done(&hdr->task, hdr); - } else + } + trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); + if (unlikely(hdr->pnfs_error)) pnfs_ld_handle_write_error(hdr); hdr->mds_ops->rpc_release(hdr); } @@ -1981,15 +2028,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_pgio_header *hdr; int ret; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); @@ -2028,11 +2073,12 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr) */ void pnfs_ld_read_done(struct nfs_pgio_header *hdr) { - trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); if (likely(!hdr->pnfs_error)) { __nfs4_read_done_cb(hdr); hdr->mds_ops->rpc_call_done(&hdr->task, hdr); - } else + } + trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); + if (unlikely(hdr->pnfs_error)) pnfs_ld_handle_read_error(hdr); hdr->mds_ops->rpc_release(hdr); } @@ -2111,15 +2157,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_pgio_header *hdr; int ret; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d1990e90e7a0..9f4e2a47f4aa 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -98,7 +98,6 @@ enum { NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ - NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */ }; enum layoutdriver_policy_flags { @@ -261,11 +260,14 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, bool update_barrier); int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, struct nfs4_state *open_state); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range); +int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + const struct pnfs_layout_range *recall_range); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); @@ -379,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d) return d; } -static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) - atomic_inc(&lo->plh_refcount); -} - -static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) { - atomic_dec(&lo->plh_refcount); - /* wake up waiters for LAYOUTRETURN as that is not needed */ - wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); - } -} - -static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags); -} - static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { @@ -409,6 +391,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg) return lseg; } +static inline bool +pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg) +{ + return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0; +} + /* Return true if a layout driver is being used for this mountpoint */ static inline int pnfs_enabled_sb(struct nfs_server *nfss) { @@ -556,6 +544,26 @@ pnfs_calc_offset_length(u64 offset, u64 end) return 1 + end - offset; } +/** + * pnfs_mark_layout_returned_if_empty - marks the layout as returned + * @lo: layout header + * + * Note: Caller must hold inode->i_lock + */ +static inline void +pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) +{ + if (list_empty(&lo->plh_segs)) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); +} + +static inline void +pnfs_copy_range(struct pnfs_layout_range *dst, + const struct pnfs_layout_range *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 24655b807d44..81ac6480f9e7 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -266,17 +266,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, } else { nfs_retry_commit(mds_pages, NULL, cinfo, 0); pnfs_generic_retry_commit(cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } } nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); - if (nreq == 0) { - cinfo->completion_ops->error_cleanup(NFS_I(inode)); + if (nreq == 0) goto out; - } atomic_add(nreq, &cinfo->mds->rpcs_out); @@ -871,6 +868,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { + if (!pnfs_is_valid_lseg(lseg)) { + spin_unlock(cinfo->lock); + cinfo->completion_ops->resched_write(cinfo, req); + return; + } /* Non-empty buckets hold a reference on the lseg. That ref * is normally transferred to the COMMIT call and released * there. It could also be released if the last req is pulled diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 01b8cc8e8cfc..eb31e23e7def 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -85,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); +static void nfs_readpage_release(struct nfs_page *req) +{ + struct inode *inode = d_inode(req->wb_context->dentry); + + dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode), req->wb_bytes, + (long long)req_offset(req)); + + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(inode, req->wb_page, 0); + + unlock_page(req->wb_page); + } + nfs_release_request(req); +} + int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { @@ -106,7 +123,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); - nfs_pageio_add_request(&pgio, new); + if (!nfs_pageio_add_request(&pgio, new)) { + nfs_list_remove_request(new); + nfs_readpage_release(new); + } nfs_pageio_complete(&pgio); /* It doesn't make sense to do mirrored reads! */ @@ -115,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, pgm = &pgio.pg_mirrors[0]; NFS_I(inode)->read_io += pgm->pg_bytes_written; - return 0; -} - -static void nfs_readpage_release(struct nfs_page *req) -{ - struct inode *inode = d_inode(req->wb_context->dentry); - - dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), req->wb_bytes, - (long long)req_offset(req)); - - if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(inode, req->wb_page, 0); - - unlock_page(req->wb_page); - } - nfs_release_request(req); + return pgio.pg_error < 0 ? pgio.pg_error : 0; } static void nfs_page_group_set_uptodate(struct nfs_page *req) @@ -246,6 +249,13 @@ static void nfs_readpage_retry(struct rpc_task *task, nfs_set_pgio_error(hdr, -EIO, argp->offset); return; } + + /* For non rpc-based layout drivers, retry-through-MDS */ + if (!task->tk_ops) { + hdr->pnfs_error = -EAGAIN; + return; + } + /* Yes, so retry the read at the end of the hdr */ hdr->mds_offset += resp->count; argp->offset += resp->count; @@ -268,7 +278,7 @@ static void nfs_readpage_result(struct rpc_task *task, hdr->good_bytes = bound - hdr->io_start; } spin_unlock(&hdr->lock); - } else if (hdr->res.count != hdr->args.count) + } else if (hdr->res.count < hdr->args.count) nfs_readpage_retry(task, hdr); } @@ -354,6 +364,8 @@ readpage_async_filler(void *data, struct page *page) if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); if (!nfs_pageio_add_request(desc->pgio, new)) { + nfs_list_remove_request(new); + nfs_readpage_release(new); error = desc->pgio->pg_error; goto out_unlock; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 383a027de452..f1268280244e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2816,7 +2816,6 @@ out_invalid_transport_udp: * NFS client for backwards compatibility */ unsigned int nfs_callback_set_tcpport; -unsigned short nfs_callback_tcpport; /* Default cache timeout is 10 minutes */ unsigned int nfs_idmap_cache_timeout = 600; /* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ @@ -2827,7 +2826,6 @@ char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; bool recover_lost_locks = false; EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); -EXPORT_SYMBOL_GPL(nfs_callback_tcpport); EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); EXPORT_SYMBOL_GPL(max_session_slots); diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index b6de433da5db..4fe3eead3868 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -42,21 +42,35 @@ error: return -EIO; } -static const char *nfs_follow_link(struct dentry *dentry, void **cookie) +static const char *nfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct inode *inode = d_inode(dentry); struct page *page; void *err; - err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); - if (err) - return err; - page = read_cache_page(&inode->i_data, 0, - (filler_t *)nfs_symlink_filler, inode); - if (IS_ERR(page)) - return ERR_CAST(page); - *cookie = page; - return kmap(page); + if (!dentry) { + err = ERR_PTR(nfs_revalidate_mapping_rcu(inode)); + if (err) + return err; + page = find_get_page(inode->i_mapping, 0); + if (!page) + return ERR_PTR(-ECHILD); + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-ECHILD); + } + } else { + err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); + if (err) + return err; + page = read_cache_page(&inode->i_data, 0, + (filler_t *)nfs_symlink_filler, inode); + if (IS_ERR(page)) + return ERR_CAST(page); + } + set_delayed_call(done, page_put_link, page); + return page_address(page); } /* @@ -64,8 +78,7 @@ static const char *nfs_follow_link(struct dentry *dentry, void **cookie) */ const struct inode_operations nfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = nfs_follow_link, - .put_link = page_put_link, + .get_link = nfs_get_link, .getattr = nfs_getattr, .setattr = nfs_setattr, }; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 75ab7622e0cc..ce43cd6d88c6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -21,6 +21,8 @@ #include <linux/nfs_page.h> #include <linux/backing-dev.h> #include <linux/export.h> +#include <linux/freezer.h> +#include <linux/wait.h> #include <asm/uaccess.h> @@ -244,11 +246,9 @@ static int wb_priority(struct writeback_control *wbc) { int ret = 0; if (wbc->for_reclaim) - return FLUSH_HIGHPRI | FLUSH_STABLE; + return FLUSH_HIGHPRI | FLUSH_COND_STABLE; if (wbc->sync_mode == WB_SYNC_ALL) ret = FLUSH_COND_STABLE; - if (wbc->for_kupdate || wbc->for_background) - ret |= FLUSH_LOWPRI; return ret; } @@ -545,12 +545,22 @@ try_again: return head; } +static void nfs_write_error_remove_page(struct nfs_page *req) +{ + nfs_unlock_request(req); + nfs_end_page_writeback(req); + nfs_release_request(req); + generic_error_remove_page(page_file_mapping(req->wb_page), + req->wb_page); +} + /* * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock) + struct page *page, bool nonblock, + bool launder) { struct nfs_page *req; int ret = 0; @@ -567,8 +577,21 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, ret = 0; if (!nfs_pageio_add_request(pgio, req)) { - nfs_redirty_request(req); ret = pgio->pg_error; + /* + * Remove the problematic req upon fatal errors + * in launder case, while other dirty pages can + * still be around until they get flushed. + */ + if (nfs_error_is_fatal(ret)) { + nfs_context_set_write_error(req->wb_context, ret); + if (launder) { + nfs_write_error_remove_page(req); + goto out; + } + } + nfs_redirty_request(req); + ret = -EAGAIN; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); @@ -576,12 +599,14 @@ out: return ret; } -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio, bool launder) { int ret; nfs_pageio_cond_complete(pgio, page_file_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, + launder); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -592,7 +617,9 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st /* * Write an mmapped page to the server. */ -static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) +static int nfs_writepage_locked(struct page *page, + struct writeback_control *wbc, + bool launder) { struct nfs_pageio_descriptor pgio; struct inode *inode = page_file_mapping(page)->host; @@ -601,7 +628,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio); + err = nfs_do_writepage(page, wbc, &pgio, launder); nfs_pageio_complete(&pgio); if (err < 0) return err; @@ -614,7 +641,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) { int ret; - ret = nfs_writepage_locked(page, wbc); + ret = nfs_writepage_locked(page, wbc, false); unlock_page(page); return ret; } @@ -623,7 +650,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * { int ret; - ret = nfs_do_writepage(page, wbc, data); + ret = nfs_do_writepage(page, wbc, data, false); unlock_page(page); return ret; } @@ -1128,7 +1155,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page) if (req == NULL) return 0; l_ctx = req->wb_lock_context; - do_flush = req->wb_page != page || req->wb_context != ctx; + do_flush = req->wb_page != page || + !nfs_match_open_context(req->wb_context, ctx); /* for now, flush if more than 1 request in page_group */ do_flush |= req->wb_this_page != req; if (l_ctx && flctx && @@ -1326,9 +1354,15 @@ static void nfs_async_write_error(struct list_head *head) } } +static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) +{ + nfs_async_write_error(&hdr->pages); +} + static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { .error_cleanup = nfs_async_write_error, .completion = nfs_write_completion, + .reschedule_io = nfs_async_write_reschedule_io, }; void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, @@ -1505,6 +1539,13 @@ static void nfs_writeback_result(struct rpc_task *task, task->tk_status = -EIO; return; } + + /* For non rpc-based layout drivers, retry-through-MDS */ + if (!task->tk_ops) { + hdr->pnfs_error = -EAGAIN; + return; + } + /* Was this an NFSv2 write or an NFSv3 stable write? */ if (resp->verf->committed != NFS_UNSTABLE) { /* Resend from where the server left off */ @@ -1522,27 +1563,21 @@ static void nfs_writeback_result(struct rpc_task *task, } } - -static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +static int wait_on_commit(struct nfs_mds_commit_info *cinfo) { - int ret; + return wait_on_atomic_t(&cinfo->rpcs_out, + nfs_wait_atomic_killable, TASK_KILLABLE); +} - if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) - return 1; - if (!may_wait) - return 0; - ret = out_of_line_wait_on_bit_lock(&nfsi->flags, - NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - return (ret < 0) ? ret : 1; +static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) +{ + atomic_inc(&cinfo->rpcs_out); } -static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +static void nfs_commit_end(struct nfs_mds_commit_info *cinfo) { - clear_bit(NFS_INO_COMMIT, &nfsi->flags); - smp_mb__after_atomic(); - wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); + if (atomic_dec_and_test(&cinfo->rpcs_out)) + wake_up_atomic_t(&cinfo->rpcs_out); } void nfs_commitdata_release(struct nfs_commit_data *data) @@ -1659,6 +1694,13 @@ void nfs_retry_commit(struct list_head *page_list, } EXPORT_SYMBOL_GPL(nfs_retry_commit); +static void +nfs_commit_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) +{ + __set_page_dirty_nobuffers(req->wb_page); +} + /* * Commit dirty pages */ @@ -1680,7 +1722,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, data->mds_ops, how, 0); out_bad: nfs_retry_commit(head, NULL, cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } @@ -1742,8 +1783,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); - if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) - nfs_commit_clear_lock(NFS_I(data->inode)); + nfs_commit_end(cinfo.mds); } static void nfs_commit_release(void *calldata) @@ -1762,7 +1802,7 @@ static const struct rpc_call_ops nfs_commit_ops = { static const struct nfs_commit_completion_ops nfs_commit_completion_ops = { .completion = nfs_commit_release_pages, - .error_cleanup = nfs_commit_clear_lock, + .resched_write = nfs_commit_resched_write, }; int nfs_generic_commit_list(struct inode *inode, struct list_head *head, @@ -1781,30 +1821,25 @@ int nfs_commit_inode(struct inode *inode, int how) LIST_HEAD(head); struct nfs_commit_info cinfo; int may_wait = how & FLUSH_SYNC; + int error = 0; int res; - res = nfs_commit_set_lock(NFS_I(inode), may_wait); - if (res <= 0) - goto out_mark_dirty; nfs_init_cinfo_from_inode(&cinfo, inode); + nfs_commit_begin(cinfo.mds); res = nfs_scan_commit(inode, &head, &cinfo); - if (res) { - int error; - + if (res) error = nfs_generic_commit_list(inode, &head, how, &cinfo); - if (error < 0) - return error; - if (!may_wait) - goto out_mark_dirty; - error = wait_on_bit_action(&NFS_I(inode)->flags, - NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - if (error < 0) - return error; - } else - nfs_commit_clear_lock(NFS_I(inode)); + nfs_commit_end(cinfo.mds); + if (error < 0) + goto out_error; + if (!may_wait) + goto out_mark_dirty; + error = wait_on_commit(cinfo.mds); + if (error < 0) + return error; return res; +out_error: + res = error; /* Note: If we exit without ensuring that the commit is complete, * we must mark the inode as dirty. Otherwise, future calls to * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure @@ -1814,6 +1849,7 @@ out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return res; } +EXPORT_SYMBOL_GPL(nfs_commit_inode); int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { @@ -1904,7 +1940,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* * Write back all requests on one page - we do this before reading it. */ -int nfs_wb_page(struct inode *inode, struct page *page) +int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) { loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); @@ -1921,7 +1957,7 @@ int nfs_wb_page(struct inode *inode, struct page *page) for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc); + ret = nfs_writepage_locked(page, &wbc, launder); if (ret < 0) goto out_error; continue; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 77e7a5cca888..1a03bc3059e8 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -58,7 +58,7 @@ nlm_fclose(struct file *filp) fput(filp); } -static struct nlmsvc_binding nfsd_nlm_ops = { +static const struct nlmsvc_binding nfsd_nlm_ops = { .fopen = nlm_fopen, /* open file for locking */ .fclose = nlm_fclose, /* close file */ }; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index d8b16c2568f3..5fbf3bbd00d0 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -92,7 +92,7 @@ struct nfsd_net { struct file *rec_file; bool in_grace; - struct nfsd4_client_tracking_ops *client_tracking_ops; + const struct nfsd4_client_tracking_ops *client_tracking_ops; time_t nfsd4_lease; time_t nfsd4_grace; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index f6e7cbabac5a..2246454dec76 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -262,11 +262,11 @@ void fill_post_wcc(struct svc_fh *fhp) err = fh_getattr(fhp, &fhp->fh_post_attr); fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version; if (err) { - fhp->fh_post_saved = 0; + fhp->fh_post_saved = false; /* Grab the ctime anyway - set_change_info might use it */ fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime; } else - fhp->fh_post_saved = 1; + fhp->fh_post_saved = true; } /* @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, } else dchild = dget(dparent); } else - dchild = lookup_one_len(name, dparent, namlen); + dchild = lookup_one_len_unlocked(name, dparent, namlen); if (IS_ERR(dchild)) return rv; if (d_mountpoint(dchild)) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index e7f50c4081d6..7389cb1d7409 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -792,12 +792,16 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason) static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) { + if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) + return; clp->cl_cb_state = NFSD4_CB_DOWN; warn_no_callback_path(clp, reason); } static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) { + if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) + return; clp->cl_cb_state = NFSD4_CB_FAULT; warn_no_callback_path(clp, reason); } @@ -1143,7 +1147,7 @@ nfsd4_run_cb_work(struct work_struct *work) } void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, - struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) + const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) { cb->cb_clp = clp; cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index ebf90e487c75..ce2d010d3b17 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -22,7 +22,7 @@ struct nfs4_layout { static struct kmem_cache *nfs4_layout_cache; static struct kmem_cache *nfs4_layout_stateid_cache; -static struct nfsd4_callback_ops nfsd4_cb_layout_ops; +static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lock_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { @@ -201,6 +201,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, INIT_LIST_HEAD(&ls->ls_perfile); spin_lock_init(&ls->ls_lock); INIT_LIST_HEAD(&ls->ls_layouts); + mutex_init(&ls->ls_mutex); ls->ls_layout_type = layout_type; nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, NFSPROC4_CLNT_CB_LAYOUT); @@ -262,19 +263,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, status = nfserr_jukebox; if (!ls) goto out; + mutex_lock(&ls->ls_mutex); } else { ls = container_of(stid, struct nfs4_layout_stateid, ls_stid); status = nfserr_bad_stateid; + mutex_lock(&ls->ls_mutex); if (stateid->si_generation > stid->sc_stateid.si_generation) - goto out_put_stid; + goto out_unlock_stid; if (layout_type != ls->ls_layout_type) - goto out_put_stid; + goto out_unlock_stid; } *lsp = ls; return 0; +out_unlock_stid: + mutex_unlock(&ls->ls_mutex); out_put_stid: nfs4_put_stid(stid); out: @@ -296,8 +301,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) trace_layout_recall(&ls->ls_stid.sc_stateid); atomic_inc(&ls->ls_stid.sc_count); - update_stateid(&ls->ls_stid.sc_stateid); - memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); nfsd4_run_cb(&ls->ls_recall); out_unlock: @@ -406,8 +409,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) list_add_tail(&new->lo_perstate, &ls->ls_layouts); new = NULL; done: - update_stateid(&ls->ls_stid.sc_stateid); - memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&lgp->lg_sid, &ls->ls_stid); spin_unlock(&ls->ls_lock); out: spin_unlock(&fp->fi_lock); @@ -481,11 +483,8 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, } } if (!list_empty(&ls->ls_layouts)) { - if (found) { - update_stateid(&ls->ls_stid.sc_stateid); - memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid, - sizeof(stateid_t)); - } + if (found) + nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid); lrp->lrs_present = 1; } else { trace_layoutstate_unhash(&ls->ls_stid.sc_stateid); @@ -494,6 +493,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, } spin_unlock(&ls->ls_lock); + mutex_unlock(&ls->ls_mutex); nfs4_put_stid(&ls->ls_stid); nfsd4_free_layouts(&reaplist); return nfs_ok; @@ -608,29 +608,55 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) } } +static void +nfsd4_cb_layout_prepare(struct nfsd4_callback *cb) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + + mutex_lock(&ls->ls_mutex); + nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid); + mutex_unlock(&ls->ls_mutex); +} + static int nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) { struct nfs4_layout_stateid *ls = container_of(cb, struct nfs4_layout_stateid, ls_recall); + struct nfsd_net *nn; + ktime_t now, cutoff; LIST_HEAD(reaplist); + switch (task->tk_status) { case 0: - return 1; + case -NFS4ERR_DELAY: + /* + * Anything left? If not, then call it done. Note that we don't + * take the spinlock since this is an optimization and nothing + * should get added until the cb counter goes to zero. + */ + if (list_empty(&ls->ls_layouts)) + return 1; + + /* Poll the client until it's done with the layout */ + now = ktime_get(); + nn = net_generic(ls->ls_stid.sc_client->net, nfsd_net_id); + + /* Client gets 2 lease periods to return it */ + cutoff = ktime_add_ns(task->tk_start, + nn->nfsd4_lease * NSEC_PER_SEC * 2); + + if (ktime_before(now, cutoff)) { + rpc_delay(task, HZ/100); /* 10 mili-seconds */ + return 0; + } + /* Fallthrough */ case -NFS4ERR_NOMATCHING_LAYOUT: trace_layout_recall_done(&ls->ls_stid.sc_stateid); task->tk_status = 0; return 1; - case -NFS4ERR_DELAY: - /* Poll the client until it's done with the layout */ - /* FIXME: cap number of retries. - * The pnfs standard states that we need to only expire - * the client after at-least "lease time" .eg lease-time * 2 - * when failing to communicate a recall - */ - rpc_delay(task, HZ/100); /* 10 mili-seconds */ - return 0; default: /* * Unknown error or non-responding client, we'll need to fence. @@ -654,7 +680,8 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb) nfs4_put_stid(&ls->ls_stid); } -static struct nfsd4_callback_ops nfsd4_cb_layout_ops = { +static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = { + .prepare = nfsd4_cb_layout_prepare, .done = nfsd4_cb_layout_done, .release = nfsd4_cb_layout_release, }; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 4ce6b97b31ad..819ad812c71b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -774,8 +774,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ - status = nfs4_preprocess_stateid_op(rqstp, cstate, &read->rd_stateid, - RD_STATE, &read->rd_filp, &read->rd_tmp_file); + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + &read->rd_stateid, RD_STATE, + &read->rd_filp, &read->rd_tmp_file); if (status) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; @@ -921,7 +922,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { status = nfs4_preprocess_stateid_op(rqstp, cstate, - &setattr->sa_stateid, WR_STATE, NULL, NULL); + &cstate->current_fh, &setattr->sa_stateid, + WR_STATE, NULL, NULL); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -985,8 +987,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (write->wr_offset >= OFFSET_MAX) return nfserr_inval; - status = nfs4_preprocess_stateid_op(rqstp, cstate, stateid, WR_STATE, - &filp, NULL); + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + stateid, WR_STATE, &filp, NULL); if (status) { dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; @@ -1010,13 +1012,54 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } static __be32 +nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_clone *clone) +{ + struct file *src, *dst; + __be32 status; + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, + &clone->cl_src_stateid, RD_STATE, + &src, NULL); + if (status) { + dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); + goto out; + } + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + &clone->cl_dst_stateid, WR_STATE, + &dst, NULL); + if (status) { + dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); + goto out_put_src; + } + + /* fix up for NFS-specific error code */ + if (!S_ISREG(file_inode(src)->i_mode) || + !S_ISREG(file_inode(dst)->i_mode)) { + status = nfserr_wrong_type; + goto out_put_dst; + } + + status = nfsd4_clone_file_range(src, clone->cl_src_pos, + dst, clone->cl_dst_pos, clone->cl_count); + +out_put_dst: + fput(dst); +out_put_src: + fput(src); +out: + return status; +} + +static __be32 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { __be32 status = nfserr_notsupp; struct file *file; - status = nfs4_preprocess_stateid_op(rqstp, cstate, + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &fallocate->falloc_stateid, WR_STATE, &file, NULL); if (status != nfs_ok) { @@ -1055,7 +1098,7 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct file *file; - status = nfs4_preprocess_stateid_op(rqstp, cstate, + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &seek->seek_stateid, RD_STATE, &file, NULL); if (status) { @@ -1309,6 +1352,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, nfserr = nfsd4_insert_layout(lgp, ls); out_put_stid: + mutex_unlock(&ls->ls_mutex); nfs4_put_stid(&ls->ls_stid); out: return nfserr; @@ -1362,6 +1406,9 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, goto out; } + /* LAYOUTCOMMIT does not require any serialization */ + mutex_unlock(&ls->ls_mutex); + if (new_size > i_size_read(inode)) { lcp->lc_size_chg = 1; lcp->lc_newsize = new_size; @@ -2275,6 +2322,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_DEALLOCATE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, + [OP_CLONE] = { + .op_func = (nfsd4op_func)nfsd4_clone, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_CLONE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e3d47091b191..79f0307a5ec8 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -631,7 +631,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp) return -ENOENT; } -static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { +static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .init = nfsd4_legacy_tracking_init, .exit = nfsd4_legacy_tracking_exit, .create = nfsd4_create_clid_dir, @@ -1050,7 +1050,7 @@ out_err: printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); } -static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .init = nfsd4_init_cld_pipe, .exit = nfsd4_remove_cld_pipe, .create = nfsd4_cld_create, @@ -1394,7 +1394,7 @@ nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn) kfree(legacy); } -static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { +static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .init = nfsd4_umh_cltrack_init, .exit = NULL, .create = nfsd4_umh_cltrack_create, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0f1d5691b795..c484a2b6cd10 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -98,7 +98,7 @@ static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); -static struct nfsd4_callback_ops nfsd4_cb_recall_ops; +static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static bool is_session_dead(struct nfsd4_session *ses) { @@ -575,6 +575,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; /* Will be incremented before return to client: */ atomic_set(&stid->sc_count, 1); + spin_lock_init(&stid->sc_lock); /* * It shouldn't be a problem to reuse an opaque stateid value. @@ -745,6 +746,18 @@ nfs4_put_stid(struct nfs4_stid *s) put_nfs4_file(fp); } +void +nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) +{ + stateid_t *src = &stid->sc_stateid; + + spin_lock(&stid->sc_lock); + if (unlikely(++src->si_generation == 0)) + src->si_generation = 1; + memcpy(dst, src, sizeof(*dst)); + spin_unlock(&stid->sc_lock); +} + static void nfs4_put_deleg_lease(struct nfs4_file *fp) { struct file *filp = NULL; @@ -765,16 +778,68 @@ void nfs4_unhash_stid(struct nfs4_stid *s) s->sc_type = 0; } -static void +/** + * nfs4_get_existing_delegation - Discover if this delegation already exists + * @clp: a pointer to the nfs4_client we're granting a delegation to + * @fp: a pointer to the nfs4_file we're granting a delegation on + * + * Return: + * On success: NULL if an existing delegation was not found. + * + * On error: -EAGAIN if one was previously granted to this nfs4_client + * for this nfs4_file. + * + */ + +static int +nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp) +{ + struct nfs4_delegation *searchdp = NULL; + struct nfs4_client *searchclp = NULL; + + lockdep_assert_held(&state_lock); + lockdep_assert_held(&fp->fi_lock); + + list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { + searchclp = searchdp->dl_stid.sc_client; + if (clp == searchclp) { + return -EAGAIN; + } + } + return 0; +} + +/** + * hash_delegation_locked - Add a delegation to the appropriate lists + * @dp: a pointer to the nfs4_delegation we are adding. + * @fp: a pointer to the nfs4_file we're granting a delegation on + * + * Return: + * On success: NULL if the delegation was successfully hashed. + * + * On error: -EAGAIN if one was previously granted to this + * nfs4_client for this nfs4_file. Delegation is not hashed. + * + */ + +static int hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) { + int status; + struct nfs4_client *clp = dp->dl_stid.sc_client; + lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); + status = nfs4_get_existing_delegation(clp, fp); + if (status) + return status; + ++fp->fi_delegees; atomic_inc(&dp->dl_stid.sc_count); dp->dl_stid.sc_type = NFS4_DELEG_STID; list_add(&dp->dl_perfile, &fp->fi_delegations); - list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); + list_add(&dp->dl_perclnt, &clp->cl_delegations); + return 0; } static bool @@ -1792,15 +1857,28 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) target->cl_clientid.cl_id = source->cl_clientid.cl_id; } -static int copy_cred(struct svc_cred *target, struct svc_cred *source) +int strdup_if_nonnull(char **target, char *source) { - if (source->cr_principal) { - target->cr_principal = - kstrdup(source->cr_principal, GFP_KERNEL); - if (target->cr_principal == NULL) + if (source) { + *target = kstrdup(source, GFP_KERNEL); + if (!*target) return -ENOMEM; } else - target->cr_principal = NULL; + *target = NULL; + return 0; +} + +static int copy_cred(struct svc_cred *target, struct svc_cred *source) +{ + int ret; + + ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal); + if (ret) + return ret; + ret = strdup_if_nonnull(&target->cr_raw_principal, + source->cr_raw_principal); + if (ret) + return ret; target->cr_flavor = source->cr_flavor; target->cr_uid = source->cr_uid; target->cr_gid = source->cr_gid; @@ -1904,6 +1982,9 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) return false; if (!svc_rqst_integrity_protected(rqstp)) return false; + if (cl->cl_cred.cr_raw_principal) + return 0 == strcmp(cl->cl_cred.cr_raw_principal, + cr->cr_raw_principal); if (!cr->cr_principal) return false; return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); @@ -2175,7 +2256,8 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) base = resp->cstate.data_offset; slot->sl_datalen = buf->len - base; if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) - WARN("%s: sessions DRC could not cache compound\n", __func__); + WARN(1, "%s: sessions DRC could not cache compound\n", + __func__); return; } @@ -2256,15 +2338,20 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) clid->flags = new->cl_exchange_flags; } +static bool client_has_openowners(struct nfs4_client *clp) +{ + struct nfs4_openowner *oo; + + list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) { + if (!list_empty(&oo->oo_owner.so_stateids)) + return true; + } + return false; +} + static bool client_has_state(struct nfs4_client *clp) { - /* - * Note clp->cl_openowners check isn't quite right: there's no - * need to count owners without stateid's. - * - * Also note we should probably be using this in 4.0 case too. - */ - return !list_empty(&clp->cl_openowners) + return client_has_openowners(clp) #ifdef CONFIG_NFSD_PNFS || !list_empty(&clp->cl_lo_states) #endif @@ -2295,10 +2382,27 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; + new = create_client(exid->clname, rqstp, &verf); + if (new == NULL) + return nfserr_jukebox; + switch (exid->spa_how) { case SP4_MACH_CRED: - if (!svc_rqst_integrity_protected(rqstp)) - return nfserr_inval; + if (!svc_rqst_integrity_protected(rqstp)) { + status = nfserr_inval; + goto out_nolock; + } + /* + * Sometimes userspace doesn't give us a principal. + * Which is a bug, really. Anyway, we can't enforce + * MACH_CRED in that case, better to give up now: + */ + if (!new->cl_cred.cr_principal && + !new->cl_cred.cr_raw_principal) { + status = nfserr_serverfault; + goto out_nolock; + } + new->cl_mach_cred = true; case SP4_NONE: break; default: /* checked by xdr code */ @@ -2307,10 +2411,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, return nfserr_encr_alg_unsupp; } - new = create_client(exid->clname, rqstp, &verf); - if (new == NULL) - return nfserr_jukebox; - /* Cases below refer to rfc 5661 section 18.35.4: */ spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&exid->clname, nn); @@ -2372,7 +2472,6 @@ out_new: goto out; } new->cl_minorversion = cstate->minorversion; - new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED); gen_clid(new, nn); add_to_unconfirmed(new); @@ -2390,6 +2489,7 @@ out_copy: out: spin_unlock(&nn->client_lock); +out_nolock: if (new) expire_client(new); if (unconf) @@ -3049,7 +3149,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Cases below refer to rfc 3530 section 14.2.33: */ spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&clname, nn); - if (conf) { + if (conf && client_has_state(conf)) { /* case 0: */ status = nfserr_clid_inuse; if (clp_used_exchangeid(conf)) @@ -3136,6 +3236,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } else { /* case 3: normal case; new or rebooted client */ old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { + status = nfserr_clid_inuse; + if (client_has_state(old) + && !same_creds(&unconf->cl_cred, + &old->cl_cred)) + goto out; status = mark_client_expired_locked(old); if (status) { old = NULL; @@ -3317,6 +3422,27 @@ static const struct nfs4_stateowner_operations openowner_ops = { .so_free = nfs4_free_openowner, }; +static struct nfs4_ol_stateid * +nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) +{ + struct nfs4_ol_stateid *local, *ret = NULL; + struct nfs4_openowner *oo = open->op_openowner; + + lockdep_assert_held(&fp->fi_lock); + + list_for_each_entry(local, &fp->fi_stateids, st_perfile) { + /* ignore lock owners */ + if (local->st_stateowner->so_is_open_owner == 0) + continue; + if (local->st_stateowner == &oo->oo_owner) { + ret = local; + atomic_inc(&ret->st_stid.sc_count); + break; + } + } + return ret; +} + static struct nfs4_openowner * alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, struct nfsd4_compound_state *cstate) @@ -3348,9 +3474,20 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, return ret; } -static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { +static struct nfs4_ol_stateid * +init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, + struct nfsd4_open *open) +{ + struct nfs4_openowner *oo = open->op_openowner; + struct nfs4_ol_stateid *retstp = NULL; + spin_lock(&oo->oo_owner.so_client->cl_lock); + spin_lock(&fp->fi_lock); + + retstp = nfsd4_find_existing_open(fp, open); + if (retstp) + goto out_unlock; atomic_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_OPEN_STID; INIT_LIST_HEAD(&stp->st_locks); @@ -3360,12 +3497,14 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, stp->st_access_bmap = 0; stp->st_deny_bmap = 0; stp->st_openstp = NULL; - spin_lock(&oo->oo_owner.so_client->cl_lock); + init_rwsem(&stp->st_rwsem); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); - spin_lock(&fp->fi_lock); list_add(&stp->st_perfile, &fp->fi_stateids); + +out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); + return retstp; } /* @@ -3539,7 +3678,7 @@ static void nfsd4_cb_recall_release(struct nfsd4_callback *cb) nfs4_put_stid(&dp->dl_stid); } -static struct nfsd4_callback_ops nfsd4_cb_recall_ops = { +static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = { .prepare = nfsd4_cb_recall_prepare, .done = nfsd4_cb_recall_done, .release = nfsd4_cb_recall_release, @@ -3776,27 +3915,6 @@ out: return nfs_ok; } -static struct nfs4_ol_stateid * -nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) -{ - struct nfs4_ol_stateid *local, *ret = NULL; - struct nfs4_openowner *oo = open->op_openowner; - - spin_lock(&fp->fi_lock); - list_for_each_entry(local, &fp->fi_stateids, st_perfile) { - /* ignore lock owners */ - if (local->st_stateowner->so_is_open_owner == 0) - continue; - if (local->st_stateowner == &oo->oo_owner) { - ret = local; - atomic_inc(&ret->st_stid.sc_count); - break; - } - } - spin_unlock(&fp->fi_lock); - return ret; -} - static inline int nfs4_access_to_access(u32 nfs4_access) { int flags = 0; @@ -3945,6 +4063,18 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) return fl; } +/** + * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer + * @dp: a pointer to the nfs4_delegation we're adding. + * + * Return: + * On success: Return code will be 0 on success. + * + * On error: -EAGAIN if there was an existing delegation. + * nonzero if there is an error in other cases. + * + */ + static int nfs4_setlease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; @@ -3976,16 +4106,19 @@ static int nfs4_setlease(struct nfs4_delegation *dp) goto out_unlock; /* Race breaker */ if (fp->fi_deleg_file) { - status = 0; - ++fp->fi_delegees; - hash_delegation_locked(dp, fp); + status = hash_delegation_locked(dp, fp); goto out_unlock; } fp->fi_deleg_file = filp; - fp->fi_delegees = 1; - hash_delegation_locked(dp, fp); + fp->fi_delegees = 0; + status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); + if (status) { + /* Should never happen, this is a new fi_deleg_file */ + WARN_ON_ONCE(1); + goto out_fput; + } return 0; out_unlock: spin_unlock(&fp->fi_lock); @@ -4005,6 +4138,15 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); + spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + status = nfs4_get_existing_delegation(clp, fp); + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + + if (status) + return ERR_PTR(status); + dp = alloc_init_deleg(clp, fh, odstate); if (!dp) return ERR_PTR(-ENOMEM); @@ -4023,9 +4165,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, status = -EAGAIN; goto out_unlock; } - ++fp->fi_delegees; - hash_delegation_locked(dp, fp); - status = 0; + status = hash_delegation_locked(dp, fp); out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); @@ -4160,6 +4300,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; struct nfs4_ol_stateid *stp = NULL; + struct nfs4_ol_stateid *swapstp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; @@ -4173,7 +4314,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; + spin_lock(&fp->fi_lock); stp = nfsd4_find_existing_open(fp, open); + spin_unlock(&fp->fi_lock); } else { open->op_file = NULL; status = nfserr_bad_stateid; @@ -4187,15 +4330,32 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf */ if (stp) { /* Stateid was found, this is an OPEN upgrade */ + down_read(&stp->st_rwsem); status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); - if (status) + if (status) { + up_read(&stp->st_rwsem); goto out; + } } else { stp = open->op_stp; open->op_stp = NULL; - init_open_stateid(stp, fp, open); + swapstp = init_open_stateid(stp, fp, open); + if (swapstp) { + nfs4_put_stid(&stp->st_stid); + stp = swapstp; + down_read(&stp->st_rwsem); + status = nfs4_upgrade_open(rqstp, fp, current_fh, + stp, open); + if (status) { + up_read(&stp->st_rwsem); + goto out; + } + goto upgrade_out; + } + down_read(&stp->st_rwsem); status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); if (status) { + up_read(&stp->st_rwsem); release_open_stateid(stp); goto out; } @@ -4205,8 +4365,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (stp->st_clnt_odstate == open->op_odstate) open->op_odstate = NULL; } - update_stateid(&stp->st_stid.sc_stateid); - memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); +upgrade_out: + nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); + up_read(&stp->st_rwsem); if (nfsd4_has_session(&resp->cstate)) { if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { @@ -4410,8 +4571,7 @@ static void laundromat_main(struct work_struct *laundry) { time_t t; - struct delayed_work *dwork = container_of(laundry, struct delayed_work, - work); + struct delayed_work *dwork = to_delayed_work(laundry); struct nfsd_net *nn = container_of(dwork, struct nfsd_net, laundromat_work); @@ -4666,10 +4826,9 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, */ __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, - struct nfsd4_compound_state *cstate, stateid_t *stateid, - int flags, struct file **filpp, bool *tmp_file) + struct nfsd4_compound_state *cstate, struct svc_fh *fhp, + stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file) { - struct svc_fh *fhp = &cstate->current_fh; struct inode *ino = d_inode(fhp->fh_dentry); struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -4819,10 +4978,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ * revoked delegations are kept only for free_stateid. */ return nfserr_bad_stateid; + down_write(&stp->st_rwsem); status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); - if (status) - return status; - return nfs4_check_fh(current_fh, &stp->st_stid); + if (status == nfs_ok) + status = nfs4_check_fh(current_fh, &stp->st_stid); + if (status != nfs_ok) + up_write(&stp->st_rwsem); + return status; } /* @@ -4869,6 +5031,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs return status; oo = openowner(stp->st_stateowner); if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { + up_write(&stp->st_rwsem); nfs4_put_stid(&stp->st_stid); return nfserr_bad_stateid; } @@ -4899,11 +5062,13 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; - if (oo->oo_flags & NFS4_OO_CONFIRMED) + if (oo->oo_flags & NFS4_OO_CONFIRMED) { + up_write(&stp->st_rwsem); goto put_stateid; + } oo->oo_flags |= NFS4_OO_CONFIRMED; - update_stateid(&stp->st_stid.sc_stateid); - memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid); + up_write(&stp->st_rwsem); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); @@ -4975,13 +5140,11 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, goto put_stateid; } nfs4_stateid_downgrade(stp, od->od_share_access); - reset_union_bmap_deny(od->od_share_deny, stp); - - update_stateid(&stp->st_stid.sc_stateid); - memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid); status = nfs_ok; put_stateid: + up_write(&stp->st_rwsem); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); @@ -5033,8 +5196,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_bump_seqid(cstate, status); if (status) goto out; - update_stateid(&stp->st_stid.sc_stateid); - memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); + up_write(&stp->st_rwsem); nfsd4_close_open_stateid(stp); @@ -5260,6 +5423,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; + init_rwsem(&stp->st_rwsem); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); spin_lock(&fp->fi_lock); @@ -5428,6 +5592,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &open_stp, nn); if (status) goto out; + up_write(&open_stp->st_rwsem); open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, @@ -5435,6 +5600,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); + if (status == nfs_ok) + down_write(&lock_stp->st_rwsem); } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, @@ -5512,9 +5679,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); switch (-err) { case 0: /* success! */ - update_stateid(&lock_stp->st_stid.sc_stateid); - memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, - sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); status = 0; break; case (EAGAIN): /* conflock holds conflicting lock */ @@ -5540,6 +5705,8 @@ out: seqid_mutating_err(ntohl(status))) lock_sop->lo_owner.so_seqid++; + up_write(&lock_stp->st_rwsem); + /* * If this is a new, never-before-used stateid, and we are * returning an error, then just go ahead and release it. @@ -5704,11 +5871,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } - update_stateid(&stp->st_stid.sc_stateid); - memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid); fput: fput(filp); put_stateid: + up_write(&stp->st_rwsem); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 51c9e9ca39a4..d6ef0955a979 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1675,6 +1675,25 @@ nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, } static __be32 +nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid); + if (status) + return status; + + READ_BUF(8 + 8 + 8); + p = xdr_decode_hyper(p, &clone->cl_src_pos); + p = xdr_decode_hyper(p, &clone->cl_dst_pos); + p = xdr_decode_hyper(p, &clone->cl_count); + DECODE_TAIL; +} + +static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { DECODE_HEAD; @@ -1785,6 +1804,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone, }; static inline bool @@ -2838,14 +2858,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, __be32 nfserr; int ignore_crossmnt = 0; - dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); + dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); if (d_really_is_negative(dentry)) { /* - * nfsd_buffered_readdir drops the i_mutex between - * readdir and calling this callback, leaving a window - * where this directory entry could have gone away. + * we're not holding the i_mutex here, so there's + * a window where this directory entry could have gone + * away. */ dput(dentry); return nfserr_noent; @@ -4292,6 +4312,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop, }; /* diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 46ec934f5dee..54cde9a5864e 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -63,7 +63,6 @@ static unsigned int longest_chain; static unsigned int longest_chain_cachesize; static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); -static void cache_cleaner_func(struct work_struct *unused); static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, @@ -76,13 +75,6 @@ static struct shrinker nfsd_reply_cache_shrinker = { }; /* - * locking for the reply cache: - * A cache entry is "single use" if c_state == RC_INPROG - * Otherwise, it when accessing _prev or _next, the lock must be held. - */ -static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func); - -/* * Put a cap on the size of the DRC based on the amount of available * low memory in the machine. * @@ -203,7 +195,6 @@ void nfsd_reply_cache_shutdown(void) unsigned int i; unregister_shrinker(&nfsd_reply_cache_shrinker); - cancel_delayed_work_sync(&cache_cleaner); for (i = 0; i < drc_hashsize; i++) { struct list_head *head = &drc_hashtbl[i].lru_head; @@ -217,10 +208,8 @@ void nfsd_reply_cache_shutdown(void) drc_hashtbl = NULL; drc_hashsize = 0; - if (drc_slab) { - kmem_cache_destroy(drc_slab); - drc_slab = NULL; - } + kmem_cache_destroy(drc_slab); + drc_slab = NULL; } /* @@ -232,7 +221,6 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { rp->c_timestamp = jiffies; list_move_tail(&rp->c_lru, &b->lru_head); - schedule_delayed_work(&cache_cleaner, RC_EXPIRE); } static long @@ -266,7 +254,6 @@ prune_cache_entries(void) { unsigned int i; long freed = 0; - bool cancel = true; for (i = 0; i < drc_hashsize; i++) { struct nfsd_drc_bucket *b = &drc_hashtbl[i]; @@ -275,26 +262,11 @@ prune_cache_entries(void) continue; spin_lock(&b->cache_lock); freed += prune_bucket(b); - if (!list_empty(&b->lru_head)) - cancel = false; spin_unlock(&b->cache_lock); } - - /* - * Conditionally rearm the job to run in RC_EXPIRE since we just - * ran the pruner. - */ - if (!cancel) - mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); return freed; } -static void -cache_cleaner_func(struct work_struct *unused) -{ - prune_cache_entries(); -} - static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 350041a40fe5..c1681ce894c5 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -631,10 +631,7 @@ fh_put(struct svc_fh *fhp) fh_unlock(fhp); fhp->fh_dentry = NULL; dput(dentry); -#ifdef CONFIG_NFSD_V3 - fhp->fh_pre_saved = 0; - fhp->fh_post_saved = 0; -#endif + fh_clear_wcc(fhp); } fh_drop_write(fhp); if (exp) { diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 1e90dad4926b..0770bcb543c8 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -7,6 +7,7 @@ #ifndef _LINUX_NFSD_NFSFH_H #define _LINUX_NFSD_NFSFH_H +#include <linux/crc32.h> #include <linux/sunrpc/svc.h> #include <uapi/linux/nfsd/nfsfh.h> @@ -26,16 +27,16 @@ static inline ino_t u32_to_ino_t(__u32 uino) */ typedef struct svc_fh { struct knfsd_fh fh_handle; /* FH data */ + int fh_maxsize; /* max size for fh_handle */ struct dentry * fh_dentry; /* validated dentry */ struct svc_export * fh_export; /* export pointer */ - int fh_maxsize; /* max size for fh_handle */ - unsigned char fh_locked; /* inode locked by us */ - unsigned char fh_want_write; /* remount protection taken */ + bool fh_locked; /* inode locked by us */ + bool fh_want_write; /* remount protection taken */ #ifdef CONFIG_NFSD_V3 - unsigned char fh_post_saved; /* post-op attrs saved */ - unsigned char fh_pre_saved; /* pre-op attrs saved */ + bool fh_post_saved; /* post-op attrs saved */ + bool fh_pre_saved; /* pre-op attrs saved */ /* Pre-op attributes saved during fh_lock */ __u64 fh_pre_size; /* size before operation */ @@ -205,6 +206,28 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) return true; } +#ifdef CONFIG_CRC32 +/** + * knfsd_fh_hash - calculate the crc32 hash for the filehandle + * @fh - pointer to filehandle + * + * returns a crc32 hash for the filehandle that is compatible with + * the one displayed by "wireshark". + */ + +static inline u32 +knfsd_fh_hash(struct knfsd_fh *fh) +{ + return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size); +} +#else +static inline u32 +knfsd_fh_hash(struct knfsd_fh *fh) +{ + return 0; +} +#endif + #ifdef CONFIG_NFSD_V3 /* * The wcc data stored in current_fh should be cleared @@ -213,8 +236,8 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) static inline void fh_clear_wcc(struct svc_fh *fhp) { - fhp->fh_post_saved = 0; - fhp->fh_pre_saved = 0; + fhp->fh_post_saved = false; + fhp->fh_pre_saved = false; } /* @@ -231,7 +254,7 @@ fill_pre_wcc(struct svc_fh *fhp) fhp->fh_pre_ctime = inode->i_ctime; fhp->fh_pre_size = inode->i_size; fhp->fh_pre_change = inode->i_version; - fhp->fh_pre_saved = 1; + fhp->fh_pre_saved = true; } } @@ -267,7 +290,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) inode = d_inode(dentry); mutex_lock_nested(&inode->i_mutex, subclass); fill_pre_wcc(fhp); - fhp->fh_locked = 1; + fhp->fh_locked = true; } static inline void @@ -285,7 +308,7 @@ fh_unlock(struct svc_fh *fhp) if (fhp->fh_locked) { fill_post_wcc(fhp); mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); - fhp->fh_locked = 0; + fhp->fh_locked = false; } } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ad4e2377dd63..45007acaf364 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -14,9 +14,13 @@ #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/seq_file.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#include <net/ipv6.h> #include <net/net_namespace.h> #include "nfsd.h" #include "cache.h" @@ -306,22 +310,81 @@ static void nfsd_shutdown_net(struct net *net) nfsd_shutdown_generic(); } +static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + struct net *net = dev_net(dev); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct sockaddr_in sin; + + if (event != NETDEV_DOWN) + goto out; + + if (nn->nfsd_serv) { + dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ifa->ifa_local; + svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block nfsd_inetaddr_notifier = { + .notifier_call = nfsd_inetaddr_event, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static int nfsd_inet6addr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + struct net_device *dev = ifa->idev->dev; + struct net *net = dev_net(dev); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct sockaddr_in6 sin6; + + if (event != NETDEV_DOWN) + goto out; + + if (nn->nfsd_serv) { + dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ifa->addr; + svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block nfsd_inet6addr_notifier = { + .notifier_call = nfsd_inet6addr_event, +}; +#endif + static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); +#endif /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are * started, then nfsd_last_thread will be run before any of this - * other initialization has been done. + * other initialization has been done except the rpcb information. */ + svc_rpcb_cleanup(serv, net); if (!nn->nfsd_net_up) return; - nfsd_shutdown_net(net); - - svc_rpcb_cleanup(serv, net); + nfsd_shutdown_net(net); printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); nfsd_export_flush(net); @@ -425,6 +488,10 @@ int nfsd_create_serv(struct net *net) } set_max_drc(); + register_inetaddr_notifier(&nfsd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + register_inet6addr_notifier(&nfsd_inet6addr_notifier); +#endif do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ return 0; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 583ffc13cae2..c050c53036a6 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -65,7 +65,7 @@ struct nfsd4_callback { struct nfs4_client *cb_clp; u32 cb_minorversion; struct rpc_message cb_msg; - struct nfsd4_callback_ops *cb_ops; + const struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; int cb_seq_status; int cb_status; @@ -84,7 +84,7 @@ struct nfsd4_callback_ops { * fields that are of general use to any stateid. */ struct nfs4_stid { - atomic_t sc_count; + atomic_t sc_count; #define NFS4_OPEN_STID 1 #define NFS4_LOCK_STID 2 #define NFS4_DELEG_STID 4 @@ -94,11 +94,12 @@ struct nfs4_stid { #define NFS4_REVOKED_DELEG_STID 16 #define NFS4_CLOSED_DELEG_STID 32 #define NFS4_LAYOUT_STID 64 - unsigned char sc_type; - stateid_t sc_stateid; - struct nfs4_client *sc_client; - struct nfs4_file *sc_file; - void (*sc_free)(struct nfs4_stid *); + unsigned char sc_type; + stateid_t sc_stateid; + spinlock_t sc_lock; + struct nfs4_client *sc_client; + struct nfs4_file *sc_file; + void (*sc_free)(struct nfs4_stid *); }; /* @@ -364,15 +365,6 @@ struct nfs4_client_reclaim { char cr_recdir[HEXDIR_LEN]; /* recover dir */ }; -static inline void -update_stateid(stateid_t *stateid) -{ - stateid->si_generation++; - /* Wraparound recommendation from 3530bis-13 9.1.3.2: */ - if (stateid->si_generation == 0) - stateid->si_generation = 1; -} - /* A reasonable value for REPLAY_ISIZE was estimated as follows: * The OPEN response, typically the largest, requires * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) + @@ -534,15 +526,16 @@ struct nfs4_file { * Better suggestions welcome. */ struct nfs4_ol_stateid { - struct nfs4_stid st_stid; /* must be first field */ - struct list_head st_perfile; - struct list_head st_perstateowner; - struct list_head st_locks; - struct nfs4_stateowner * st_stateowner; - struct nfs4_clnt_odstate * st_clnt_odstate; - unsigned char st_access_bmap; - unsigned char st_deny_bmap; - struct nfs4_ol_stateid * st_openstp; + struct nfs4_stid st_stid; + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_locks; + struct nfs4_stateowner *st_stateowner; + struct nfs4_clnt_odstate *st_clnt_odstate; + unsigned char st_access_bmap; + unsigned char st_deny_bmap; + struct nfs4_ol_stateid *st_openstp; + struct rw_semaphore st_rwsem; }; static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) @@ -561,6 +554,7 @@ struct nfs4_layout_stateid { struct nfsd4_callback ls_recall; stateid_t ls_recall_sid; bool ls_recalled; + struct mutex ls_mutex; }; static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) @@ -584,8 +578,8 @@ struct nfsd4_compound_state; struct nfsd_net; extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, - struct nfsd4_compound_state *cstate, stateid_t *stateid, - int flags, struct file **filp, bool *tmp_file); + struct nfsd4_compound_state *cstate, struct svc_fh *fhp, + stateid_t *stateid, int flags, struct file **filp, bool *tmp_file); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); @@ -593,6 +587,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab); void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); +void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, @@ -604,7 +599,7 @@ extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, - struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); + const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); extern void nfsd4_run_cb(struct nfsd4_callback *cb); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c index 82f89070594c..90967466a1e5 100644 --- a/fs/nfsd/trace.c +++ b/fs/nfsd/trace.c @@ -1,5 +1,3 @@ -#include "state.h" - #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index c668520c344b..3287041905da 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -8,6 +8,49 @@ #define _NFSD_TRACE_H #include <linux/tracepoint.h> +#include "nfsfh.h" + +DECLARE_EVENT_CLASS(nfsd_io_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + loff_t offset, + int len), + TP_ARGS(rqstp, fhp, offset, len), + TP_STRUCT__entry( + __field(__be32, xid) + __field_struct(struct knfsd_fh, fh) + __field(loff_t, offset) + __field(int, len) + ), + TP_fast_assign( + __entry->xid = rqstp->rq_xid, + fh_copy_shallow(&__entry->fh, &fhp->fh_handle); + __entry->offset = offset; + __entry->len = len; + ), + TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d", + __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh), + __entry->offset, __entry->len) +) + +#define DEFINE_NFSD_IO_EVENT(name) \ +DEFINE_EVENT(nfsd_io_class, name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + loff_t offset, \ + int len), \ + TP_ARGS(rqstp, fhp, offset, len)) + +DEFINE_NFSD_IO_EVENT(read_start); +DEFINE_NFSD_IO_EVENT(read_opened); +DEFINE_NFSD_IO_EVENT(read_io_done); +DEFINE_NFSD_IO_EVENT(read_done); +DEFINE_NFSD_IO_EVENT(write_start); +DEFINE_NFSD_IO_EVENT(write_opened); +DEFINE_NFSD_IO_EVENT(write_io_done); +DEFINE_NFSD_IO_EVENT(write_done); + +#include "state.h" DECLARE_EVENT_CLASS(nfsd_stateid_class, TP_PROTO(stateid_t *stp), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 45c04979e7b3..6739077f17fe 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -36,12 +36,14 @@ #endif /* CONFIG_NFSD_V3 */ #ifdef CONFIG_NFSD_V4 +#include "../internal.h" #include "acl.h" #include "idmap.h" #endif /* CONFIG_NFSD_V4 */ #include "nfsd.h" #include "vfs.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP @@ -217,10 +219,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, host_err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_nfserr; - /* - * check if we have crossed a mount point ... - */ if (nfsd_mountpoint(dentry, exp)) { + /* + * We don't need the i_mutex after all. It's + * still possible we could open this (regular + * files can be mountpoints too), but the + * i_mutex is just there to prevent renames of + * something that we might be about to delegate, + * and a mountpoint won't be renamed: + */ + fh_unlock(fhp); if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { dput(dentry); goto out_nfserr; @@ -498,6 +506,13 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif +__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, + u64 dst_pos, u64 count) +{ + return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, + count)); +} + __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, loff_t len, int flags) @@ -983,16 +998,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct raparms *ra; __be32 err; + trace_read_start(rqstp, fhp, offset, vlen); err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (err) return err; ra = nfsd_init_raparms(file); + + trace_read_opened(rqstp, fhp, offset, vlen); err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); + trace_read_io_done(rqstp, fhp, offset, vlen); + if (ra) nfsd_put_raparams(file, ra); fput(file); + trace_read_done(rqstp, fhp, offset, vlen); + return err; } @@ -1008,24 +1030,31 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, { __be32 err = 0; + trace_write_start(rqstp, fhp, offset, vlen); + if (file) { err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; + trace_write_opened(rqstp, fhp, offset, vlen); err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); + trace_write_io_done(rqstp, fhp, offset, vlen); } else { err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); if (err) goto out; + trace_write_opened(rqstp, fhp, offset, vlen); if (cnt) err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); + trace_write_io_done(rqstp, fhp, offset, vlen); fput(file); } out: + trace_write_done(rqstp, fhp, offset, vlen); return err; } @@ -1631,7 +1660,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, /* cannot use fh_lock as we need deadlock protective ordering * so do it by hand */ trap = lock_rename(tdentry, fdentry); - ffhp->fh_locked = tfhp->fh_locked = 1; + ffhp->fh_locked = tfhp->fh_locked = true; fill_pre_wcc(ffhp); fill_pre_wcc(tfhp); @@ -1681,7 +1710,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, fill_post_wcc(ffhp); fill_post_wcc(tfhp); unlock_rename(tdentry, fdentry); - ffhp->fh_locked = tfhp->fh_locked = 0; + ffhp->fh_locked = tfhp->fh_locked = false; fh_drop_write(ffhp); out: @@ -1809,7 +1838,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, offset = *offsetp; while (1) { - struct inode *dir_inode = file_inode(file); unsigned int reclen; cdp->err = nfserr_eof; /* will be cleared on successful read */ @@ -1828,15 +1856,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, if (!size) break; - /* - * Various filldir functions may end up calling back into - * lookup_one_len() and the file system's ->lookup() method. - * These expect i_mutex to be held, as it would within readdir. - */ - host_err = mutex_lock_killable(&dir_inode->i_mutex); - if (host_err) - break; - de = (struct buffered_dirent *)buf.dirent; while (size > 0) { offset = de->offset; @@ -1853,7 +1872,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, size -= reclen; de = (struct buffered_dirent *)((char *)de + reclen); } - mutex_unlock(&dir_inode->i_mutex); if (size > 0) /* We bailed out early */ break; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index fee2451ae248..c11ba316f23f 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -56,6 +56,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, loff_t, int); +__be32 nfsd4_clone_file_range(struct file *, u64, struct file *, + u64, u64); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, @@ -112,14 +114,14 @@ static inline int fh_want_write(struct svc_fh *fh) int ret = mnt_want_write(fh->fh_export->ex_path.mnt); if (!ret) - fh->fh_want_write = 1; + fh->fh_want_write = true; return ret; } static inline void fh_drop_write(struct svc_fh *fh) { if (fh->fh_want_write) { - fh->fh_want_write = 0; + fh->fh_want_write = false; mnt_drop_write(fh->fh_export->ex_path.mnt); } } diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 9f991007a578..d9554813e58a 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -491,6 +491,15 @@ struct nfsd4_fallocate { u64 falloc_length; }; +struct nfsd4_clone { + /* request */ + stateid_t cl_src_stateid; + stateid_t cl_dst_stateid; + u64 cl_src_pos; + u64 cl_dst_pos; + u64 cl_count; +}; + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -555,6 +564,7 @@ struct nfsd4_op { /* NFSv4.2 */ struct nfsd4_fallocate allocate; struct nfsd4_fallocate deallocate; + struct nfsd4_clone clone; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; @@ -632,7 +642,7 @@ static inline void set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { BUG_ON(!fhp->fh_pre_saved); - cinfo->atomic = fhp->fh_post_saved; + cinfo->atomic = (u32)fhp->fh_post_saved; cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry)); cinfo->before_change = fhp->fh_pre_change; diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 8df0f3b7839b..2ccbf5531554 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -133,38 +133,38 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) /** * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group - * @inode: inode of metadata file using this allocator - * @group: group number * @desc: pointer to descriptor structure for the group + * @lock: spin lock protecting @desc */ static unsigned long -nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, - const struct nilfs_palloc_group_desc *desc) +nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc, + spinlock_t *lock) { unsigned long nfree; - spin_lock(nilfs_mdt_bgl_lock(inode, group)); + spin_lock(lock); nfree = le32_to_cpu(desc->pg_nfrees); - spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + spin_unlock(lock); return nfree; } /** * nilfs_palloc_group_desc_add_entries - adjust count of free entries - * @inode: inode of metadata file using this allocator - * @group: group number * @desc: pointer to descriptor structure for the group + * @lock: spin lock protecting @desc * @n: delta to be added */ -static void -nilfs_palloc_group_desc_add_entries(struct inode *inode, - unsigned long group, - struct nilfs_palloc_group_desc *desc, - u32 n) +static u32 +nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc, + spinlock_t *lock, u32 n) { - spin_lock(nilfs_mdt_bgl_lock(inode, group)); + u32 nfree; + + spin_lock(lock); le32_add_cpu(&desc->pg_nfrees, n); - spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + nfree = le32_to_cpu(desc->pg_nfrees); + spin_unlock(lock); + return nfree; } /** @@ -240,6 +240,26 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, } /** + * nilfs_palloc_delete_block - delete a block on the persistent allocator file + * @inode: inode of metadata file using this allocator + * @blkoff: block offset + * @prev: nilfs_bh_assoc struct of the last used buffer + * @lock: spin lock protecting @prev + */ +static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff, + struct nilfs_bh_assoc *prev, + spinlock_t *lock) +{ + spin_lock(lock); + if (prev->bh && blkoff == prev->blkoff) { + brelse(prev->bh); + prev->bh = NULL; + } + spin_unlock(lock); + return nilfs_mdt_delete_block(inode, blkoff); +} + +/** * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block * @inode: inode of metadata file using this allocator * @group: group number @@ -278,6 +298,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode, } /** + * nilfs_palloc_delete_bitmap_block - delete a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + */ +static int nilfs_palloc_delete_bitmap_block(struct inode *inode, + unsigned long group) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_delete_block(inode, + nilfs_palloc_bitmap_blkoff(inode, + group), + &cache->prev_bitmap, &cache->lock); +} + +/** * nilfs_palloc_get_entry_block - get buffer head of an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) @@ -296,6 +332,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, } /** + * nilfs_palloc_delete_entry_block - delete an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry + */ +static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_delete_block(inode, + nilfs_palloc_entry_blkoff(inode, nr), + &cache->prev_entry, &cache->lock); +} + +/** * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor * @inode: inode of metadata file using this allocator * @group: group number @@ -332,51 +382,40 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, /** * nilfs_palloc_find_available_slot - find available slot in a group - * @inode: inode of metadata file using this allocator - * @group: group number - * @target: offset number of an entry in the group (start point) * @bitmap: bitmap of the group + * @target: offset number of an entry in the group (start point) * @bsize: size in bits + * @lock: spin lock protecting @bitmap */ -static int nilfs_palloc_find_available_slot(struct inode *inode, - unsigned long group, +static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, - unsigned char *bitmap, - int bsize) -{ - int curr, pos, end, i; - - if (target > 0) { - end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1); - if (end > bsize) - end = bsize; - pos = nilfs_find_next_zero_bit(bitmap, end, target); - if (pos < end && - !nilfs_set_bit_atomic( - nilfs_mdt_bgl_lock(inode, group), pos, bitmap)) - return pos; - } else - end = 0; - - for (i = 0, curr = end; - i < bsize; - i += BITS_PER_LONG, curr += BITS_PER_LONG) { - /* wrap around */ - if (curr >= bsize) - curr = 0; - while (*((unsigned long *)bitmap + curr / BITS_PER_LONG) - != ~0UL) { - end = curr + BITS_PER_LONG; - if (end > bsize) - end = bsize; - pos = nilfs_find_next_zero_bit(bitmap, end, curr); - if ((pos < end) && - !nilfs_set_bit_atomic( - nilfs_mdt_bgl_lock(inode, group), pos, - bitmap)) + unsigned bsize, + spinlock_t *lock) +{ + int pos, end = bsize; + + if (likely(target < bsize)) { + pos = target; + do { + pos = nilfs_find_next_zero_bit(bitmap, end, pos); + if (pos >= end) + break; + if (!nilfs_set_bit_atomic(lock, pos, bitmap)) return pos; - } + } while (++pos < end); + + end = target; + } + + /* wrap around */ + for (pos = 0; pos < end; pos++) { + pos = nilfs_find_next_zero_bit(bitmap, end, pos); + if (pos >= end) + break; + if (!nilfs_set_bit_atomic(lock, pos, bitmap)) + return pos; } + return -ENOSPC; } @@ -475,15 +514,15 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, void *desc_kaddr, *bitmap_kaddr; unsigned long group, maxgroup, ngroups; unsigned long group_offset, maxgroup_offset; - unsigned long n, entries_per_group, groups_per_desc_block; + unsigned long n, entries_per_group; unsigned long i, j; + spinlock_t *lock; int pos, ret; ngroups = nilfs_palloc_groups_count(inode); maxgroup = ngroups - 1; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); entries_per_group = nilfs_palloc_entries_per_group(inode); - groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode); for (i = 0; i < ngroups; i += n) { if (group >= ngroups) { @@ -501,8 +540,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, n = nilfs_palloc_rest_groups_in_desc_block(inode, group, maxgroup); for (j = 0; j < n; j++, desc++, group++) { - if (nilfs_palloc_group_desc_nfrees(inode, group, desc) - > 0) { + lock = nilfs_mdt_bgl_lock(inode, group); + if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) { ret = nilfs_palloc_get_bitmap_block( inode, group, 1, &bitmap_bh); if (ret < 0) @@ -510,12 +549,12 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, bitmap_kaddr = kmap(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); pos = nilfs_palloc_find_available_slot( - inode, group, group_offset, bitmap, - entries_per_group); + bitmap, group_offset, + entries_per_group, lock); if (pos >= 0) { /* found a free entry */ nilfs_palloc_group_desc_add_entries( - inode, group, desc, -1); + desc, lock, -1); req->pr_entry_nr = entries_per_group * group + pos; kunmap(desc_bh->b_page); @@ -573,6 +612,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, unsigned long group, group_offset; unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; + spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); desc_kaddr = kmap(req->pr_desc_bh->b_page); @@ -580,13 +620,15 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + lock = nilfs_mdt_bgl_lock(inode, group); - if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), - group_offset, bitmap)) - printk(KERN_WARNING "%s: entry number %llu already freed\n", - __func__, (unsigned long long)req->pr_entry_nr); + if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) + nilfs_warning(inode->i_sb, __func__, + "entry number %llu already freed: ino=%lu\n", + (unsigned long long)req->pr_entry_nr, + (unsigned long)inode->i_ino); else - nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + nilfs_palloc_group_desc_add_entries(desc, lock, 1); kunmap(req->pr_bitmap_bh->b_page); kunmap(req->pr_desc_bh->b_page); @@ -611,6 +653,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, void *desc_kaddr, *bitmap_kaddr; unsigned char *bitmap; unsigned long group, group_offset; + spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); desc_kaddr = kmap(req->pr_desc_bh->b_page); @@ -618,12 +661,15 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); - if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), - group_offset, bitmap)) - printk(KERN_WARNING "%s: entry number %llu already freed\n", - __func__, (unsigned long long)req->pr_entry_nr); + lock = nilfs_mdt_bgl_lock(inode, group); + + if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) + nilfs_warning(inode->i_sb, __func__, + "entry number %llu already freed: ino=%lu\n", + (unsigned long long)req->pr_entry_nr, + (unsigned long)inode->i_ino); else - nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + nilfs_palloc_group_desc_add_entries(desc, lock, 1); kunmap(req->pr_bitmap_bh->b_page); kunmap(req->pr_desc_bh->b_page); @@ -680,22 +726,6 @@ void nilfs_palloc_abort_free_entry(struct inode *inode, } /** - * nilfs_palloc_group_is_in - judge if an entry is in a group - * @inode: inode of metadata file using this allocator - * @group: group number - * @nr: serial number of the entry (e.g. inode number) - */ -static int -nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) -{ - __u64 first, last; - - first = group * nilfs_palloc_entries_per_group(inode); - last = first + nilfs_palloc_entries_per_group(inode) - 1; - return (nr >= first) && (nr <= last); -} - -/** * nilfs_palloc_freev - deallocate a set of persistent objects * @inode: inode of metadata file using this allocator * @entry_nrs: array of entry numbers to be deallocated @@ -708,9 +738,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; unsigned long group, group_offset; - int i, j, n, ret; + __u64 group_min_nr, last_nrs[8]; + const unsigned long epg = nilfs_palloc_entries_per_group(inode); + const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block; + unsigned entry_start, end, pos; + spinlock_t *lock; + int i, j, k, ret; + u32 nfree; for (i = 0; i < nitems; i = j) { + int change_group = false; + int nempties = 0, n = 0; + group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); if (ret < 0) @@ -721,38 +760,89 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) brelse(desc_bh); return ret; } - desc_kaddr = kmap(desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc( - inode, group, desc_bh, desc_kaddr); + + /* Get the first entry number of the group */ + group_min_nr = (__u64)group * epg; + bitmap_kaddr = kmap(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); - for (j = i, n = 0; - (j < nitems) && nilfs_palloc_group_is_in(inode, group, - entry_nrs[j]); - j++) { - nilfs_palloc_group(inode, entry_nrs[j], &group_offset); - if (!nilfs_clear_bit_atomic( - nilfs_mdt_bgl_lock(inode, group), - group_offset, bitmap)) { - printk(KERN_WARNING - "%s: entry number %llu already freed\n", - __func__, - (unsigned long long)entry_nrs[j]); + lock = nilfs_mdt_bgl_lock(inode, group); + + j = i; + entry_start = rounddown(group_offset, epb); + do { + if (!nilfs_clear_bit_atomic(lock, group_offset, + bitmap)) { + nilfs_warning(inode->i_sb, __func__, + "entry number %llu already freed: ino=%lu\n", + (unsigned long long)entry_nrs[j], + (unsigned long)inode->i_ino); } else { n++; } - } - nilfs_palloc_group_desc_add_entries(inode, group, desc, n); + + j++; + if (j >= nitems || entry_nrs[j] < group_min_nr || + entry_nrs[j] >= group_min_nr + epg) { + change_group = true; + } else { + group_offset = entry_nrs[j] - group_min_nr; + if (group_offset >= entry_start && + group_offset < entry_start + epb) { + /* This entry is in the same block */ + continue; + } + } + + /* Test if the entry block is empty or not */ + end = entry_start + epb; + pos = nilfs_find_next_bit(bitmap, end, entry_start); + if (pos >= end) { + last_nrs[nempties++] = entry_nrs[j - 1]; + if (nempties >= ARRAY_SIZE(last_nrs)) + break; + } + + if (change_group) + break; + + /* Go on to the next entry block */ + entry_start = rounddown(group_offset, epb); + } while (true); kunmap(bitmap_bh->b_page); - kunmap(desc_bh->b_page); + mark_buffer_dirty(bitmap_bh); + brelse(bitmap_bh); + for (k = 0; k < nempties; k++) { + ret = nilfs_palloc_delete_entry_block(inode, + last_nrs[k]); + if (ret && ret != -ENOENT) { + nilfs_warning(inode->i_sb, __func__, + "failed to delete block of entry %llu: ino=%lu, err=%d\n", + (unsigned long long)last_nrs[k], + (unsigned long)inode->i_ino, ret); + } + } + + desc_kaddr = kmap_atomic(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n); + kunmap_atomic(desc_kaddr); mark_buffer_dirty(desc_bh); - mark_buffer_dirty(bitmap_bh); nilfs_mdt_mark_dirty(inode); - - brelse(bitmap_bh); brelse(desc_bh); + + if (nfree == nilfs_palloc_entries_per_group(inode)) { + ret = nilfs_palloc_delete_bitmap_block(inode, group); + if (ret && ret != -ENOENT) { + nilfs_warning(inode->i_sb, __func__, + "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n", + group, + (unsigned long)inode->i_ino, ret); + } + } } return 0; } diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 4bd6451b5703..6e6f49aa53df 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -77,6 +77,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t); #define nilfs_set_bit_atomic ext2_set_bit_atomic #define nilfs_clear_bit_atomic ext2_clear_bit_atomic #define nilfs_find_next_zero_bit find_next_zero_bit_le +#define nilfs_find_next_bit find_next_bit_le /** * struct nilfs_bh_assoc - block offset and buffer head association diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 919fd5bb14a8..3a3821b00486 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -919,8 +919,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; - __u64 newkey; - __u64 newptr; int nchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); @@ -942,9 +940,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree, if (!buffer_dirty(path[level].bp_sib_bh)) mark_buffer_dirty(path[level].bp_sib_bh); - newkey = nilfs_btree_node_get_key(right, 0); - newptr = path[level].bp_newreq.bpr_ptr; - if (move) { path[level].bp_index -= nilfs_btree_node_get_nchildren(node); nilfs_btree_node_insert(right, path[level].bp_index, @@ -1856,7 +1851,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, const __u64 *keys, const __u64 *ptrs, int n) { - struct buffer_head *bh; + struct buffer_head *bh = NULL; union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; struct nilfs_bmap_stats stats; int ret; diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 0d5fada91191..7dc23f100e57 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -155,7 +155,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; - __u64 start; sector_t blocknr; void *kaddr; int ret; @@ -169,7 +168,6 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) kaddr = kmap_atomic(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); - start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); kunmap_atomic(kaddr); diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 54575e3cc1a2..088ba001c6ef 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -109,7 +109,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; file_update_time(vma->vm_file); - ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); + ret = block_page_mkwrite(vma, vmf, nilfs_get_block); if (ret) { nilfs_transaction_abort(inode->i_sb); goto out; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 4a73d6dffabf..10b22527a617 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -356,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) goto failed; mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); root = NILFS_I(dir)->i_root; ii = NILFS_I(inode); @@ -510,6 +510,7 @@ static int __nilfs_read_inode(struct super_block *sb, inode->i_mapping->a_ops = &nilfs_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &nilfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &nilfs_aops; } else { inode->i_op = &nilfs_special_inode_operations; @@ -522,7 +523,7 @@ static int __nilfs_read_inode(struct super_block *sb, up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); nilfs_set_inode_flags(inode); mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); return 0; failed_unmap: diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index dee34d990281..1125f40233ff 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -33,6 +33,7 @@ #include "page.h" #include "mdt.h" +#include <trace/events/nilfs2.h> #define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) @@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, set_buffer_uptodate(bh); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(inode); + + trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block); + return 0; } @@ -158,6 +162,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, get_bh(bh); submit_bh(mode, bh); ret = 0; + + trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode); out: get_bh(bh); *out_bh = bh; diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index fe529a87a208..03246cac3338 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -72,7 +72,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) } /* Default GFP flags using highmem */ -#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) +#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM) int nilfs_mdt_get_block(struct inode *, unsigned long, int, void (*init_block)(struct inode *, diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 37dd6b05b1b5..7ccdb961eea9 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -120,9 +120,6 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) struct nilfs_transaction_info ti; int err; - if (!new_valid_dev(rdev)) - return -EINVAL; - err = nilfs_transaction_begin(dir->i_sb, &ti, 1); if (err) return err; @@ -164,6 +161,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry, /* slow symlink */ inode->i_op = &nilfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &nilfs_aops; err = page_symlink(inode, symname, l); if (err) @@ -571,8 +569,7 @@ const struct inode_operations nilfs_special_inode_operations = { const struct inode_operations nilfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .permission = nilfs_permission, }; diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index ff00a0b7acb9..9b4f205d1173 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -582,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; - struct nilfs_segment_summary *sum; + struct nilfs_segment_summary *sum = NULL; sector_t pseg_start; sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ unsigned long nsalvaged_blocks = 0; @@ -814,7 +814,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; - struct nilfs_segment_summary *sum; + struct nilfs_segment_summary *sum = NULL; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ sector_t b, end; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index c6abbad9b8e3..3b65adaae7e4 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -77,6 +77,36 @@ enum { NILFS_ST_DONE, }; +#define CREATE_TRACE_POINTS +#include <trace/events/nilfs2.h> + +/* + * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are + * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of + * the variable must use them because transition of stage count must involve + * trace events (trace_nilfs2_collection_stage_transition). + * + * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't + * produce tracepoint events. It is provided just for making the intention + * clear. + */ +static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci) +{ + sci->sc_stage.scnt++; + trace_nilfs2_collection_stage_transition(sci); +} + +static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt) +{ + sci->sc_stage.scnt = next_scnt; + trace_nilfs2_collection_stage_transition(sci); +} + +static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci) +{ + return sci->sc_stage.scnt; +} + /* State flags of collection */ #define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ #define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ @@ -184,11 +214,18 @@ int nilfs_transaction_begin(struct super_block *sb, { struct the_nilfs *nilfs; int ret = nilfs_prepare_segment_lock(ti); + struct nilfs_transaction_info *trace_ti; if (unlikely(ret < 0)) return ret; - if (ret > 0) + if (ret > 0) { + trace_ti = current->journal_info; + + trace_nilfs2_transaction_transition(sb, trace_ti, + trace_ti->ti_count, trace_ti->ti_flags, + TRACE_NILFS2_TRANSACTION_BEGIN); return 0; + } sb_start_intwrite(sb); @@ -199,6 +236,11 @@ int nilfs_transaction_begin(struct super_block *sb, ret = -ENOSPC; goto failed; } + + trace_ti = current->journal_info; + trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count, + trace_ti->ti_flags, + TRACE_NILFS2_TRANSACTION_BEGIN); return 0; failed: @@ -231,6 +273,8 @@ int nilfs_transaction_commit(struct super_block *sb) ti->ti_flags |= NILFS_TI_COMMIT; if (ti->ti_count > 0) { ti->ti_count--; + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT); return 0; } if (nilfs->ns_writer) { @@ -242,6 +286,9 @@ int nilfs_transaction_commit(struct super_block *sb) nilfs_segctor_do_flush(sci, 0); } up_read(&nilfs->ns_segctor_sem); + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT); + current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_SYNC) @@ -260,10 +307,15 @@ void nilfs_transaction_abort(struct super_block *sb) BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); if (ti->ti_count > 0) { ti->ti_count--; + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT); return; } up_read(&nilfs->ns_segctor_sem); + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT); + current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); @@ -309,6 +361,9 @@ static void nilfs_transaction_lock(struct super_block *sb, current->journal_info = ti; for (;;) { + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK); + down_write(&nilfs->ns_segctor_sem); if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) break; @@ -320,6 +375,9 @@ static void nilfs_transaction_lock(struct super_block *sb, } if (gcflag) ti->ti_flags |= NILFS_TI_GC; + + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK); } static void nilfs_transaction_unlock(struct super_block *sb) @@ -332,6 +390,9 @@ static void nilfs_transaction_unlock(struct super_block *sb) up_write(&nilfs->ns_segctor_sem); current->journal_info = ti->ti_save; + + trace_nilfs2_transaction_transition(sb, ti, ti->ti_count, + ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK); } static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, @@ -1062,7 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) size_t ndone; int err = 0; - switch (sci->sc_stage.scnt) { + switch (nilfs_sc_cstage_get(sci)) { case NILFS_ST_INIT: /* Pre-processes */ sci->sc_stage.flags = 0; @@ -1071,7 +1132,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) sci->sc_nblk_inc = 0; sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN; if (mode == SC_LSEG_DSYNC) { - sci->sc_stage.scnt = NILFS_ST_DSYNC; + nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC); goto dsync_mode; } } @@ -1079,10 +1140,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) sci->sc_stage.dirty_file_ptr = NULL; sci->sc_stage.gc_inode_ptr = NULL; if (mode == SC_FLUSH_DAT) { - sci->sc_stage.scnt = NILFS_ST_DAT; + nilfs_sc_cstage_set(sci, NILFS_ST_DAT); goto dat_stage; } - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_GC: if (nilfs_doing_gc()) { head = &sci->sc_gc_inodes; @@ -1103,7 +1164,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } sci->sc_stage.gc_inode_ptr = NULL; } - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_FILE: head = &sci->sc_dirty_files; ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, @@ -1125,10 +1186,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } sci->sc_stage.dirty_file_ptr = NULL; if (mode == SC_FLUSH_FILE) { - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; } - sci->sc_stage.scnt++; + nilfs_sc_cstage_inc(sci); sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; /* Fall through */ case NILFS_ST_IFILE: @@ -1136,7 +1197,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - sci->sc_stage.scnt++; + nilfs_sc_cstage_inc(sci); /* Creating a checkpoint */ err = nilfs_segctor_create_checkpoint(sci); if (unlikely(err)) @@ -1147,7 +1208,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_SUFILE: err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, &ndone); @@ -1163,7 +1224,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) &nilfs_sc_file_ops); if (unlikely(err)) break; - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_DAT: dat_stage: err = nilfs_segctor_scan_file(sci, nilfs->ns_dat, @@ -1171,10 +1232,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) break; if (mode == SC_FLUSH_DAT) { - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; } - sci->sc_stage.scnt++; /* Fall through */ + nilfs_sc_cstage_inc(sci); /* Fall through */ case NILFS_ST_SR: if (mode == SC_LSEG_SR) { /* Appending a super root */ @@ -1184,7 +1245,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) } /* End of a logical segment */ sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; case NILFS_ST_DSYNC: dsync_mode: @@ -1197,7 +1258,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) break; sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; - sci->sc_stage.scnt = NILFS_ST_DONE; + nilfs_sc_cstage_set(sci, NILFS_ST_DONE); return 0; case NILFS_ST_DONE: return 0; @@ -1442,7 +1503,8 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, goto failed; /* The current segment is filled up */ - if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) + if (mode != SC_LSEG_SR || + nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE) break; nilfs_clear_logs(&sci->sc_segbufs); @@ -1946,7 +2008,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int err; - sci->sc_stage.scnt = NILFS_ST_INIT; + nilfs_sc_cstage_set(sci, NILFS_ST_INIT); sci->sc_cno = nilfs->ns_cno; err = nilfs_segctor_collect_dirty_files(sci, nilfs); @@ -1974,7 +2036,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) goto failed; /* Avoid empty segment */ - if (sci->sc_stage.scnt == NILFS_ST_DONE && + if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE && nilfs_segbuf_empty(sci->sc_curseg)) { nilfs_segctor_abort_construction(sci, nilfs, 1); goto out; @@ -1988,7 +2050,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) nilfs_segctor_fill_in_file_bmap(sci); if (mode == SC_LSEG_SR && - sci->sc_stage.scnt >= NILFS_ST_CPFILE) { + nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) { err = nilfs_segctor_fill_in_checkpoint(sci); if (unlikely(err)) goto failed_to_write; @@ -2007,7 +2069,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) goto failed_to_write; - if (sci->sc_stage.scnt == NILFS_ST_DONE || + if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE || nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) { /* * At this point, we avoid double buffering @@ -2020,7 +2082,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (err) goto failed_to_write; } - } while (sci->sc_stage.scnt != NILFS_ST_DONE); + } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE); out: nilfs_segctor_drop_written_files(sci, nilfs); @@ -2430,7 +2492,6 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) { int mode = 0; - int err; spin_lock(&sci->sc_state_lock); mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ? @@ -2438,7 +2499,7 @@ static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) spin_unlock(&sci->sc_state_lock); if (mode) { - err = nilfs_segctor_do_construct(sci, mode); + nilfs_segctor_do_construct(sci, mode); spin_lock(&sci->sc_state_lock); sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ? diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index a48d6de1e02c..0408b9b2814b 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -67,7 +67,8 @@ struct nilfs_recovery_info { /** * struct nilfs_cstage - Context of collection stage - * @scnt: Stage count + * @scnt: Stage count, must be accessed via wrappers: + * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() * @flags: State flags * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file * @gc_inode_ptr: Pointer on the list of gc-inodes diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 2a869c35c362..52821ffc11f4 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -30,6 +30,8 @@ #include "mdt.h" #include "sufile.h" +#include <trace/events/nilfs2.h> + /** * struct nilfs_sufile_info - on-memory private data of sufile * @mi: on-memory private data of metadata file @@ -317,7 +319,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) size_t susz = NILFS_MDT(sufile)->mi_entry_size; __u64 segnum, maxsegnum, last_alloc; void *kaddr; - unsigned long nsegments, ncleansegs, nsus, cnt; + unsigned long nsegments, nsus, cnt; int ret, j; down_write(&NILFS_MDT(sufile)->mi_sem); @@ -327,7 +329,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) goto out_sem; kaddr = kmap_atomic(header_bh->b_page); header = kaddr + bh_offset(header_bh); - ncleansegs = le64_to_cpu(header->sh_ncleansegs); last_alloc = le64_to_cpu(header->sh_last_alloc); kunmap_atomic(kaddr); @@ -358,6 +359,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) break; /* never happens */ } } + trace_nilfs2_segment_usage_check(sufile, segnum, cnt); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &su_bh); if (ret < 0) @@ -388,6 +390,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) nilfs_mdt_mark_dirty(sufile); brelse(su_bh); *segnump = segnum; + + trace_nilfs2_segment_usage_allocated(sufile, segnum); + goto out_header; } @@ -490,6 +495,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, NILFS_SUI(sufile)->ncleansegs++; nilfs_mdt_mark_dirty(sufile); + + trace_nilfs2_segment_usage_freed(sufile, segnum); } /** diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f47585bfeb01..7f5d3d9f1c37 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -361,7 +361,7 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) struct nilfs_super_block *nsbp; sector_t blocknr, newblocknr; unsigned long offset; - int sb2i = -1; /* array index of the secondary superblock */ + int sb2i; /* array index of the secondary superblock */ int ret = 0; /* nilfs->ns_sem must be locked by the caller. */ @@ -372,6 +372,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) { sb2i = 0; blocknr = nilfs->ns_sbh[0]->b_blocknr; + } else { + sb2i = -1; + blocknr = 0; } if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off) goto out; /* super block location is unchanged */ @@ -1313,13 +1316,11 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (!s->s_root) { - char b[BDEVNAME_SIZE]; - - s_new = true; + s_new = true; /* New superblock instance created */ s->s_mode = mode; - strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); + snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev); sb_set_blocksize(s, block_size(sd.bdev)); err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0); @@ -1405,21 +1406,18 @@ static void nilfs_destroy_cachep(void) */ rcu_barrier(); - if (nilfs_inode_cachep) - kmem_cache_destroy(nilfs_inode_cachep); - if (nilfs_transaction_cachep) - kmem_cache_destroy(nilfs_transaction_cachep); - if (nilfs_segbuf_cachep) - kmem_cache_destroy(nilfs_segbuf_cachep); - if (nilfs_btree_path_cache) - kmem_cache_destroy(nilfs_btree_path_cache); + kmem_cache_destroy(nilfs_inode_cachep); + kmem_cache_destroy(nilfs_transaction_cachep); + kmem_cache_destroy(nilfs_segbuf_cachep); + kmem_cache_destroy(nilfs_btree_path_cache); } static int __init nilfs_init_cachep(void) { nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", sizeof(struct nilfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + nilfs_inode_init_once); if (!nilfs_inode_cachep) goto fail; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 6b6f0d472ae8..fd98e5100cab 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(mark->inode); if (inode) { + /* + * IN_ALL_EVENTS represents all of the mask bits + * that we expose to userspace. There is at + * least one bit (FS_EVENT_ON_CHILD) which is + * used only internally to the kernel. + */ + u32 mask = mark->mask & IN_ALL_EVENTS; seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, - mark->mask, mark->ignored_mask); + mask, mark->ignored_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index e785fd954c30..741077deef3b 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb) break; } spin_unlock(&next_i->i_lock); - next_i = list_entry(next_i->i_sb_list.next, - struct inode, i_sb_list); + next_i = list_next_entry(next_i, i_sb_list); } /* diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5b1e2a497e51..b8d08d0d0a4d 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, int ret; unsigned flags = 0; - /* don't allow invalid bits: we don't want flags set */ + /* + * We share a lot of code with fs/dnotify. We also share + * the bit layout between inotify's IN_* and the fsnotify + * FS_*. This check ensures that only the inotify IN_* + * bits get passed in and set in watches/events. + */ + if (unlikely(mask & ~ALL_INOTIFY_BITS)) + return -EINVAL; + /* + * Require at least one valid bit set in the mask. + * Without _something_ set, we would have no events to + * watch for. + */ if (unlikely(!(mask & ALL_INOTIFY_BITS))) return -EINVAL; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index fc0df4442f7b..cfcbf114676e 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -92,9 +92,6 @@ #include "fsnotify.h" struct srcu_struct fsnotify_mark_srcu; -static DEFINE_SPINLOCK(destroy_lock); -static LIST_HEAD(destroy_list); -static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq); void fsnotify_get_mark(struct fsnotify_mark *mark) { @@ -168,10 +165,19 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) atomic_dec(&group->num_marks); } +static void +fsnotify_mark_free_rcu(struct rcu_head *rcu) +{ + struct fsnotify_mark *mark; + + mark = container_of(rcu, struct fsnotify_mark, g_rcu); + fsnotify_put_mark(mark); +} + /* - * Free fsnotify mark. The freeing is actually happening from a kthread which - * first waits for srcu period end. Caller must have a reference to the mark - * or be protected by fsnotify_mark_srcu. + * Free fsnotify mark. The freeing is actually happening from a call_srcu + * callback. Caller must have a reference to the mark or be protected by + * fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { @@ -186,10 +192,7 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - wake_up(&destroy_waitq); + call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu); /* * Some groups like to know that marks are being freed. This is a @@ -385,11 +388,7 @@ err: spin_unlock(&mark->lock); - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - wake_up(&destroy_waitq); - + call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu); return ret; } @@ -492,40 +491,3 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, atomic_set(&mark->refcnt, 1); mark->free_mark = free_mark; } - -static int fsnotify_mark_destroy(void *ignored) -{ - struct fsnotify_mark *mark, *next; - struct list_head private_destroy_list; - - for (;;) { - spin_lock(&destroy_lock); - /* exchange the list head */ - list_replace_init(&destroy_list, &private_destroy_list); - spin_unlock(&destroy_lock); - - synchronize_srcu(&fsnotify_mark_srcu); - - list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { - list_del_init(&mark->g_list); - fsnotify_put_mark(mark); - } - - wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list)); - } - - return 0; -} - -static int __init fsnotify_mark_init(void) -{ - struct task_struct *thread; - - thread = kthread_run(fsnotify_mark_destroy, NULL, - "fsnotify_mark"); - if (IS_ERR(thread)) - panic("unable to start fsnotify mark destruction thread."); - - return 0; -} -device_initcall(fsnotify_mark_init); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 262561fea923..9d383e5eff0e 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -525,8 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, } } err = add_to_page_cache_lru(*cached_page, mapping, - index, - GFP_KERNEL & mapping_gfp_mask(mapping)); + index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (unlikely(err)) { if (err == -EEXIST) continue; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index d1a853585b53..2f77f8dfb861 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void) ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, sizeof(big_ntfs_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - ntfs_big_inode_init_once); + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ntfs_big_inode_init_once); if (!ntfs_big_inode_cache) { pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); goto big_inode_err_out; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 86181d6526dc..a3ded88718c9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); -static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, .eo_update_clusters = ocfs2_dinode_update_clusters, @@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_value_update_clusters, @@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_tree_update_clusters, @@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) et->et_root_el = &dx_root->dr_list; } -static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, .eo_update_clusters = ocfs2_dx_root_update_clusters, @@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, return CONTIG_NONE; } -static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_refcount_tree_update_clusters, @@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct buffer_head *bh, ocfs2_journal_access_func access, void *obj, - struct ocfs2_extent_tree_operations *ops) + const struct ocfs2_extent_tree_operations *ops) { et->et_ops = ops; et->et_root_bh = bh; @@ -6174,8 +6174,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, } bail: - if (tl_inode) - iput(tl_inode); + iput(tl_inode); brelse(tl_bh); if (status < 0) { diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index fb09b97db162..f3dc1b0dfffc 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -54,7 +54,7 @@ */ struct ocfs2_extent_tree_operations; struct ocfs2_extent_tree { - struct ocfs2_extent_tree_operations *et_ops; + const struct ocfs2_extent_tree_operations *et_ops; struct buffer_head *et_root_bh; struct ocfs2_extent_list *et_root_el; struct ocfs2_caching_info *et_ci; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 64b11d90eca6..7f604727f487 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -589,6 +589,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = -EIO; goto bail; } + set_buffer_new(bh_result); up_write(&OCFS2_I(inode)->ip_alloc_sem); } @@ -864,6 +865,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, is_overwrite = ocfs2_is_overwrite(osb, inode, offset); if (is_overwrite < 0) { mlog_errno(is_overwrite); + ret = is_overwrite; ocfs2_inode_unlock(inode, 1); goto clean_orphan; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index fa15debcc02b..a3cc6d2fc896 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -219,7 +219,8 @@ struct o2hb_region { unsigned hr_unclean_stop:1, hr_aborted_start:1, hr_item_pinned:1, - hr_item_dropped:1; + hr_item_dropped:1, + hr_node_deleted:1; /* protected by the hr_callback_sem */ struct task_struct *hr_task; @@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data) set_user_nice(current, MIN_NICE); /* Pin node */ - o2nm_depend_this_node(); + ret = o2nm_depend_this_node(); + if (ret) { + mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret); + reg->hr_node_deleted = 1; + wake_up(&o2hb_steady_queue); + return 0; + } while (!kthread_should_stop() && !reg->hr_unclean_stop && !reg->hr_aborted_start) { @@ -1473,16 +1480,17 @@ static int o2hb_read_block_input(struct o2hb_region *reg, return 0; } -static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg, +static ssize_t o2hb_region_block_bytes_show(struct config_item *item, char *page) { - return sprintf(page, "%u\n", reg->hr_block_bytes); + return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes); } -static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, +static ssize_t o2hb_region_block_bytes_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); int status; unsigned long block_bytes; unsigned int block_bits; @@ -1501,16 +1509,17 @@ static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg, +static ssize_t o2hb_region_start_block_show(struct config_item *item, char *page) { - return sprintf(page, "%llu\n", reg->hr_start_block); + return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block); } -static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, +static ssize_t o2hb_region_start_block_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); unsigned long long tmp; char *p = (char *)page; @@ -1526,16 +1535,16 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", reg->hr_blocks); + return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks); } -static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, +static ssize_t o2hb_region_blocks_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); unsigned long tmp; char *p = (char *)page; @@ -1554,13 +1563,12 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_dev_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) { unsigned int ret = 0; - if (reg->hr_bdev) - ret = sprintf(page, "%s\n", reg->hr_dev_name); + if (to_o2hb_region(item)->hr_bdev) + ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name); return ret; } @@ -1670,10 +1678,11 @@ out: } /* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ -static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, +static ssize_t o2hb_region_dev_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); struct task_struct *hb_task; long fd; int sectsize; @@ -1771,8 +1780,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, } ++live_threshold; atomic_set(®->hr_steady_iterations, live_threshold); - /* unsteady_iterations is double the steady_iterations */ - atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); + /* unsteady_iterations is triple the steady_iterations */ + atomic_set(®->hr_unsteady_iterations, (live_threshold * 3)); hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", reg->hr_item.ci_name); @@ -1787,7 +1796,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, spin_unlock(&o2hb_live_lock); ret = wait_event_interruptible(o2hb_steady_queue, - atomic_read(®->hr_steady_iterations) == 0); + atomic_read(®->hr_steady_iterations) == 0 || + reg->hr_node_deleted); if (ret) { atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; @@ -1798,6 +1808,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out3; } + if (reg->hr_node_deleted) { + ret = -EINVAL; + goto out3; + } + /* Ok, we were woken. Make sure it wasn't by drop_item() */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; @@ -1828,9 +1843,9 @@ out: return ret; } -static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_pid_show(struct config_item *item, char *page) { + struct o2hb_region *reg = to_o2hb_region(item); pid_t pid = 0; spin_lock(&o2hb_live_lock); @@ -1844,92 +1859,23 @@ static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, return sprintf(page, "%u\n", pid); } -struct o2hb_region_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2hb_region *, char *); - ssize_t (*store)(struct o2hb_region *, const char *, size_t); -}; - -static struct o2hb_region_attribute o2hb_region_attr_block_bytes = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "block_bytes", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_block_bytes_read, - .store = o2hb_region_block_bytes_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_start_block = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "start_block", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_start_block_read, - .store = o2hb_region_start_block_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_blocks = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "blocks", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_blocks_read, - .store = o2hb_region_blocks_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_dev = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "dev", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_dev_read, - .store = o2hb_region_dev_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_pid = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "pid", - .ca_mode = S_IRUGO | S_IRUSR }, - .show = o2hb_region_pid_read, -}; +CONFIGFS_ATTR(o2hb_region_, block_bytes); +CONFIGFS_ATTR(o2hb_region_, start_block); +CONFIGFS_ATTR(o2hb_region_, blocks); +CONFIGFS_ATTR(o2hb_region_, dev); +CONFIGFS_ATTR_RO(o2hb_region_, pid); static struct configfs_attribute *o2hb_region_attrs[] = { - &o2hb_region_attr_block_bytes.attr, - &o2hb_region_attr_start_block.attr, - &o2hb_region_attr_blocks.attr, - &o2hb_region_attr_dev.attr, - &o2hb_region_attr_pid.attr, + &o2hb_region_attr_block_bytes, + &o2hb_region_attr_start_block, + &o2hb_region_attr_blocks, + &o2hb_region_attr_dev, + &o2hb_region_attr_pid, NULL, }; -static ssize_t o2hb_region_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2hb_region *reg = to_o2hb_region(item); - struct o2hb_region_attribute *o2hb_region_attr = - container_of(attr, struct o2hb_region_attribute, attr); - ssize_t ret = 0; - - if (o2hb_region_attr->show) - ret = o2hb_region_attr->show(reg, page); - return ret; -} - -static ssize_t o2hb_region_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2hb_region *reg = to_o2hb_region(item); - struct o2hb_region_attribute *o2hb_region_attr = - container_of(attr, struct o2hb_region_attribute, attr); - ssize_t ret = -EINVAL; - - if (o2hb_region_attr->store) - ret = o2hb_region_attr->store(reg, page, count); - return ret; -} - static struct configfs_item_operations o2hb_region_item_ops = { .release = o2hb_region_release, - .show_attribute = o2hb_region_show, - .store_attribute = o2hb_region_store, }; static struct config_item_type o2hb_region_type = { @@ -2124,49 +2070,14 @@ unlock: spin_unlock(&o2hb_live_lock); } -struct o2hb_heartbeat_group_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2hb_heartbeat_group *, char *); - ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t); -}; - -static ssize_t o2hb_heartbeat_group_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); - struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = - container_of(attr, struct o2hb_heartbeat_group_attribute, attr); - ssize_t ret = 0; - - if (o2hb_heartbeat_group_attr->show) - ret = o2hb_heartbeat_group_attr->show(reg, page); - return ret; -} - -static ssize_t o2hb_heartbeat_group_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); - struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = - container_of(attr, struct o2hb_heartbeat_group_attribute, attr); - ssize_t ret = -EINVAL; - - if (o2hb_heartbeat_group_attr->store) - ret = o2hb_heartbeat_group_attr->store(reg, page, count); - return ret; -} - -static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group, - char *page) +static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item, + char *page) { return sprintf(page, "%u\n", o2hb_dead_threshold); } -static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group, - const char *page, - size_t count) +static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item, + const char *page, size_t count) { unsigned long tmp; char *p = (char *)page; @@ -2181,17 +2092,15 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group return count; } -static -ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group, - char *page) +static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item, + char *page) { return sprintf(page, "%s\n", o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); } -static -ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, - const char *page, size_t count) +static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, + const char *page, size_t count) { unsigned int i; int ret; @@ -2216,33 +2125,15 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, } -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "dead_threshold", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_heartbeat_group_threshold_show, - .store = o2hb_heartbeat_group_threshold_store, -}; - -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "mode", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_heartbeat_group_mode_show, - .store = o2hb_heartbeat_group_mode_store, -}; +CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold); +CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { - &o2hb_heartbeat_group_attr_threshold.attr, - &o2hb_heartbeat_group_attr_mode.attr, + &o2hb_heartbeat_group_attr_threshold, + &o2hb_heartbeat_group_attr_mode, NULL, }; -static struct configfs_item_operations o2hb_heartbeat_group_item_ops = { - .show_attribute = o2hb_heartbeat_group_show, - .store_attribute = o2hb_heartbeat_group_store, -}; - static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { .make_item = o2hb_heartbeat_group_make_item, .drop_item = o2hb_heartbeat_group_drop_item, @@ -2250,7 +2141,6 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { static struct config_item_type o2hb_heartbeat_group_type = { .ct_group_ops = &o2hb_heartbeat_group_group_ops, - .ct_item_ops = &o2hb_heartbeat_group_item_ops, .ct_attrs = o2hb_heartbeat_group_attrs, .ct_owner = THIS_MODULE, }; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 441c84e169e6..ebe543894db0 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -172,9 +172,9 @@ static void o2nm_node_release(struct config_item *item) kfree(node); } -static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_num_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", node->nd_num); + return sprintf(page, "%d\n", to_o2nm_node(item)->nd_num); } static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) @@ -188,15 +188,16 @@ enum { O2NM_NODE_ATTR_NUM = 0, O2NM_NODE_ATTR_PORT, O2NM_NODE_ATTR_ADDRESS, - O2NM_NODE_ATTR_LOCAL, }; -static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, +static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, size_t count) { + struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); unsigned long tmp; char *p = (char *)page; + int ret = 0; tmp = simple_strtoul(p, &p, 0); if (!p || (*p && (*p != '\n'))) @@ -215,26 +216,30 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, write_lock(&cluster->cl_nodes_lock); if (cluster->cl_nodes[tmp]) - p = NULL; + ret = -EEXIST; + else if (test_and_set_bit(O2NM_NODE_ATTR_NUM, + &node->nd_set_attributes)) + ret = -EBUSY; else { cluster->cl_nodes[tmp] = node; node->nd_num = tmp; set_bit(tmp, cluster->cl_nodes_bitmap); } write_unlock(&cluster->cl_nodes_lock); - if (p == NULL) - return -EEXIST; + if (ret) + return ret; return count; } -static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_ipv4_port_show(struct config_item *item, char *page) { - return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port)); + return sprintf(page, "%u\n", ntohs(to_o2nm_node(item)->nd_ipv4_port)); } -static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, +static ssize_t o2nm_node_ipv4_port_store(struct config_item *item, const char *page, size_t count) { + struct o2nm_node *node = to_o2nm_node(item); unsigned long tmp; char *p = (char *)page; @@ -247,20 +252,23 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, if (tmp >= (u16)-1) return -ERANGE; + if (test_and_set_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) + return -EBUSY; node->nd_ipv4_port = htons(tmp); return count; } -static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_ipv4_address_show(struct config_item *item, char *page) { - return sprintf(page, "%pI4\n", &node->nd_ipv4_address); + return sprintf(page, "%pI4\n", &to_o2nm_node(item)->nd_ipv4_address); } -static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, +static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, const char *page, size_t count) { + struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); int ret, i; struct rb_node **p, *parent; @@ -282,6 +290,9 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, write_lock(&cluster->cl_nodes_lock); if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) ret = -EEXIST; + else if (test_and_set_bit(O2NM_NODE_ATTR_ADDRESS, + &node->nd_set_attributes)) + ret = -EBUSY; else { rb_link_node(&node->nd_ip_node, parent, p); rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); @@ -295,14 +306,15 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, return count; } -static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_local_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", node->nd_local); + return sprintf(page, "%d\n", to_o2nm_node(item)->nd_local); } -static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, +static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, size_t count) { + struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); unsigned long tmp; char *p = (char *)page; @@ -349,108 +361,21 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, return count; } -struct o2nm_node_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2nm_node *, char *); - ssize_t (*store)(struct o2nm_node *, const char *, size_t); -}; - -static struct o2nm_node_attribute o2nm_node_attr_num = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "num", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_num_read, - .store = o2nm_node_num_write, -}; - -static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_port", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_ipv4_port_read, - .store = o2nm_node_ipv4_port_write, -}; - -static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_address", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_ipv4_address_read, - .store = o2nm_node_ipv4_address_write, -}; - -static struct o2nm_node_attribute o2nm_node_attr_local = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "local", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_local_read, - .store = o2nm_node_local_write, -}; +CONFIGFS_ATTR(o2nm_node_, num); +CONFIGFS_ATTR(o2nm_node_, ipv4_port); +CONFIGFS_ATTR(o2nm_node_, ipv4_address); +CONFIGFS_ATTR(o2nm_node_, local); static struct configfs_attribute *o2nm_node_attrs[] = { - [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr, - [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr, - [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr, - [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr, + &o2nm_node_attr_num, + &o2nm_node_attr_ipv4_port, + &o2nm_node_attr_ipv4_address, + &o2nm_node_attr_local, NULL, }; -static int o2nm_attr_index(struct configfs_attribute *attr) -{ - int i; - for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) { - if (attr == o2nm_node_attrs[i]) - return i; - } - BUG(); - return 0; -} - -static ssize_t o2nm_node_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_node_attribute *o2nm_node_attr = - container_of(attr, struct o2nm_node_attribute, attr); - ssize_t ret = 0; - - if (o2nm_node_attr->show) - ret = o2nm_node_attr->show(node, page); - return ret; -} - -static ssize_t o2nm_node_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_node_attribute *o2nm_node_attr = - container_of(attr, struct o2nm_node_attribute, attr); - ssize_t ret; - int attr_index = o2nm_attr_index(attr); - - if (o2nm_node_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - if (test_bit(attr_index, &node->nd_set_attributes)) - return -EBUSY; - - ret = o2nm_node_attr->store(node, page, count); - if (ret < count) - goto out; - - set_bit(attr_index, &node->nd_set_attributes); -out: - return ret; -} - static struct configfs_item_operations o2nm_node_item_ops = { .release = o2nm_node_release, - .show_attribute = o2nm_node_show, - .store_attribute = o2nm_node_store, }; static struct config_item_type o2nm_node_type = { @@ -475,12 +400,6 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group) } #endif -struct o2nm_cluster_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2nm_cluster *, char *); - ssize_t (*store)(struct o2nm_cluster *, const char *, size_t); -}; - static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, unsigned int *val) { @@ -501,15 +420,16 @@ static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, return count; } -static ssize_t o2nm_cluster_attr_idle_timeout_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_idle_timeout_ms_show(struct config_item *item, + char *page) { - return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); + return sprintf(page, "%u\n", to_o2nm_cluster(item)->cl_idle_timeout_ms); } -static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_idle_timeout_ms_store(struct config_item *item, + const char *page, size_t count) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret; unsigned int val; @@ -536,15 +456,17 @@ static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( return ret; } -static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_keepalive_delay_ms_show( + struct config_item *item, char *page) { - return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); + return sprintf(page, "%u\n", + to_o2nm_cluster(item)->cl_keepalive_delay_ms); } -static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_keepalive_delay_ms_store( + struct config_item *item, const char *page, size_t count) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret; unsigned int val; @@ -571,22 +493,24 @@ static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( return ret; } -static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_reconnect_delay_ms_show( + struct config_item *item, char *page) { - return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); + return sprintf(page, "%u\n", + to_o2nm_cluster(item)->cl_reconnect_delay_ms); } -static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_reconnect_delay_ms_store( + struct config_item *item, const char *page, size_t count) { return o2nm_cluster_attr_write(page, count, - &cluster->cl_reconnect_delay_ms); + &to_o2nm_cluster(item)->cl_reconnect_delay_ms); } -static ssize_t o2nm_cluster_attr_fence_method_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_fence_method_show( + struct config_item *item, char *page) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret = 0; if (cluster) @@ -595,8 +519,8 @@ static ssize_t o2nm_cluster_attr_fence_method_read( return ret; } -static ssize_t o2nm_cluster_attr_fence_method_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_fence_method_store( + struct config_item *item, const char *page, size_t count) { unsigned int i; @@ -608,10 +532,10 @@ static ssize_t o2nm_cluster_attr_fence_method_write( continue; if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) continue; - if (cluster->cl_fence_method != i) { + if (to_o2nm_cluster(item)->cl_fence_method != i) { printk(KERN_INFO "ocfs2: Changing fence method to %s\n", o2nm_fence_method_desc[i]); - cluster->cl_fence_method = i; + to_o2nm_cluster(item)->cl_fence_method = i; } return count; } @@ -620,79 +544,18 @@ bail: return -EINVAL; } -static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "idle_timeout_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_idle_timeout_ms_read, - .store = o2nm_cluster_attr_idle_timeout_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "keepalive_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_keepalive_delay_ms_read, - .store = o2nm_cluster_attr_keepalive_delay_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "reconnect_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_reconnect_delay_ms_read, - .store = o2nm_cluster_attr_reconnect_delay_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "fence_method", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_fence_method_read, - .store = o2nm_cluster_attr_fence_method_write, -}; +CONFIGFS_ATTR(o2nm_cluster_, idle_timeout_ms); +CONFIGFS_ATTR(o2nm_cluster_, keepalive_delay_ms); +CONFIGFS_ATTR(o2nm_cluster_, reconnect_delay_ms); +CONFIGFS_ATTR(o2nm_cluster_, fence_method); static struct configfs_attribute *o2nm_cluster_attrs[] = { - &o2nm_cluster_attr_idle_timeout_ms.attr, - &o2nm_cluster_attr_keepalive_delay_ms.attr, - &o2nm_cluster_attr_reconnect_delay_ms.attr, - &o2nm_cluster_attr_fence_method.attr, + &o2nm_cluster_attr_idle_timeout_ms, + &o2nm_cluster_attr_keepalive_delay_ms, + &o2nm_cluster_attr_reconnect_delay_ms, + &o2nm_cluster_attr_fence_method, NULL, }; -static ssize_t o2nm_cluster_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2nm_cluster *cluster = to_o2nm_cluster(item); - struct o2nm_cluster_attribute *o2nm_cluster_attr = - container_of(attr, struct o2nm_cluster_attribute, attr); - ssize_t ret = 0; - - if (o2nm_cluster_attr->show) - ret = o2nm_cluster_attr->show(cluster, page); - return ret; -} - -static ssize_t o2nm_cluster_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2nm_cluster *cluster = to_o2nm_cluster(item); - struct o2nm_cluster_attribute *o2nm_cluster_attr = - container_of(attr, struct o2nm_cluster_attribute, attr); - ssize_t ret; - - if (o2nm_cluster_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - ret = o2nm_cluster_attr->store(cluster, page, count); - if (ret < count) - goto out; -out: - return ret; -} static struct config_item *o2nm_node_group_make_item(struct config_group *group, const char *name) @@ -773,8 +636,6 @@ static void o2nm_cluster_release(struct config_item *item) static struct configfs_item_operations o2nm_cluster_item_ops = { .release = o2nm_cluster_release, - .show_attribute = o2nm_cluster_show, - .store_attribute = o2nm_cluster_store, }; static struct config_item_type o2nm_cluster_type = { @@ -896,7 +757,7 @@ int o2nm_depend_item(struct config_item *item) void o2nm_undepend_item(struct config_item *item) { - configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); + configfs_undepend_item(item); } int o2nm_depend_this_node(void) diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e88ccf8c83ff..68c607e63ff6 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -376,17 +376,6 @@ struct dlm_lock lksb_kernel_allocated:1; }; - -#define DLM_LKSB_UNUSED1 0x01 -#define DLM_LKSB_PUT_LVB 0x02 -#define DLM_LKSB_GET_LVB 0x04 -#define DLM_LKSB_UNUSED2 0x08 -#define DLM_LKSB_UNUSED3 0x10 -#define DLM_LKSB_UNUSED4 0x20 -#define DLM_LKSB_UNUSED5 0x40 -#define DLM_LKSB_UNUSED6 0x80 - - enum dlm_lockres_list { DLM_GRANTED_LIST = 0, DLM_CONVERTING_LIST = 1, diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6918f30d02cd..2ee7fe747cea 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1866,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) int status; unsigned int backoff; unsigned int total_backoff = 0; + char wq_name[O2NM_MAX_NAME_LEN]; BUG_ON(!dlm); @@ -1895,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); + dlm->dlm_worker = create_singlethread_workqueue(wq_name); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index ce38b4ccc9ab..9477d6e1de37 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2388,8 +2388,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) spin_lock(&res->spinlock); BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); if (test_bit(node, res->refmap)) { - __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); dlm_lockres_clear_refmap_bit(dlm, res, node); cleared = 1; } @@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); + /* get an extra reference on the mle. + * otherwise the assert_master from the new + * master will destroy this. + */ + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -2544,7 +2549,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (oldmle) { + if (ret != -EEXIST && oldmle) { /* master is known, detach if not already detached */ dlm_mle_detach_hb_events(dlm, oldmle); dlm_put_mle(oldmle); @@ -2554,6 +2559,7 @@ fail: if (mle_added) { dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); + dlm_put_mle_inuse(mle); } else if (mle) { kmem_cache_free(dlm_mle_cache, mle); mle = NULL; @@ -2571,17 +2577,6 @@ fail: * ensure that all assert_master work is flushed. */ flush_workqueue(dlm->dlm_worker); - /* get an extra reference on the mle. - * otherwise the assert_master from the new - * master will destroy this. - * also, make sure that all callers of dlm_get_mle - * take both dlm->spinlock and dlm->master_lock */ - spin_lock(&dlm->spinlock); - spin_lock(&dlm->master_lock); - dlm_get_mle_inuse(mle); - spin_unlock(&dlm->master_lock); - spin_unlock(&dlm->spinlock); - /* notify new node and send all lock state */ /* call send_one_lockres with migration flag. * this serves as notice to the target node that a @@ -2843,6 +2838,8 @@ again: res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; if (!ret) BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + else + res->migration_pending = 0; spin_unlock(&res->spinlock); /* @@ -3048,7 +3045,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, int ret = 0; if (!dlm_grab(dlm)) - return -EINVAL; + return 0; name = migrate->name; namelen = migrate->namelen; @@ -3139,7 +3136,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, mlog(0, "tried to migrate %.*s, but some " "process beat me to it\n", namelen, name); - ret = -EEXIST; + spin_unlock(&tmp->spinlock); + return -EEXIST; } else { /* bad. 2 NODES are trying to migrate! */ mlog(ML_ERROR, "migration error mle: " @@ -3310,6 +3308,15 @@ top: mle->new_master != dead_node) continue; + if (mle->new_master == dead_node && mle->inuse) { + mlog(ML_NOTICE, "%s: target %u died during " + "migration from %u, the MLE is " + "still keep used, ignore it!\n", + dlm->name, dead_node, + mle->master); + continue; + } + /* If we have reached this point, this mle needs to be * removed from the list and freed. */ dlm_clean_migration_mle(dlm, mle); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 58eaa5c0d387..c5bdf02c213b 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) mlog(0, "starting dlm recovery thread...\n"); dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, - "dlm_reco_thread"); + "dlm_reco-%s", dlm->name); if (IS_ERR(dlm->dlm_reco_thread_task)) { mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); dlm->dlm_reco_thread_task = NULL; @@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, char *buf = NULL; struct dlm_work_item *item = NULL; struct dlm_lock_resource *res = NULL; + unsigned int hash; if (!dlm_grab(dlm)) return -EINVAL; @@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, /* lookup the lock to see if we have a secondary queue for this * already... just add the locks in and this will have its owner * and RECOVERY flag changed when it completes. */ - res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); + hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len, + hash); if (res) { /* this will get a ref on res */ /* mark it as recovering/migrating and hash it */ @@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; } spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); } else { + spin_unlock(&dlm->spinlock); /* need to allocate, just like if it was * mastered here normally */ res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); @@ -2450,11 +2457,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) * perhaps later we can genericize this for other waiters. */ wake_up(&dlm->migration_wq); - if (test_bit(idx, dlm->recovery_map)) - mlog(0, "domain %s, node %u already added " - "to recovery map!\n", dlm->name, idx); - else - set_bit(idx, dlm->recovery_map); + set_bit(idx, dlm->recovery_map); } void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 2e5e6d5fffe8..c5f6c241ecd7 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -493,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm) { mlog(0, "Starting dlm_thread...\n"); - dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); + dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s", + dlm->name); if (IS_ERR(dlm->dlm_thread_task)) { mlog_errno(PTR_ERR(dlm->dlm_thread_task)); dlm->dlm_thread_task = NULL; diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 2e3c9dbab68c..1082b2c3014b 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } if (!dlm_grab(dlm)) - return DLM_REJECTED; + return DLM_FORWARD; mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), "Domain %s not fully joined!\n", dlm->name); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b5cf27dcb18a..03768bb3aab1 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void) dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), dlmfs_init_once); if (!dlmfs_inode_cache) { status = -ENOMEM; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1c91103c1333..f92612e4b9d6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2432,12 +2432,6 @@ bail: * done this we have to return AOP_TRUNCATED_PAGE so the aop method * that called us can bubble that back up into the VFS who will then * immediately retry the aop call. - * - * We do a blocking lock and immediate unlock before returning, though, so that - * the lock has a great chance of being cached on this node by the time the VFS - * calls back to retry the aop. This has a potential to livelock as nodes - * ping locks back and forth, but that's a risk we're willing to take to avoid - * the lock inversion simply. */ int ocfs2_inode_lock_with_page(struct inode *inode, struct buffer_head **ret_bh, @@ -2449,8 +2443,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode, ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { unlock_page(page); - if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) - ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; } @@ -2998,7 +2990,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) } /* launch downconvert thread */ - osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc"); + osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s", + osb->uuid_str); if (IS_ERR(osb->dc_task)) { status = PTR_ERR(osb->dc_task); osb->dc_task = NULL; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0e5b4515f92e..d63127932509 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt, } generic_fillattr(inode, stat); + /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others don't incorrectly think the file is completely sparse. + */ + if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + stat->blocks += (stat->size + 511)>>9; /* We set the blksize from the cluster size for performance */ stat->blksize = osb->s_clustersize; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 8f87e05ee25d..97a563bab9a8 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -361,6 +361,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, break; case S_IFLNK: inode->i_op = &ocfs2_symlink_inode_operations; + inode_nohighmem(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); break; default: diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ca3431ee7f24..aac8b86f312e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -112,6 +112,8 @@ struct ocfs2_inode_info #define OCFS2_INODE_OPEN_DIRECT 0x00000020 /* Tell the inode wipe code it's not in orphan dir */ #define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 +/* Entry in orphan dir with 'dio-' prefix */ +#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 3cb097ccce60..16b0bb482ea7 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -606,9 +606,7 @@ bail: if (gb_inode) mutex_unlock(&gb_inode->i_mutex); - if (gb_inode) - iput(gb_inode); - + iput(gb_inode); brelse(bh); return status; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ff82b28462a6..3772a2dbb980 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) // up_write(&journal->j_trans_barrier); done: - if (inode) - iput(inode); + iput(inode); } static void ocfs2_clear_journal_error(struct super_block *sb, @@ -1090,7 +1089,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) /* Launch the commit thread */ if (!local) { osb->commit_task = kthread_run(ocfs2_commit_thread, osb, - "ocfs2cmt"); + "ocfs2cmt-%s", osb->uuid_str); if (IS_ERR(osb->commit_task)) { status = PTR_ERR(osb->commit_task); osb->commit_task = NULL; @@ -1507,7 +1506,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) goto out; osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, - "ocfs2rec"); + "ocfs2rec-%s", osb->uuid_str); if (IS_ERR(osb->recovery_thread_task)) { mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); osb->recovery_thread_task = NULL; @@ -1687,9 +1686,7 @@ done: if (got_lock) ocfs2_inode_unlock(inode, 1); - if (inode) - iput(inode); - + iput(inode); brelse(bh); return status; @@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb, ocfs2_inode_unlock(inode, 1); bail: - if (inode) - iput(inode); + iput(inode); return status; } @@ -2021,6 +2017,7 @@ struct ocfs2_orphan_filldir_priv { struct dir_context ctx; struct inode *head; struct ocfs2_super *osb; + enum ocfs2_orphan_reco_type orphan_reco_type; }; static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, @@ -2036,12 +2033,22 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, if (name_len == 2 && !strncmp("..", name, 2)) return 0; + /* do not include dio entry in case of orphan scan */ + if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) && + (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, + OCFS2_DIO_ORPHAN_PREFIX_LEN))) + return 0; + /* Skip bad inodes so that recovery can continue */ iter = ocfs2_iget(p->osb, ino, OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); if (IS_ERR(iter)) return 0; + if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, + OCFS2_DIO_ORPHAN_PREFIX_LEN)) + OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY; + /* Skip inodes which are already added to recover list, since dio may * happen concurrently with unlink/rename */ if (OCFS2_I(iter)->ip_next_orphan) { @@ -2060,14 +2067,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, static int ocfs2_queue_orphans(struct ocfs2_super *osb, int slot, - struct inode **head) + struct inode **head, + enum ocfs2_orphan_reco_type orphan_reco_type) { int status; struct inode *orphan_dir_inode = NULL; struct ocfs2_orphan_filldir_priv priv = { .ctx.actor = ocfs2_orphan_filldir, .osb = osb, - .head = *head + .head = *head, + .orphan_reco_type = orphan_reco_type }; orphan_dir_inode = ocfs2_get_system_file_inode(osb, @@ -2170,7 +2179,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, trace_ocfs2_recover_orphans(slot); ocfs2_mark_recovering_orphan_dir(osb, slot); - ret = ocfs2_queue_orphans(osb, slot, &inode); + ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type); ocfs2_clear_recovering_orphan_dir(osb, slot); /* Error here should be noted, but we want to continue with as @@ -2186,25 +2195,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; - mutex_lock(&inode->i_mutex); - ret = ocfs2_rw_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto next; - } - /* - * We need to take and drop the inode lock to - * force read inode from disk. - */ - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret) { - mlog_errno(ret); - goto unlock_rw; - } + if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { + mutex_lock(&inode->i_mutex); + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto unlock_mutex; + } + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto unlock_rw; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; - di = (struct ocfs2_dinode *)di_bh->b_data; + if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto unlock_inode; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, + di_bh, 0, 0); + if (ret) + mlog_errno(ret); + } +unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; +unlock_rw: + ocfs2_rw_unlock(inode, 1); +unlock_mutex: + mutex_unlock(&inode->i_mutex); - if (inode->i_nlink == 0) { + /* clear dio flag in ocfs2_inode_info */ + oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; + } else { spin_lock(&oi->ip_lock); /* Set the proper information to get us going into * ocfs2_delete_inode. */ @@ -2212,28 +2247,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, spin_unlock(&oi->ip_lock); } - if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && - (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { - ret = ocfs2_truncate_file(inode, di_bh, - i_size_read(inode)); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto unlock_inode; - } - - ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); - if (ret) - mlog_errno(ret); - } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ -unlock_inode: - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; -unlock_rw: - ocfs2_rw_unlock(inode, 1); -next: - mutex_unlock(&inode->i_mutex); iput(inode); inode = iter; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 0a4457fb0711..e9c99e35f5ea 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) bail: if (status < 0) brelse(alloc_bh); - if (inode) - iput(inode); + iput(inode); trace_ocfs2_load_local_alloc(osb->local_alloc_bits); @@ -473,8 +472,7 @@ out_mutex: iput(main_bm_inode); out: - if (local_alloc_inode) - iput(local_alloc_inode); + iput(local_alloc_inode); kfree(alloc_copy); } @@ -1327,9 +1325,7 @@ bail: brelse(main_bm_bh); - if (main_bm_inode) - iput(main_bm_inode); - + iput(main_bm_inode); kfree(alloc_copy); if (ac) diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 6b6d092b0998..d56f0079b858 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -66,8 +66,11 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, * level. */ - flock_lock_file_wait(file, - &(struct file_lock){.fl_type = F_UNLCK}); + locks_lock_file_wait(file, + &(struct file_lock) { + .fl_type = F_UNLCK, + .fl_flags = FL_FLOCK + }); ocfs2_file_unlock(file); } @@ -81,7 +84,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, goto out; } - ret = flock_lock_file_wait(file, fl); + ret = locks_lock_file_wait(file, fl); if (ret) ocfs2_file_unlock(file); @@ -98,7 +101,7 @@ static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl) mutex_lock(&fp->fp_mutex); ocfs2_file_unlock(file); - ret = flock_lock_file_wait(file, fl); + ret = locks_lock_file_wait(file, fl); mutex_unlock(&fp->fp_mutex); return ret; @@ -119,7 +122,7 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb)) - return flock_lock_file_wait(file, fl); + return locks_lock_file_wait(file, fl); if (fl->fl_type == F_UNLCK) return ocfs2_do_funlock(file, cmd, fl); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b7dfac226b1e..ab42c38031b1 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) -#define OCFS2_DIO_ORPHAN_PREFIX "dio-" -#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -369,7 +367,7 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = posix_acl_create(dir, &mode, &default_acl, &acl); + status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (status) { mlog_errno(status); goto leave; @@ -657,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, return status; } - return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); + if (status < 0) { + u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); + int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, + inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); + if (tmp) + mlog_errno(tmp); + } + + return status; } static int ocfs2_mkdir(struct inode *dir, @@ -1676,8 +1683,7 @@ bail: if (new_inode) sync_mapping_buffers(old_inode->i_mapping); - if (new_inode) - iput(new_inode); + iput(new_inode); ocfs2_free_dir_lookup_result(&target_lookup_res); ocfs2_free_dir_lookup_result(&old_entry_lookup); @@ -1951,6 +1957,7 @@ static int ocfs2_symlink(struct inode *dir, inode->i_rdev = 0; newsize = l - 1; inode->i_op = &ocfs2_symlink_inode_operations; + inode_nohighmem(inode); if (l > ocfs2_fast_symlink_chars(sb)) { u32 offset = 0; @@ -2365,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, name, strlen(name)); + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + /* find it's spot in the orphan directory */ status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, &lookup); @@ -2380,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access_di(handle, - INODE_CACHE(orphan_dir_inode), - orphan_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* do the i_nlink dance! :) */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e173329eb830..1155918d6784 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -26,6 +26,9 @@ #ifndef OCFS2_NAMEI_H #define OCFS2_NAMEI_H +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 + extern const struct inode_operations ocfs2_dir_iops; struct dentry *ocfs2_get_parent(struct dentry *child); diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index b6d51333ad02..d153e6e31529 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -82,7 +82,7 @@ struct ocfs2_quota_chunk { extern struct kmem_cache *ocfs2_dquot_cachep; extern struct kmem_cache *ocfs2_qf_chunk_cachep; -extern struct qtree_fmt_operations ocfs2_global_ops; +extern const struct qtree_fmt_operations ocfs2_global_ops; struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( struct ocfs2_super *osb, int slot_num); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index c93d67220887..fde9ef18cff3 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -123,7 +123,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot) dquot->dq_id); } -struct qtree_fmt_operations ocfs2_global_ops = { +const struct qtree_fmt_operations ocfs2_global_ops = { .mem2disk_dqblk = ocfs2_global_mem2diskdqb, .disk2mem_dqblk = ocfs2_global_disk2memdqb, .is_id = ocfs2_global_is_id, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index e5d57cd32505..252119860e6c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2920,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; - unsigned int from, to, readahead_pages; + unsigned int from, to; loff_t offset, end, map_end; struct address_space *mapping = inode->i_mapping; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); - readahead_pages = - (ocfs2_cow_contig_clusters(sb) << - OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); /* diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index d5da6f624142..79b8021302b3 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -54,11 +54,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode, struct ocfs2_group_desc *gd, u16 cl_cpg, + u16 old_bg_clusters, int set) { int i; u16 backups = 0; - u32 cluster; + u32 cluster, lgd_cluster; u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno); for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { @@ -71,6 +72,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode, else if (gd_blkno > lgd_blkno) break; + /* check if already done backup super */ + lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno); + lgd_cluster += old_bg_clusters; + if (lgd_cluster >= cluster) + continue; + if (set) ocfs2_set_bit(cluster % cl_cpg, (unsigned long *)gd->bg_bitmap); @@ -99,6 +106,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, u16 chain, num_bits, backups = 0; u16 cl_bpc = le16_to_cpu(cl->cl_bpc); u16 cl_cpg = le16_to_cpu(cl->cl_cpg); + u16 old_bg_clusters; trace_ocfs2_update_last_group_and_inode(new_clusters, first_new_cluster); @@ -112,6 +120,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, group = (struct ocfs2_group_desc *)group_bh->b_data; + old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc; /* update the group first. */ num_bits = new_clusters * cl_bpc; le16_add_cpu(&group->bg_bits, num_bits); @@ -125,7 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, OCFS2_FEATURE_COMPAT_BACKUP_SB)) { backups = ocfs2_calc_new_backup_super(bm_inode, group, - cl_cpg, 1); + cl_cpg, old_bg_clusters, 1); le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } @@ -163,7 +172,7 @@ out_rollback: if (ret < 0) { ocfs2_calc_new_backup_super(bm_inode, group, - cl_cpg, 0); + cl_cpg, old_bg_clusters, 0); le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index e78a203d44c8..1e09592148ad 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) if (si == NULL) return; - if (si->si_inode) - iput(si->si_inode); + iput(si->si_inode); if (si->si_bh) { for (i = 0; i < si->si_blocks; i++) { if (si->si_bh[i]) { @@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb) trace_ocfs2_find_slot(osb->slot_num); status = ocfs2_update_disk_slot(osb, si, osb->slot_num); - if (status < 0) + if (status < 0) { mlog_errno(status); + /* + * if write block failed, invalidate slot to avoid overwrite + * slot during dismount in case another node rightly has mounted + */ + spin_lock(&osb->osb_lock); + ocfs2_invalidate_slot(si, osb->slot_num); + osb->slot_num = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + } bail: return status; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d83d2602cf2b..fc6d25f6d444 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) { - hint = ocfs2_group_from_res(res); + if (ocfs2_is_cluster_bitmap(ac->ac_inode)) + hint = res->sr_bg_blkno; + else + hint = ocfs2_group_from_res(res); goto set_hint; } if (status < 0 && status != -ENOSPC) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2de4c8a9340c..faa1365097bc 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1280,6 +1280,8 @@ static int ocfs2_parse_options(struct super_block *sb, int status, user_stack = 0; char *p; u32 tmp; + int token, option; + substring_t args[MAX_OPT_ARGS]; trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); @@ -1298,9 +1300,6 @@ static int ocfs2_parse_options(struct super_block *sb, } while ((p = strsep(&options, ",")) != NULL) { - int token, option; - substring_t args[MAX_OPT_ARGS]; - if (!*p) continue; @@ -1367,7 +1366,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->atime_quantum = option; break; case Opt_slot: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1376,7 +1374,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->slot = (s16)option; break; case Opt_commit: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1388,7 +1385,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->commit_interval = HZ * option; break; case Opt_localalloc: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1726,8 +1722,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) ocfs2_inode_unlock(inode, 0); status = 0; bail: - if (inode) - iput(inode); + iput(inode); if (status) mlog_errno(status); @@ -1771,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void) sizeof(struct ocfs2_inode_info), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), ocfs2_inode_init_once); ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 66edce7ecfd7..6c2a3e3c521c 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -88,8 +88,7 @@ const struct address_space_operations ocfs2_fast_symlink_aops = { const struct inode_operations ocfs2_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, .setxattr = generic_setxattr, diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ebfdea78659b..f0e241ffd94f 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -544,8 +544,7 @@ static inline const char *ocfs2_xattr_prefix(int name_index) if (name_index > 0 && name_index < OCFS2_XATTR_MAX) handler = ocfs2_xattr_handler_map[name_index]; - - return handler ? handler->prefix : NULL; + return handler ? xattr_prefix(handler) : NULL; } static u32 ocfs2_xattr_name_hash(struct inode *inode, @@ -884,14 +883,39 @@ static int ocfs2_xattr_value_truncate(struct inode *inode, return ret; } -static int ocfs2_xattr_list_entry(char *buffer, size_t size, - size_t *result, const char *prefix, +static int ocfs2_xattr_list_entry(struct super_block *sb, + char *buffer, size_t size, + size_t *result, int type, const char *name, int name_len) { char *p = buffer + *result; - int prefix_len = strlen(prefix); - int total_len = prefix_len + name_len + 1; + const char *prefix; + int prefix_len; + int total_len; + switch(type) { + case OCFS2_XATTR_INDEX_USER: + if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return 0; + break; + + case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS: + case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT: + if (!(sb->s_flags & MS_POSIXACL)) + return 0; + break; + + case OCFS2_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return 0; + break; + } + + prefix = ocfs2_xattr_prefix(type); + if (!prefix) + return 0; + prefix_len = strlen(prefix); + total_len = prefix_len + name_len + 1; *result += total_len; /* we are just looking for how big our buffer needs to be */ @@ -914,23 +938,20 @@ static int ocfs2_xattr_list_entries(struct inode *inode, { size_t result = 0; int i, type, ret; - const char *prefix, *name; + const char *name; for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) { struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; type = ocfs2_xattr_get_type(entry); - prefix = ocfs2_xattr_prefix(type); - - if (prefix) { - name = (const char *)header + - le16_to_cpu(entry->xe_name_offset); + name = (const char *)header + + le16_to_cpu(entry->xe_name_offset); - ret = ocfs2_xattr_list_entry(buffer, buffer_size, - &result, prefix, name, - entry->xe_name_len); - if (ret) - return ret; - } + ret = ocfs2_xattr_list_entry(inode->i_sb, + buffer, buffer_size, + &result, type, name, + entry->xe_name_len); + if (ret) + return ret; } return result; @@ -4033,32 +4054,30 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, int ret = 0, type; struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para; int i, block_off, new_offset; - const char *prefix, *name; + const char *name; for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) { struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i]; type = ocfs2_xattr_get_type(entry); - prefix = ocfs2_xattr_prefix(type); - if (prefix) { - ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, - bucket_xh(bucket), - i, - &block_off, - &new_offset); - if (ret) - break; + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(bucket), + i, + &block_off, + &new_offset); + if (ret) + break; - name = (const char *)bucket_block(bucket, block_off) + - new_offset; - ret = ocfs2_xattr_list_entry(xl->buffer, - xl->buffer_size, - &xl->result, - prefix, name, - entry->xe_name_len); - if (ret) - break; - } + name = (const char *)bucket_block(bucket, block_off) + + new_offset; + ret = ocfs2_xattr_list_entry(inode->i_sb, + xl->buffer, + xl->buffer_size, + &xl->result, + type, name, + entry->xe_name_len); + if (ret) + break; } return ret; @@ -7226,39 +7245,22 @@ int ocfs2_init_security_and_acl(struct inode *dir, leave: return ret; } + /* * 'security' attributes support */ -static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int ocfs2_xattr_security_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, buffer, size); } -static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int ocfs2_xattr_security_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -7311,7 +7313,6 @@ int ocfs2_init_security_set(handle_t *handle, const struct xattr_handler ocfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = ocfs2_xattr_security_list, .get = ocfs2_xattr_security_get, .set = ocfs2_xattr_security_set, }; @@ -7319,46 +7320,24 @@ const struct xattr_handler ocfs2_xattr_security_handler = { /* * 'trusted' attributes support */ -static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, buffer, size); } -static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } const struct xattr_handler ocfs2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .list = ocfs2_xattr_trusted_list, .get = ocfs2_xattr_trusted_get, .set = ocfs2_xattr_trusted_set, }; @@ -7366,45 +7345,24 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = { /* * 'user' attributes support */ -static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - - if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int ocfs2_xattr_user_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - if (strcmp(name, "") == 0) - return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, buffer, size); } -static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int ocfs2_xattr_user_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - if (strcmp(name, "") == 0) - return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; @@ -7414,7 +7372,6 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, const struct xattr_handler ocfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .list = ocfs2_xattr_user_list, .get = ocfs2_xattr_user_get, .set = ocfs2_xattr_user_set, }; diff --git a/fs/open.c b/fs/open.c index b6f1e96a7c0b..b25b1542c530 100644 --- a/fs/open.c +++ b/fs/open.c @@ -887,7 +887,7 @@ EXPORT_SYMBOL(dentry_open); static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; - int acc_mode; + int acc_mode = ACC_MODE(flags); if (flags & (O_CREAT | __O_TMPFILE)) op->mode = (mode & S_IALLUGO) | S_IFREG; @@ -909,7 +909,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o if (flags & __O_TMPFILE) { if ((flags & O_TMPFILE_MASK) != O_TMPFILE) return -EINVAL; - acc_mode = MAY_OPEN | ACC_MODE(flags); if (!(acc_mode & MAY_WRITE)) return -EINVAL; } else if (flags & O_PATH) { @@ -919,8 +918,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o */ flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; acc_mode = 0; - } else { - acc_mode = MAY_OPEN | ACC_MODE(flags); } op->open_flag = flags; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 15e4500cda3e..b61b883c8ff8 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -443,7 +443,7 @@ static int __init init_openprom_fs(void) sizeof(struct op_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), op_inode_init_once); if (!op_inode_cachep) return -ENOMEM; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 871fcb67be97..0a8983492d91 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -195,8 +195,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, struct dentry *dentry, struct path *lowerpath, - struct kstat *stat, struct iattr *attr, - const char *link) + struct kstat *stat, const char *link) { struct inode *wdir = workdir->d_inode; struct inode *udir = upperdir->d_inode; @@ -240,8 +239,6 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, mutex_lock(&newdentry->d_inode->i_mutex); err = ovl_set_attr(newdentry, stat); - if (!err && attr) - err = notify_change(newdentry, attr, NULL); mutex_unlock(&newdentry->d_inode->i_mutex); if (err) goto out_cleanup; @@ -286,8 +283,7 @@ out_cleanup: * that point the file will have already been copied up anyway. */ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, - struct path *lowerpath, struct kstat *stat, - struct iattr *attr) + struct path *lowerpath, struct kstat *stat) { struct dentry *workdir = ovl_workdir(dentry); int err; @@ -345,26 +341,19 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, } upperdentry = ovl_dentry_upper(dentry); if (upperdentry) { - unlock_rename(workdir, upperdir); + /* Raced with another copy-up? Nothing to do, then... */ err = 0; - /* Raced with another copy-up? Do the setattr here */ - if (attr) { - mutex_lock(&upperdentry->d_inode->i_mutex); - err = notify_change(upperdentry, attr, NULL); - mutex_unlock(&upperdentry->d_inode->i_mutex); - } - goto out_put_cred; + goto out_unlock; } err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath, - stat, attr, link); + stat, link); if (!err) { /* Restore timestamps on parent (best effort) */ ovl_set_timestamps(upperdir, &pstat); } out_unlock: unlock_rename(workdir, upperdir); -out_put_cred: revert_creds(old_cred); put_cred(override_cred); @@ -406,7 +395,7 @@ int ovl_copy_up(struct dentry *dentry) ovl_path_lower(next, &lowerpath); err = vfs_getattr(&lowerpath, &stat); if (!err) - err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL); + err = ovl_copy_up_one(parent, next, &lowerpath, &stat); dput(parent); dput(next); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index ec0c2a050043..964a60fa7afc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -12,8 +12,7 @@ #include <linux/xattr.h> #include "overlayfs.h" -static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr, - bool no_data) +static int ovl_copy_up_truncate(struct dentry *dentry) { int err; struct dentry *parent; @@ -30,10 +29,8 @@ static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr, if (err) goto out_dput_parent; - if (no_data) - stat.size = 0; - - err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr); + stat.size = 0; + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); out_dput_parent: dput(parent); @@ -49,13 +46,13 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (err) goto out; - upperdentry = ovl_dentry_upper(dentry); - if (upperdentry) { + err = ovl_copy_up(dentry); + if (!err) { + upperdentry = ovl_dentry_upper(dentry); + mutex_lock(&upperdentry->d_inode->i_mutex); err = notify_change(upperdentry, attr, NULL); mutex_unlock(&upperdentry->d_inode->i_mutex); - } else { - err = ovl_copy_up_last(dentry, attr, false); } ovl_drop_write(dentry); out: @@ -134,57 +131,23 @@ out_dput: return err; } - -struct ovl_link_data { - struct dentry *realdentry; - void *cookie; -}; - -static const char *ovl_follow_link(struct dentry *dentry, void **cookie) +static const char *ovl_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { struct dentry *realdentry; struct inode *realinode; - struct ovl_link_data *data = NULL; - const char *ret; + + if (!dentry) + return ERR_PTR(-ECHILD); realdentry = ovl_dentry_real(dentry); realinode = realdentry->d_inode; - if (WARN_ON(!realinode->i_op->follow_link)) + if (WARN_ON(!realinode->i_op->get_link)) return ERR_PTR(-EPERM); - if (realinode->i_op->put_link) { - data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); - if (!data) - return ERR_PTR(-ENOMEM); - data->realdentry = realdentry; - } - - ret = realinode->i_op->follow_link(realdentry, cookie); - if (IS_ERR_OR_NULL(ret)) { - kfree(data); - return ret; - } - - if (data) - data->cookie = *cookie; - - *cookie = data; - - return ret; -} - -static void ovl_put_link(struct inode *unused, void *c) -{ - struct inode *realinode; - struct ovl_link_data *data = c; - - if (!data) - return; - - realinode = data->realdentry->d_inode; - realinode->i_op->put_link(realinode, data->cookie); - kfree(data); + return realinode->i_op->get_link(realdentry, realinode, done); } static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) @@ -353,7 +316,7 @@ struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags) return ERR_PTR(err); if (file_flags & O_TRUNC) - err = ovl_copy_up_last(dentry, NULL, true); + err = ovl_copy_up_truncate(dentry); else err = ovl_copy_up(dentry); ovl_drop_write(dentry); @@ -381,8 +344,7 @@ static const struct inode_operations ovl_file_inode_operations = { static const struct inode_operations ovl_symlink_inode_operations = { .setattr = ovl_setattr, - .follow_link = ovl_follow_link, - .put_link = ovl_put_link, + .get_link = ovl_get_link, .readlink = ovl_readlink, .getattr = ovl_getattr, .setxattr = ovl_setxattr, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index ea5a40b06e3a..e17154aeaae4 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -194,7 +194,6 @@ void ovl_cleanup(struct inode *dir, struct dentry *dentry); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, - struct path *lowerpath, struct kstat *stat, - struct iattr *attr); + struct path *lowerpath, struct kstat *stat); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); diff --git a/fs/pipe.c b/fs/pipe.c index 8865f7963700..42cf8ddf0e55 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -366,18 +366,17 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) int offset = buf->offset + buf->len; if (ops->can_merge && offset + chars <= PAGE_SIZE) { - int error = ops->confirm(pipe, buf); - if (error) + ret = ops->confirm(pipe, buf); + if (ret) goto out; ret = copy_page_from_iter(buf->page, offset, chars, from); if (unlikely(ret < chars)) { - error = -EFAULT; + ret = -EFAULT; goto out; } do_wakeup = 1; - buf->len += chars; - ret = chars; + buf->len += ret; if (!iov_iter_count(from)) goto out; } @@ -693,17 +692,20 @@ int create_pipe_files(struct file **res, int flags) d_instantiate(path.dentry, inode); - err = -ENFILE; f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); - if (IS_ERR(f)) + if (IS_ERR(f)) { + err = PTR_ERR(f); goto err_dentry; + } f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); f->private_data = inode->i_pipe; res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); - if (IS_ERR(res[0])) + if (IS_ERR(res[0])) { + err = PTR_ERR(res[0]); goto err_file; + } path_get(&path); res[0]->private_data = inode->i_pipe; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 4fb17ded7d47..711dd5170376 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -762,8 +762,9 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, EXPORT_SYMBOL (posix_acl_to_xattr); static int -posix_acl_xattr_get(struct dentry *dentry, const char *name, - void *value, size_t size, int type) +posix_acl_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *value, size_t size) { struct posix_acl *acl; int error; @@ -773,7 +774,7 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name, if (d_is_symlink(dentry)) return -EOPNOTSUPP; - acl = get_acl(d_backing_inode(dentry), type); + acl = get_acl(d_backing_inode(dentry), handler->flags); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -786,8 +787,9 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name, } static int -posix_acl_xattr_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +posix_acl_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = d_backing_inode(dentry); struct posix_acl *acl = NULL; @@ -798,7 +800,7 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name, if (!inode->i_op->set_acl) return -EOPNOTSUPP; - if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return value ? -EACCES : 0; if (!inode_owner_or_capable(inode)) return -EPERM; @@ -815,37 +817,20 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name, } } - ret = inode->i_op->set_acl(inode, acl, type); + ret = inode->i_op->set_acl(inode, acl, handler->flags); out: posix_acl_release(acl); return ret; } -static size_t -posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) +static bool +posix_acl_xattr_list(struct dentry *dentry) { - const char *xname; - size_t size; - - if (!IS_POSIXACL(d_backing_inode(dentry))) - return -EOPNOTSUPP; - if (d_is_symlink(dentry)) - return -EOPNOTSUPP; - - if (type == ACL_TYPE_ACCESS) - xname = POSIX_ACL_XATTR_ACCESS; - else - xname = POSIX_ACL_XATTR_DEFAULT; - - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; + return IS_POSIXACL(d_backing_inode(dentry)); } const struct xattr_handler posix_acl_access_xattr_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, + .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, .list = posix_acl_xattr_list, .get = posix_acl_xattr_get, @@ -854,7 +839,7 @@ const struct xattr_handler posix_acl_access_xattr_handler = { EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); const struct xattr_handler posix_acl_default_xattr_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, + .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = posix_acl_xattr_list, .get = posix_acl_xattr_get, diff --git a/fs/proc/array.c b/fs/proc/array.c index f60f0121e331..d73291f5f0fc 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -91,18 +91,18 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) { char *buf; + size_t size; char tcomm[sizeof(p->comm)]; + int ret; get_task_comm(tcomm, p); seq_puts(m, "Name:\t"); - buf = m->buf + m->count; - /* Ignore error for now */ - buf += string_escape_str(tcomm, buf, m->size - m->count, - ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + size = seq_get_buf(m, &buf); + ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + seq_commit(m, ret < size ? ret : -1); - m->count = buf - m->buf; seq_putc(m, '\n'); } @@ -375,7 +375,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { - unsigned long vsize, eip, esp, wchan = ~0UL; + unsigned long vsize, eip, esp, wchan = 0; int priority, nice; int tty_pgrp = -1, tty_nr = 0; sigset_t sigign, sigcatch; @@ -507,7 +507,19 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); - seq_put_decimal_ull(m, ' ', wchan); + + /* + * We used to output the absolute kernel address, but that's an + * information leak - so instead we show a 0/1 flag here, to signal + * to user-space whether there's a wchan field in /proc/PID/wchan. + * + * This works with older implementations of procps as well. + */ + if (wchan) + seq_puts(m, " 1"); + else + seq_puts(m, " 0"); + seq_put_decimal_ull(m, ' ', 0); seq_put_decimal_ull(m, ' ', 0); seq_put_decimal_ll(m, ' ', task->exit_signal); diff --git a/fs/proc/base.c b/fs/proc/base.c index b25eee4cead5..2cf5d7e37375 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -430,13 +430,10 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (lookup_symbol_name(wchan, symname) < 0) { - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - return 0; - seq_printf(m, "%lu", wchan); - } else { + if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); - } + else + seq_putc(m, '0'); return 0; } @@ -1035,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, return simple_read_from_buffer(buf, count, ppos, buffer, len); } +/* + * /proc/pid/oom_adj exists solely for backwards compatibility with previous + * kernels. The effective policy is defined by oom_score_adj, which has a + * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. + * Values written to oom_adj are simply mapped linearly to oom_score_adj. + * Processes that become oom disabled via oom_adj will still be oom disabled + * with this implementation. + * + * oom_adj cannot be removed since existing userspace binaries use it. + */ static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -1557,12 +1564,16 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) return -ENOENT; } -static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie) +static const char *proc_pid_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct inode *inode = d_inode(dentry); struct path path; int error = -EACCES; + if (!dentry) + return ERR_PTR(-ECHILD); + /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; @@ -1623,7 +1634,7 @@ out: const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, - .follow_link = proc_pid_follow_link, + .get_link = proc_pid_get_link, .setattr = proc_setattr, }; @@ -1888,7 +1899,7 @@ static const struct dentry_operations tid_map_files_dentry_operations = { .d_delete = pid_delete_dentry, }; -static int proc_map_files_get_link(struct dentry *dentry, struct path *path) +static int map_files_get_link(struct dentry *dentry, struct path *path) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; @@ -1938,20 +1949,22 @@ struct map_files_info { * path to the file in question. */ static const char * -proc_map_files_follow_link(struct dentry *dentry, void **cookie) +proc_map_files_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - return proc_pid_follow_link(dentry, NULL); + return proc_pid_get_link(dentry, inode, done); } /* - * Identical to proc_pid_link_inode_operations except for follow_link() + * Identical to proc_pid_link_inode_operations except for get_link() */ static const struct inode_operations proc_map_files_link_inode_operations = { .readlink = proc_pid_readlink, - .follow_link = proc_map_files_follow_link, + .get_link = proc_map_files_get_link, .setattr = proc_setattr, }; @@ -1968,7 +1981,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, return -ENOENT; ei = PROC_I(inode); - ei->op.proc_get_link = proc_map_files_get_link; + ei->op.proc_get_link = map_files_get_link; inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; @@ -2352,7 +2365,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); - char *page; + void *page; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -2367,14 +2380,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (*ppos != 0) goto out; - length = -ENOMEM; - page = (char*)__get_free_page(GFP_TEMPORARY); - if (!page) + page = memdup_user(buf, count); + if (IS_ERR(page)) { + length = PTR_ERR(page); goto out; - - length = -EFAULT; - if (copy_from_user(page, buf, count)) - goto out_free; + } /* Guard against adverse ptrace interaction */ length = mutex_lock_interruptible(&task->signal->cred_guard_mutex); @@ -2383,10 +2393,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, length = security_setprocattr(task, (char*)file->f_path.dentry->d_name.name, - (void*)page, count); + page, count); mutex_unlock(&task->signal->cred_guard_mutex); out_free: - free_page((unsigned long) page); + kfree(page); out: put_task_struct(task); out_no_task: @@ -2487,6 +2497,7 @@ static ssize_t proc_coredump_filter_write(struct file *file, mm = get_task_mm(task); if (!mm) goto out_no_mm; + ret = 0; for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 6e5fcd00733e..56afa5ef08f2 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -258,6 +258,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, name, len, instantiate, p, (void *)(unsigned long)fd)) goto out_fd_loop; + cond_resched(); rcu_read_lock(); } rcu_read_unlock(); @@ -291,11 +292,19 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, */ int proc_fd_permission(struct inode *inode, int mask) { - int rv = generic_permission(inode, mask); + struct task_struct *p; + int rv; + + rv = generic_permission(inode, mask); if (rv == 0) - return 0; - if (task_tgid(current) == proc_pid(inode)) + return rv; + + rcu_read_lock(); + p = pid_task(proc_pid(inode), PIDTYPE_PID); + if (p && same_thread_group(p, current)) rv = 0; + rcu_read_unlock(); + return rv; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index bd95b9fdebb0..42305ddcbaa0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -95,7 +95,8 @@ void __init proc_init_inodecache(void) proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_PANIC), init_once); } @@ -393,24 +394,25 @@ static const struct file_operations proc_reg_file_ops_no_compat = { }; #endif -static const char *proc_follow_link(struct dentry *dentry, void **cookie) +static void proc_put_link(void *p) { - struct proc_dir_entry *pde = PDE(d_inode(dentry)); - if (unlikely(!use_pde(pde))) - return ERR_PTR(-EINVAL); - *cookie = pde; - return pde->data; + unuse_pde(p); } -static void proc_put_link(struct inode *unused, void *p) +static const char *proc_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - unuse_pde(p); + struct proc_dir_entry *pde = PDE(inode); + if (unlikely(!use_pde(pde))) + return ERR_PTR(-EINVAL); + set_delayed_call(done, proc_put_link, pde); + return pde->data; } const struct inode_operations proc_link_inode_operations = { .readlink = generic_readlink, - .follow_link = proc_follow_link, - .put_link = proc_put_link, + .get_link = proc_get_link, }; struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d3ebf2e61853..df4661abadc4 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -27,7 +27,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; unsigned long committed; - struct vmalloc_info vmi; long cached; long available; unsigned long pagecache; @@ -49,8 +48,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) if (cached < 0) cached = 0; - get_vmalloc_info(&vmi); - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_page_state(NR_LRU_BASE + lru); @@ -60,11 +57,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) /* * Estimate the amount of memory available for userspace allocations, * without causing swapping. - * - * Free memory cannot be taken below the low watermark, before the - * system starts swapping. */ - available = i.freeram - wmark_low; + available = i.freeram - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will @@ -191,8 +185,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(vm_commit_limit()), K(committed), (unsigned long)VMALLOC_TOTAL >> 10, - vmi.used >> 10, - vmi.largest_chunk >> 10 + 0ul, // used to be vmalloc 'used' + 0ul // used to be vmalloc 'largest_chunk' #ifdef CONFIG_MEMORY_FAILURE , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index f6e8354b8cea..1dece8781f91 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -30,14 +30,18 @@ static const struct proc_ns_operations *ns_entries[] = { &mntns_operations, }; -static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie) +static const char *proc_ns_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; void *error = ERR_PTR(-EACCES); + if (!dentry) + return ERR_PTR(-ECHILD); + task = get_proc_task(inode); if (!task) return error; @@ -74,7 +78,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl static const struct inode_operations proc_ns_link_inode_operations = { .readlink = proc_ns_readlink, - .follow_link = proc_ns_follow_link, + .get_link = proc_ns_get_link, .setattr = proc_setattr, }; diff --git a/fs/proc/page.c b/fs/proc/page.c index 93484034a03d..b2855eea5405 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page) * pseudo flags for the well known (anonymous) memory mapped pages * * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the - * simple test in page_mapped() is not enough. + * simple test in page_mapcount() is not enough. */ - if (!PageSlab(page) && page_mapped(page)) + if (!PageSlab(page) && page_mapcount(page)) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fdda62e6115e..fe5b6e6c4671 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -948,7 +948,7 @@ static struct ctl_dir *get_subdir(struct ctl_dir *dir, found: subdir->header.nreg++; failed: - if (unlikely(IS_ERR(subdir))) { + if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); pr_cont("/%*.*s %ld\n", diff --git a/fs/proc/self.c b/fs/proc/self.c index 113b8d061fc0..67e8db442cf0 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -18,26 +18,28 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static const char *proc_self_follow_link(struct dentry *dentry, void **cookie) +static const char *proc_self_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct pid_namespace *ns = dentry->d_sb->s_fs_info; + struct pid_namespace *ns = inode->i_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); char *name; if (!tgid) return ERR_PTR(-ENOENT); /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, GFP_KERNEL); - if (!name) - return ERR_PTR(-ENOMEM); + name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC); + if (unlikely(!name)) + return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%d", tgid); - return *cookie = name; + set_delayed_call(done, kfree_link, name); + return name; } static const struct inode_operations proc_self_inode_operations = { .readlink = proc_self_readlink, - .follow_link = proc_self_follow_link, - .put_link = kfree_put_link, + .get_link = proc_self_get_link, }; static unsigned self_inum; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e2d46adb54b4..65a1b6c69c11 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -14,6 +14,7 @@ #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> +#include <linux/shmem_fs.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -22,9 +23,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap, ptes, pmds; + unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + anon = get_mm_counter(mm, MM_ANONPAGES); + file = get_mm_counter(mm, MM_FILEPAGES); + shmem = get_mm_counter(mm, MM_SHMEMPAGES); + /* * Note: to minimize their overhead, mm maintains hiwater_vm and * hiwater_rss only when about to *lower* total_vm or rss. Any @@ -35,11 +40,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) hiwater_vm = total_vm = mm->total_vm; if (hiwater_vm < mm->hiwater_vm) hiwater_vm = mm->hiwater_vm; - hiwater_rss = total_rss = get_mm_rss(mm); + hiwater_rss = total_rss = anon + file + shmem; if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); @@ -52,6 +56,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPin:\t%8lu kB\n" "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" + "RssAnon:\t%8lu kB\n" + "RssFile:\t%8lu kB\n" + "RssShmem:\t%8lu kB\n" "VmData:\t%8lu kB\n" "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" @@ -65,11 +72,15 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), total_rss << (PAGE_SHIFT-10), - data << (PAGE_SHIFT-10), + anon << (PAGE_SHIFT-10), + file << (PAGE_SHIFT-10), + shmem << (PAGE_SHIFT-10), + mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, ptes >> 10, pmds >> 10, swap << (PAGE_SHIFT-10)); + hugetlb_report_usage(m, mm); } unsigned long task_vsize(struct mm_struct *mm) @@ -81,10 +92,11 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES); + *shared = get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; - *data = mm->total_vm - mm->shared_vm; + *data = mm->data_vm + mm->stack_vm; *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } @@ -446,14 +458,18 @@ struct mem_size_stats { unsigned long anonymous; unsigned long anonymous_thp; unsigned long swap; + unsigned long shared_hugetlb; + unsigned long private_hugetlb; u64 pss; u64 swap_pss; + bool check_shmem_swap; }; static void smaps_account(struct mem_size_stats *mss, struct page *page, - unsigned long size, bool young, bool dirty) + bool compound, bool young, bool dirty) { - int mapcount; + int i, nr = compound ? HPAGE_PMD_NR : 1; + unsigned long size = nr * PAGE_SIZE; if (PageAnon(page)) mss->anonymous += size; @@ -462,25 +478,52 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, /* Accumulate the size in pages that have been accessed. */ if (young || page_is_young(page) || PageReferenced(page)) mss->referenced += size; - mapcount = page_mapcount(page); - if (mapcount >= 2) { - u64 pss_delta; - if (dirty || PageDirty(page)) - mss->shared_dirty += size; - else - mss->shared_clean += size; - pss_delta = (u64)size << PSS_SHIFT; - do_div(pss_delta, mapcount); - mss->pss += pss_delta; - } else { + /* + * page_count(page) == 1 guarantees the page is mapped exactly once. + * If any subpage of the compound page mapped with PTE it would elevate + * page_count(). + */ + if (page_count(page) == 1) { if (dirty || PageDirty(page)) mss->private_dirty += size; else mss->private_clean += size; mss->pss += (u64)size << PSS_SHIFT; + return; } + + for (i = 0; i < nr; i++, page++) { + int mapcount = page_mapcount(page); + + if (mapcount >= 2) { + if (dirty || PageDirty(page)) + mss->shared_dirty += PAGE_SIZE; + else + mss->shared_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; + } else { + if (dirty || PageDirty(page)) + mss->private_dirty += PAGE_SIZE; + else + mss->private_clean += PAGE_SIZE; + mss->pss += PAGE_SIZE << PSS_SHIFT; + } + } +} + +#ifdef CONFIG_SHMEM +static int smaps_pte_hole(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + + mss->swap += shmem_partial_swap_usage( + walk->vma->vm_file->f_mapping, addr, end); + + return 0; } +#endif static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) @@ -509,11 +552,25 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap + && pte_none(*pte))) { + page = find_get_entry(vma->vm_file->f_mapping, + linear_page_index(vma, addr)); + if (!page) + return; + + if (radix_tree_exceptional_entry(page)) + mss->swap += PAGE_SIZE; + else + page_cache_release(page); + + return; } if (!page) return; - smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); + + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -529,8 +586,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, if (IS_ERR_OR_NULL(page)) return; mss->anonymous_thp += HPAGE_PMD_SIZE; - smaps_account(mss, page, HPAGE_PMD_SIZE, - pmd_young(*pmd), pmd_dirty(*pmd)); + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -546,7 +602,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); return 0; @@ -625,17 +681,74 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) seq_putc(m, '\n'); } +#ifdef CONFIG_HUGETLB_PAGE +static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + struct page *page = NULL; + + if (pte_present(*pte)) { + page = vm_normal_page(vma, addr, *pte); + } else if (is_swap_pte(*pte)) { + swp_entry_t swpent = pte_to_swp_entry(*pte); + + if (is_migration_entry(swpent)) + page = migration_entry_to_page(swpent); + } + if (page) { + int mapcount = page_mapcount(page); + + if (mapcount >= 2) + mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); + else + mss->private_hugetlb += huge_page_size(hstate_vma(vma)); + } + return 0; +} +#endif /* HUGETLB_PAGE */ + static int show_smap(struct seq_file *m, void *v, int is_pid) { struct vm_area_struct *vma = v; struct mem_size_stats mss; struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range, +#ifdef CONFIG_HUGETLB_PAGE + .hugetlb_entry = smaps_hugetlb_range, +#endif .mm = vma->vm_mm, .private = &mss, }; memset(&mss, 0, sizeof mss); + +#ifdef CONFIG_SHMEM + if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { + /* + * For shared or readonly shmem mappings we know that all + * swapped out pages belong to the shmem object, and we can + * obtain the swap value much more efficiently. For private + * writable mappings, we might have COW pages that are + * not affected by the parent swapped out pages of the shmem + * object, so we have to distinguish them during the page walk. + * Unless we know that the shmem object (or the part mapped by + * our VMA) has no swapped out pages at all. + */ + unsigned long shmem_swapped = shmem_swap_usage(vma); + + if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE)) { + mss.swap = shmem_swapped; + } else { + mss.check_shmem_swap = true; + smaps_walk.pte_hole = smaps_pte_hole; + } + } +#endif + /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); @@ -652,6 +765,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" + "Shared_Hugetlb: %8lu kB\n" + "Private_Hugetlb: %7lu kB\n" "Swap: %8lu kB\n" "SwapPss: %8lu kB\n" "KernelPageSize: %8lu kB\n" @@ -667,6 +782,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.referenced >> 10, mss.anonymous >> 10, mss.anonymous_thp >> 10, + mss.shared_hugetlb >> 10, + mss.private_hugetlb >> 10, mss.swap >> 10, (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), vma_kernel_pagesize(vma) >> 10, @@ -753,36 +870,34 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, pte_t ptent = *pte; if (pte_present(ptent)) { + ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); ptent = pte_wrprotect(ptent); - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + ptent = pte_clear_soft_dirty(ptent); + ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); } - - set_pte_at(vma->vm_mm, addr, pte, ptent); } +#else +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +} +#endif +#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = *pmdp; + pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); pmd = pmd_wrprotect(pmd); - pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); - - if (vma->vm_flags & VM_SOFTDIRTY) - vma->vm_flags &= ~VM_SOFTDIRTY; + pmd = pmd_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } - #else - -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) -{ -} - static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { @@ -798,7 +913,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; @@ -1072,7 +1187,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmdp, vma, &ptl)) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; @@ -1404,7 +1519,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 947b0f4fd0a1..9eacd59e0360 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -19,26 +19,29 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie) +static const char *proc_thread_self_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct pid_namespace *ns = dentry->d_sb->s_fs_info; + struct pid_namespace *ns = inode->i_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); char *name; if (!pid) return ERR_PTR(-ENOENT); - name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); - if (!name) - return ERR_PTR(-ENOMEM); + name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, + dentry ? GFP_KERNEL : GFP_ATOMIC); + if (unlikely(!name)) + return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%d/task/%d", tgid, pid); - return *cookie = name; + set_delayed_call(done, kfree_link, name); + return name; } static const struct inode_operations proc_thread_self_inode_operations = { .readlink = proc_thread_self_readlink, - .follow_link = proc_thread_self_follow_link, - .put_link = kfree_put_link, + .get_link = proc_thread_self_get_link, }; static unsigned thread_self_inum; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 8ebd9a334085..2256e7e23e67 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -95,9 +95,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); - int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; + int err; if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt_path.dentry); @@ -131,16 +131,17 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - int err = 0; + int err; seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, MAJOR(sb->s_dev), MINOR(sb->s_dev)); - if (sb->s_op->show_path) + if (sb->s_op->show_path) { err = sb->s_op->show_path(m, mnt->mnt_root); - else + if (err) + goto out; + } else { seq_dentry(m, mnt->mnt_root, " \t\n\\"); - if (err) - goto out; + } seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ @@ -168,12 +169,13 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, " - "); show_type(m, sb); seq_putc(m, ' '); - if (sb->s_op->show_devname) + if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt->mnt_root); - else + if (err) + goto out; + } else { mangle(m, r->mnt_devname ? r->mnt_devname : "none"); - if (err) - goto out; + } seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); err = show_sb_opts(m, sb); if (err) @@ -191,7 +193,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; - int err = 0; + int err; /* device */ if (sb->s_op->show_devname) { @@ -220,8 +222,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) /* optional statistics */ if (sb->s_op->show_stats) { seq_putc(m, ' '); - if (!err) - err = sb->s_op->show_stats(m, mnt_path.dentry); + err = sb->s_op->show_stats(m, mnt_path.dentry); } seq_putc(m, '\n'); diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 916b8e23d968..360ae43f590c 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -1,5 +1,5 @@ config PSTORE - bool "Persistent store support" + tristate "Persistent store support" default n select ZLIB_DEFLATE select ZLIB_INFLATE diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index e647d8e81712..b8803cc07fce 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -2,12 +2,12 @@ # Makefile for the linux pstorefs routines. # -obj-y += pstore.o +obj-$(CONFIG_PSTORE) += pstore.o pstore-objs += inode.o platform.o -obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o +pstore-$(CONFIG_PSTORE_FTRACE) += ftrace.o -obj-$(CONFIG_PSTORE_PMSG) += pmsg.o +pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o ramoops-objs += ram.o ram_core.o obj-$(CONFIG_PSTORE_RAM) += ramoops.o diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 76a4eeb92982..d4887705bb61 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -104,22 +104,23 @@ static const struct file_operations pstore_knob_fops = { .write = pstore_ftrace_knob_write, }; +static struct dentry *pstore_ftrace_dir; + void pstore_register_ftrace(void) { - struct dentry *dir; struct dentry *file; if (!psinfo->write_buf) return; - dir = debugfs_create_dir("pstore", NULL); - if (!dir) { + pstore_ftrace_dir = debugfs_create_dir("pstore", NULL); + if (!pstore_ftrace_dir) { pr_err("%s: unable to create pstore directory\n", __func__); return; } - file = debugfs_create_file("record_ftrace", 0600, dir, NULL, - &pstore_knob_fops); + file = debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir, + NULL, &pstore_knob_fops); if (!file) { pr_err("%s: unable to create record_ftrace file\n", __func__); goto err_file; @@ -127,5 +128,17 @@ void pstore_register_ftrace(void) return; err_file: - debugfs_remove(dir); + debugfs_remove(pstore_ftrace_dir); +} + +void pstore_unregister_ftrace(void) +{ + mutex_lock(&pstore_ftrace_lock); + if (pstore_ftrace_enabled) { + unregister_ftrace_function(&pstore_ftrace_ops); + pstore_ftrace_enabled = 0; + } + mutex_unlock(&pstore_ftrace_lock); + + debugfs_remove_recursive(pstore_ftrace_dir); } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 3adcc4669fac..d8c439d813ce 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -178,6 +178,7 @@ static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence) } static const struct file_operations pstore_file_operations = { + .owner = THIS_MODULE, .open = pstore_file_open, .read = pstore_file_read, .llseek = pstore_file_llseek, @@ -287,7 +288,7 @@ static const struct super_operations pstore_ops = { static struct super_block *pstore_sb; -int pstore_is_mounted(void) +bool pstore_is_mounted(void) { return pstore_sb != NULL; } @@ -456,6 +457,7 @@ static void pstore_kill_sb(struct super_block *sb) } static struct file_system_type pstore_fs_type = { + .owner = THIS_MODULE, .name = "pstore", .mount = pstore_mount, .kill_sb = pstore_kill_sb, @@ -479,5 +481,12 @@ out: } module_init(init_pstore_fs) +static void __exit exit_pstore_fs(void) +{ + unregister_filesystem(&pstore_fs_type); + sysfs_remove_mount_point(fs_kobj, "pstore"); +} +module_exit(exit_pstore_fs) + MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); MODULE_LICENSE("GPL"); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index c36ba2cd0b5d..e38a22b31282 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -41,14 +41,18 @@ pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec) #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); +extern void pstore_unregister_ftrace(void); #else static inline void pstore_register_ftrace(void) {} +static inline void pstore_unregister_ftrace(void) {} #endif #ifdef CONFIG_PSTORE_PMSG extern void pstore_register_pmsg(void); +extern void pstore_unregister_pmsg(void); #else static inline void pstore_register_pmsg(void) {} +static inline void pstore_unregister_pmsg(void) {} #endif extern struct pstore_info *psinfo; @@ -59,6 +63,6 @@ extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, int count, char *data, bool compressed, size_t size, struct timespec time, struct pstore_info *psi); -extern int pstore_is_mounted(void); +extern bool pstore_is_mounted(void); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 791743deedf1..588461bb2dd4 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -237,6 +237,14 @@ static void allocate_buf_for_compression(void) } +static void free_buf_for_compression(void) +{ + kfree(stream.workspace); + stream.workspace = NULL; + kfree(big_oops_buf); + big_oops_buf = NULL; +} + /* * Called when compression fails, since the printk buffer * would be fetched for compression calling it again when @@ -353,6 +361,19 @@ static struct kmsg_dumper pstore_dumper = { .dump = pstore_dump, }; +/* + * Register with kmsg_dump to save last part of console log on panic. + */ +static void pstore_register_kmsg(void) +{ + kmsg_dump_register(&pstore_dumper); +} + +static void pstore_unregister_kmsg(void) +{ + kmsg_dump_unregister(&pstore_dumper); +} + #ifdef CONFIG_PSTORE_CONSOLE static void pstore_console_write(struct console *con, const char *s, unsigned c) { @@ -390,8 +411,14 @@ static void pstore_register_console(void) { register_console(&pstore_console); } + +static void pstore_unregister_console(void) +{ + unregister_console(&pstore_console); +} #else static void pstore_register_console(void) {} +static void pstore_unregister_console(void) {} #endif static int pstore_write_compat(enum pstore_type_id type, @@ -410,8 +437,6 @@ static int pstore_write_compat(enum pstore_type_id type, * read function right away to populate the file system. If not * then the pstore mount code will call us later to fill out * the file system. - * - * Register with kmsg_dump to save last part of console log on panic. */ int pstore_register(struct pstore_info *psi) { @@ -442,7 +467,7 @@ int pstore_register(struct pstore_info *psi) if (pstore_is_mounted()) pstore_get_records(0); - kmsg_dump_register(&pstore_dumper); + pstore_register_kmsg(); if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { pstore_register_console(); @@ -462,12 +487,28 @@ int pstore_register(struct pstore_info *psi) */ backend = psi->name; + module_put(owner); + pr_info("Registered %s as persistent store backend\n", psi->name); return 0; } EXPORT_SYMBOL_GPL(pstore_register); +void pstore_unregister(struct pstore_info *psi) +{ + pstore_unregister_pmsg(); + pstore_unregister_ftrace(); + pstore_unregister_console(); + pstore_unregister_kmsg(); + + free_buf_for_compression(); + + psinfo = NULL; + backend = NULL; +} +EXPORT_SYMBOL_GPL(pstore_unregister); + /* * Read all the records from the persistent store. Create * files in our filesystem. Don't warn about -EEXIST errors diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index feb5dd2948b4..7de20cd3797f 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -37,6 +37,8 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf, if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE) buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE; buffer = vmalloc(buffer_size); + if (!buffer) + return -ENOMEM; mutex_lock(&pmsg_lock); for (i = 0; i < count; ) { @@ -112,3 +114,10 @@ err_class: err: return; } + +void pstore_unregister_pmsg(void) +{ + device_destroy(pmsg_class, MKDEV(pmsg_major, 0)); + class_destroy(pmsg_class); + unregister_chrdev(pmsg_major, PMSG_NAME); +} diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 6c26c4daaec9..319c3a60cfa5 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -578,30 +578,27 @@ fail_out: return err; } -static int __exit ramoops_remove(struct platform_device *pdev) +static int ramoops_remove(struct platform_device *pdev) { -#if 0 - /* TODO(kees): We cannot unload ramoops since pstore doesn't support - * unregistering yet. - */ struct ramoops_context *cxt = &oops_cxt; - iounmap(cxt->virt_addr); - release_mem_region(cxt->phys_addr, cxt->size); + pstore_unregister(&cxt->pstore); cxt->max_dump_cnt = 0; - /* TODO(kees): When pstore supports unregistering, call it here. */ kfree(cxt->pstore.buf); cxt->pstore.bufsize = 0; + persistent_ram_free(cxt->mprz); + persistent_ram_free(cxt->fprz); + persistent_ram_free(cxt->cprz); + ramoops_free_przs(cxt); + return 0; -#endif - return -EBUSY; } static struct platform_driver ramoops_driver = { .probe = ramoops_probe, - .remove = __exit_p(ramoops_remove), + .remove = ramoops_remove, .driver = { .name = "ramoops", }, diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index c4bcb778886e..3a67cfb142d8 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -316,6 +316,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &qnx4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &qnx4_aops; qnx4_i(inode)->mmu_private = inode->i_size; } else { @@ -364,7 +365,7 @@ static int init_inodecache(void) qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache", sizeof(struct qnx4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (qnx4_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 32d2e1a9774c..47bb1de07155 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -582,6 +582,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode->i_mapping->a_ops = &qnx6_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &qnx6_aops; } else init_special_inode(inode, inode->i_mode, 0); @@ -624,7 +625,7 @@ static int init_inodecache(void) qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", sizeof(struct qnx6_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!qnx6_inode_cachep) return -ENOMEM; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ef0d64b2a6d9..fbd70af98820 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2924,4 +2924,4 @@ static int __init dquot_init(void) return 0; } -module_init(dquot_init); +fs_initcall(dquot_init); diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index bb2869f5dfd8..d07a2f91d858 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -1,7 +1,5 @@ - #include <linux/cred.h> #include <linux/init.h> -#include <linux/module.h> #include <linux/kernel.h> #include <linux/quotaops.h> #include <linux/sched.h> @@ -105,5 +103,4 @@ static int __init quota_init(void) "VFS: Failed to create quota netlink interface.\n"); return 0; }; - -module_init(quota_init); +fs_initcall(quota_init); diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 2aa012a68e90..ed85d4f35c04 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -30,13 +30,13 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot); static void v2r1_disk2memdqb(struct dquot *dquot, void *dp); static int v2r1_is_id(void *dp, struct dquot *dquot); -static struct qtree_fmt_operations v2r0_qtree_ops = { +static const struct qtree_fmt_operations v2r0_qtree_ops = { .mem2disk_dqblk = v2r0_mem2diskdqb, .disk2mem_dqblk = v2r0_disk2memdqb, .is_id = v2r0_is_id, }; -static struct qtree_fmt_operations v2r1_qtree_ops = { +static const struct qtree_fmt_operations v2r1_qtree_ops = { .mem2disk_dqblk = v2r1_mem2diskdqb, .disk2mem_dqblk = v2r1_disk2memdqb, .is_id = v2r1_is_id, diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 889d558b4e05..38981b037524 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -79,6 +79,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); break; } } diff --git a/fs/read_write.c b/fs/read_write.c index 819ef3faf1bb..06b07d5a08fe 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -16,6 +16,7 @@ #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> +#include <linux/mount.h> #include "internal.h" #include <asm/uaccess.h> @@ -171,6 +172,45 @@ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t si EXPORT_SYMBOL(fixed_size_llseek); /** + * no_seek_end_llseek - llseek implementation for fixed-sized devices + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * + */ +loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) +{ + switch (whence) { + case SEEK_SET: case SEEK_CUR: + return generic_file_llseek_size(file, offset, whence, + ~0ULL, 0); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(no_seek_end_llseek); + +/** + * no_seek_end_llseek_size - llseek implementation for fixed-sized devices + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * @size: maximal offset allowed + * + */ +loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) +{ + switch (whence) { + case SEEK_SET: case SEEK_CUR: + return generic_file_llseek_size(file, offset, whence, + size, 0); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(no_seek_end_llseek_size); + +/** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to @@ -395,9 +435,8 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t } if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - retval = locks_mandatory_area( - read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, - inode, file, pos, count); + retval = locks_mandatory_area(inode, file, pos, pos + count - 1, + read_write == READ ? F_RDLCK : F_WRLCK); if (retval < 0) return retval; } @@ -1327,3 +1366,299 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif + +/* + * copy_file_range() differs from regular file read and write in that it + * specifically allows return partial success. When it does so is up to + * the copy_file_range method. + */ +ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + ssize_t ret; + + if (flags != 0) + return -EINVAL; + + /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */ + ret = rw_verify_area(READ, file_in, &pos_in, len); + if (ret >= 0) + ret = rw_verify_area(WRITE, file_out, &pos_out, len); + if (ret < 0) + return ret; + + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND)) + return -EBADF; + + /* this could be relaxed once a method supports cross-fs copies */ + if (inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + + if (len == 0) + return 0; + + ret = mnt_want_write_file(file_out); + if (ret) + return ret; + + ret = -EOPNOTSUPP; + if (file_out->f_op->copy_file_range) + ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, + pos_out, len, flags); + if (ret == -EOPNOTSUPP) + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + + if (ret > 0) { + fsnotify_access(file_in); + add_rchar(current, ret); + fsnotify_modify(file_out); + add_wchar(current, ret); + } + inc_syscr(current); + inc_syscw(current); + + mnt_drop_write_file(file_out); + + return ret; +} +EXPORT_SYMBOL(vfs_copy_file_range); + +SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, + int, fd_out, loff_t __user *, off_out, + size_t, len, unsigned int, flags) +{ + loff_t pos_in; + loff_t pos_out; + struct fd f_in; + struct fd f_out; + ssize_t ret = -EBADF; + + f_in = fdget(fd_in); + if (!f_in.file) + goto out2; + + f_out = fdget(fd_out); + if (!f_out.file) + goto out1; + + ret = -EFAULT; + if (off_in) { + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) + goto out; + } else { + pos_in = f_in.file->f_pos; + } + + if (off_out) { + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) + goto out; + } else { + pos_out = f_out.file->f_pos; + } + + ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, + flags); + if (ret > 0) { + pos_in += ret; + pos_out += ret; + + if (off_in) { + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) + ret = -EFAULT; + } else { + f_in.file->f_pos = pos_in; + } + + if (off_out) { + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) + ret = -EFAULT; + } else { + f_out.file->f_pos = pos_out; + } + } + +out: + fdput(f_out); +out1: + fdput(f_in); +out2: + return ret; +} + +static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) +{ + struct inode *inode = file_inode(file); + + if (unlikely(pos < 0)) + return -EINVAL; + + if (unlikely((loff_t) (pos + len) < 0)) + return -EINVAL; + + if (unlikely(inode->i_flctx && mandatory_lock(inode))) { + loff_t end = len ? pos + len - 1 : OFFSET_MAX; + int retval; + + retval = locks_mandatory_area(inode, file, pos, end, + write ? F_WRLCK : F_RDLCK); + if (retval < 0) + return retval; + } + + return security_file_permission(file, write ? MAY_WRITE : MAY_READ); +} + +int vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + int ret; + + if (inode_in->i_sb != inode_out->i_sb || + file_in->f_path.mnt != file_out->f_path.mnt) + return -EXDEV; + + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND) || + !file_in->f_op->clone_file_range) + return -EBADF; + + ret = clone_verify_area(file_in, pos_in, len, false); + if (ret) + return ret; + + ret = clone_verify_area(file_out, pos_out, len, true); + if (ret) + return ret; + + if (pos_in + len > i_size_read(inode_in)) + return -EINVAL; + + ret = mnt_want_write_file(file_out); + if (ret) + return ret; + + ret = file_in->f_op->clone_file_range(file_in, pos_in, + file_out, pos_out, len); + if (!ret) { + fsnotify_access(file_in); + fsnotify_modify(file_out); + } + + mnt_drop_write_file(file_out); + return ret; +} +EXPORT_SYMBOL(vfs_clone_file_range); + +int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) +{ + struct file_dedupe_range_info *info; + struct inode *src = file_inode(file); + u64 off; + u64 len; + int i; + int ret; + bool is_admin = capable(CAP_SYS_ADMIN); + u16 count = same->dest_count; + struct file *dst_file; + loff_t dst_off; + ssize_t deduped; + + if (!(file->f_mode & FMODE_READ)) + return -EINVAL; + + if (same->reserved1 || same->reserved2) + return -EINVAL; + + off = same->src_offset; + len = same->src_length; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode)) + goto out; + + ret = -EINVAL; + if (!S_ISREG(src->i_mode)) + goto out; + + ret = clone_verify_area(file, off, len, false); + if (ret < 0) + goto out; + ret = 0; + + /* pre-format output fields to sane values */ + for (i = 0; i < count; i++) { + same->info[i].bytes_deduped = 0ULL; + same->info[i].status = FILE_DEDUPE_RANGE_SAME; + } + + for (i = 0, info = same->info; i < count; i++, info++) { + struct inode *dst; + struct fd dst_fd = fdget(info->dest_fd); + + dst_file = dst_fd.file; + if (!dst_file) { + info->status = -EBADF; + goto next_loop; + } + dst = file_inode(dst_file); + + ret = mnt_want_write_file(dst_file); + if (ret) { + info->status = ret; + goto next_loop; + } + + dst_off = info->dest_offset; + ret = clone_verify_area(dst_file, dst_off, len, true); + if (ret < 0) { + info->status = ret; + goto next_file; + } + ret = 0; + + if (info->reserved) { + info->status = -EINVAL; + } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { + info->status = -EINVAL; + } else if (file->f_path.mnt != dst_file->f_path.mnt) { + info->status = -EXDEV; + } else if (S_ISDIR(dst->i_mode)) { + info->status = -EISDIR; + } else if (dst_file->f_op->dedupe_file_range == NULL) { + info->status = -EINVAL; + } else { + deduped = dst_file->f_op->dedupe_file_range(file, off, + len, dst_file, + info->dest_offset); + if (deduped == -EBADE) + info->status = FILE_DEDUPE_RANGE_DIFFERS; + else if (deduped < 0) + info->status = deduped; + else + info->bytes_deduped += deduped; + } + +next_file: + mnt_drop_write_file(dst_file); +next_loop: + fdput(dst_fd); + } + +out: + return ret; +} +EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 3d8e7e671d5b..ae9e5b308cf9 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1361,6 +1361,7 @@ static void init_inode(struct inode *inode, struct treepath *path) inode->i_fop = &reiserfs_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &reiserfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &reiserfs_address_space_operations; } else { inode->i_blocks = 0; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 9d6486d416a3..44c2bdced1c8 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -618,12 +618,10 @@ static void release_buffer_page(struct buffer_head *bh) static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (buffer_journaled(bh)) { reiserfs_warning(NULL, "clm-2084", - "pinned buffer %lu:%s sent to disk", - bh->b_blocknr, bdevname(bh->b_bdev, b)); + "pinned buffer %lu:%pg sent to disk", + bh->b_blocknr, bh->b_bdev); } if (uptodate) set_buffer_uptodate(bh); @@ -2387,11 +2385,10 @@ static int journal_read(struct super_block *sb) int replay_count = 0; int continue_replay = 1; int ret; - char b[BDEVNAME_SIZE]; cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); - reiserfs_info(sb, "checking transaction log (%s)\n", - bdevname(journal->j_dev_bd, b)); + reiserfs_info(sb, "checking transaction log (%pg)\n", + journal->j_dev_bd); start = get_seconds(); /* @@ -2651,8 +2648,8 @@ static int journal_init_dev(struct super_block *super, set_blocksize(journal->j_dev_bd, super->s_blocksize); reiserfs_info(super, - "journal_init_dev: journal device: %s\n", - bdevname(journal->j_dev_bd, b)); + "journal_init_dev: journal device: %pg\n", + journal->j_dev_bd); return 0; } @@ -2724,7 +2721,6 @@ int journal_init(struct super_block *sb, const char *j_dev_name, struct reiserfs_journal_header *jh; struct reiserfs_journal *journal; struct reiserfs_journal_list *jl; - char b[BDEVNAME_SIZE]; int ret; journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); @@ -2794,10 +2790,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name, && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != sb_jp_journal_magic(rs))) { reiserfs_warning(sb, "sh-460", - "journal header magic %x (device %s) does " + "journal header magic %x (device %pg) does " "not match to magic found in super block %x", jh->jh_journal.jp_journal_magic, - bdevname(journal->j_dev_bd, b), + journal->j_dev_bd, sb_jp_journal_magic(rs)); brelse(bhjh); goto free_and_return; @@ -2818,10 +2814,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_max_trans_age = commit_max_age; } - reiserfs_info(sb, "journal params: device %s, size %u, " + reiserfs_info(sb, "journal params: device %pg, size %u, " "journal first block %u, max trans len %u, max batch %u, " "max commit age %u, max trans age %u\n", - bdevname(journal->j_dev_bd, b), + journal->j_dev_bd, SB_ONDISK_JOURNAL_SIZE(sb), SB_ONDISK_JOURNAL_1st_BLOCK(sb), journal->j_trans_max, diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 5f1c9c29eb8c..2a12d46d7fb4 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -712,9 +712,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); - if (!new_valid_dev(rdev)) - return -EINVAL; - retval = dquot_initialize(dir); if (retval) return retval; @@ -1173,6 +1170,7 @@ static int reiserfs_symlink(struct inode *parent_dir, reiserfs_update_inode_transaction(parent_dir); inode->i_op = &reiserfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &reiserfs_address_space_operations; retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name, @@ -1667,8 +1665,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { */ const struct inode_operations reiserfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = reiserfs_setattr, .setxattr = reiserfs_setxattr, .getxattr = reiserfs_getxattr, diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index ae1dc841db3a..4f3f928076f3 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -139,11 +139,9 @@ static void sprintf_block_head(char *buf, struct buffer_head *bh) static void sprintf_buffer_head(char *buf, struct buffer_head *bh) { - char b[BDEVNAME_SIZE]; - sprintf(buf, - "dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", - bdevname(bh->b_bdev, b), bh->b_size, + "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", + bh->b_bdev, bh->b_size, (unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)), bh->b_state, bh->b_page, buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", @@ -530,7 +528,6 @@ static int print_super_block(struct buffer_head *bh) (struct reiserfs_super_block *)(bh->b_data); int skipped, data_blocks; char *version; - char b[BDEVNAME_SIZE]; if (is_reiserfs_3_5(rs)) { version = "3.5"; @@ -543,7 +540,7 @@ static int print_super_block(struct buffer_head *bh) return 1; } - printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b), + printk("%pg\'s super block is in block %llu\n", bh->b_bdev, (unsigned long long)bh->b_blocknr); printk("Reiserfs version %s\n", version); printk("Block count %u\n", sb_block_count(rs)); diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 621b9f381fe1..fe999157dd97 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -303,11 +303,10 @@ static int show_journal(struct seq_file *m, void *unused) struct reiserfs_sb_info *r = REISERFS_SB(sb); struct reiserfs_super_block *rs = r->s_rs; struct journal_params *jp = &rs->s_v1.s_journal; - char b[BDEVNAME_SIZE]; seq_printf(m, /* on-disk fields */ "jp_journal_1st_block: \t%i\n" - "jp_journal_dev: \t%s[%x]\n" + "jp_journal_dev: \t%pg[%x]\n" "jp_journal_size: \t%i\n" "jp_journal_trans_max: \t%i\n" "jp_journal_magic: \t%i\n" @@ -348,7 +347,7 @@ static int show_journal(struct seq_file *m, void *unused) "prepare: \t%12lu\n" "prepare_retry: \t%12lu\n", DJP(jp_journal_1st_block), - bdevname(SB_JOURNAL(sb)->j_dev_bd, b), + SB_JOURNAL(sb)->j_dev_bd, DJP(jp_journal_dev), DJP(jp_journal_size), DJP(jp_journal_trans_max), diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 4a62fe8cc3bf..05db7473bcb5 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -626,7 +626,8 @@ static int __init init_inodecache(void) sizeof(struct reiserfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD| + SLAB_ACCOUNT), init_once); if (reiserfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e87f9b52bf06..e5ddb4e5ea94 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -756,7 +756,8 @@ find_xattr_handler_prefix(const struct xattr_handler **handlers, return NULL; for_each_xattr_handler(handlers, xah) { - if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0) + const char *prefix = xattr_prefix(xah); + if (strncmp(prefix, name, strlen(prefix)) == 0) break; } @@ -778,7 +779,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->get(dentry, name, buffer, size, handler->flags); + return handler->get(handler, dentry, name, buffer, size); } /* @@ -797,7 +798,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->set(dentry, name, value, size, flags, handler->flags); + return handler->set(handler, dentry, name, value, size, flags); } /* @@ -814,7 +815,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name) if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags); + return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE); } struct listxattr_buf { @@ -839,19 +840,16 @@ static int listxattr_filler(struct dir_context *ctx, const char *name, handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); - if (!handler) /* Unsupported xattr name */ + if (!handler /* Unsupported xattr name */ || + (handler->list && !handler->list(b->dentry))) return 0; + size = namelen + 1; if (b->buf) { - size = handler->list(b->dentry, b->buf + b->pos, - b->size, name, namelen, - handler->flags); if (size > b->size) return -ERANGE; - } else { - size = handler->list(b->dentry, NULL, 0, name, - namelen, handler->flags); + memcpy(b->buf + b->pos, name, namelen); + b->buf[b->pos + namelen] = 0; } - b->pos += size; } return 0; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 4b34b9dc03dd..558a16beaacb 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -186,10 +186,10 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); @@ -244,7 +244,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) @@ -256,7 +256,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, } break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 9a3b0616f283..ab0217d32039 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -9,8 +9,8 @@ #include <linux/uaccess.h> static int -security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, - int handler_flags) +security_get(const struct xattr_handler *handler, struct dentry *dentry, + const char *name, void *buffer, size_t size) { if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; @@ -22,8 +22,8 @@ security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, } static int -security_set(struct dentry *dentry, const char *name, const void *buffer, - size_t size, int flags, int handler_flags) +security_set(const struct xattr_handler *handler, struct dentry *dentry, + const char *name, const void *buffer, size_t size, int flags) { if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; @@ -34,20 +34,9 @@ security_set(struct dentry *dentry, const char *name, const void *buffer, return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } -static size_t security_list(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t namelen, int handler_flags) +static bool security_list(struct dentry *dentry) { - const size_t len = namelen + 1; - - if (IS_PRIVATE(d_inode(dentry))) - return 0; - - if (list && len <= list_len) { - memcpy(list, name, namelen); - list[namelen] = '\0'; - } - - return len; + return !IS_PRIVATE(d_inode(dentry)); } /* Initializes the security context for a new inode and returns the number diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index e4f1343714e0..64b67aa643a9 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -8,8 +8,8 @@ #include <linux/uaccess.h> static int -trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, - int handler_flags) +trusted_get(const struct xattr_handler *handler, struct dentry *dentry, + const char *name, void *buffer, size_t size) { if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; @@ -21,8 +21,8 @@ trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, } static int -trusted_set(struct dentry *dentry, const char *name, const void *buffer, - size_t size, int flags, int handler_flags) +trusted_set(const struct xattr_handler *handler, struct dentry *dentry, + const char *name, const void *buffer, size_t size, int flags) { if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; @@ -33,19 +33,9 @@ trusted_set(struct dentry *dentry, const char *name, const void *buffer, return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } -static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int handler_flags) +static bool trusted_list(struct dentry *dentry) { - const size_t len = name_len + 1; - - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) - return 0; - - if (list && len <= list_size) { - memcpy(list, name, name_len); - list[name_len] = '\0'; - } - return len; + return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry)); } const struct xattr_handler reiserfs_xattr_trusted_handler = { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index d0b08d3e5689..12e6306f562a 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -7,8 +7,8 @@ #include <linux/uaccess.h> static int -user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, - int handler_flags) +user_get(const struct xattr_handler *handler, struct dentry *dentry, + const char *name, void *buffer, size_t size) { if (strlen(name) < sizeof(XATTR_USER_PREFIX)) @@ -19,8 +19,8 @@ user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, } static int -user_set(struct dentry *dentry, const char *name, const void *buffer, - size_t size, int flags, int handler_flags) +user_set(const struct xattr_handler *handler, struct dentry *dentry, + const char *name, const void *buffer, size_t size, int flags) { if (strlen(name) < sizeof(XATTR_USER_PREFIX)) return -EINVAL; @@ -30,18 +30,9 @@ user_set(struct dentry *dentry, const char *name, const void *buffer, return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } -static size_t user_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int handler_flags) +static bool user_list(struct dentry *dentry) { - const size_t len = name_len + 1; - - if (!reiserfs_xattrs_user(dentry->d_sb)) - return 0; - if (list && len <= list_size) { - memcpy(list, name, name_len); - list[name_len] = '\0'; - } - return len; + return reiserfs_xattrs_user(dentry->d_sb); } const struct xattr_handler reiserfs_xattr_user_handler = { diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 268733cda397..6b00ca357c58 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -360,6 +360,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) break; case ROMFH_SYM: i->i_op = &page_symlink_inode_operations; + inode_nohighmem(i); i->i_data.a_ops = &romfs_aops; mode |= S_IRWXUGO; break; @@ -618,8 +619,8 @@ static int __init init_romfs_fs(void) romfs_inode_cachep = kmem_cache_create("romfs_i", sizeof(struct romfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - romfs_i_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT, romfs_i_init_once); if (!romfs_inode_cachep) { pr_err("Failed to initialise inode cache\n"); diff --git a/fs/select.c b/fs/select.c index 015547330e88..79d0d4953cad 100644 --- a/fs/select.c +++ b/fs/select.c @@ -778,8 +778,8 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, return mask; } -static int do_poll(unsigned int nfds, struct poll_list *list, - struct poll_wqueues *wait, struct timespec *end_time) +static int do_poll(struct poll_list *list, struct poll_wqueues *wait, + struct timespec *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; @@ -908,7 +908,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, } poll_initwait(&table); - fdcount = do_poll(nfds, head, &table, end_time); + fdcount = do_poll(head, &table, end_time); poll_freewait(&table); for (walk = head; walk; walk = walk->next) { diff --git a/fs/seq_file.c b/fs/seq_file.c index 225586e141ca..e85664b7c7d9 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -13,6 +13,7 @@ #include <linux/cred.h> #include <linux/mm.h> #include <linux/printk.h> +#include <linux/string_helpers.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -25,12 +26,17 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { void *buf; + gfp_t gfp = GFP_KERNEL; /* - * __GFP_NORETRY to avoid oom-killings with high-order allocations - - * it's better to fall back to vmalloc() than to kill things. + * For high order allocations, use __GFP_NORETRY to avoid oom-killing - + * it's better to fall back to vmalloc() than to kill things. For small + * allocations, just use GFP_KERNEL which will oom kill, thus no need + * for vmalloc fallback. */ - buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); + if (size > PAGE_SIZE) + gfp |= __GFP_NORETRY | __GFP_NOWARN; + buf = kmalloc(size, gfp); if (!buf && size > PAGE_SIZE) buf = vmalloc(size); return buf; @@ -377,26 +383,12 @@ EXPORT_SYMBOL(seq_release); */ void seq_escape(struct seq_file *m, const char *s, const char *esc) { - char *end = m->buf + m->size; - char *p; - char c; + char *buf; + size_t size = seq_get_buf(m, &buf); + int ret; - for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) { - if (!strchr(esc, c)) { - *p++ = c; - continue; - } - if (p + 3 < end) { - *p++ = '\\'; - *p++ = '0' + ((c & 0300) >> 6); - *p++ = '0' + ((c & 070) >> 3); - *p++ = '0' + (c & 07); - continue; - } - seq_set_overflow(m); - return; - } - m->count = p - m->buf; + ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc); + seq_commit(m, ret < size ? ret : -1); } EXPORT_SYMBOL(seq_escape); @@ -773,6 +765,8 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, { const u8 *ptr = buf; int i, linelen, remaining = len; + char *buffer; + size_t size; int ret; if (rowsize != 16 && rowsize != 32) @@ -794,15 +788,12 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, break; } + size = seq_get_buf(m, &buffer); ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, - m->buf + m->count, m->size - m->count, - ascii); - if (ret >= m->size - m->count) { - seq_set_overflow(m); - } else { - m->count += ret; - seq_putc(m, '\n'); - } + buffer, size, ascii); + seq_commit(m, ret < size ? ret : -1); + + seq_putc(m, '\n'); } } EXPORT_SYMBOL(seq_hex_dump); diff --git a/fs/splice.c b/fs/splice.c index 5fc1e50a7f30..82bc0d64fc38 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, break; error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & mapping_gfp_mask(mapping)); + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (unlikely(error)) { page_cache_release(page); if (error == -EEXIST) @@ -415,6 +415,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, */ if (!page->mapping) { unlock_page(page); +retry_lookup: page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); @@ -439,13 +440,10 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { /* - * We really should re-lookup the page here, - * but it complicates things a lot. Instead - * lets just do what we already stored, and - * we'll get it the next time we are called. + * Re-lookup the page */ if (error == AOP_TRUNCATED_PAGE) - error = 0; + goto retry_lookup; break; } @@ -809,6 +807,13 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des */ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) { + /* + * Check for signal early to make process killable when there are + * always buffers available + */ + if (signal_pending(current)) + return -ERESTARTSYS; + while (!pipe->nrbufs) { if (!pipe->writers) return 0; @@ -884,6 +889,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_from_pipe_begin(sd); do { + cond_resched(); ret = splice_from_pipe_next(pipe, sd); if (ret > 0) ret = splice_from_pipe_feed(pipe, sd, actor); diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index a1ce5ce60632..0927b1e80ab6 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -41,6 +41,7 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/xattr.h> +#include <linux/pagemap.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -291,6 +292,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); inode->i_op = &squashfs_symlink_inode_ops; + inode_nohighmem(inode); inode->i_data.a_ops = &squashfs_symlink_aops; inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 5056babe00df..5e79bfa4f260 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -80,7 +80,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) { struct squashfs_sb_info *msblk; struct squashfs_super_block *sblk = NULL; - char b[BDEVNAME_SIZE]; struct inode *root; long long root_inode; unsigned short flags; @@ -124,8 +123,8 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = le32_to_cpu(sblk->s_magic); if (sb->s_magic != SQUASHFS_MAGIC) { if (!silent) - ERROR("Can't find a SQUASHFS superblock on %s\n", - bdevname(sb->s_bdev, b)); + ERROR("Can't find a SQUASHFS superblock on %pg\n", + sb->s_bdev); goto failed_mount; } @@ -178,7 +177,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) msblk->inodes = le32_to_cpu(sblk->inodes); flags = le16_to_cpu(sblk->flags); - TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b)); + TRACE("Found valid superblock on %pg\n", sb->s_bdev); TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags) ? "un" : ""); TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags) @@ -420,7 +419,8 @@ static int __init init_inodecache(void) { squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", sizeof(struct squashfs_inode_info), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + init_once); return squashfs_inode_cachep ? 0 : -ENOMEM; } diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 12806dffb345..dbcc2f54bad4 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -119,8 +119,7 @@ const struct address_space_operations squashfs_symlink_aops = { const struct inode_operations squashfs_symlink_inode_ops = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getxattr = generic_getxattr, .listxattr = squashfs_listxattr }; diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index e5e0ddf5b143..1e9de96288d8 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -58,7 +58,7 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer, struct squashfs_xattr_entry entry; struct squashfs_xattr_val val; const struct xattr_handler *handler; - int name_size, prefix_size = 0; + int name_size; err = squashfs_read_metadata(sb, &entry, &start, &offset, sizeof(entry)); @@ -67,15 +67,16 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer, name_size = le16_to_cpu(entry.size); handler = squashfs_xattr_handler(le16_to_cpu(entry.type)); - if (handler) - prefix_size = handler->list(d, buffer, rest, NULL, - name_size, handler->flags); - if (prefix_size) { + if (handler && (!handler->list || handler->list(d))) { + const char *prefix = handler->prefix ?: handler->name; + size_t prefix_size = strlen(prefix); + if (buffer) { if (prefix_size + name_size + 1 > rest) { err = -ERANGE; goto failed; } + memcpy(buffer, prefix, prefix_size); buffer += prefix_size; } err = squashfs_read_metadata(sb, buffer, &start, @@ -212,88 +213,45 @@ failed: } -/* - * User namespace support - */ -static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - if (list && XATTR_USER_PREFIX_LEN <= list_size) - memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); - return XATTR_USER_PREFIX_LEN; -} - -static int squashfs_user_get(struct dentry *d, const char *name, void *buffer, - size_t size, int type) +static int squashfs_xattr_handler_get(const struct xattr_handler *handler, + struct dentry *d, const char *name, + void *buffer, size_t size) { - if (name[0] == '\0') - return -EINVAL; - - return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name, + return squashfs_xattr_get(d_inode(d), handler->flags, name, buffer, size); } +/* + * User namespace support + */ static const struct xattr_handler squashfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .list = squashfs_user_list, - .get = squashfs_user_get + .flags = SQUASHFS_XATTR_USER, + .get = squashfs_xattr_handler_get }; /* * Trusted namespace support */ -static size_t squashfs_trusted_list(struct dentry *d, char *list, - size_t list_size, const char *name, size_t name_len, int type) +static bool squashfs_trusted_xattr_handler_list(struct dentry *d) { - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size) - memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); - return XATTR_TRUSTED_PREFIX_LEN; -} - -static int squashfs_trusted_get(struct dentry *d, const char *name, - void *buffer, size_t size, int type) -{ - if (name[0] == '\0') - return -EINVAL; - - return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name, - buffer, size); + return capable(CAP_SYS_ADMIN); } static const struct xattr_handler squashfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .list = squashfs_trusted_list, - .get = squashfs_trusted_get + .flags = SQUASHFS_XATTR_TRUSTED, + .list = squashfs_trusted_xattr_handler_list, + .get = squashfs_xattr_handler_get }; /* * Security namespace support */ -static size_t squashfs_security_list(struct dentry *d, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - if (list && XATTR_SECURITY_PREFIX_LEN <= list_size) - memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); - return XATTR_SECURITY_PREFIX_LEN; -} - -static int squashfs_security_get(struct dentry *d, const char *name, - void *buffer, size_t size, int type) -{ - if (name[0] == '\0') - return -EINVAL; - - return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name, - buffer, size); -} - static const struct xattr_handler squashfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = squashfs_security_list, - .get = squashfs_security_get + .flags = SQUASHFS_XATTR_SECURITY, + .get = squashfs_xattr_handler_get }; static const struct xattr_handler *squashfs_xattr_handler(int type) diff --git a/fs/stat.c b/fs/stat.c index cccc1aab9a8b..bc045c7994e1 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x) +#define valid_dev(x) choose_32_64(old_valid_dev(x),true) #define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) #ifndef INIT_STRUCT_STAT_PADDING @@ -367,8 +367,6 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) INIT_STRUCT_STAT64_PADDING(tmp); #ifdef CONFIG_MIPS /* mips has weird padding, so we don't get 64 bits there */ - if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev)) - return -EOVERFLOW; tmp.st_dev = new_encode_dev(stat->dev); tmp.st_rdev = new_encode_dev(stat->rdev); #else diff --git a/fs/super.c b/fs/super.c index 954aeb80e202..1182af8fd5ff 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1012,10 +1012,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, blkdev_put(bdev, mode); down_write(&s->s_umount); } else { - char b[BDEVNAME_SIZE]; - s->s_mode = mode; - strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { @@ -1199,7 +1197,7 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) else ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); - WARN_ON(force_trylock & !ret); + WARN_ON(force_trylock && !ret); return ret; } EXPORT_SYMBOL(__sb_start_write); diff --git a/fs/sync.c b/fs/sync.c index fbc98ee62044..dd5d1711c7ac 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) static void fdatawait_one_bdev(struct block_device *bdev, void *arg) { - filemap_fdatawait(bdev->bd_inode->i_mapping); + /* + * We keep the error status of individual mapping so that + * applications can catch the writeback error using fsync(2). + * See filemap_fdatawait_keep_errors() for details. + */ + filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); } /* @@ -343,7 +348,8 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { - ret = filemap_fdatawrite_range(mapping, offset, endbyte); + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); if (ret < 0) goto out_put; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 6c95628ea377..f35523d4fa3a 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -108,6 +108,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = of->kn->parent->priv; + size_t len; /* * If buf != of->prealloc_buf, we don't know how @@ -115,7 +116,8 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, */ if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; - return ops->show(kobj, of->kn->priv, buf); + len = ops->show(kobj, of->kn->priv, buf); + return min(count, len); } /* kernfs write callback for regular sysfs files */ diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 39a019936768..dc1358b5ec95 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -73,13 +73,26 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, } if (grp->bin_attrs) { - for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { + for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { + umode_t mode = (*bin_attr)->attr.mode; + if (update) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); + if (grp->is_bin_visible) { + mode = grp->is_bin_visible(kobj, *bin_attr, i); + if (!mode) + continue; + } + + WARN(mode & ~(SYSFS_PREALLOC | 0664), + "Attribute %s: Invalid permissions 0%o\n", + (*bin_attr)->attr.name, mode); + + mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_file_mode_ns(parent, &(*bin_attr)->attr, true, - (*bin_attr)->attr.mode, NULL); + mode, NULL); if (error) break; } @@ -352,3 +365,47 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); + +/** + * __compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing + * to a group or an attribute + * @kobj: The kobject containing the group. + * @target_kobj: The target kobject. + * @target_name: The name of the target group or attribute. + */ +int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, + struct kobject *target_kobj, + const char *target_name) +{ + struct kernfs_node *target; + struct kernfs_node *entry; + struct kernfs_node *link; + + /* + * We don't own @target_kobj and it may be removed at any time. + * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir() + * for details. + */ + spin_lock(&sysfs_symlink_target_lock); + target = target_kobj->sd; + if (target) + kernfs_get(target); + spin_unlock(&sysfs_symlink_target_lock); + if (!target) + return -ENOENT; + + entry = kernfs_find_and_get(target_kobj->sd, target_name); + if (!entry) { + kernfs_put(target); + return -ENOENT; + } + + link = kernfs_create_link(kobj->sd, target_name, entry); + if (IS_ERR(link) && PTR_ERR(link) == -EEXIST) + sysfs_warn_dup(kobj->sd, target_name); + + kernfs_put(entry); + kernfs_put(target); + return IS_ERR(link) ? PTR_ERR(link) : 0; +} +EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 590ad9206e3f..d62c423a5a2d 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -146,8 +146,7 @@ static inline void write3byte(struct sysv_sb_info *sbi, static const struct inode_operations sysv_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = sysv_getattr, }; @@ -162,15 +161,9 @@ void sysv_set_inode(struct inode *inode, dev_t rdev) inode->i_fop = &sysv_dir_operations; inode->i_mapping->a_ops = &sysv_aops; } else if (S_ISLNK(inode->i_mode)) { - if (inode->i_blocks) { - inode->i_op = &sysv_symlink_inode_operations; - inode->i_mapping->a_ops = &sysv_aops; - } else { - inode->i_op = &simple_symlink_inode_operations; - inode->i_link = (char *)SYSV_I(inode)->i_data; - nd_terminate_link(inode->i_link, inode->i_size, - sizeof(SYSV_I(inode)->i_data) - 1); - } + inode->i_op = &sysv_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &sysv_aops; } else init_special_inode(inode, inode->i_mode, rdev); } @@ -353,7 +346,7 @@ int __init sysv_init_icache(void) { sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, init_once); if (!sysv_inode_cachep) return -ENOMEM; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index cbc8d5d2755a..c66f2423e1f5 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -340,8 +340,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) dput(dentry); dentry = ERR_PTR(-EEXIST); } - if (IS_ERR(dentry)) + + if (IS_ERR(dentry)) { mutex_unlock(&parent->d_inode->i_mutex); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + } + return dentry; } diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index ba66d508006a..7ff7712f284e 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -35,3 +35,18 @@ config UBIFS_FS_ZLIB default y help Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. + +config UBIFS_ATIME_SUPPORT + bool "Access time support" if UBIFS_FS + depends on UBIFS_FS + default n + help + Originally UBIFS did not support atime, because it looked like a bad idea due + increased flash wear. This option adds atime support and it is disabled by default + to preserve the old behavior. If you enable this option, UBIFS starts updating atime, + which means that file-system read operations will cause writes (inode atime + updates). This may affect file-system performance and increase flash device wear, + so be careful. How often atime is updated depends on the selected strategy: + strictatime is the "heavy", relatime is "lighter", etc. + + If unsure, say 'N' diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 4c46a9865fa7..595ca0debe11 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2573,7 +2573,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, { int err, failing; - if (c->dbg->pc_happened) + if (dbg_is_power_cut(c)) return -EROFS; failing = power_cut_emulated(c, lnum, 1); @@ -2595,7 +2595,7 @@ int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, { int err; - if (c->dbg->pc_happened) + if (dbg_is_power_cut(c)) return -EROFS; if (power_cut_emulated(c, lnum, 1)) return -EROFS; @@ -2611,7 +2611,7 @@ int dbg_leb_unmap(struct ubifs_info *c, int lnum) { int err; - if (c->dbg->pc_happened) + if (dbg_is_power_cut(c)) return -EROFS; if (power_cut_emulated(c, lnum, 0)) return -EROFS; @@ -2627,7 +2627,7 @@ int dbg_leb_map(struct ubifs_info *c, int lnum) { int err; - if (c->dbg->pc_happened) + if (dbg_is_power_cut(c)) return -EROFS; if (power_cut_emulated(c, lnum, 0)) return -EROFS; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 5c27c66c224a..e49bd2808bf3 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -449,13 +449,14 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) } out: + kfree(file->private_data); + file->private_data = NULL; + if (err != -ENOENT) { ubifs_err(c, "cannot find next direntry, error %d", err); return err; } - kfree(file->private_data); - file->private_data = NULL; /* 2 is a special value indicating that there are no more direntries */ ctx->pos = 2; return 0; @@ -787,9 +788,6 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino); - if (!new_valid_dev(rdev)) - return -EINVAL; - if (S_ISBLK(mode) || S_ISCHR(mode)) { dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); if (!dev) @@ -1188,6 +1186,9 @@ const struct inode_operations ubifs_dir_inode_operations = { .getxattr = ubifs_getxattr, .listxattr = ubifs_listxattr, .removexattr = ubifs_removexattr, +#ifdef CONFIG_UBIFS_ATIME_SUPPORT + .update_time = ubifs_update_time, +#endif }; const struct file_operations ubifs_dir_operations = { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index a3dfe2ae79f2..eff62801acbf 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1354,6 +1354,47 @@ static inline int mctime_update_needed(const struct inode *inode, return 0; } +#ifdef CONFIG_UBIFS_ATIME_SUPPORT +/** + * ubifs_update_time - update time of inode. + * @inode: inode to update + * + * This function updates time of the inode. + */ +int ubifs_update_time(struct inode *inode, struct timespec *time, + int flags) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + int iflags = I_DIRTY_TIME; + int err, release; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + mutex_lock(&ui->ui_mutex); + if (flags & S_ATIME) + inode->i_atime = *time; + if (flags & S_CTIME) + inode->i_ctime = *time; + if (flags & S_MTIME) + inode->i_mtime = *time; + + if (!(inode->i_sb->s_flags & MS_LAZYTIME)) + iflags |= I_DIRTY_SYNC; + + release = ui->dirty; + __mark_inode_dirty(inode, iflags); + mutex_unlock(&ui->ui_mutex); + if (release) + ubifs_release_budget(c, &req); + return 0; +} +#endif + /** * update_ctime - update mtime and ctime of an inode. * @inode: inode to update @@ -1537,6 +1578,9 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) if (err) return err; vma->vm_ops = &ubifs_file_vm_ops; +#ifdef CONFIG_UBIFS_ATIME_SUPPORT + file_accessed(file); +#endif return 0; } @@ -1557,17 +1601,23 @@ const struct inode_operations ubifs_file_inode_operations = { .getxattr = ubifs_getxattr, .listxattr = ubifs_listxattr, .removexattr = ubifs_removexattr, +#ifdef CONFIG_UBIFS_ATIME_SUPPORT + .update_time = ubifs_update_time, +#endif }; const struct inode_operations ubifs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, .setxattr = ubifs_setxattr, .getxattr = ubifs_getxattr, .listxattr = ubifs_listxattr, .removexattr = ubifs_removexattr, +#ifdef CONFIG_UBIFS_ATIME_SUPPORT + .update_time = ubifs_update_time, +#endif }; const struct file_operations ubifs_file_operations = { diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index 92a8491a8f8c..c0a95e393347 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -34,6 +34,12 @@ * node. We use "r5" hash borrowed from reiserfs. */ +/* + * Lot's of the key helpers require a struct ubifs_info *c as the first parameter. + * But we are not using it at all currently. That's designed for future extensions of + * different c->key_format. But right now, there is only one key type, UBIFS_SIMPLE_KEY_FMT. + */ + #ifndef __UBIFS_KEY_H__ #define __UBIFS_KEY_H__ diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index dc9f27e9d61b..9a517109da0f 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1498,11 +1498,10 @@ static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c, } /* nnode is being committed, so copy it */ - n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS); + n = kmemdup(nnode, sizeof(struct ubifs_nnode), GFP_NOFS); if (unlikely(!n)) return ERR_PTR(-ENOMEM); - memcpy(n, nnode, sizeof(struct ubifs_nnode)); n->cnext = NULL; __set_bit(DIRTY_CNODE, &n->flags); __clear_bit(COW_CNODE, &n->flags); @@ -1549,11 +1548,10 @@ static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c, } /* pnode is being committed, so copy it */ - p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS); + p = kmemdup(pnode, sizeof(struct ubifs_pnode), GFP_NOFS); if (unlikely(!p)) return ERR_PTR(-ENOMEM); - memcpy(p, pnode, sizeof(struct ubifs_pnode)); p->cnext = NULL; __set_bit(DIRTY_CNODE, &p->flags); __clear_bit(COW_CNODE, &p->flags); diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index ee7cb5ebb6e8..8ece6ca58c0b 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -155,13 +155,8 @@ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf) */ static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev) { - if (new_valid_dev(rdev)) { - dev->new = cpu_to_le32(new_encode_dev(rdev)); - return sizeof(dev->new); - } else { - dev->huge = cpu_to_le64(huge_encode_dev(rdev)); - return sizeof(dev->huge); - } + dev->new = cpu_to_le32(new_encode_dev(rdev)); + return sizeof(dev->new); } /** diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 695fc71d5244..586d59347fff 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -789,7 +789,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, corrupted_rescan: /* Re-scan the corrupted data with verbose messages */ ubifs_err(c, "corruption %d", ret); - ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + ubifs_scan_a_node(c, buf, len, lnum, offs, 0); corrupted: ubifs_scanned_corruption(c, lnum, offs, buf); err = -EUCLEAN; @@ -1331,8 +1331,7 @@ void ubifs_destroy_size_tree(struct ubifs_info *c) struct size_entry *e, *n; rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) { - if (e->inode) - iput(e->inode); + iput(e->inode); kfree(e); } @@ -1533,8 +1532,7 @@ int ubifs_recover_size(struct ubifs_info *c) err = fix_size_in_place(c, e); if (err) return err; - if (e->inode) - iput(e->inode); + iput(e->inode); } } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 9547a27868ad..a233ba913be4 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -128,7 +128,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) if (err) goto out_ino; - inode->i_flags |= (S_NOCMTIME | S_NOATIME); + inode->i_flags |= S_NOCMTIME; +#ifndef CONFIG_UBIFS_ATIME_SUPPORT + inode->i_flags |= S_NOATIME; +#endif set_nlink(inode, le32_to_cpu(ino->nlink)); i_uid_write(inode, le32_to_cpu(ino->uid)); i_gid_write(inode, le32_to_cpu(ino->gid)); @@ -2037,7 +2040,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) if (c->max_inode_sz > MAX_LFS_FILESIZE) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; sb->s_op = &ubifs_super_operations; - sb->s_xattr = ubifs_xattr_handlers; mutex_lock(&c->umount_mutex); err = mount_ubifs(c); @@ -2139,7 +2141,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, if (err) goto out_deact; /* We do not support atime */ - sb->s_flags |= MS_ACTIVE | MS_NOATIME; + sb->s_flags |= MS_ACTIVE; +#ifndef CONFIG_UBIFS_ATIME_SUPPORT + sb->s_flags |= MS_NOATIME; +#else + ubifs_msg(c, "full atime support is enabled."); +#endif } /* 'fill_super()' opens ubi again so we must close it here */ @@ -2241,8 +2248,8 @@ static int __init ubifs_init(void) ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, - SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, - &inode_slab_ctor); + SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) return -ENOMEM; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 957f5757f374..fa9a20cc60d6 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -198,11 +198,10 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c, { struct ubifs_znode *zn; - zn = kmalloc(c->max_znode_sz, GFP_NOFS); + zn = kmemdup(znode, c->max_znode_sz, GFP_NOFS); if (unlikely(!zn)) return ERR_PTR(-ENOMEM); - memcpy(zn, znode, c->max_znode_sz); zn->cnext = NULL; __set_bit(DIRTY_ZNODE, &zn->flags); __clear_bit(COW_ZNODE, &zn->flags); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index de759022f3d6..a5697de763f5 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -858,9 +858,9 @@ struct ubifs_compressor { * @mod_dent: non-zero if the operation removes or modifies an existing * directory entry * @new_ino: non-zero if the operation adds a new inode - * @new_ino_d: now much data newly created inode contains + * @new_ino_d: how much data newly created inode contains * @dirtied_ino: how many inodes the operation makes dirty - * @dirtied_ino_d: now much data dirtied inode contains + * @dirtied_ino_d: how much data dirtied inode contains * @idx_growth: how much the index will supposedly grow * @data_growth: how much new data the operation will supposedly add * @dd_growth: how much data that makes other data dirty the operation will @@ -1470,7 +1470,6 @@ extern spinlock_t ubifs_infos_lock; extern atomic_long_t ubifs_clean_zn_cnt; extern struct kmem_cache *ubifs_inode_slab; extern const struct super_operations ubifs_super_operations; -extern const struct xattr_handler *ubifs_xattr_handlers[]; extern const struct address_space_operations ubifs_file_address_operations; extern const struct file_operations ubifs_file_operations; extern const struct inode_operations ubifs_file_inode_operations; @@ -1746,6 +1745,9 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); int ubifs_setattr(struct dentry *dentry, struct iattr *attr); +#ifdef CONFIG_UBIFS_ATIME_SUPPORT +int ubifs_update_time(struct inode *inode, struct timespec *time, int flags); +#endif /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index fd65b3f1923c..e53292d0c21b 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -200,6 +200,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, int err; struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_inode *ui = ubifs_inode(inode); + void *buf = NULL; struct ubifs_budget_req req = { .dirtied_ino = 2, .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) }; @@ -208,14 +209,17 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, if (err) return err; - kfree(ui->data); - ui->data = kmemdup(value, size, GFP_NOFS); - if (!ui->data) { + buf = kmemdup(value, size, GFP_NOFS); + if (!buf) { err = -ENOMEM; goto out_free; } + mutex_lock(&ui->ui_mutex); + kfree(ui->data); + ui->data = buf; inode->i_size = ui->ui_size = size; ui->data_len = size; + mutex_unlock(&ui->ui_mutex); mutex_lock(&host_ui->ui_mutex); host->i_ctime = ubifs_current_time(host); @@ -263,7 +267,7 @@ static int check_namespace(const struct qstr *nm) if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { - if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0') + if (nm->name[XATTR_TRUSTED_PREFIX_LEN] == '\0') return -EINVAL; type = TRUSTED_XATTR; } else if (!strncmp(nm->name, XATTR_USER_PREFIX, @@ -273,7 +277,7 @@ static int check_namespace(const struct qstr *nm) type = USER_XATTR; } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { - if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0') + if (nm->name[XATTR_SECURITY_PREFIX_LEN] == '\0') return -EINVAL; type = SECURITY_XATTR; } else @@ -409,6 +413,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, ubifs_assert(inode->i_size == ui->data_len); ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len); + mutex_lock(&ui->ui_mutex); if (buf) { /* If @buf is %NULL we are supposed to return the length */ if (ui->data_len > size) { @@ -423,6 +428,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, err = ui->data_len; out_iput: + mutex_unlock(&ui->ui_mutex); iput(inode); out_unlock: kfree(xent); @@ -582,46 +588,6 @@ out_free: return err; } -static size_t security_listxattr(struct dentry *d, char *list, size_t list_size, - const char *name, size_t name_len, int flags) -{ - const int prefix_len = XATTR_SECURITY_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - - return total_len; -} - -static int security_getxattr(struct dentry *d, const char *name, void *buffer, - size_t size, int flags) -{ - return ubifs_getxattr(d, name, buffer, size); -} - -static int security_setxattr(struct dentry *d, const char *name, - const void *value, size_t size, int flags, - int handler_flags) -{ - return ubifs_setxattr(d, name, value, size, flags); -} - -static const struct xattr_handler ubifs_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .list = security_listxattr, - .get = security_getxattr, - .set = security_setxattr, -}; - -const struct xattr_handler *ubifs_xattr_handlers[] = { - &ubifs_xattr_security_handler, - NULL, -}; - static int init_xattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 6d6a96b4e73f..e0fd65fe73e8 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -447,9 +447,6 @@ static void udf_table_free_blocks(struct super_block *sb, */ int adsize; - struct short_ad *sad = NULL; - struct long_ad *lad = NULL; - struct allocExtDesc *aed; eloc.logicalBlockNum = start; elen = EXT_RECORDED_ALLOCATED | @@ -466,102 +463,17 @@ static void udf_table_free_blocks(struct super_block *sb, } if (epos.offset + (2 * adsize) > sb->s_blocksize) { - unsigned char *sptr, *dptr; - int loffset; - - brelse(oepos.bh); - oepos = epos; - /* Steal a block from the extent being free'd */ - epos.block.logicalBlockNum = eloc.logicalBlockNum; + udf_setup_indirect_aext(table, eloc.logicalBlockNum, + &epos); + eloc.logicalBlockNum++; elen -= sb->s_blocksize; - - epos.bh = udf_tread(sb, - udf_get_lb_pblock(sb, &epos.block, 0)); - if (!epos.bh) { - brelse(oepos.bh); - goto error_return; - } - aed = (struct allocExtDesc *)(epos.bh->b_data); - aed->previousAllocExtLocation = - cpu_to_le32(oepos.block.logicalBlockNum); - if (epos.offset + adsize > sb->s_blocksize) { - loffset = epos.offset; - aed->lengthAllocDescs = cpu_to_le32(adsize); - sptr = iinfo->i_ext.i_data + epos.offset - - adsize; - dptr = epos.bh->b_data + - sizeof(struct allocExtDesc); - memcpy(dptr, sptr, adsize); - epos.offset = sizeof(struct allocExtDesc) + - adsize; - } else { - loffset = epos.offset + adsize; - aed->lengthAllocDescs = cpu_to_le32(0); - if (oepos.bh) { - sptr = oepos.bh->b_data + epos.offset; - aed = (struct allocExtDesc *) - oepos.bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, - adsize); - } else { - sptr = iinfo->i_ext.i_data + - epos.offset; - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(table); - } - epos.offset = sizeof(struct allocExtDesc); - } - if (sbi->s_udfrev >= 0x0200) - udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, - 3, 1, epos.block.logicalBlockNum, - sizeof(struct tag)); - else - udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, - 2, 1, epos.block.logicalBlockNum, - sizeof(struct tag)); - - switch (iinfo->i_alloc_type) { - case ICBTAG_FLAG_AD_SHORT: - sad = (struct short_ad *)sptr; - sad->extLength = cpu_to_le32( - EXT_NEXT_EXTENT_ALLOCDECS | - sb->s_blocksize); - sad->extPosition = - cpu_to_le32(epos.block.logicalBlockNum); - break; - case ICBTAG_FLAG_AD_LONG: - lad = (struct long_ad *)sptr; - lad->extLength = cpu_to_le32( - EXT_NEXT_EXTENT_ALLOCDECS | - sb->s_blocksize); - lad->extLocation = - cpu_to_lelb(epos.block); - break; - } - if (oepos.bh) { - udf_update_tag(oepos.bh->b_data, loffset); - mark_buffer_dirty(oepos.bh); - } else { - mark_inode_dirty(table); - } } /* It's possible that stealing the block emptied the extent */ - if (elen) { - udf_write_aext(table, &epos, &eloc, elen, 1); - - if (!epos.bh) { - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(table); - } else { - aed = (struct allocExtDesc *)epos.bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, adsize); - udf_update_tag(epos.bh->b_data, epos.offset); - mark_buffer_dirty(epos.bh); - } - } + if (elen) + __udf_add_aext(table, &epos, &eloc, elen, 1); } brelse(epos.bh); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 8d0b3ade0ff0..87dc16d15572 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -539,9 +539,18 @@ static int udf_do_extend_file(struct inode *inode, udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); count++; - } else + } else { + struct kernel_lb_addr tmploc; + uint32_t tmplen; + udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); + /* + * We've rewritten the last extent but there may be empty + * indirect extent after it - enter it. + */ + udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); + } /* Managed to do everything necessary? */ if (!blocks) @@ -1540,7 +1549,8 @@ reread: break; case ICBTAG_FILE_TYPE_SYMLINK: inode->i_data.a_ops = &udf_symlink_aops; - inode->i_op = &udf_symlink_inode_operations; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; break; case ICBTAG_FILE_TYPE_MAIN: @@ -1866,22 +1876,90 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, return inode; } -int udf_add_aext(struct inode *inode, struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t elen, int inc) +int udf_setup_indirect_aext(struct inode *inode, int block, + struct extent_position *epos) { - int adsize; - struct short_ad *sad = NULL; - struct long_ad *lad = NULL; + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; struct allocExtDesc *aed; - uint8_t *ptr; - struct udf_inode_info *iinfo = UDF_I(inode); + struct extent_position nepos; + struct kernel_lb_addr neloc; + int ver, adsize; - if (!epos->bh) - ptr = iinfo->i_ext.i_data + epos->offset - - udf_file_entry_alloc_offset(inode) + - iinfo->i_lenEAttr; + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); else - ptr = epos->bh->b_data + epos->offset; + return -EIO; + + neloc.logicalBlockNum = block; + neloc.partitionReferenceNum = epos->block.partitionReferenceNum; + + bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); + if (!bh) + return -EIO; + lock_buffer(bh); + memset(bh->b_data, 0x00, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + + aed = (struct allocExtDesc *)(bh->b_data); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) { + aed->previousAllocExtLocation = + cpu_to_le32(epos->block.logicalBlockNum); + } + aed->lengthAllocDescs = cpu_to_le32(0); + if (UDF_SB(sb)->s_udfrev >= 0x0200) + ver = 3; + else + ver = 2; + udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block, + sizeof(struct tag)); + + nepos.block = neloc; + nepos.offset = sizeof(struct allocExtDesc); + nepos.bh = bh; + + /* + * Do we have to copy current last extent to make space for indirect + * one? + */ + if (epos->offset + adsize > sb->s_blocksize) { + struct kernel_lb_addr cp_loc; + uint32_t cp_len; + int cp_type; + + epos->offset -= adsize; + cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0); + cp_len |= ((uint32_t)cp_type) << 30; + + __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); + udf_write_aext(inode, epos, &nepos.block, + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + } else { + __udf_add_aext(inode, epos, &nepos.block, + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + } + + brelse(epos->bh); + *epos = nepos; + + return 0; +} + +/* + * Append extent at the given position - should be the first free one in inode + * / indirect extent. This function assumes there is enough space in the inode + * or indirect extent. Use udf_add_aext() if you didn't check for this before. + */ +int __udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + struct allocExtDesc *aed; + int adsize; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); @@ -1890,88 +1968,14 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos, else return -EIO; - if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { - unsigned char *sptr, *dptr; - struct buffer_head *nbh; - int err, loffset; - struct kernel_lb_addr obloc = epos->block; - - epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, - obloc.partitionReferenceNum, - obloc.logicalBlockNum, &err); - if (!epos->block.logicalBlockNum) - return -ENOSPC; - nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, - &epos->block, - 0)); - if (!nbh) - return -EIO; - lock_buffer(nbh); - memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); - set_buffer_uptodate(nbh); - unlock_buffer(nbh); - mark_buffer_dirty_inode(nbh, inode); - - aed = (struct allocExtDesc *)(nbh->b_data); - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) - aed->previousAllocExtLocation = - cpu_to_le32(obloc.logicalBlockNum); - if (epos->offset + adsize > inode->i_sb->s_blocksize) { - loffset = epos->offset; - aed->lengthAllocDescs = cpu_to_le32(adsize); - sptr = ptr - adsize; - dptr = nbh->b_data + sizeof(struct allocExtDesc); - memcpy(dptr, sptr, adsize); - epos->offset = sizeof(struct allocExtDesc) + adsize; - } else { - loffset = epos->offset + adsize; - aed->lengthAllocDescs = cpu_to_le32(0); - sptr = ptr; - epos->offset = sizeof(struct allocExtDesc); - - if (epos->bh) { - aed = (struct allocExtDesc *)epos->bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, adsize); - } else { - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(inode); - } - } - if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) - udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, - epos->block.logicalBlockNum, sizeof(struct tag)); - else - udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, - epos->block.logicalBlockNum, sizeof(struct tag)); - switch (iinfo->i_alloc_type) { - case ICBTAG_FLAG_AD_SHORT: - sad = (struct short_ad *)sptr; - sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | - inode->i_sb->s_blocksize); - sad->extPosition = - cpu_to_le32(epos->block.logicalBlockNum); - break; - case ICBTAG_FLAG_AD_LONG: - lad = (struct long_ad *)sptr; - lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | - inode->i_sb->s_blocksize); - lad->extLocation = cpu_to_lelb(epos->block); - memset(lad->impUse, 0x00, sizeof(lad->impUse)); - break; - } - if (epos->bh) { - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || - UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) - udf_update_tag(epos->bh->b_data, loffset); - else - udf_update_tag(epos->bh->b_data, - sizeof(struct allocExtDesc)); - mark_buffer_dirty_inode(epos->bh, inode); - brelse(epos->bh); - } else { - mark_inode_dirty(inode); - } - epos->bh = nbh; + if (!epos->bh) { + WARN_ON(iinfo->i_lenAlloc != + epos->offset - udf_file_entry_alloc_offset(inode)); + } else { + aed = (struct allocExtDesc *)epos->bh->b_data; + WARN_ON(le32_to_cpu(aed->lengthAllocDescs) != + epos->offset - sizeof(struct allocExtDesc)); + WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize); } udf_write_aext(inode, epos, eloc, elen, inc); @@ -1995,6 +1999,41 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos, return 0; } +/* + * Append extent at given position - should be the first free one in inode + * / indirect extent. Takes care of allocating and linking indirect blocks. + */ +int udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + int adsize; + struct super_block *sb = inode->i_sb; + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + return -EIO; + + if (epos->offset + (2 * adsize) > sb->s_blocksize) { + int err; + int new_block; + + new_block = udf_new_block(sb, NULL, + epos->block.partitionReferenceNum, + epos->block.logicalBlockNum, &err); + if (!new_block) + return -ENOSPC; + + err = udf_setup_indirect_aext(inode, new_block, epos); + if (err) + return err; + } + + return __udf_add_aext(inode, epos, eloc, elen, inc); +} + void udf_write_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { @@ -2047,14 +2086,29 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos, epos->offset += adsize; } +/* + * Only 1 indirect extent in a row really makes sense but allow upto 16 in case + * someone does some weird stuff. + */ +#define UDF_MAX_INDIR_EXTS 16 + int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int inc) { int8_t etype; + unsigned int indirections = 0; while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { int block; + + if (++indirections > UDF_MAX_INDIR_EXTS) { + udf_err(inode->i_sb, + "too many indirect extents in inode %lu\n", + inode->i_ino); + return -1; + } + epos->block = *eloc; epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index c97b5a8d1e24..42eafb91f7ff 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -921,7 +921,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, } inode->i_data.a_ops = &udf_symlink_aops; - inode->i_op = &udf_symlink_inode_operations; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { struct kernel_lb_addr eloc; @@ -1344,8 +1345,3 @@ const struct inode_operations udf_dir_inode_operations = { .rename = udf_rename, .tmpfile = udf_tmpfile, }; -const struct inode_operations udf_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, -}; diff --git a/fs/udf/super.c b/fs/udf/super.c index 81155b9b445b..0fbb4c7c72e8 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -179,7 +179,8 @@ static int __init init_inodecache(void) udf_inode_cachep = kmem_cache_create("udf_inode_cache", sizeof(struct udf_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | + SLAB_ACCOUNT), init_once); if (!udf_inode_cachep) return -ENOMEM; @@ -1586,6 +1587,13 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ } /* + * Maximum number of Terminating Descriptor redirections. The chosen number is + * arbitrary - just that we hopefully don't limit any real use of rewritten + * inode on write-once media but avoid looping for too long on corrupted media. + */ +#define UDF_MAX_TD_NESTING 64 + +/* * Process a main/reserve volume descriptor sequence. * @block First block of first extent of the sequence. * @lastblock Lastblock of first extent of the sequence. @@ -1609,6 +1617,7 @@ static noinline int udf_process_sequence( uint16_t ident; long next_s = 0, next_e = 0; int ret; + unsigned int indirections = 0; memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); @@ -1679,6 +1688,12 @@ static noinline int udf_process_sequence( } break; case TAG_IDENT_TD: /* ISO 13346 3/10.9 */ + if (++indirections > UDF_MAX_TD_NESTING) { + udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING); + brelse(bh); + return -EIO; + } + vds[VDS_POS_TERMINATING_DESC].block = block; if (next_e) { block = next_s; diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 862535b3ba58..8d619773056b 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -107,7 +107,7 @@ static int udf_symlink_filler(struct file *file, struct page *page) struct buffer_head *bh = NULL; unsigned char *symlink; int err; - unsigned char *p = kmap(page); + unsigned char *p = page_address(page); struct udf_inode_info *iinfo; uint32_t pos; @@ -141,7 +141,6 @@ static int udf_symlink_filler(struct file *file, struct page *page) up_read(&iinfo->i_data_sem); SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; @@ -149,7 +148,6 @@ out_unlock_inode: up_read(&iinfo->i_data_sem); SetPageError(page); out_unmap: - kunmap(page); unlock_page(page); return err; } diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 47bb3f5ca360..fa0044b6b81d 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -85,7 +85,6 @@ extern const struct inode_operations udf_dir_inode_operations; extern const struct file_operations udf_dir_operations; extern const struct inode_operations udf_file_inode_operations; extern const struct file_operations udf_file_operations; -extern const struct inode_operations udf_symlink_inode_operations; extern const struct address_space_operations udf_aops; extern const struct address_space_operations udf_adinicb_aops; extern const struct address_space_operations udf_symlink_aops; @@ -159,6 +158,10 @@ extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); +extern int udf_setup_indirect_aext(struct inode *inode, int block, + struct extent_position *epos); +extern int __udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc); extern int udf_add_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); extern void udf_write_aext(struct inode *, struct extent_position *, diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index ab478e62baae..e788a05aab83 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -128,11 +128,15 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) if (c < 0x80U) utf_o->u_name[utf_o->u_len++] = (uint8_t)c; else if (c < 0x800U) { + if (utf_o->u_len > (UDF_NAME_LEN - 4)) + break; utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6)); utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); } else { + if (utf_o->u_len > (UDF_NAME_LEN - 5)) + break; utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12)); utf_o->u_name[utf_o->u_len++] = @@ -173,17 +177,22 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) { unsigned c, i, max_val, utf_char; - int utf_cnt, u_len; + int utf_cnt, u_len, u_ch; memset(ocu, 0, sizeof(dstring) * length); ocu[0] = 8; max_val = 0xffU; + u_ch = 1; try_again: u_len = 0U; utf_char = 0U; utf_cnt = 0U; for (i = 0U; i < utf->u_len; i++) { + /* Name didn't fit? */ + if (u_len + 1 + u_ch >= length) + return 0; + c = (uint8_t)utf->u_name[i]; /* Complete a multi-byte UTF-8 character */ @@ -225,6 +234,7 @@ try_again: if (max_val == 0xffU) { max_val = 0xffffU; ocu[0] = (uint8_t)0x10U; + u_ch = 2; goto try_again; } goto error_out; @@ -277,7 +287,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, c = (c << 8) | ocu[i++]; len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len], - UDF_NAME_LEN - utf_o->u_len); + UDF_NAME_LEN - 2 - utf_o->u_len); /* Valid character? */ if (len >= 0) utf_o->u_len += len; @@ -295,15 +305,19 @@ static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int len; unsigned i, max_val; uint16_t uni_char; - int u_len; + int u_len, u_ch; memset(ocu, 0, sizeof(dstring) * length); ocu[0] = 8; max_val = 0xffU; + u_ch = 1; try_again: u_len = 0U; for (i = 0U; i < uni->u_len; i++) { + /* Name didn't fit? */ + if (u_len + 1 + u_ch >= length) + return 0; len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); if (!len) continue; @@ -316,6 +330,7 @@ try_again: if (uni_char > max_val) { max_val = 0xffffU; ocu[0] = (uint8_t)0x10U; + u_ch = 2; goto try_again; } diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile index 392db25c0b56..ec4a6b49fa13 100644 --- a/fs/ufs/Makefile +++ b/fs/ufs/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_UFS_FS) += ufs.o ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ - namei.o super.o symlink.o util.o + namei.o super.o util.o ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a064cf44b143..d897e169ab9c 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -528,11 +528,12 @@ static void ufs_set_inode_ops(struct inode *inode) inode->i_mapping->a_ops = &ufs_aops; } else if (S_ISLNK(inode->i_mode)) { if (!inode->i_blocks) { - inode->i_op = &ufs_fast_symlink_inode_operations; inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink; + inode->i_op = &simple_symlink_inode_operations; } else { - inode->i_op = &ufs_symlink_inode_operations; inode->i_mapping->a_ops = &ufs_aops; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); } } else init_special_inode(inode, inode->i_mode, diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 47966554317c..acf4a3b61b81 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -123,14 +123,15 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ - inode->i_op = &ufs_symlink_inode_operations; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &ufs_aops; err = page_symlink(inode, symname, l); if (err) goto out_fail; } else { /* fast symlink */ - inode->i_op = &ufs_fast_symlink_inode_operations; + inode->i_op = &simple_symlink_inode_operations; inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink; memcpy(inode->i_link, symname, l); inode->i_size = l-1; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f6390eec02ca..442fd52ebffe 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1427,7 +1427,7 @@ static int __init init_inodecache(void) ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", sizeof(struct ufs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ufs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c deleted file mode 100644 index 874480bb43e9..000000000000 --- a/fs/ufs/symlink.c +++ /dev/null @@ -1,42 +0,0 @@ -/* - * linux/fs/ufs/symlink.c - * - * Only fast symlinks left here - the rest is done by generic code. AV, 1999 - * - * Copyright (C) 1998 - * Daniel Pirkl <daniel.pirkl@emai.cz> - * Charles University, Faculty of Mathematics and Physics - * - * from - * - * linux/fs/ext2/symlink.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/symlink.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext2 symlink handling code - */ - -#include "ufs_fs.h" -#include "ufs.h" - -const struct inode_operations ufs_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = simple_follow_link, - .setattr = ufs_setattr, -}; - -const struct inode_operations ufs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, - .setattr = ufs_setattr, -}; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 7da4aca868c0..c87f4c3fa9dd 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -136,10 +136,6 @@ extern __printf(3, 4) void ufs_panic(struct super_block *, const char *, const char *, ...); void ufs_mark_sb_dirty(struct super_block *sb); -/* symlink.c */ -extern const struct inode_operations ufs_fast_symlink_inode_operations; -extern const struct inode_operations ufs_symlink_inode_operations; - static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/xattr.c b/fs/xattr.c index 072fee1258dd..d5dd6c8b82a7 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -208,25 +208,6 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, return error; } -/* Compare an extended attribute value with the given value */ -int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name, - const char *value, size_t size, gfp_t flags) -{ - char *xattr_value = NULL; - int rc; - - rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags); - if (rc < 0) - return rc; - - if ((rc != size) || (memcmp(xattr_value, value, rc) != 0)) - rc = -EINVAL; - else - rc = 0; - kfree(xattr_value); - return rc; -} - ssize_t vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { @@ -324,7 +305,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, { int error; void *kvalue = NULL; - void *vvalue = NULL; /* If non-NULL, we used vmalloc() */ char kname[XATTR_NAME_MAX + 1]; if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) @@ -341,10 +321,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, return -E2BIG; kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!kvalue) { - vvalue = vmalloc(size); - if (!vvalue) + kvalue = vmalloc(size); + if (!kvalue) return -ENOMEM; - kvalue = vvalue; } if (copy_from_user(kvalue, value, size)) { error = -EFAULT; @@ -357,10 +336,8 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, error = vfs_setxattr(d, kname, kvalue, size, flags); out: - if (vvalue) - vfree(vvalue); - else - kfree(kvalue); + kvfree(kvalue); + return error; } @@ -428,7 +405,6 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, { ssize_t error; void *kvalue = NULL; - void *vvalue = NULL; char kname[XATTR_NAME_MAX + 1]; error = strncpy_from_user(kname, name, sizeof(kname)); @@ -442,10 +418,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, size = XATTR_SIZE_MAX; kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!kvalue) { - vvalue = vmalloc(size); - if (!vvalue) + kvalue = vmalloc(size); + if (!kvalue) return -ENOMEM; - kvalue = vvalue; } } @@ -461,10 +436,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } - if (vvalue) - vfree(vvalue); - else - kfree(kvalue); + + kvfree(kvalue); + return error; } @@ -521,17 +495,15 @@ listxattr(struct dentry *d, char __user *list, size_t size) { ssize_t error; char *klist = NULL; - char *vlist = NULL; /* If non-NULL, we used vmalloc() */ if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL); if (!klist) { - vlist = vmalloc(size); - if (!vlist) + klist = vmalloc(size); + if (!klist) return -ENOMEM; - klist = vlist; } } @@ -544,10 +516,9 @@ listxattr(struct dentry *d, char __user *list, size_t size) than XATTR_LIST_MAX bytes. Not possible. */ error = -E2BIG; } - if (vlist) - vfree(vlist); - else - kfree(klist); + + kvfree(klist); + return error; } @@ -700,13 +671,20 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name) return NULL; for_each_xattr_handler(handlers, handler) { - const char *n = strcmp_prefix(*name, handler->prefix); + const char *n; + + n = strcmp_prefix(*name, xattr_prefix(handler)); if (n) { + if (!handler->prefix ^ !*n) { + if (*n) + continue; + return ERR_PTR(-EINVAL); + } *name = n; - break; + return handler; } } - return handler; + return ERR_PTR(-EOPNOTSUPP); } /* @@ -718,9 +696,9 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); - if (!handler) - return -EOPNOTSUPP; - return handler->get(dentry, name, buffer, size, handler->flags); + if (IS_ERR(handler)) + return PTR_ERR(handler); + return handler->get(handler, dentry, name, buffer, size); } /* @@ -735,19 +713,25 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (!buffer) { for_each_xattr_handler(handlers, handler) { - size += handler->list(dentry, NULL, 0, NULL, 0, - handler->flags); + if (!handler->name || + (handler->list && !handler->list(dentry))) + continue; + size += strlen(handler->name) + 1; } } else { char *buf = buffer; + size_t len; for_each_xattr_handler(handlers, handler) { - size = handler->list(dentry, buf, buffer_size, - NULL, 0, handler->flags); - if (size > buffer_size) + if (!handler->name || + (handler->list && !handler->list(dentry))) + continue; + len = strlen(handler->name); + if (len + 1 > buffer_size) return -ERANGE; - buf += size; - buffer_size -= size; + memcpy(buf, handler->name, len + 1); + buf += len + 1; + buffer_size -= len + 1; } size = buf - buffer; } @@ -765,9 +749,9 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz if (size == 0) value = ""; /* empty EA, do not remove */ handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); - if (!handler) - return -EOPNOTSUPP; - return handler->set(dentry, name, value, size, flags, handler->flags); + if (IS_ERR(handler)) + return PTR_ERR(handler); + return handler->set(handler, dentry, name, value, size, flags); } /* @@ -780,10 +764,9 @@ generic_removexattr(struct dentry *dentry, const char *name) const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); - if (!handler) - return -EOPNOTSUPP; - return handler->set(dentry, name, NULL, 0, - XATTR_REPLACE, handler->flags); + if (IS_ERR(handler)) + return PTR_ERR(handler); + return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(generic_getxattr); @@ -791,6 +774,30 @@ EXPORT_SYMBOL(generic_listxattr); EXPORT_SYMBOL(generic_setxattr); EXPORT_SYMBOL(generic_removexattr); +/** + * xattr_full_name - Compute full attribute name from suffix + * + * @handler: handler of the xattr_handler operation + * @name: name passed to the xattr_handler operation + * + * The get and set xattr handler operations are called with the remainder of + * the attribute name after skipping the handler's prefix: for example, "foo" + * is passed to the get operation of a handler with prefix "user." to get + * attribute "user.foo". The full name is still "there" in the name though. + * + * Note: the list xattr handler operation when called from the vfs is passed a + * NULL name; some file systems use this operation internally, with varying + * semantics. + */ +const char *xattr_full_name(const struct xattr_handler *handler, + const char *name) +{ + size_t prefix_len = strlen(xattr_prefix(handler)); + + return name - prefix_len; +} +EXPORT_SYMBOL(xattr_full_name); + /* * Allocate new xattr and copy in the value; but leave the name to callers. */ @@ -840,8 +847,22 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, return ret; } -static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags) +/** + * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems + * @xattrs: target simple_xattr list + * @name: name of the extended attribute + * @value: value of the xattr. If %NULL, will remove the attribute. + * @size: size of the new xattr + * @flags: %XATTR_{CREATE|REPLACE} + * + * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails + * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; + * otherwise, fails with -ENODATA. + * + * Returns 0 on success, -errno on failure. + */ +int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, + const void *value, size_t size, int flags) { struct simple_xattr *xattr; struct simple_xattr *new_xattr = NULL; @@ -891,73 +912,64 @@ out: } -/** - * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems - * @xattrs: target simple_xattr list - * @name: name of the new extended attribute - * @value: value of the new xattr. If %NULL, will remove the attribute - * @size: size of the new xattr - * @flags: %XATTR_{CREATE|REPLACE} - * - * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails - * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; - * otherwise, fails with -ENODATA. - * - * Returns 0 on success, -errno on failure. - */ -int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags) -{ - if (size == 0) - value = ""; /* empty EA, do not remove */ - return __simple_xattr_set(xattrs, name, value, size, flags); -} - -/* - * xattr REMOVE operation for in-memory/pseudo filesystems - */ -int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name) +static bool xattr_is_trusted(const char *name) { - return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE); + return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } -static bool xattr_is_trusted(const char *name) +static int xattr_list_one(char **buffer, ssize_t *remaining_size, + const char *name) { - return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); + size_t len = strlen(name) + 1; + if (*buffer) { + if (*remaining_size < len) + return -ERANGE; + memcpy(*buffer, name, len); + *buffer += len; + } + *remaining_size -= len; + return 0; } /* * xattr LIST operation for in-memory/pseudo filesystems */ -ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer, - size_t size) +ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, + char *buffer, size_t size) { bool trusted = capable(CAP_SYS_ADMIN); struct simple_xattr *xattr; - size_t used = 0; + ssize_t remaining_size = size; + int err; + +#ifdef CONFIG_FS_POSIX_ACL + if (inode->i_acl) { + err = xattr_list_one(&buffer, &remaining_size, + XATTR_NAME_POSIX_ACL_ACCESS); + if (err) + return err; + } + if (inode->i_default_acl) { + err = xattr_list_one(&buffer, &remaining_size, + XATTR_NAME_POSIX_ACL_DEFAULT); + if (err) + return err; + } +#endif spin_lock(&xattrs->lock); list_for_each_entry(xattr, &xattrs->head, list) { - size_t len; - /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; - len = strlen(xattr->name) + 1; - used += len; - if (buffer) { - if (size < used) { - used = -ERANGE; - break; - } - memcpy(buffer, xattr->name, len); - buffer += len; - } + err = xattr_list_one(&buffer, &remaining_size, xattr->name); + if (err) + return err; } spin_unlock(&xattrs->lock); - return used; + return size - remaining_size; } /* diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index a096841bd06c..f64639176670 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ + xfs_stats.o \ xfs_super.o \ xfs_symlink.o \ xfs_sysfs.o \ @@ -118,7 +119,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o -xfs-$(CONFIG_PROC_FS) += xfs_stats.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index a7a3a63bb360..686ba6fb20dd 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -55,8 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) return ptr; if (!(++retries % 100)) xfs_err(NULL, - "possible memory allocation deadlock in %s (mode:0x%x)", - __func__, lflags); + "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", + current->comm, current->pid, + (unsigned int)size, __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } @@ -120,8 +121,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) return ptr; if (!(++retries % 100)) xfs_err(NULL, - "possible memory allocation deadlock in %s (mode:0x%x)", - __func__, lflags); + "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", + current->comm, current->pid, + __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index cc6b768fc068..d1c66e465ca5 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) #define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN #define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT #define KM_ZONE_SPREAD SLAB_MEM_SPREAD +#define KM_ZONE_ACCOUNT SLAB_ACCOUNT #define kmem_zone kmem_cache #define kmem_zone_t struct kmem_cache diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ffad7f20342f..a708e38b494c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -482,7 +482,9 @@ xfs_agfl_verify( be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) return false; } - return true; + + return xfs_log_check_lsn(mp, + be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)); } static void @@ -533,6 +535,7 @@ xfs_agfl_write_verify( } const struct xfs_buf_ops xfs_agfl_buf_ops = { + .name = "xfs_agfl", .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, }; @@ -651,8 +654,8 @@ xfs_alloc_ag_vextent( -((long)(args->len))); } - XFS_STATS_INC(xs_allocx); - XFS_STATS_ADD(xs_allocb, args->len); + XFS_STATS_INC(args->mp, xs_allocx); + XFS_STATS_ADD(args->mp, xs_allocb, args->len); return error; } @@ -1808,8 +1811,8 @@ xfs_free_ag_extent( if (!isfl) xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); - XFS_STATS_INC(xs_freex); - XFS_STATS_ADD(xs_freeb, len); + XFS_STATS_INC(mp, xs_freex); + XFS_STATS_ADD(mp, xs_freeb, len); trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); @@ -1924,7 +1927,7 @@ xfs_alloc_space_available( * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ -STATIC int /* error */ +int /* error */ xfs_alloc_fix_freelist( struct xfs_alloc_arg *args, /* allocation argument structure */ int flags) /* XFS_ALLOC_FLAG_... */ @@ -2259,9 +2262,13 @@ xfs_agf_verify( { struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); - if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) + return false; + if (!xfs_log_check_lsn(mp, + be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) return false; + } if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && @@ -2333,6 +2340,7 @@ xfs_agf_write_verify( } const struct xfs_buf_ops xfs_agf_buf_ops = { + .name = "xfs_agf", .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, }; @@ -2503,7 +2511,7 @@ xfs_alloc_vextent( * Try near allocation first, then anywhere-in-ag after * the first a.g. fails. */ - if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) && + if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) && (mp->m_flags & XFS_MOUNT_32BITINODES)) { args->fsbno = XFS_AGB_TO_FSB(mp, ((mp->m_agfrotor / rotorstep) % @@ -2634,6 +2642,14 @@ xfs_alloc_vextent( XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); #endif + + /* Zero the extent if we were asked to do so */ + if (args->userdata & XFS_ALLOC_USERDATA_ZERO) { + error = xfs_zero_extent(args->ip, args->fsbno, args->len); + if (error) + goto error0; + } + } xfs_perag_put(args->pag); return 0; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index ca1c8168373a..135eb3d24db7 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg { struct xfs_mount *mp; /* file system mount point */ struct xfs_buf *agbp; /* buffer for a.g. freelist header */ struct xfs_perag *pag; /* per-ag struct for this agno */ + struct xfs_inode *ip; /* for userdata zeroing method */ xfs_fsblock_t fsbno; /* file system block number */ xfs_agnumber_t agno; /* allocation group number */ xfs_agblock_t agbno; /* allocation group-relative block # */ @@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg { char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ char isfl; /* set if is freelist blocks - !acctg */ - char userdata; /* set if this is user data */ + char userdata; /* mask defining userdata treatment */ xfs_fsblock_t firstblock; /* io first block allocated */ } xfs_alloc_arg_t; /* * Defines for userdata */ -#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ -#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ +#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ +#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag, xfs_extlen_t need); @@ -233,5 +235,6 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 90de071dd4c2..444626ddbd1b 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -293,14 +293,7 @@ xfs_allocbt_verify( level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): @@ -311,14 +304,7 @@ xfs_allocbt_verify( return false; break; case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): @@ -332,21 +318,7 @@ xfs_allocbt_verify( return false; } - /* numrecs verification */ - if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) - return false; - - /* sibling pointer verification */ - if (!block->bb_u.s.bb_leftsib || - (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) - return false; - if (!block->bb_u.s.bb_rightsib || - (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) - return false; - - return true; + return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); } static void @@ -379,6 +351,7 @@ xfs_allocbt_write_verify( } const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .name = "xfs_allocbt", .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index ff065578969f..fa3b948ef9c2 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -125,7 +125,7 @@ xfs_attr_get( uint lock_mode; int error; - XFS_STATS_INC(xs_attr_get); + XFS_STATS_INC(ip->i_mount, xs_attr_get); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; @@ -207,9 +207,9 @@ xfs_attr_set( struct xfs_trans_res tres; xfs_fsblock_t firstblock; int rsvd = (flags & ATTR_ROOT) != 0; - int error, err2, committed, local; + int error, err2, local; - XFS_STATS_INC(xs_attr_set); + XFS_STATS_INC(mp, xs_attr_set); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; @@ -334,25 +334,15 @@ xfs_attr_set( */ xfs_bmap_init(args.flist, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); - if (!error) { - error = xfs_bmap_finish(&args.trans, args.flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args.trans, args.flist, dp); if (error) { - ASSERT(committed); args.trans = NULL; xfs_bmap_cancel(&flist); goto out; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args.trans, dp, 0); - - /* * Commit the leaf transformation. We'll need another (linked) * transaction to add the new attribute to the leaf. */ @@ -412,7 +402,7 @@ xfs_attr_remove( xfs_fsblock_t firstblock; int error; - XFS_STATS_INC(xs_attr_remove); + XFS_STATS_INC(mp, xs_attr_remove); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; @@ -568,7 +558,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) { xfs_inode_t *dp; struct xfs_buf *bp; - int retval, error, committed, forkoff; + int retval, error, forkoff; trace_xfs_attr_leaf_addname(args); @@ -628,25 +618,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - /* * Commit the current trans (including the inode) and start * a new one. */ @@ -729,25 +709,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -775,7 +744,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) { xfs_inode_t *dp; struct xfs_buf *bp; - int error, committed, forkoff; + int error, forkoff; trace_xfs_attr_leaf_removename(args); @@ -803,23 +772,13 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } return 0; } @@ -877,7 +836,7 @@ xfs_attr_node_addname(xfs_da_args_t *args) xfs_da_state_blk_t *blk; xfs_inode_t *dp; xfs_mount_t *mp; - int committed, retval, error; + int retval, error; trace_xfs_attr_node_addname(args); @@ -938,27 +897,16 @@ restart: state = NULL; xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - /* * Commit the node conversion and start the next * trans in the chain. */ @@ -977,23 +925,13 @@ restart: */ xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_split(state); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } else { /* * Addition succeeded, update Btree hashvals. @@ -1086,25 +1024,14 @@ restart: if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_join(state); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -1146,7 +1073,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_da_state_blk_t *blk; xfs_inode_t *dp; struct xfs_buf *bp; - int retval, error, committed, forkoff; + int retval, error, forkoff; trace_xfs_attr_node_removename(args); @@ -1220,24 +1147,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_join(state); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - /* * Commit the Btree join operation and start a new trans. */ @@ -1265,25 +1181,14 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } else xfs_trans_brelse(args->trans, bp); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 33df52d97ec7..01a5ecfedfcf 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -41,6 +41,7 @@ #include "xfs_buf_item.h" #include "xfs_cksum.h" #include "xfs_dir2.h" +#include "xfs_log.h" /* @@ -266,6 +267,8 @@ xfs_attr3_leaf_verify( return false; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) + return false; } else { if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) return false; @@ -325,6 +328,7 @@ xfs_attr3_leaf_read_verify( } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .name = "xfs_attr3_leaf", .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index f38f9bd81557..a572532a55cd 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -107,7 +107,7 @@ xfs_attr3_rmt_verify( if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + - be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX) + be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX) return false; if (rmt->rm_owner == 0) return false; @@ -201,6 +201,7 @@ xfs_attr3_rmt_write_verify( } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .name = "xfs_attr3_rmt", .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, }; @@ -447,8 +448,6 @@ xfs_attr_rmtval_set( * Roll through the "value", allocating blocks on disk as required. */ while (blkcnt > 0) { - int committed; - /* * Allocate a single extent, up to the size of the value. * @@ -466,24 +465,14 @@ xfs_attr_rmtval_set( error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, args->total, &map, &nmap, args->flist); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -614,31 +603,20 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; done = 0; while (!done) { - int committed; - xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, args->firstblock, args->flist, &done); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + args->dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, args->dp, 0); - - /* * Close out trans and start the next one in the chain. */ error = xfs_trans_roll(&args->trans, args->dp); diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c index 0e8885a59646..0a94cce5ea35 100644 --- a/fs/xfs/libxfs/xfs_bit.c +++ b/fs/xfs/libxfs/xfs_bit.c @@ -32,13 +32,13 @@ int xfs_bitmap_empty(uint *map, uint size) { uint i; - uint ret = 0; for (i = 0; i < size; i++) { - ret |= map[i]; + if (map[i] != 0) + return 0; } - return (ret == 0); + return 1; } /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8e2010d53b07..ef00156f4f96 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -325,9 +325,11 @@ xfs_check_block( /* * Check that the extents for the inode ip are in the right order in all - * btree leaves. + * btree leaves. THis becomes prohibitively expensive for large extent count + * files, so don't bother with inodes that have more than 10,000 extents in + * them. The btree record ordering checks will still be done, so for such large + * bmapbt constructs that is going to catch most corruptions. */ - STATIC void xfs_bmap_check_leaf_extents( xfs_btree_cur_t *cur, /* btree cursor or null */ @@ -352,6 +354,10 @@ xfs_bmap_check_leaf_extents( return; } + /* skip large extent count inodes */ + if (ip->i_d.di_nextents > 10000) + return; + bno = NULLFSBLOCK; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); @@ -948,14 +954,16 @@ xfs_bmap_local_to_extents( bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); /* - * Initialise the block and copy the data + * Initialize the block, copy the data and log the remote buffer. * - * Note: init_fn must set the buffer log item type correctly! + * The callout is responsible for logging because the remote format + * might differ from the local format and thus we don't know how much to + * log here. Note that init_fn must also set the buffer log item type + * correctly. */ init_fn(tp, bp, ip, ifp); - /* account for the change in fork size and log everything */ - xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); xfs_bmap_local_to_extents_empty(ip, whichfork); flags |= XFS_ILOG_CORE; @@ -1109,7 +1117,6 @@ xfs_bmap_add_attrfork( xfs_trans_t *tp; /* transaction pointer */ int blks; /* space reservation */ int version = 1; /* superblock attr version */ - int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ @@ -1212,7 +1219,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_bmap_finish(&tp, &flist, &committed); + error = xfs_bmap_finish(&tp, &flist, NULL); if (error) goto bmap_cancel; error = xfs_trans_commit(tp); @@ -1435,7 +1442,7 @@ xfs_bmap_search_extents( xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - XFS_STATS_INC(xs_look_exlist); + XFS_STATS_INC(ip->i_mount, xs_look_exlist); ifp = XFS_IFORK_PTR(ip, fork); ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); @@ -1721,10 +1728,11 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + int whichfork = XFS_DATA_FORK; struct xfs_mount *mp; - mp = bma->tp ? bma->tp->t_mountp : NULL; - ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); + mp = bma->ip->i_mount; + ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); @@ -1732,7 +1740,7 @@ xfs_bmap_add_extent_delay_real( ASSERT(!bma->cur || (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); - XFS_STATS_INC(xs_add_exlist); + XFS_STATS_INC(mp, xs_add_exlist); #define LEFT r[0] #define RIGHT r[1] @@ -1783,7 +1791,7 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); @@ -2014,10 +2022,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, - &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); + &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2098,10 +2106,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, - &tmp_rval, XFS_DATA_FORK); + &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2167,10 +2175,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, - 1, &tmp_rval, XFS_DATA_FORK); + 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2213,13 +2221,13 @@ xfs_bmap_add_extent_delay_real( } /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, - da_old > 0, &tmp_logflags, XFS_DATA_FORK); + da_old > 0, &tmp_logflags, whichfork); bma->logflags |= tmp_logflags; if (error) goto done; @@ -2240,7 +2248,7 @@ xfs_bmap_add_extent_delay_real( if (bma->cur) bma->cur->bc_private.b.allocated = 0; - xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: bma->logflags |= rval; return error; @@ -2286,7 +2294,7 @@ xfs_bmap_add_extent_unwritten_real( ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); ASSERT(!isnullstartblock(new->br_startblock)); - XFS_STATS_INC(xs_add_exlist); + XFS_STATS_INC(mp, xs_add_exlist); #define LEFT r[0] #define RIGHT r[1] @@ -2937,7 +2945,7 @@ xfs_bmap_add_extent_hole_real( int state; /* state bits, accessed thru macros */ struct xfs_mount *mp; - mp = bma->tp ? bma->tp->t_mountp : NULL; + mp = bma->ip->i_mount; ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); @@ -2946,7 +2954,7 @@ xfs_bmap_add_extent_hole_real( ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); - XFS_STATS_INC(xs_add_exlist); + XFS_STATS_INC(mp, xs_add_exlist); state = 0; if (whichfork == XFS_ATTR_FORK) @@ -3800,8 +3808,13 @@ xfs_bmap_btalloc( args.wasdel = ap->wasdel; args.isfl = 0; args.userdata = ap->userdata; - if ((error = xfs_alloc_vextent(&args))) + if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) + args.ip = ap->ip; + + error = xfs_alloc_vextent(&args); + if (error) return error; + if (tryagain && args.fsbno == NULLFSBLOCK) { /* * Exact allocation failed. Now try with alignment @@ -4036,7 +4049,7 @@ xfs_bmapi_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - XFS_STATS_INC(xs_blk_mapr); + XFS_STATS_INC(mp, xs_blk_mapr); ifp = XFS_IFORK_PTR(ip, whichfork); @@ -4221,7 +4234,7 @@ xfs_bmapi_delay( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - XFS_STATS_INC(xs_blk_mapw); + XFS_STATS_INC(mp, xs_blk_mapw); if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); @@ -4300,11 +4313,14 @@ xfs_bmapi_allocate( /* * Indicate if this is the first user data in the file, or just any - * user data. + * user data. And if it is userdata, indicate whether it needs to + * be initialised to zero during allocation. */ if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + if (bma->flags & XFS_BMAPI_ZERO) + bma->userdata |= XFS_ALLOC_USERDATA_ZERO; } bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; @@ -4419,6 +4435,17 @@ xfs_bmapi_convert_unwritten( mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + /* + * Before insertion into the bmbt, zero the range being converted + * if required. + */ + if (flags & XFS_BMAPI_ZERO) { + error = xfs_zero_extent(bma->ip, mval->br_startblock, + mval->br_blockcount); + if (error) + return error; + } + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, &bma->cur, mval, bma->firstblock, bma->flist, &tmp_logflags); @@ -4512,6 +4539,18 @@ xfs_bmapi_write( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + /* zeroing is for currently only for data extents, not metadata */ + ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != + (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)); + /* + * we can allocate unwritten extents or pre-zero allocated blocks, + * but it makes no sense to do both at once. This would result in + * zeroing the unwritten extent twice, but it still being an + * unwritten extent.... + */ + ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) != + (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); + if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), @@ -4525,7 +4564,7 @@ xfs_bmapi_write( ifp = XFS_IFORK_PTR(ip, whichfork); - XFS_STATS_INC(xs_blk_mapw); + XFS_STATS_INC(mp, xs_blk_mapw); if (*firstblock == NULLFSBLOCK) { if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) @@ -4718,12 +4757,12 @@ xfs_bmap_del_extent( xfs_filblks_t temp2; /* for indirect length calculations */ int state = 0; - XFS_STATS_INC(xs_del_exlist); + mp = ip->i_mount; + XFS_STATS_INC(mp, xs_del_exlist); if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; - mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); @@ -5070,7 +5109,7 @@ xfs_bunmapi( *done = 1; return 0; } - XFS_STATS_INC(xs_blk_unmap); + XFS_STATS_INC(mp, xs_blk_unmap); isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); start = bno; bno = start + len - 1; @@ -5917,7 +5956,6 @@ xfs_bmap_split_extent( struct xfs_trans *tp; struct xfs_bmap_free free_list; xfs_fsblock_t firstfsb; - int committed; int error; tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); @@ -5938,7 +5976,7 @@ xfs_bmap_split_extent( if (error) goto out; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 6aaa0c1c7200..423a34e832bd 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -52,9 +52,9 @@ struct xfs_bmalloca { xfs_extlen_t minleft; /* amount must be left after alloc */ bool eof; /* set if allocating past last extent */ bool wasdel; /* replacing a delayed allocation */ - bool userdata;/* set if is user data */ bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ + char userdata;/* userdata mask */ int flags; }; @@ -109,6 +109,14 @@ typedef struct xfs_bmap_free */ #define XFS_BMAPI_CONVERT 0x040 +/* + * allocate zeroed extents - this requires all newly allocated user data extents + * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set. + * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found + * during the allocation range to zeroed written extents. + */ +#define XFS_BMAPI_ZERO 0x080 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -116,7 +124,8 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" } + { XFS_BMAPI_CONVERT, "CONVERT" }, \ + { XFS_BMAPI_ZERO, "ZERO" } static inline int xfs_bmapi_aflag(int w) @@ -186,7 +195,7 @@ void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_bmap_free *flist, struct xfs_mount *mp); void xfs_bmap_cancel(struct xfs_bmap_free *flist); int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, - int *committed); + struct xfs_inode *ip); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6b0cf6546a82..1637c37bfbaa 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -720,6 +720,7 @@ xfs_bmbt_write_verify( } const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .name = "xfs_bmbt", .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f7d7ee7a2607..a0eb18ce3ad3 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -32,6 +32,7 @@ #include "xfs_trace.h" #include "xfs_cksum.h" #include "xfs_alloc.h" +#include "xfs_log.h" /* * Cursor allocation zone. @@ -222,7 +223,7 @@ xfs_btree_check_ptr( * long-form btree header. * * Prior to calculting the CRC, pull the LSN out of the buffer log item and put - * it into the buffer so recovery knows what the last modifcation was that made + * it into the buffer so recovery knows what the last modification was that made * it to disk. */ void @@ -243,8 +244,14 @@ bool xfs_btree_lblock_verify_crc( struct xfs_buf *bp) { - if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) + return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); + } return true; } @@ -254,7 +261,7 @@ xfs_btree_lblock_verify_crc( * short-form btree header. * * Prior to calculting the CRC, pull the LSN out of the buffer log item and put - * it into the buffer so recovery knows what the last modifcation was that made + * it into the buffer so recovery knows what the last modification was that made * it to disk. */ void @@ -275,8 +282,14 @@ bool xfs_btree_sblock_verify_crc( struct xfs_buf *bp) { - if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) + return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); + } return true; } @@ -4067,3 +4080,61 @@ xfs_btree_change_owner( return 0; } + +/** + * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format + * btree block + * + * @bp: buffer containing the btree block + * @max_recs: pointer to the m_*_mxr max records field in the xfs mount + * @pag_max_level: pointer to the per-ag max level field + */ +bool +xfs_btree_sblock_v5hdr_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + return true; +} + +/** + * xfs_btree_sblock_verify() -- verify a short-format btree block + * + * @bp: buffer containing the btree block + * @max_recs: maximum records allowed in this btree node + */ +bool +xfs_btree_sblock_verify( + struct xfs_buf *bp, + unsigned int max_recs) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 8f18bab73ea5..2e874be70209 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -84,31 +84,38 @@ union xfs_btree_rec { /* * Generic stats interface */ -#define __XFS_BTREE_STATS_INC(type, stat) \ - XFS_STATS_INC(xs_ ## type ## _2_ ## stat) -#define XFS_BTREE_STATS_INC(cur, stat) \ +#define __XFS_BTREE_STATS_INC(mp, type, stat) \ + XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat) +#define XFS_BTREE_STATS_INC(cur, stat) \ do { \ + struct xfs_mount *__mp = cur->bc_mp; \ switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \ - case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \ - case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ - case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ - case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) -#define __XFS_BTREE_STATS_ADD(type, stat, val) \ - XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val) +#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \ + XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val) #define XFS_BTREE_STATS_ADD(cur, stat, val) \ do { \ + struct xfs_mount *__mp = cur->bc_mp; \ switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \ - case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \ - case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ - case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ - case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + case XFS_BTNUM_BNO: \ + __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \ + case XFS_BTNUM_CNT: \ + __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \ + case XFS_BTNUM_BMAP: \ + __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \ + case XFS_BTNUM_INO: \ + __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \ + case XFS_BTNUM_FINO: \ + __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -465,4 +472,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_BTREE_TRACE_ARGR(c, r) #define XFS_BTREE_TRACE_CURSOR(c, t) +bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); +bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index be43248a5822..097bf7717d80 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -39,6 +39,7 @@ #include "xfs_trace.h" #include "xfs_cksum.h" #include "xfs_buf_item.h" +#include "xfs_log.h" /* * xfs_da_btree.c @@ -150,6 +151,8 @@ xfs_da3_node_verify( return false; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) + return false; } else { if (ichdr.magic != XFS_DA_NODE_MAGIC) return false; @@ -242,6 +245,7 @@ xfs_da3_node_read_verify( } const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .name = "xfs_da3_node", .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, }; @@ -322,6 +326,7 @@ xfs_da3_node_create( if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr)); ichdr.magic = XFS_DA3_NODE_MAGIC; hdr3->info.blkno = cpu_to_be64(bp->b_bn); hdr3->info.owner = cpu_to_be64(args->dp->i_ino); diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 9de401d297e5..2fb53a5c0a74 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -271,7 +271,7 @@ xfs_dir_createname( rval = xfs_dir_ino_validate(tp->t_mountp, inum); if (rval) return rval; - XFS_STATS_INC(xs_dir_create); + XFS_STATS_INC(dp->i_mount, xs_dir_create); } args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); @@ -365,7 +365,7 @@ xfs_dir_lookup( int lock_mode; ASSERT(S_ISDIR(dp->i_d.di_mode)); - XFS_STATS_INC(xs_dir_lookup); + XFS_STATS_INC(dp->i_mount, xs_dir_lookup); /* * We need to use KM_NOFS here so that lockdep will not throw false @@ -444,7 +444,7 @@ xfs_dir_removename( int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - XFS_STATS_INC(xs_dir_remove); + XFS_STATS_INC(dp->i_mount, xs_dir_remove); args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); if (!args) diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 4778d1dd511a..aa17cb788946 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -34,6 +34,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_log.h" /* * Local function prototypes. @@ -71,6 +72,8 @@ xfs_dir3_block_verify( return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return false; } else { if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) return false; @@ -120,6 +123,7 @@ xfs_dir3_block_write_verify( } const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .name = "xfs_dir3_block", .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 824131e71bc5..725fc7841fde 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -31,6 +31,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" +#include "xfs_log.h" /* * Check the consistency of the data block. @@ -224,6 +225,8 @@ xfs_dir3_data_verify( return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return false; } else { if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) return false; @@ -302,11 +305,13 @@ xfs_dir3_data_write_verify( } const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .name = "xfs_dir3_data", .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, }; static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .name = "xfs_dir3_data_reada", .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index f300240ebb8d..b887fb2a2bcf 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -33,6 +33,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" +#include "xfs_log.h" /* * Local function declarations. @@ -164,6 +165,8 @@ xfs_dir3_leaf_verify( return false; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) + return false; } else { if (leaf->hdr.info.magic != cpu_to_be16(magic)) return false; @@ -242,11 +245,13 @@ xfs_dir3_leafn_write_verify( } const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .name = "xfs_dir3_leaf1", .verify_read = xfs_dir3_leaf1_read_verify, .verify_write = xfs_dir3_leaf1_write_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .name = "xfs_dir3_leafn", .verify_read = xfs_dir3_leafn_read_verify, .verify_write = xfs_dir3_leafn_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index cc28e924545b..63ee03db796c 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -33,6 +33,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" +#include "xfs_log.h" /* * Function declarations. @@ -97,6 +98,8 @@ xfs_dir3_free_verify( return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return false; } else { if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) return false; @@ -147,6 +150,7 @@ xfs_dir3_free_write_verify( } const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .name = "xfs_dir3_free", .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 5331b7f0460c..3cc3cf767474 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -54,7 +54,7 @@ xfs_dqcheck( xfs_dqid_t id, uint type, /* used only when IO_dorepair is true */ uint flags, - char *str) + const char *str) { xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; int errs = 0; @@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc( STATIC bool xfs_dquot_buf_verify( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + int warn) { struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; xfs_dqid_t id = 0; @@ -240,8 +241,7 @@ xfs_dquot_buf_verify( if (i == 0) id = be32_to_cpu(ddq->d_id); - error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_buf_verify"); + error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__); if (error) return false; } @@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify( if (!xfs_dquot_buf_verify_crc(mp, bp)) xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dquot_buf_verify(mp, bp)) + else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) @@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify( } /* + * readahead errors are silent and simply leave the buffer as !done so a real + * read will then be run with the xfs_dquot_buf_ops verifier. See + * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than + * reporting the failure. + */ +static void +xfs_dquot_buf_readahead_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp) || + !xfs_dquot_buf_verify(mp, bp, 0)) { + xfs_buf_ioerror(bp, -EIO); + bp->b_flags &= ~XBF_DONE; + } +} + +/* * we don't calculate the CRC here as that is done when the dquot is flushed to * the buffer after the update is done. This ensures that the dquot in the * buffer always has an up-to-date CRC value. @@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if (!xfs_dquot_buf_verify(mp, bp)) { + if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) { xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; @@ -282,7 +301,13 @@ xfs_dquot_buf_write_verify( } const struct xfs_buf_ops xfs_dquot_buf_ops = { + .name = "xfs_dquot", .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, }; +const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { + .name = "xfs_dquot_ra", + .verify_read = xfs_dquot_buf_readahead_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9590a069e556..e2536bb1c760 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -60,6 +60,14 @@ struct xfs_ifork; #define XFS_SB_VERSION_MOREBITSBIT 0x8000 /* + * The size of a single extended attribute on disk is limited by + * the size of index values within the attribute entries themselves. + * These are be16 fields, so we can only support attribute data + * sizes up to 2^16 bytes in length. + */ +#define XFS_XATTR_SIZE_MAX (1 << 16) + +/* * Supported feature bit list is just all bits in the versionnum field because * we've used them all up and understand them all. Except, of course, for the * shared superblock bit, which nobody knows what it does and so is unsupported. @@ -778,7 +786,7 @@ typedef struct xfs_agfl { __be64 agfl_lsn; __be32 agfl_crc; __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ -} xfs_agfl_t; +} __attribute__((packed)) xfs_agfl_t; #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) @@ -1483,13 +1491,17 @@ struct xfs_acl { */ #define XFS_ACL_MAX_ENTRIES(mp) \ (xfs_sb_version_hascrc(&mp->m_sb) \ - ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ sizeof(struct xfs_acl_entry) \ : 25) -#define XFS_ACL_MAX_SIZE(mp) \ +#define XFS_ACL_SIZE(cnt) \ (sizeof(struct xfs_acl) + \ - sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + sizeof(struct xfs_acl_entry) * cnt) + +#define XFS_ACL_MAX_SIZE(mp) \ + XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp))) + /* On-disk XFS extended attribute names */ #define SGI_ACL_FILE "SGI_ACL_FILE" diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 89689c6a43e2..b2b73a998d42 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -490,6 +490,16 @@ typedef struct xfs_swapext #define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ /* + * ioctl limits + */ +#ifdef XATTR_LIST_MAX +# define XFS_XATTR_LIST_MAX XATTR_LIST_MAX +#else +# define XFS_XATTR_LIST_MAX 65536 +#endif + + +/* * ioctl commands that are used by Linux filesystems */ #define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 54deb2d12ac6..66d702e6b9ff 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -38,6 +38,7 @@ #include "xfs_icreate_item.h" #include "xfs_icache.h" #include "xfs_trace.h" +#include "xfs_log.h" /* @@ -2500,9 +2501,14 @@ xfs_agi_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); - if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) + return false; + if (!xfs_log_check_lsn(mp, + be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn))) return false; + } + /* * Validate the magic number of the agi block. */ @@ -2566,6 +2572,7 @@ xfs_agi_write_verify( } const struct xfs_buf_ops xfs_agi_buf_ops = { + .name = "xfs_agi", .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index f39b285beb19..c679f3c05b63 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -221,7 +221,6 @@ xfs_inobt_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - struct xfs_perag *pag = bp->b_pag; unsigned int level; /* @@ -237,14 +236,7 @@ xfs_inobt_verify( switch (block->bb_magic) { case cpu_to_be32(XFS_IBT_CRC_MAGIC): case cpu_to_be32(XFS_FIBT_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_IBT_MAGIC): @@ -254,24 +246,12 @@ xfs_inobt_verify( return 0; } - /* numrecs and level verification */ + /* level verification */ level = be16_to_cpu(block->bb_level); if (level >= mp->m_in_maxlevels) return false; - if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) - return false; - - /* sibling pointer verification */ - if (!block->bb_u.s.bb_leftsib || - (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) - return false; - if (!block->bb_u.s.bb_rightsib || - (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) - return false; - return true; + return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]); } static void @@ -304,6 +284,7 @@ xfs_inobt_write_verify( } const struct xfs_buf_ops xfs_inobt_buf_ops = { + .name = "xfs_inobt", .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 268c00f4f83a..1aabfda669b0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -62,11 +62,14 @@ xfs_inobp_check( * has not had the inode cores stamped into it. Hence for readahead, the buffer * may be potentially invalid. * - * If the readahead buffer is invalid, we don't want to mark it with an error, - * but we do want to clear the DONE status of the buffer so that a followup read - * will re-read it from disk. This will ensure that we don't get an unnecessary - * warnings during log recovery and we don't get unnecssary panics on debug - * kernels. + * If the readahead buffer is invalid, we need to mark it with an error and + * clear the DONE status of the buffer so that a followup read will re-read it + * from disk. We don't report the error otherwise to avoid warnings during log + * recovery and we don't get unnecssary panics on debug kernels. We use EIO here + * because all we want to do is say readahead failed; there is no-one to report + * the error to, so this will distinguish it from a non-ra verifier failure. + * Changes to this readahead error behavour also need to be reflected in + * xfs_dquot_buf_readahead_verify(). */ static void xfs_inode_buf_verify( @@ -93,6 +96,7 @@ xfs_inode_buf_verify( XFS_RANDOM_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; + xfs_buf_ioerror(bp, -EIO); return; } @@ -132,11 +136,13 @@ xfs_inode_buf_write_verify( } const struct xfs_buf_ops xfs_inode_buf_ops = { + .name = "xfs_inode", .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { + .name = "xxfs_inode_ra", .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 1c55ccbb379d..8e385f91d660 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -60,6 +60,7 @@ typedef struct xlog_recover { */ #define XLOG_BC_TABLE_SIZE 64 +#define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 1b0a08379759..f51078f1e92a 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, - xfs_dqid_t id, uint type, uint flags, char *str); + xfs_dqid_t id, uint type, uint flags, const char *str); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 47425140f343..8a53eaa349f4 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -35,6 +35,7 @@ #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" +#include "xfs_log.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -163,6 +164,15 @@ xfs_mount_validate_sb( "Filesystem can not be safely mounted by this kernel."); return -EINVAL; } + } else if (xfs_sb_version_hascrc(sbp)) { + /* + * We can't read verify the sb LSN because the read verifier is + * called before the log is allocated and processed. We know the + * log is set up before write verifier (!check_version) calls, + * so just check it here. + */ + if (!xfs_log_check_lsn(mp, sbp->sb_lsn)) + return -EFSCORRUPTED; } if (xfs_sb_version_has_pquotino(sbp)) { @@ -669,11 +679,13 @@ xfs_sb_write_verify( } const struct xfs_buf_ops xfs_sb_buf_ops = { + .name = "xfs_sb", .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .name = "xfs_sb_quiet", .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 5be529707903..15c3ceb845b9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 8f8af05b3f13..2e2c6716b623 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -31,6 +31,7 @@ #include "xfs_cksum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" +#include "xfs_log.h" /* @@ -60,6 +61,7 @@ xfs_symlink_hdr_set( if (!xfs_sb_version_hascrc(&mp->m_sb)) return 0; + memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr)); dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); dsl->sl_offset = cpu_to_be32(offset); dsl->sl_bytes = cpu_to_be32(size); @@ -116,6 +118,8 @@ xfs_symlink_verify( return false; if (dsl->sl_owner == 0) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn))) + return false; return true; } @@ -164,6 +168,7 @@ xfs_symlink_write_verify( } const struct xfs_buf_ops xfs_symlink_buf_ops = { + .name = "xfs_symlink", .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, }; @@ -183,6 +188,7 @@ xfs_symlink_local_to_remote( if (!xfs_sb_version_hascrc(&mp->m_sb)) { bp->b_ops = NULL; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); return; } @@ -198,4 +204,6 @@ xfs_symlink_local_to_remote( buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) + + ifp->if_bytes - 1); } diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 4b641676f258..2d5df1f23bbc 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -37,16 +37,19 @@ STATIC struct posix_acl * xfs_acl_from_disk( - struct xfs_acl *aclp, - int max_entries) + const struct xfs_acl *aclp, + int len, + int max_entries) { struct posix_acl_entry *acl_e; struct posix_acl *acl; - struct xfs_acl_entry *ace; + const struct xfs_acl_entry *ace; unsigned int count, i; + if (len < sizeof(*aclp)) + return ERR_PTR(-EFSCORRUPTED); count = be32_to_cpu(aclp->acl_cnt); - if (count > max_entries) + if (count > max_entries || XFS_ACL_SIZE(count) != len) return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); @@ -160,10 +163,11 @@ xfs_get_acl(struct inode *inode, int type) */ if (error == -ENOATTR) goto out_update_cache; + acl = ERR_PTR(error); goto out; } - acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); + acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount)); if (IS_ERR(acl)) goto out; @@ -248,29 +252,6 @@ xfs_set_mode(struct inode *inode, umode_t mode) return error; } -static int -xfs_acl_exists(struct inode *inode, unsigned char *name) -{ - int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); - - return (xfs_attr_get(XFS_I(inode), name, NULL, &len, - ATTR_ROOT|ATTR_KERNOVAL) == 0); -} - -int -posix_acl_access_exists(struct inode *inode) -{ - return xfs_acl_exists(inode, SGI_ACL_FILE); -} - -int -posix_acl_default_exists(struct inode *inode) -{ - if (!S_ISDIR(inode->i_mode)) - return 0; - return xfs_acl_exists(inode, SGI_ACL_DEFAULT); -} - int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 3841b07f27bf..286fa89217f5 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -20,20 +20,18 @@ struct inode; struct posix_acl; -struct xfs_inode; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int posix_acl_access_exists(struct inode *inode); -extern int posix_acl_default_exists(struct inode *inode); #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { return NULL; } # define xfs_set_acl NULL -# define posix_acl_access_exists(inode) 0 -# define posix_acl_default_exists(inode) 0 #endif /* CONFIG_XFS_POSIX_ACL */ + +extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags); + #endif /* __XFS_ACL_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 50ab2879b9da..379c089fb051 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -172,6 +172,12 @@ xfs_setfilesize_ioend( current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); + /* we abort the update if there was an IO error */ + if (ioend->io_error) { + xfs_trans_cancel(tp); + return ioend->io_error; + } + return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } @@ -212,14 +218,17 @@ xfs_end_io( ioend->io_error = -EIO; goto done; } - if (ioend->io_error) - goto done; /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. + * Detecting and handling completion IO errors is done individually + * for each case as different cleanup operations need to be performed + * on error. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { + if (ioend->io_error) + goto done; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); } else if (ioend->io_append_trans) { @@ -1250,13 +1259,28 @@ xfs_vm_releasepage( * the DIO. There is only going to be one reference to the ioend and its life * cycle is constrained by the DIO completion code. hence we don't need * reference counting here. + * + * Note that for DIO, an IO to the highest supported file block offset (i.e. + * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 + * bit variable. Hence if we see this overflow, we have to assume that the IO is + * extending the file size. We won't know for sure until IO completion is run + * and the actual max write offset is communicated to the IO completion + * routine. + * + * For DAX page faults, we are preparing to never see unwritten extents here, + * nor should we ever extend the inode size. Hence we will soon have nothing to + * do here for this case, ensuring we don't have to provide an IO completion + * callback to free an ioend that we don't actually need for a fault into the + * page at offset (2^63 - 1FSB) bytes. */ + static void xfs_map_direct( struct inode *inode, struct buffer_head *bh_result, struct xfs_bmbt_irec *imap, - xfs_off_t offset) + xfs_off_t offset, + bool dax_fault) { struct xfs_ioend *ioend; xfs_off_t size = bh_result->b_size; @@ -1269,6 +1293,13 @@ xfs_map_direct( trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); + if (dax_fault) { + ASSERT(type == XFS_IO_OVERWRITE); + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, + imap); + return; + } + if (bh_result->b_private) { ioend = bh_result->b_private; ASSERT(ioend->io_size > 0); @@ -1283,7 +1314,8 @@ xfs_map_direct( ioend->io_size, ioend->io_type, imap); } else if (type == XFS_IO_UNWRITTEN || - offset + size > i_size_read(inode)) { + offset + size > i_size_read(inode) || + offset + size < 0) { ioend = xfs_alloc_ioend(inode, type); ioend->io_offset = offset; ioend->io_size = size; @@ -1345,7 +1377,8 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - bool direct) + bool direct, + bool dax_fault) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1393,18 +1426,20 @@ __xfs_get_blocks( if (error) goto out_unlock; + /* for DAX, we convert unwritten extents directly */ if (create && (!nimaps || (imap.br_startblock == HOLESTARTBLOCK || - imap.br_startblock == DELAYSTARTBLOCK))) { + imap.br_startblock == DELAYSTARTBLOCK) || + (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { if (direct || xfs_get_extsz_hint(ip)) { /* - * Drop the ilock in preparation for starting the block - * allocation transaction. It will be retaken - * exclusively inside xfs_iomap_write_direct for the - * actual allocation. + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. */ - xfs_iunlock(ip, lockmode); + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, size, &imap, nimaps); if (error) @@ -1441,6 +1476,12 @@ __xfs_get_blocks( goto out_unlock; } + if (IS_DAX(inode) && create) { + ASSERT(!ISUNWRITTEN(&imap)); + /* zeroing is not needed at a higher layer */ + new = 0; + } + /* trim mapping down to size requested */ if (direct || size > (1 << inode->i_blkbits)) xfs_map_trim_size(inode, iblock, bh_result, @@ -1458,7 +1499,8 @@ __xfs_get_blocks( set_buffer_unwritten(bh_result); /* direct IO needs special help */ if (create && direct) - xfs_map_direct(inode, bh_result, &imap, offset); + xfs_map_direct(inode, bh_result, &imap, offset, + dax_fault); } /* @@ -1505,7 +1547,7 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, false); + return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); } int @@ -1515,7 +1557,17 @@ xfs_get_blocks_direct( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, true); + return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); +} + +int +xfs_get_blocks_dax_fault( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); } static void @@ -1614,45 +1666,6 @@ xfs_end_io_direct_write( __xfs_end_io_direct_write(inode, ioend, offset, size); } -/* - * For DAX we need a mapping buffer callback for unwritten extent conversion - * when page faults allocate blocks and then zero them. Note that in this - * case the mapping indicated by the ioend may extend beyond EOF. We most - * definitely do not want to extend EOF here, so we trim back the ioend size to - * EOF. - */ -#ifdef CONFIG_FS_DAX -void -xfs_end_io_dax_write( - struct buffer_head *bh, - int uptodate) -{ - struct xfs_ioend *ioend = bh->b_private; - struct inode *inode = ioend->io_inode; - ssize_t size = ioend->io_size; - - ASSERT(IS_DAX(ioend->io_inode)); - - /* if there was an error zeroing, then don't convert it */ - if (!uptodate) - ioend->io_error = -EIO; - - /* - * Trim update to EOF, so we don't extend EOF during unwritten extent - * conversion of partial EOF blocks. - */ - spin_lock(&XFS_I(inode)->i_flags_lock); - if (ioend->io_offset + size > i_size_read(inode)) - size = i_size_read(inode) - ioend->io_offset; - spin_unlock(&XFS_I(inode)->i_flags_lock); - - __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); - -} -#else -void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } -#endif - static inline ssize_t xfs_vm_do_dio( struct inode *inode, @@ -1904,6 +1917,7 @@ xfs_vm_readpage( struct file *unused, struct page *page) { + trace_xfs_vm_readpage(page->mapping->host, 1); return mpage_readpage(page, xfs_get_blocks); } @@ -1914,6 +1928,7 @@ xfs_vm_readpages( struct list_head *pages, unsigned nr_pages) { + trace_xfs_vm_readpages(mapping->host, nr_pages); return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 86afd1ac7895..f6ffc9ae5ceb 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -58,7 +58,8 @@ int xfs_get_blocks(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); int xfs_get_blocks_direct(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); -void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); +int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 65fb37a18e92..0ef7c2ed3f8a 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -511,7 +511,7 @@ xfs_attr_list_int( xfs_inode_t *dp = context->dp; uint lock_mode; - XFS_STATS_INC(xs_attr_list); + XFS_STATS_INC(dp->i_mount, xs_attr_list); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 3bf4ad0d19e4..45ec9e40150c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -57,37 +57,66 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) } /* + * Routine to zero an extent on disk allocated to the specific inode. + * + * The VFS functions take a linearised filesystem block offset, so we have to + * convert the sparse xfs fsb to the right format first. + * VFS types are real funky, too. + */ +int +xfs_zero_extent( + struct xfs_inode *ip, + xfs_fsblock_t start_fsb, + xfs_off_t count_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); + sector_t block = XFS_BB_TO_FSBT(mp, sector); + ssize_t size = XFS_FSB_TO_B(mp, count_fsb); + + if (IS_DAX(VFS_I(ip))) + return dax_clear_blocks(VFS_I(ip), block, size); + + /* + * let the block layer decide on the fastest method of + * implementing the zeroing. + */ + return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS); + +} + +/* * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi * caller. Frees all the extents that need freeing, which must be done * last due to locking considerations. We never free any extents in * the first transaction. * - * Return 1 if the given transaction was committed and a new one - * started, and 0 otherwise in the committed parameter. + * If an inode *ip is provided, rejoin it to the transaction if + * the transaction was committed. */ int /* error */ xfs_bmap_finish( struct xfs_trans **tp, /* transaction pointer addr */ struct xfs_bmap_free *flist, /* i/o: list extents to free */ - int *committed)/* xact committed or not */ + struct xfs_inode *ip) { struct xfs_efd_log_item *efd; /* extent free data */ struct xfs_efi_log_item *efi; /* extent free intention */ int error; /* error return value */ + int committed;/* xact committed or not */ struct xfs_bmap_free_item *free; /* free extent item */ struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) { - *committed = 0; + if (flist->xbf_count == 0) return 0; - } + efi = xfs_trans_get_efi(*tp, flist->xbf_count); for (free = flist->xbf_first; free; free = free->xbfi_next) xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - error = __xfs_trans_roll(tp, NULL, committed); + error = __xfs_trans_roll(tp, ip, &committed); if (error) { /* * If the transaction was committed, drop the EFD reference @@ -99,16 +128,13 @@ xfs_bmap_finish( * transaction so we should return committed=1 even though we're * returning an error. */ - if (*committed) { + if (committed) { xfs_efi_release(efi); xfs_force_shutdown((*tp)->t_mountp, (error == -EFSCORRUPTED) ? SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR); - } else { - *committed = 1; } - return error; } @@ -229,6 +255,13 @@ xfs_bmap_rtalloc( xfs_trans_mod_dquot_byino(ap->tp, ap->ip, ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_RTBCOUNT, (long) ralen); + + /* Zero the extent if we were asked to do so */ + if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) { + error = xfs_zero_extent(ap->ip, ap->blkno, ap->length); + if (error) + return error; + } } else { ap->length = 0; } @@ -933,7 +966,6 @@ xfs_alloc_file_space( xfs_bmbt_irec_t imaps[1], *imapp; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; - int committed; int error; trace_xfs_alloc_file_space(ip); @@ -1027,24 +1059,21 @@ xfs_alloc_file_space( xfs_bmap_init(&free_list, &firstfsb); error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, alloc_type, &firstfsb, - 0, imapp, &nimaps, &free_list); - if (error) { + resblks, imapp, &nimaps, &free_list); + if (error) goto error0; - } /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) goto error0; - } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) { + if (error) break; - } allocated_fsb = imapp->br_blockcount; @@ -1170,7 +1199,6 @@ xfs_free_file_space( xfs_off_t offset, xfs_off_t len) { - int committed; int done; xfs_fileoff_t endoffset_fsb; int error; @@ -1310,17 +1338,15 @@ xfs_free_file_space( error = xfs_bunmapi(tp, ip, startoffset_fsb, endoffset_fsb - startoffset_fsb, 0, 2, &firstfsb, &free_list, &done); - if (error) { + if (error) goto error0; - } /* * complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) goto error0; - } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1398,7 +1424,6 @@ xfs_shift_file_space( int error; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; - int committed; xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; @@ -1490,7 +1515,7 @@ xfs_shift_file_space( if (error) goto out_bmap_cancel; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8ecffb35935b..daed4bfb85b2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -201,7 +201,7 @@ _xfs_buf_alloc( atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); - XFS_STATS_INC(xb_create); + XFS_STATS_INC(target->bt_mount, xb_create); trace_xfs_buf_init(bp, _RET_IP_); return bp; @@ -354,15 +354,16 @@ retry: */ if (!(++retries % 100)) xfs_err(NULL, - "possible memory allocation deadlock in %s (mode:0x%x)", + "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", + current->comm, current->pid, __func__, gfp_mask); - XFS_STATS_INC(xb_page_retries); + XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - XFS_STATS_INC(xb_page_found); + XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found); nbytes = min_t(size_t, size, PAGE_SIZE - offset); size -= nbytes; @@ -516,7 +517,7 @@ _xfs_buf_find( new_bp->b_pag = pag; spin_unlock(&pag->pag_buf_lock); } else { - XFS_STATS_INC(xb_miss_locked); + XFS_STATS_INC(btp->bt_mount, xb_miss_locked); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); } @@ -529,11 +530,11 @@ found: if (!xfs_buf_trylock(bp)) { if (flags & XBF_TRYLOCK) { xfs_buf_rele(bp); - XFS_STATS_INC(xb_busy_locked); + XFS_STATS_INC(btp->bt_mount, xb_busy_locked); return NULL; } xfs_buf_lock(bp); - XFS_STATS_INC(xb_get_locked_waited); + XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); } /* @@ -549,7 +550,7 @@ found: } trace_xfs_buf_find(bp, flags, _RET_IP_); - XFS_STATS_INC(xb_get_locked); + XFS_STATS_INC(btp->bt_mount, xb_get_locked); return bp; } @@ -603,7 +604,14 @@ found: } } - XFS_STATS_INC(xb_get); + /* + * Clear b_error if this is a lookup from a caller that doesn't expect + * valid data to be found in the buffer. + */ + if (!(flags & XBF_READ)) + xfs_buf_ioerror(bp, 0); + + XFS_STATS_INC(target->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; } @@ -643,7 +651,7 @@ xfs_buf_read_map( trace_xfs_buf_read(bp, flags, _RET_IP_); if (!XFS_BUF_ISDONE(bp)) { - XFS_STATS_INC(xb_get_read); + XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { @@ -1044,7 +1052,7 @@ xfs_buf_ioend_work( xfs_buf_ioend(bp); } -void +static void xfs_buf_ioend_async( struct xfs_buf *bp) { @@ -1631,13 +1639,9 @@ xfs_setsize_buftarg( btp->bt_meta_sectormask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { - char name[BDEVNAME_SIZE]; - - bdevname(btp->bt_bdev, name); - xfs_warn(btp->bt_mount, - "Cannot set_blocksize to %u on device %s", - sectorsize, name); + "Cannot set_blocksize to %u on device %pg", + sectorsize, btp->bt_bdev); return -EINVAL; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c79b717d9b88..c75721acd867 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -132,6 +132,7 @@ struct xfs_buf_map { struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; struct xfs_buf_ops { + char *name; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); }; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index a989a9c7edb7..642d55d10075 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -666,7 +666,7 @@ xfs_readdir( return -EIO; ASSERT(S_ISDIR(dp->i_d.di_mode)); - XFS_STATS_INC(xs_dir_getdents); + XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; args.geo = dp->i_mount->m_dir_geo; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 30cb3afb67f0..9c44d38dcd1f 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -75,9 +75,9 @@ xfs_qm_dqdestroy( ASSERT(list_empty(&dqp->q_lru)); mutex_destroy(&dqp->q_qlock); - kmem_zone_free(xfs_qm_dqzone, dqp); - XFS_STATS_DEC(xs_qm_dquot); + XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); + kmem_zone_free(xfs_qm_dqzone, dqp); } /* @@ -306,7 +306,7 @@ xfs_qm_dqalloc( xfs_fsblock_t firstblock; xfs_bmap_free_t flist; xfs_bmbt_irec_t map; - int nmaps, error, committed; + int nmaps, error; xfs_buf_t *bp; xfs_trans_t *tp = *tpp; @@ -379,11 +379,12 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { + error = xfs_bmap_finish(tpp, &flist, NULL); + if (error) goto error1; - } - if (committed) { + /* Transaction was committed? */ + if (*tpp != tp) { tp = *tpp; xfs_trans_bjoin(tp, bp); } else { @@ -393,9 +394,9 @@ xfs_qm_dqalloc( *O_bpp = bp; return 0; - error1: +error1: xfs_bmap_cancel(&flist); - error0: +error0: xfs_iunlock(quotip, XFS_ILOCK_EXCL); return error; @@ -605,7 +606,7 @@ xfs_qm_dqread( break; } - XFS_STATS_INC(xs_qm_dquot); + XFS_STATS_INC(mp, xs_qm_dquot); trace_xfs_dqread(dqp); @@ -747,12 +748,12 @@ restart: mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_hit(dqp); - XFS_STATS_INC(xs_qm_dqcachehits); + XFS_STATS_INC(mp, xs_qm_dqcachehits); *O_dqpp = dqp; return 0; } mutex_unlock(&qi->qi_tree_lock); - XFS_STATS_INC(xs_qm_dqcachemisses); + XFS_STATS_INC(mp, xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across @@ -806,7 +807,7 @@ restart: mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_dup(dqp); xfs_qm_dqdestroy(dqp); - XFS_STATS_INC(xs_qm_dquot_dups); + XFS_STATS_INC(mp, xs_qm_dquot_dups); goto restart; } @@ -846,7 +847,7 @@ xfs_qm_dqput( trace_xfs_dqput_free(dqp); if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) - XFS_STATS_INC(xs_qm_dquot_unused); + XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } xfs_dqunlock(dqp); } diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 74d0e5966ebc..88693a98fac5 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -164,9 +164,9 @@ xfs_verifier_error( { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - __return_address, bp->b_bn); + __return_address, bp->b_ops->name, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e78feb400e22..ebe9b8290a70 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -242,19 +242,30 @@ xfs_file_fsync( } /* - * All metadata updates are logged, which means that we just have - * to flush the log up to the latest LSN that touched the inode. + * All metadata updates are logged, which means that we just have to + * flush the log up to the latest LSN that touched the inode. If we have + * concurrent fsync/fdatasync() calls, we need them to all block on the + * log force before we clear the ili_fsync_fields field. This ensures + * that we don't get a racing sync operation that does not wait for the + * metadata to hit the journal before returning. If we race with + * clearing the ili_fsync_fields, then all that will happen is the log + * force will do nothing as the lsn will already be on disk. We can't + * race with setting ili_fsync_fields because that is done under + * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared + * until after the ili_fsync_fields is cleared. */ xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { if (!datasync || - (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) lsn = ip->i_itemp->ili_last_lsn; } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (lsn) + if (lsn) { error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + ip->i_itemp->ili_fsync_fields = 0; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); /* * If we only have a single device, and the log force about was @@ -287,7 +298,7 @@ xfs_file_read_iter( xfs_fsize_t n; loff_t pos = iocb->ki_pos; - XFS_STATS_INC(xs_read_calls); + XFS_STATS_INC(mp, xs_read_calls); if (unlikely(iocb->ki_flags & IOCB_DIRECT)) ioflags |= XFS_IO_ISDIRECT; @@ -365,7 +376,7 @@ xfs_file_read_iter( ret = generic_file_read_iter(iocb, to); if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); + XFS_STATS_ADD(mp, xs_read_bytes, ret); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -383,7 +394,7 @@ xfs_file_splice_read( int ioflags = 0; ssize_t ret; - XFS_STATS_INC(xs_read_calls); + XFS_STATS_INC(ip->i_mount, xs_read_calls); if (infilp->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; @@ -391,19 +402,26 @@ xfs_file_splice_read( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - /* for dax, we need to avoid the page cache */ - if (IS_DAX(VFS_I(ip))) - ret = default_file_splice_read(infilp, ppos, pipe, count, flags); - else - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); + /* + * DAX inodes cannot ues the page cache for splice, so we have to push + * them through the VFS IO path. This means it goes through + * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we + * cannot lock the splice operation at this level for DAX inodes. + */ + if (IS_DAX(VFS_I(ip))) { + ret = default_file_splice_read(infilp, ppos, pipe, count, + flags); + goto out; + } + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); +out: + if (ret > 0) + XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); return ret; } @@ -482,6 +500,8 @@ xfs_zero_eof( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); + trace_xfs_zero_eof(ip, isize, offset - isize); + /* * First handle zeroing the block on which isize resides. * @@ -574,6 +594,7 @@ xfs_file_aio_write_checks( struct xfs_inode *ip = XFS_I(inode); ssize_t error = 0; size_t count = iov_iter_count(from); + bool drained_dio = false; restart: error = generic_write_checks(iocb, from); @@ -611,12 +632,13 @@ restart: bool zero = false; spin_unlock(&ip->i_flags_lock); - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); - iov_iter_reexpand(from, count); - + if (!drained_dio) { + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } /* * We now have an IO submission barrier in place, but * AIO can do EOF updates during IO completion and hence @@ -626,6 +648,7 @@ restart: * no-op. */ inode_dio_wait(inode); + drained_dio = true; goto restart; } error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); @@ -867,7 +890,7 @@ xfs_file_write_iter( ssize_t ret; size_t ocount = iov_iter_count(from); - XFS_STATS_INC(xs_write_calls); + XFS_STATS_INC(ip->i_mount, xs_write_calls); if (ocount == 0) return 0; @@ -883,7 +906,7 @@ xfs_file_write_iter( if (ret > 0) { ssize_t err; - XFS_STATS_ADD(xs_write_bytes, ret); + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); /* Handle various SYNC-type writes */ err = generic_write_sync(file, iocb->ki_pos - ret, ret); @@ -1477,7 +1500,7 @@ xfs_file_llseek( * * mmap_sem (MM) * sb_start_pagefault(vfs, freeze) - * i_mmap_lock (XFS - truncate serialisation) + * i_mmaplock (XFS - truncate serialisation) * page_lock (MM) * i_lock (XFS - extent map serialisation) */ @@ -1503,10 +1526,9 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, - xfs_end_io_dax_write); + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); } else { - ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); ret = block_page_mkwrite_return(ret); } @@ -1538,7 +1560,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL); + ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1546,6 +1568,13 @@ xfs_filemap_fault( return ret; } +/* + * Similar to xfs_filemap_fault(), the DAX fault path can call into here on + * both read and write faults. Hence we need to handle both cases. There is no + * ->pmd_mkwrite callout for huge pages, so we have a single function here to + * handle both cases here. @flags carries the information on the type of fault + * occuring. + */ STATIC int xfs_filemap_pmd_fault( struct vm_area_struct *vma, @@ -1562,22 +1591,62 @@ xfs_filemap_pmd_fault( trace_xfs_filemap_pmd_fault(ip); - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + if (flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + } + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct, - xfs_end_io_dax_write); + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, + NULL); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); + + if (flags & FAULT_FLAG_WRITE) + sb_end_pagefault(inode->i_sb); return ret; } +/* + * pfn_mkwrite was originally inteneded to ensure we capture time stamp + * updates on write faults. In reality, it's need to serialise against + * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() + * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault + * barrier in place. + */ +static int +xfs_filemap_pfn_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + + struct inode *inode = file_inode(vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); + int ret = VM_FAULT_NOPAGE; + loff_t size; + + trace_xfs_filemap_pfn_mkwrite(ip); + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + + /* check if the faulting page hasn't raced with truncate */ + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + return ret; + +} + static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, .pmd_fault = xfs_filemap_pmd_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, + .pfn_mkwrite = xfs_filemap_pfn_mkwrite, }; STATIC int diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0a326bd64d4e..d7a490f24ead 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -63,7 +63,7 @@ xfs_inode_alloc( return NULL; } - XFS_STATS_INC(vn_active); + XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!xfs_isiflocked(ip)); @@ -129,7 +129,7 @@ xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!xfs_isiflocked(ip)); - XFS_STATS_DEC(vn_active); + XFS_STATS_DEC(ip->i_mount, vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); } @@ -159,7 +159,7 @@ xfs_iget_cache_hit( spin_lock(&ip->i_flags_lock); if (ip->i_ino != ino) { trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); + XFS_STATS_INC(mp, xs_ig_frecycle); error = -EAGAIN; goto out_error; } @@ -177,7 +177,7 @@ xfs_iget_cache_hit( */ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); + XFS_STATS_INC(mp, xs_ig_frecycle); error = -EAGAIN; goto out_error; } @@ -259,7 +259,7 @@ xfs_iget_cache_hit( xfs_ilock(ip, lock_flags); xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); - XFS_STATS_INC(xs_ig_found); + XFS_STATS_INC(mp, xs_ig_found); return 0; @@ -342,7 +342,7 @@ xfs_iget_cache_miss( error = radix_tree_insert(&pag->pag_ici_root, agino, ip); if (unlikely(error)) { WARN_ON(error != -EEXIST); - XFS_STATS_INC(xs_ig_dup); + XFS_STATS_INC(mp, xs_ig_dup); error = -EAGAIN; goto out_preload_end; } @@ -412,7 +412,7 @@ xfs_iget( if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return -EINVAL; - XFS_STATS_INC(xs_ig_attempts); + XFS_STATS_INC(mp, xs_ig_attempts); /* get the perag structure and ensure that it's inode capable */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); @@ -429,7 +429,7 @@ again: goto out_error_or_again; } else { rcu_read_unlock(); - XFS_STATS_INC(xs_ig_missed); + XFS_STATS_INC(mp, xs_ig_missed); error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, flags, lock_flags); @@ -965,7 +965,7 @@ reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); - XFS_STATS_INC(xs_ig_reclaims); + XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); /* * Remove the inode from the per-AG radix tree. * diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index dc40a6d5ae0d..ae3758a90ed6 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1143,7 +1143,6 @@ xfs_create( xfs_bmap_free_t free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - int committed; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1226,7 +1225,7 @@ xfs_create( * pointing to itself. */ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, &committed); + prid, resblks > 0, &ip, NULL); if (error) goto out_trans_cancel; @@ -1275,7 +1274,7 @@ xfs_create( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -1427,7 +1426,6 @@ xfs_link( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; int resblks; trace_xfs_link(tdp, target_name); @@ -1502,11 +1500,10 @@ xfs_link( * link transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - } - error = xfs_bmap_finish (&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) { xfs_bmap_cancel(&free_list); goto error_return; @@ -1555,7 +1552,6 @@ xfs_itruncate_extents( xfs_fileoff_t first_unmap_block; xfs_fileoff_t last_block; xfs_filblks_t unmap_len; - int committed; int error = 0; int done = 0; @@ -1601,9 +1597,7 @@ xfs_itruncate_extents( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (committed) - xfs_trans_ijoin(tp, ip, 0); + error = xfs_bmap_finish(&tp, &free_list, ip); if (error) goto out_bmap_cancel; @@ -1774,7 +1768,6 @@ xfs_inactive_ifree( { xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; @@ -1841,7 +1834,7 @@ xfs_inactive_ifree( * Just ignore errors at this point. There is nothing we can do except * to try to keep going. Make sure it's not a silent error. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) { xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); @@ -2365,6 +2358,7 @@ retry: iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; + iip->ili_fsync_fields = 0; iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -2522,7 +2516,6 @@ xfs_remove( int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; uint resblks; trace_xfs_remove(dp, name); @@ -2623,7 +2616,7 @@ xfs_remove( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -2700,7 +2693,6 @@ xfs_finish_rename( struct xfs_trans *tp, struct xfs_bmap_free *free_list) { - int committed = 0; int error; /* @@ -2710,7 +2702,7 @@ xfs_finish_rename( if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, free_list, &committed); + error = xfs_bmap_finish(&tp, free_list, NULL); if (error) { xfs_bmap_cancel(free_list); xfs_trans_cancel(tp); @@ -3271,8 +3263,8 @@ xfs_iflush_cluster( } if (clcount) { - XFS_STATS_INC(xs_icluster_flushcnt); - XFS_STATS_ADD(xs_icluster_flushinode, clcount); + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); } out_free: @@ -3345,7 +3337,7 @@ xfs_iflush( struct xfs_dinode *dip; int error; - XFS_STATS_INC(xs_iflush_count); + XFS_STATS_INC(mp, xs_iflush_count); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); @@ -3560,6 +3552,7 @@ xfs_iflush_int( */ iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; + iip->ili_fsync_fields = 0; iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 62bd80f4edd9..d14b12b8cfef 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -719,6 +719,7 @@ xfs_iflush_abort( * attempted. */ iip->ili_fields = 0; + iip->ili_fsync_fields = 0; } /* * Release the inode's flush lock since we're done with it. diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 488d81254e28..4c7722e325b3 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -34,6 +34,7 @@ typedef struct xfs_inode_log_item { unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ + unsigned int ili_fsync_fields; /* logged since last fsync */ } xfs_inode_log_item_t; static inline int xfs_inode_clean(xfs_inode_t *ip) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ea7d85af5310..d42738deec6d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -40,6 +40,7 @@ #include "xfs_symlink.h" #include "xfs_trans.h" #include "xfs_pnfs.h" +#include "xfs_acl.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -411,7 +412,7 @@ xfs_attrlist_by_handle( if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) return -EFAULT; if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XATTR_LIST_MAX) + al_hreq.buflen > XFS_XATTR_LIST_MAX) return -EINVAL; /* @@ -455,7 +456,7 @@ xfs_attrmulti_attr_get( unsigned char *kbuf; int error = -EFAULT; - if (*len > XATTR_SIZE_MAX) + if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; kbuf = kmem_zalloc_large(*len, KM_SLEEP); if (!kbuf) @@ -482,17 +483,22 @@ xfs_attrmulti_attr_set( __uint32_t flags) { unsigned char *kbuf; + int error; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - if (len > XATTR_SIZE_MAX) + if (len > XFS_XATTR_SIZE_MAX) return -EINVAL; kbuf = memdup_user(ubuf, len); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + if (!error) + xfs_forget_acl(inode, name, flags); + kfree(kbuf); + return error; } int @@ -501,9 +507,14 @@ xfs_attrmulti_attr_remove( unsigned char *name, __uint32_t flags) { + int error; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - return xfs_attr_remove(XFS_I(inode), name, flags); + error = xfs_attr_remove(XFS_I(inode), name, flags); + if (!error) + xfs_forget_acl(inode, name, flags); + return error; } STATIC int @@ -1028,7 +1039,7 @@ xfs_ioctl_setattr_xflags( xfs_diflags_to_linux(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - XFS_STATS_INC(xs_ig_attrchg); + XFS_STATS_INC(mp, xs_ig_attrchg); return 0; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index b88bdc85dd3d..1a05d8ae327d 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -356,7 +356,7 @@ xfs_compat_attrlist_by_handle( sizeof(compat_xfs_fsop_attrlist_handlereq_t))) return -EFAULT; if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XATTR_LIST_MAX) + al_hreq.buflen > XFS_XATTR_LIST_MAX) return -EINVAL; /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1f86033171c8..d81bdc080370 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -129,22 +129,31 @@ xfs_iomap_write_direct( xfs_trans_t *tp; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; - int committed; int error; - - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; + int lockmode; + int bmapi_flags = XFS_BMAPI_PREALLOC; rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); + lockmode = XFS_ILOCK_SHARED; /* locked by caller */ + + ASSERT(xfs_isilocked(ip, lockmode)); offset_fsb = XFS_B_TO_FSBT(mp, offset); last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); if ((offset + count) > XFS_ISIZE(ip)) { + /* + * Assert that the in-core extent list is present since this can + * call xfs_iread_extents() and we only have the ilock shared. + * This should be safe because the lock was held around a bmapi + * call in the caller and we only need it to access the in-core + * list. + */ + ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags & + XFS_IFEXTENTS); error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) - return error; + goto out_unlock; } else { if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) @@ -174,9 +183,40 @@ xfs_iomap_write_direct( } /* + * Drop the shared lock acquired by the caller, attach the dquot if + * necessary and move on to transaction setup. + */ + xfs_iunlock(ip, lockmode); + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + /* * Allocate and setup the transaction */ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + + /* + * For DAX, we do not allocate unwritten extents, but instead we zero + * the block before we commit the transaction. Ideally we'd like to do + * this outside the transaction context, but if we commit and then crash + * we may not have zeroed the blocks and this will be exposed on + * recovery of the allocation. Hence we must zero before commit. + * + * Further, if we are mapping unwritten extents here, we need to zero + * and convert them to written so that we don't need an unwritten extent + * callback for DAX. This also means that we need to be able to dip into + * the reserve block pool for bmbt block allocation if there is no space + * left but we need to do unwritten extent conversion. + */ + + if (IS_DAX(VFS_I(ip))) { + bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; + if (ISUNWRITTEN(imap)) { + tp->t_flags |= XFS_TRANS_RESERVE; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + } + } error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, resrtextents); /* @@ -187,7 +227,8 @@ xfs_iomap_write_direct( return error; } - xfs_ilock(ip, XFS_ILOCK_EXCL); + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) @@ -202,17 +243,18 @@ xfs_iomap_write_direct( xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_PREALLOC, &firstfsb, 0, - imap, &nimaps, &free_list); + bmapi_flags, &firstfsb, resblks, imap, + &nimaps, &free_list); if (error) goto out_bmap_cancel; /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; + error = xfs_trans_commit(tp); if (error) goto out_unlock; @@ -229,7 +271,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; out_bmap_cancel: @@ -655,7 +697,7 @@ xfs_iomap_write_allocate( xfs_bmap_free_t free_list; xfs_filblks_t count_fsb; xfs_trans_t *tp; - int nimaps, committed; + int nimaps; int error = 0; int nres; @@ -670,7 +712,7 @@ xfs_iomap_write_allocate( count_fsb = imap->br_blockcount; map_start_fsb = imap->br_startoff; - XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); + XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); while (count_fsb != 0) { /* @@ -750,13 +792,13 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, - &first_block, 1, - imap, &nimaps, &free_list); + count_fsb, 0, &first_block, + nres, imap, &nimaps, + &free_list); if (error) goto trans_cancel; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto trans_cancel; @@ -777,7 +819,7 @@ xfs_iomap_write_allocate( if ((offset_fsb >= imap->br_startoff) && (offset_fsb < (imap->br_startoff + imap->br_blockcount))) { - XFS_STATS_INC(xs_xstrat_quick); + XFS_STATS_INC(mp, xs_xstrat_quick); return 0; } @@ -814,7 +856,6 @@ xfs_iomap_write_unwritten( xfs_bmap_free_t free_list; xfs_fsize_t i_size; uint resblks; - int committed; int error; trace_xfs_unwritten_convert(ip, offset, count); @@ -866,8 +907,8 @@ xfs_iomap_write_unwritten( xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_CONVERT, &firstfsb, - 1, &imap, &nimaps, &free_list); + XFS_BMAPI_CONVERT, &firstfsb, resblks, + &imap, &nimaps, &free_list); if (error) goto error_on_bmapi_transaction; @@ -886,7 +927,7 @@ xfs_iomap_write_unwritten( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 8294132e6a3c..06eafafe636e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -414,13 +414,17 @@ xfs_vn_rename( * uio is kmalloced for this reason... */ STATIC const char * -xfs_vn_follow_link( +xfs_vn_get_link( struct dentry *dentry, - void **cookie) + struct inode *inode, + struct delayed_call *done) { char *link; int error = -ENOMEM; + if (!dentry) + return ERR_PTR(-ECHILD); + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); if (!link) goto out_err; @@ -429,7 +433,8 @@ xfs_vn_follow_link( if (unlikely(error)) goto out_kfree; - return *cookie = link; + set_delayed_call(done, kfree_link, link); + return link; out_kfree: kfree(link); @@ -695,7 +700,7 @@ xfs_setattr_nonsize( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - XFS_STATS_INC(xs_ig_attrchg); + XFS_STATS_INC(mp, xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); @@ -922,7 +927,7 @@ xfs_setattr_size( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - XFS_STATS_INC(xs_ig_attrchg); + XFS_STATS_INC(mp, xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); @@ -1172,8 +1177,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = xfs_vn_follow_link, - .put_link = kfree_put_link, + .get_link = xfs_vn_get_link, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 85f883dd6207..ec0e239a0fa9 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -171,6 +171,13 @@ struct xfs_kobj { struct completion complete; }; +struct xstats { + struct xfsstats __percpu *xs_stats; + struct xfs_kobj xs_kobj; +}; + +extern struct xstats xfsstats; + /* Kernel uid/gid conversion. These are used to convert to/from the on disk * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. * The conversion here is type only, the value will remain the same since we diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index aaadee0969c9..9c9a1c9bcc7f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -268,7 +268,7 @@ xlog_grant_head_wait( __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&head->lock); - XFS_STATS_INC(xs_sleep_logspace); + XFS_STATS_INC(log->l_mp, xs_sleep_logspace); trace_xfs_log_grant_sleep(log, tic); schedule(); @@ -379,7 +379,7 @@ xfs_log_regrant( if (XLOG_FORCED_SHUTDOWN(log)) return -EIO; - XFS_STATS_INC(xs_try_logspace); + XFS_STATS_INC(mp, xs_try_logspace); /* * This is a new transaction on the ticket, so we need to change the @@ -448,7 +448,7 @@ xfs_log_reserve( if (XLOG_FORCED_SHUTDOWN(log)) return -EIO; - XFS_STATS_INC(xs_try_logspace); + XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, @@ -1188,10 +1188,16 @@ xlog_iodone(xfs_buf_t *bp) int aborted = 0; /* - * Race to shutdown the filesystem if we see an error. + * Race to shutdown the filesystem if we see an error or the iclog is in + * IOABORT state. The IOABORT state is only set in DEBUG mode to inject + * CRC errors into log recovery. */ - if (XFS_TEST_ERROR(bp->b_error, l->l_mp, - XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, + XFS_RANDOM_IODONE_IOERR) || + iclog->ic_state & XLOG_STATE_IOABORT) { + if (iclog->ic_state & XLOG_STATE_IOABORT) + iclog->ic_state &= ~XLOG_STATE_IOABORT; + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_stale(bp); xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); @@ -1768,7 +1774,7 @@ xlog_sync( int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); int size; - XFS_STATS_INC(xs_log_writes); + XFS_STATS_INC(log->l_mp, xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* Add for LR header */ @@ -1805,7 +1811,7 @@ xlog_sync( bp = iclog->ic_bp; XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); - XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); + XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { @@ -1838,6 +1844,23 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, iclog->ic_datap, size); +#ifdef DEBUG + /* + * Intentionally corrupt the log record CRC based on the error injection + * frequency, if defined. This facilitates testing log recovery in the + * event of torn writes. Hence, set the IOABORT state to abort the log + * write on I/O completion and shutdown the fs. The subsequent mount + * detects the bad CRC and attempts to recover. + */ + if (log->l_badcrc_factor && + (prandom_u32() % log->l_badcrc_factor == 0)) { + iclog->ic_header.h_crc &= 0xAAAAAAAA; + iclog->ic_state |= XLOG_STATE_IOABORT; + xfs_warn(log->l_mp, + "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", + be64_to_cpu(iclog->ic_header.h_lsn)); + } +#endif bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; @@ -2045,12 +2068,14 @@ xlog_print_tic_res( "QM_DQCLUSTER", "QM_QINOCREATE", "QM_QUOTAOFF_END", - "SB_UNIT", "FSYNC_TS", "GROWFSRT_ALLOC", "GROWFSRT_ZERO", "GROWFSRT_FREE", - "SWAPEXT" + "SWAPEXT", + "CHECKPOINT", + "ICREATE", + "CREATE_TMPFILE" }; xfs_warn(mp, "xlog_write: reservation summary:"); @@ -2422,11 +2447,20 @@ xlog_write( &partial_copy_len); xlog_verify_dest_ptr(log, ptr); - /* copy region */ + /* + * Copy region. + * + * Unmount records just log an opheader, so can have + * empty payloads with no data region to copy. Hence we + * only copy the payload if the vector says it has data + * to copy. + */ ASSERT(copy_len >= 0); - memcpy(ptr, reg->i_addr + copy_off, copy_len); - xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); - + if (copy_len > 0) { + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, + copy_len); + } copy_len += start_rec_copy + sizeof(xlog_op_header_t); record_cnt++; data_cnt += contwr ? copy_len : 0; @@ -2782,11 +2816,19 @@ xlog_state_do_callback( } } while (!ioerrors && loopdidcallbacks); +#ifdef DEBUG /* - * make one last gasp attempt to see if iclogs are being left in - * limbo.. + * Make one last gasp attempt to see if iclogs are being left in limbo. + * If the above loop finds an iclog earlier than the current iclog and + * in one of the syncing states, the current iclog is put into + * DO_CALLBACK and the callbacks are deferred to the completion of the + * earlier iclog. Walk the iclogs in order and make sure that no iclog + * is in DO_CALLBACK unless an earlier iclog is in one of the syncing + * states. + * + * Note that SYNCING|IOABORT is a valid state so we cannot just check + * for ic_state == SYNCING. */ -#ifdef DEBUG if (funcdidcallbacks) { first_iclog = iclog = log->l_iclog; do { @@ -2801,7 +2843,7 @@ xlog_state_do_callback( * IOERROR - give up hope all ye who enter here */ if (iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state & XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_DONE_SYNC || iclog->ic_state == XLOG_STATE_IOERROR ) break; @@ -2913,7 +2955,7 @@ restart: iclog = log->l_iclog; if (iclog->ic_state != XLOG_STATE_ACTIVE) { - XFS_STATS_INC(xs_log_noiclogs); + XFS_STATS_INC(log->l_mp, xs_log_noiclogs); /* Wait for log writes to have flushed */ xlog_wait(&log->l_flush_wait, &log->l_icloglock); @@ -3165,11 +3207,19 @@ xlog_state_switch_iclogs( } if (log->l_curr_block >= log->l_logBBsize) { + /* + * Rewind the current block before the cycle is bumped to make + * sure that the combined LSN never transiently moves forward + * when the log wraps to the next cycle. This is to support the + * unlocked sample of these fields from xlog_valid_lsn(). Most + * other cases should acquire l_icloglock. + */ + log->l_curr_block -= log->l_logBBsize; + ASSERT(log->l_curr_block >= 0); + smp_wmb(); log->l_curr_cycle++; if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) log->l_curr_cycle++; - log->l_curr_block -= log->l_logBBsize; - ASSERT(log->l_curr_block >= 0); } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; @@ -3212,7 +3262,7 @@ _xfs_log_force( struct xlog_in_core *iclog; xfs_lsn_t lsn; - XFS_STATS_INC(xs_log_force); + XFS_STATS_INC(mp, xs_log_force); xlog_cil_force(log); @@ -3297,7 +3347,7 @@ maybe_sleep: spin_unlock(&log->l_icloglock); return -EIO; } - XFS_STATS_INC(xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're @@ -3362,7 +3412,7 @@ _xfs_log_force_lsn( ASSERT(lsn != 0); - XFS_STATS_INC(xs_log_force); + XFS_STATS_INC(mp, xs_log_force); lsn = xlog_cil_force_lsn(log, lsn); if (lsn == NULLCOMMITLSN) @@ -3411,7 +3461,7 @@ try_again: (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - XFS_STATS_INC(xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); @@ -3441,7 +3491,7 @@ try_again: spin_unlock(&log->l_icloglock); return -EIO; } - XFS_STATS_INC(xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're @@ -4023,3 +4073,45 @@ xlog_iclogs_empty( return 1; } +/* + * Verify that an LSN stamped into a piece of metadata is valid. This is + * intended for use in read verifiers on v5 superblocks. + */ +bool +xfs_log_check_lsn( + struct xfs_mount *mp, + xfs_lsn_t lsn) +{ + struct xlog *log = mp->m_log; + bool valid; + + /* + * norecovery mode skips mount-time log processing and unconditionally + * resets the in-core LSN. We can't validate in this mode, but + * modifications are not allowed anyways so just return true. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return true; + + /* + * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is + * handled by recovery and thus safe to ignore here. + */ + if (lsn == NULLCOMMITLSN) + return true; + + valid = xlog_valid_lsn(mp->m_log, lsn); + + /* warn the user about what's gone wrong before verifier failure */ + if (!valid) { + spin_lock(&log->l_icloglock); + xfs_warn(mp, +"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). " +"Please unmount and run xfs_repair (>= v4.3) to resolve.", + CYCLE_LSN(lsn), BLOCK_LSN(lsn), + log->l_curr_cycle, log->l_curr_block); + spin_unlock(&log->l_icloglock); + } + + return valid; +} diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 09d91d3166cd..aa533a7d50f2 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -181,5 +181,6 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); void xfs_log_worker(struct work_struct *work); void xfs_log_quiesce(struct xfs_mount *mp); +bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 950f3f94720c..ed8896310c00 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -62,6 +62,7 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ #define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ #define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ #define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ #define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ @@ -410,6 +411,8 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG void *l_iclog_bak[XLOG_MAX_ICLOGS]; + /* log record crc error injection factor */ + uint32_t l_badcrc_factor; #endif }; @@ -560,4 +563,55 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) remove_wait_queue(wq, &wait); } +/* + * The LSN is valid so long as it is behind the current LSN. If it isn't, this + * means that the next log record that includes this metadata could have a + * smaller LSN. In turn, this means that the modification in the log would not + * replay. + */ +static inline bool +xlog_valid_lsn( + struct xlog *log, + xfs_lsn_t lsn) +{ + int cur_cycle; + int cur_block; + bool valid = true; + + /* + * First, sample the current lsn without locking to avoid added + * contention from metadata I/O. The current cycle and block are updated + * (in xlog_state_switch_iclogs()) and read here in a particular order + * to avoid false negatives (e.g., thinking the metadata LSN is valid + * when it is not). + * + * The current block is always rewound before the cycle is bumped in + * xlog_state_switch_iclogs() to ensure the current LSN is never seen in + * a transiently forward state. Instead, we can see the LSN in a + * transiently behind state if we happen to race with a cycle wrap. + */ + cur_cycle = ACCESS_ONCE(log->l_curr_cycle); + smp_rmb(); + cur_block = ACCESS_ONCE(log->l_curr_block); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { + /* + * If the metadata LSN appears invalid, it's possible the check + * above raced with a wrap to the next log cycle. Grab the lock + * to check for sure. + */ + spin_lock(&log->l_icloglock); + cur_cycle = log->l_curr_cycle; + cur_block = log->l_curr_block; + spin_unlock(&log->l_icloglock); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) + valid = false; + } + + return valid; +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 512a0945d52a..da37beb76f6e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -61,6 +61,9 @@ xlog_recover_check_summary( #else #define xlog_recover_check_summary(log) #endif +STATIC int +xlog_do_recovery_pass( + struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); /* * This structure is used during recovery to record the buf log items which @@ -868,6 +871,351 @@ validate_head: } /* + * Seek backwards in the log for log record headers. + * + * Given a starting log block, walk backwards until we find the provided number + * of records or hit the provided tail block. The return value is the number of + * records encountered or a negative error code. The log block and buffer + * pointer of the last record seen are returned in rblk and rhead respectively. + */ +STATIC int +xlog_rseek_logrec_hdr( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int count, + struct xfs_buf *bp, + xfs_daddr_t *rblk, + struct xlog_rec_header **rhead, + bool *wrapped) +{ + int i; + int error; + int found = 0; + char *offset = NULL; + xfs_daddr_t end_blk; + + *wrapped = false; + + /* + * Walk backwards from the head block until we hit the tail or the first + * block in the log. + */ + end_blk = head_blk > tail_blk ? tail_blk : 0; + for (i = (int) head_blk - 1; i >= end_blk; i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + + /* + * If we haven't hit the tail block or the log record header count, + * start looking again from the end of the physical log. Note that + * callers can pass head == tail if the tail is not yet known. + */ + if (tail_blk >= head_blk && found != count) { + for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *wrapped = true; + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + } + + return found; + +out_error: + return error; +} + +/* + * Seek forward in the log for log record headers. + * + * Given head and tail blocks, walk forward from the tail block until we find + * the provided number of records or hit the head block. The return value is the + * number of records encountered or a negative error code. The log block and + * buffer pointer of the last record seen are returned in rblk and rhead + * respectively. + */ +STATIC int +xlog_seek_logrec_hdr( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int count, + struct xfs_buf *bp, + xfs_daddr_t *rblk, + struct xlog_rec_header **rhead, + bool *wrapped) +{ + int i; + int error; + int found = 0; + char *offset = NULL; + xfs_daddr_t end_blk; + + *wrapped = false; + + /* + * Walk forward from the tail block until we hit the head or the last + * block in the log. + */ + end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1; + for (i = (int) tail_blk; i <= end_blk; i++) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + + /* + * If we haven't hit the head block or the log record header count, + * start looking again from the start of the physical log. + */ + if (tail_blk > head_blk && found != count) { + for (i = 0; i < (int) head_blk; i++) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *wrapped = true; + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + } + + return found; + +out_error: + return error; +} + +/* + * Check the log tail for torn writes. This is required when torn writes are + * detected at the head and the head had to be walked back to a previous record. + * The tail of the previous record must now be verified to ensure the torn + * writes didn't corrupt the previous tail. + * + * Return an error if CRC verification fails as recovery cannot proceed. + */ +STATIC int +xlog_verify_tail( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + struct xlog_rec_header *thead; + struct xfs_buf *bp; + xfs_daddr_t first_bad; + int count; + int error = 0; + bool wrapped; + xfs_daddr_t tmp_head; + + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + + /* + * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get + * a temporary head block that points after the last possible + * concurrently written record of the tail. + */ + count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, + XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, + &wrapped); + if (count < 0) { + error = count; + goto out; + } + + /* + * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran + * into the actual log head. tmp_head points to the start of the record + * so update it to the actual head block. + */ + if (count < XLOG_MAX_ICLOGS + 1) + tmp_head = head_blk; + + /* + * We now have a tail and temporary head block that covers at least + * XLOG_MAX_ICLOGS records from the tail. We need to verify that these + * records were completely written. Run a CRC verification pass from + * tail to head and return the result. + */ + error = xlog_do_recovery_pass(log, tmp_head, tail_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Detect and trim torn writes from the head of the log. + * + * Storage without sector atomicity guarantees can result in torn writes in the + * log in the event of a crash. Our only means to detect this scenario is via + * CRC verification. While we can't always be certain that CRC verification + * failure is due to a torn write vs. an unrelated corruption, we do know that + * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at + * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of + * the log and treat failures in this range as torn writes as a matter of + * policy. In the event of CRC failure, the head is walked back to the last good + * record in the log and the tail is updated from that record and verified. + */ +STATIC int +xlog_verify_head( + struct xlog *log, + xfs_daddr_t *head_blk, /* in/out: unverified head */ + xfs_daddr_t *tail_blk, /* out: tail block */ + struct xfs_buf *bp, + xfs_daddr_t *rhead_blk, /* start blk of last record */ + struct xlog_rec_header **rhead, /* ptr to last record */ + bool *wrapped) /* last rec. wraps phys. log */ +{ + struct xlog_rec_header *tmp_rhead; + struct xfs_buf *tmp_bp; + xfs_daddr_t first_bad; + xfs_daddr_t tmp_rhead_blk; + int found; + int error; + bool tmp_wrapped; + + /* + * Search backwards through the log looking for the log record header + * block. This wraps all the way back around to the head so something is + * seriously wrong if we can't find it. + */ + found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk, + rhead, wrapped); + if (found < 0) + return found; + if (!found) { + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + return -EIO; + } + + *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); + + /* + * Now that we have a tail block, check the head of the log for torn + * writes. Search again until we hit the tail or the maximum number of + * log record I/Os that could have been in flight at one time. Use a + * temporary buffer so we don't trash the rhead/bp pointer from the + * call above. + */ + tmp_bp = xlog_get_bp(log, 1); + if (!tmp_bp) + return -ENOMEM; + error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk, + XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk, + &tmp_rhead, &tmp_wrapped); + xlog_put_bp(tmp_bp); + if (error < 0) + return error; + + /* + * Now run a CRC verification pass over the records starting at the + * block found above to the current head. If a CRC failure occurs, the + * log block of the first bad record is saved in first_bad. + */ + error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + if (error == -EFSBADCRC) { + /* + * We've hit a potential torn write. Reset the error and warn + * about it. + */ + error = 0; + xfs_warn(log->l_mp, +"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.", + first_bad, *head_blk); + + /* + * Get the header block and buffer pointer for the last good + * record before the bad record. + * + * Note that xlog_find_tail() clears the blocks at the new head + * (i.e., the records with invalid CRC) if the cycle number + * matches the the current cycle. + */ + found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp, + rhead_blk, rhead, wrapped); + if (found < 0) + return found; + if (found == 0) /* XXX: right thing to do here? */ + return -EIO; + + /* + * Reset the head block to the starting block of the first bad + * log record and set the tail block based on the last good + * record. + * + * Bail out if the updated head/tail match as this indicates + * possible corruption outside of the acceptable + * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair... + */ + *head_blk = first_bad; + *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); + if (*head_blk == *tail_blk) { + ASSERT(0); + return 0; + } + + /* + * Now verify the tail based on the updated head. This is + * required because the torn writes trimmed from the head could + * have been written over the tail of a previous record. Return + * any errors since recovery cannot proceed if the tail is + * corrupt. + * + * XXX: This leaves a gap in truly robust protection from torn + * writes in the log. If the head is behind the tail, the tail + * pushes forward to create some space and then a crash occurs + * causing the writes into the previous record's tail region to + * tear, log recovery isn't able to recover. + * + * How likely is this to occur? If possible, can we do something + * more intelligent here? Is it safe to push the tail forward if + * we can determine that the tail is within the range of the + * torn write (e.g., the kernel can only overwrite the tail if + * it has actually been pushed forward)? Alternatively, could we + * somehow prevent this condition at runtime? + */ + error = xlog_verify_tail(log, *head_blk, *tail_blk); + } + + return error; +} + +/* * Find the sync block number or the tail of the log. * * This will be the block number of the last record to have its @@ -893,13 +1241,13 @@ xlog_find_tail( xlog_op_header_t *op_head; char *offset = NULL; xfs_buf_t *bp; - int error, i, found; + int error; xfs_daddr_t umount_data_blk; xfs_daddr_t after_umount_blk; + xfs_daddr_t rhead_blk; xfs_lsn_t tail_lsn; int hblks; - - found = 0; + bool wrapped = false; /* * Find previous log record @@ -923,48 +1271,16 @@ xlog_find_tail( } /* - * Search backwards looking for log record header block + * Trim the head block back to skip over torn records. We can have + * multiple log I/Os in flight at any time, so we assume CRC failures + * back through the previous several records are torn writes and skip + * them. */ ASSERT(*head_blk < INT_MAX); - for (i = (int)(*head_blk) - 1; i >= 0; i--) { - error = xlog_bread(log, i, 1, bp, &offset); - if (error) - goto done; - - if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { - found = 1; - break; - } - } - /* - * If we haven't found the log record header block, start looking - * again from the end of the physical log. XXXmiken: There should be - * a check here to make sure we didn't search more than N blocks in - * the previous code. - */ - if (!found) { - for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { - error = xlog_bread(log, i, 1, bp, &offset); - if (error) - goto done; - - if (*(__be32 *)offset == - cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { - found = 2; - break; - } - } - } - if (!found) { - xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); - xlog_put_bp(bp); - ASSERT(0); - return -EIO; - } - - /* find blk_no of tail of log */ - rhead = (xlog_rec_header_t *)offset; - *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); + error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk, + &rhead, &wrapped); + if (error) + goto done; /* * Reset log values according to the state of the log when we @@ -976,10 +1292,10 @@ xlog_find_tail( * written was complete and ended exactly on the end boundary * of the physical log. */ - log->l_prev_block = i; + log->l_prev_block = rhead_blk; log->l_curr_block = (int)*head_blk; log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); - if (found == 2) + if (wrapped) log->l_curr_cycle++; atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); @@ -1014,12 +1330,13 @@ xlog_find_tail( } else { hblks = 1; } - after_umount_blk = (i + hblks + (int) - BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; + after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)); + after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize); tail_lsn = atomic64_read(&log->l_tail_lsn); if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { - umount_data_blk = (i + hblks) % log->l_logBBsize; + umount_data_blk = rhead_blk + hblks; + umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize); error = xlog_bread(log, umount_data_blk, 1, bp, &offset); if (error) goto done; @@ -3204,6 +3521,7 @@ xlog_recover_dquot_ra_pass2( struct xfs_disk_dquot *recddq; struct xfs_dq_logformat *dq_f; uint type; + int len; if (mp->m_qflags == 0) @@ -3224,8 +3542,12 @@ xlog_recover_dquot_ra_pass2( ASSERT(dq_f); ASSERT(dq_f->qlf_len == 1); - xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); + len = XFS_FSB_TO_BB(mp, dq_f->qlf_len); + if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len, + &xfs_dquot_buf_ra_ops); } STATIC void @@ -3431,7 +3753,7 @@ xlog_recover_add_to_cont_trans( * previous record. Copy the rest of the header. */ if (list_empty(&trans->r_itemq)) { - ASSERT(len < sizeof(struct xfs_trans_header)); + ASSERT(len <= sizeof(struct xfs_trans_header)); if (len > sizeof(struct xfs_trans_header)) { xfs_warn(log->l_mp, "%s: bad header length", __func__); return -EIO; @@ -4118,26 +4440,69 @@ xlog_recover_process_iunlinks( mp->m_dmevmask = mp_dmevmask; } +STATIC int +xlog_unpack_data( + struct xlog_rec_header *rhead, + char *dp, + struct xlog *log) +{ + int i, j, k; + + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && + i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; + for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; + dp += BBSIZE; + } + } + + return 0; +} + /* - * Upack the log buffer data and crc check it. If the check fails, issue a - * warning if and only if the CRC in the header is non-zero. This makes the - * check an advisory warning, and the zero CRC check will prevent failure - * warnings from being emitted when upgrading the kernel from one that does not - * add CRCs by default. - * - * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log - * corruption failure + * CRC check, unpack and process a log record. */ STATIC int -xlog_unpack_data_crc( +xlog_recover_process( + struct xlog *log, + struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, - struct xlog *log) + int pass) { + int error; __le32 crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); - if (crc != rhead->h_crc) { + + /* + * Nothing else to do if this is a CRC verification pass. Just return + * if this a record with a non-zero crc. Unfortunately, mkfs always + * sets h_crc to 0 so we must consider this valid even on v5 supers. + * Otherwise, return EFSBADCRC on failure so the callers up the stack + * know precisely what failed. + */ + if (pass == XLOG_RECOVER_CRCPASS) { + if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc)) + return -EFSBADCRC; + return 0; + } + + /* + * We're in the normal recovery path. Issue a warning if and only if the + * CRC in the header is non-zero. This is an advisory warning and the + * zero CRC check prevents warnings from being emitted when upgrading + * the kernel from one that does not add CRCs by default. + */ + if (crc != le32_to_cpu(rhead->h_crc)) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", @@ -4147,47 +4512,18 @@ xlog_unpack_data_crc( } /* - * If we've detected a log record corruption, then we can't - * recover past this point. Abort recovery if we are enforcing - * CRC protection by punting an error back up the stack. + * If the filesystem is CRC enabled, this mismatch becomes a + * fatal log corruption failure. */ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) return -EFSCORRUPTED; } - return 0; -} - -STATIC int -xlog_unpack_data( - struct xlog_rec_header *rhead, - char *dp, - struct xlog *log) -{ - int i, j, k; - int error; - - error = xlog_unpack_data_crc(rhead, dp, log); + error = xlog_unpack_data(rhead, dp, log); if (error) return error; - for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; - for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; - dp += BBSIZE; - } - } - - return 0; + return xlog_recover_process_data(log, rhash, rhead, dp, pass); } STATIC int @@ -4239,18 +4575,21 @@ xlog_do_recovery_pass( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, - int pass) + int pass, + xfs_daddr_t *first_bad) /* out: first bad log rec */ { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; + xfs_daddr_t rhead_blk; char *offset; xfs_buf_t *hbp, *dbp; - int error = 0, h_size; + int error = 0, h_size, h_len; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; struct hlist_head rhash[XLOG_RHASH_SIZE]; ASSERT(head_blk != tail_blk); + rhead_blk = 0; /* * Read the header of the tail block and get the iclog buffer size from @@ -4274,7 +4613,31 @@ xlog_do_recovery_pass( error = xlog_valid_rec_header(log, rhead, tail_blk); if (error) goto bread_err1; + + /* + * xfsprogs has a bug where record length is based on lsunit but + * h_size (iclog size) is hardcoded to 32k. Now that we + * unconditionally CRC verify the unmount record, this means the + * log buffer can be too small for the record and cause an + * overrun. + * + * Detect this condition here. Use lsunit for the buffer size as + * long as this looks like the mkfs case. Otherwise, return an + * error to avoid a buffer overrun. + */ h_size = be32_to_cpu(rhead->h_size); + h_len = be32_to_cpu(rhead->h_len); + if (h_len > h_size) { + if (h_len <= log->l_mp->m_logbsize && + be32_to_cpu(rhead->h_num_logops) == 1) { + xfs_warn(log->l_mp, + "invalid iclog size (%d bytes), using lsunit (%d bytes)", + h_size, log->l_mp->m_logbsize); + h_size = log->l_mp->m_logbsize; + } else + return -EFSCORRUPTED; + } + if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && (h_size > XLOG_HEADER_CYCLE_SIZE)) { hblks = h_size / XLOG_HEADER_CYCLE_SIZE; @@ -4301,7 +4664,7 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - blk_no = tail_blk; + blk_no = rhead_blk = tail_blk; if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. @@ -4408,19 +4771,18 @@ xlog_do_recovery_pass( goto bread_err2; } - error = xlog_unpack_data(rhead, offset, log); + error = xlog_recover_process(log, rhash, rhead, offset, + pass); if (error) goto bread_err2; - error = xlog_recover_process_data(log, rhash, - rhead, offset, pass); - if (error) - goto bread_err2; blk_no += bblks; + rhead_blk = blk_no; } ASSERT(blk_no >= log->l_logBBsize); blk_no -= log->l_logBBsize; + rhead_blk = blk_no; } /* read first part of physical log */ @@ -4441,21 +4803,22 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = xlog_unpack_data(rhead, offset, log); + error = xlog_recover_process(log, rhash, rhead, offset, pass); if (error) goto bread_err2; - error = xlog_recover_process_data(log, rhash, - rhead, offset, pass); - if (error) - goto bread_err2; blk_no += bblks + hblks; + rhead_blk = blk_no; } bread_err2: xlog_put_bp(dbp); bread_err1: xlog_put_bp(hbp); + + if (error && first_bad) + *first_bad = rhead_blk; + return error; } @@ -4493,7 +4856,7 @@ xlog_do_log_recovery( INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); error = xlog_do_recovery_pass(log, head_blk, tail_blk, - XLOG_RECOVER_PASS1); + XLOG_RECOVER_PASS1, NULL); if (error != 0) { kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; @@ -4504,7 +4867,7 @@ xlog_do_log_recovery( * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, - XLOG_RECOVER_PASS2); + XLOG_RECOVER_PASS2, NULL); #ifdef DEBUG if (!error) { int i; @@ -4609,9 +4972,19 @@ xlog_recover( int error; /* find the tail of the log */ - if ((error = xlog_find_tail(log, &head_blk, &tail_blk))) + error = xlog_find_tail(log, &head_blk, &tail_blk); + if (error) return error; + /* + * The superblock was read before the log was available and thus the LSN + * could not be verified. Check the superblock LSN against the current + * LSN now that it's known. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb) && + !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn)) + return -EINVAL; + if (tail_blk != head_blk) { /* There used to be a comment here: * diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index d8b67547ab34..11792d888e4e 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -17,6 +17,7 @@ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_error.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -43,6 +44,7 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ + int level; \ \ va_start(args, fmt); \ \ @@ -51,6 +53,11 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \ \ __xfs_printk(kern_level, mp, &vaf); \ va_end(args); \ + \ + if (!kstrtoint(kern_level, 0, &level) && \ + level <= LOGLEVEL_ERR && \ + xfs_error_level >= XFS_ERRLEVEL_HIGH) \ + xfs_stack_trace(); \ } \ define_xfs_printk_level(xfs_emerg, KERN_EMERG); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bf92e0c037c7..bb753b359bee 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -47,6 +47,16 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; +void +xfs_uuid_table_free(void) +{ + if (xfs_uuid_table_size == 0) + return; + kmem_free(xfs_uuid_table); + xfs_uuid_table = NULL; + xfs_uuid_table_size = 0; +} + /* * See if the UUID is unique among mounted XFS filesystems. * Mount fails if UUID is nil or a FS with the same UUID is already mounted. @@ -693,10 +703,15 @@ xfs_mountfs( if (error) goto out; - error = xfs_uuid_mount(mp); + error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, + &mp->m_kobj, "stats"); if (error) goto out_remove_sysfs; + error = xfs_uuid_mount(mp); + if (error) + goto out_del_stats; + /* * Set the minimum read and write sizes */ @@ -971,6 +986,8 @@ xfs_mountfs( xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); + out_del_stats: + xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_sysfs: xfs_sysfs_del(&mp->m_kobj); out: @@ -1047,6 +1064,7 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); + xfs_log_unmount(mp); xfs_da_unmount(mp); xfs_uuid_unmount(mp); @@ -1056,6 +1074,7 @@ xfs_unmountfs( #endif xfs_free_perag(mp); + xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7999e91cd49a..b57098481c10 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -127,6 +127,7 @@ typedef struct xfs_mount { int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ struct xfs_kobj m_kobj; + struct xstats m_stats; /* per-fs stats */ struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_data_workqueue; @@ -312,6 +313,7 @@ typedef struct xfs_perag { int pagb_count; /* pagb slots in use */ } xfs_perag_t; +extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); @@ -336,4 +338,7 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern void xfs_set_low_space_thresholds(struct xfs_mount *); +int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, + xfs_off_t count_fsb); + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index ab4a6066f7ca..dc6221942b85 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -181,6 +181,11 @@ xfs_fs_map_blocks( ASSERT(imap.br_startblock != DELAYSTARTBLOCK); if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { + /* + * xfs_iomap_write_direct() expects to take ownership of + * the shared ilock. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); error = xfs_iomap_write_direct(ip, offset, length, &imap, nimaps); if (error) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index eac9549efd52..532ab79d38fe 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -184,7 +184,7 @@ xfs_qm_dqpurge( */ ASSERT(!list_empty(&dqp->q_lru)); list_lru_del(&qi->qi_lru, &dqp->q_lru); - XFS_STATS_DEC(xs_qm_dquot_unused); + XFS_STATS_DEC(mp, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); return 0; @@ -448,11 +448,11 @@ xfs_qm_dquot_isolate( */ if (dqp->q_nrefs) { xfs_dqunlock(dqp); - XFS_STATS_INC(xs_qm_dqwants); + XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); list_lru_isolate(lru, &dqp->q_lru); - XFS_STATS_DEC(xs_qm_dquot_unused); + XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); return LRU_REMOVED; } @@ -496,19 +496,19 @@ xfs_qm_dquot_isolate( ASSERT(dqp->q_nrefs == 0); list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); - XFS_STATS_DEC(xs_qm_dquot_unused); + XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); - XFS_STATS_INC(xs_qm_dqreclaims); + XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims); return LRU_REMOVED; out_miss_busy: trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(xs_qm_dqreclaim_misses); + XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); return LRU_SKIP; out_unlock_dirty: trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(xs_qm_dqreclaim_misses); + XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); xfs_dqunlock(dqp); spin_lock(lru_lock); return LRU_RETRY; @@ -525,7 +525,7 @@ xfs_qm_shrink_scan( unsigned long freed; int error; - if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM)) return 0; INIT_LIST_HEAD(&isol.buffers); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ab1bac6a3a1c..be02a68b2fe2 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -766,7 +766,6 @@ xfs_growfs_rt_alloc( { xfs_fileoff_t bno; /* block number in file */ struct xfs_buf *bp; /* temporary buffer for zeroing */ - int committed; /* transaction committed flag */ xfs_daddr_t d; /* disk block address */ int error; /* error return value */ xfs_fsblock_t firstblock;/* first block allocated in xaction */ @@ -811,7 +810,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_bmap_finish(&tp, &flist, &committed); + error = xfs_bmap_finish(&tp, &flist, NULL); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index f2240383d4bb..8686df6c7609 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -18,20 +18,21 @@ #include "xfs.h" #include <linux/proc_fs.h> -DEFINE_PER_CPU(struct xfsstats, xfsstats); +struct xstats xfsstats; -static int counter_val(int idx) +static int counter_val(struct xfsstats __percpu *stats, int idx) { int val = 0, cpu; for_each_possible_cpu(cpu) - val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx)); + val += *(((__u32 *)per_cpu_ptr(stats, cpu) + idx)); return val; } -static int xfs_stat_proc_show(struct seq_file *m, void *v) +int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { int i, j; + int len = 0; __uint64_t xs_xstrat_bytes = 0; __uint64_t xs_write_bytes = 0; __uint64_t xs_read_bytes = 0; @@ -65,54 +66,59 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v) }; /* Loop over all stats groups */ + for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { - seq_printf(m, "%s", xstats[i].desc); + len += snprintf(buf + len, PATH_MAX - len, "%s", + xstats[i].desc); /* inner loop does each group */ for (; j < xstats[i].endpoint; j++) - seq_printf(m, " %u", counter_val(j)); - seq_putc(m, '\n'); + len += snprintf(buf + len, PATH_MAX - len, " %u", + counter_val(stats, j)); + len += snprintf(buf + len, PATH_MAX - len, "\n"); } /* extra precision counters */ for_each_possible_cpu(i) { - xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; - xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; - xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; + xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes; + xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes; + xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes; } - seq_printf(m, "xpc %Lu %Lu %Lu\n", + len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); - seq_printf(m, "debug %u\n", + len += snprintf(buf + len, PATH_MAX-len, "debug %u\n", #if defined(DEBUG) 1); #else 0); #endif - return 0; + + return len; } -static int xfs_stat_proc_open(struct inode *inode, struct file *file) +void xfs_stats_clearall(struct xfsstats __percpu *stats) { - return single_open(file, xfs_stat_proc_show, NULL); + int c; + __uint32_t vn_active; + + xfs_notice(NULL, "Clearing xfsstats"); + for_each_possible_cpu(c) { + preempt_disable(); + /* save vn_active, it's a universal truth! */ + vn_active = per_cpu_ptr(stats, c)->vn_active; + memset(per_cpu_ptr(stats, c), 0, sizeof(*stats)); + per_cpu_ptr(stats, c)->vn_active = vn_active; + preempt_enable(); + } } -static const struct file_operations xfs_stat_proc_fops = { - .owner = THIS_MODULE, - .open = xfs_stat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* legacy quota interfaces */ #ifdef CONFIG_XFS_QUOTA static int xqm_proc_show(struct seq_file *m, void *v) { /* maximum; incore; ratio free to inuse; freelist */ seq_printf(m, "%d\t%d\t%d\t%u\n", - 0, - counter_val(XFSSTAT_END_XQMSTAT), - 0, - counter_val(XFSSTAT_END_XQMSTAT + 1)); + 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT), + 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1)); return 0; } @@ -136,7 +142,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) seq_printf(m, "qm"); for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) - seq_printf(m, " %u", counter_val(j)); + seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); return 0; } @@ -155,44 +161,35 @@ static const struct file_operations xqmstat_proc_fops = { }; #endif /* CONFIG_XFS_QUOTA */ +#ifdef CONFIG_PROC_FS int xfs_init_procfs(void) { if (!proc_mkdir("fs/xfs", NULL)) + return -ENOMEM; + + if (!proc_symlink("fs/xfs/stat", NULL, + "/sys/fs/xfs/stats/stats")) goto out; - if (!proc_create("fs/xfs/stat", 0, NULL, - &xfs_stat_proc_fops)) - goto out_remove_xfs_dir; #ifdef CONFIG_XFS_QUOTA if (!proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops)) - goto out_remove_stat_file; + goto out; if (!proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops)) - goto out_remove_xqmstat_file; + goto out; #endif return 0; -#ifdef CONFIG_XFS_QUOTA - out_remove_xqmstat_file: - remove_proc_entry("fs/xfs/xqmstat", NULL); - out_remove_stat_file: - remove_proc_entry("fs/xfs/stat", NULL); -#endif - out_remove_xfs_dir: - remove_proc_entry("fs/xfs", NULL); - out: +out: + remove_proc_subtree("fs/xfs", NULL); return -ENOMEM; } void xfs_cleanup_procfs(void) { -#ifdef CONFIG_XFS_QUOTA - remove_proc_entry("fs/xfs/xqm", NULL); - remove_proc_entry("fs/xfs/xqmstat", NULL); -#endif - remove_proc_entry("fs/xfs/stat", NULL); - remove_proc_entry("fs/xfs", NULL); + remove_proc_subtree("fs/xfs", NULL); } +#endif /* CONFIG_PROC_FS */ diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index c8f238b8299a..483b0eff1988 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -19,8 +19,6 @@ #define __XFS_STATS_H__ -#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) - #include <linux/percpu.h> /* @@ -215,15 +213,29 @@ struct xfsstats { __uint64_t xs_read_bytes; }; -DECLARE_PER_CPU(struct xfsstats, xfsstats); +int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); +void xfs_stats_clearall(struct xfsstats __percpu *stats); +extern struct xstats xfsstats; -/* - * We don't disable preempt, not too worried about poking the - * wrong CPU's stat for now (also aggregated before reporting). - */ -#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++) -#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--) -#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc)) +#define XFS_STATS_INC(mp, v) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \ +} while (0) + +#define XFS_STATS_DEC(mp, v) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \ +} while (0) + +#define XFS_STATS_ADD(mp, v, inc) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \ +} while (0) + +#if defined(CONFIG_PROC_FS) extern int xfs_init_procfs(void); extern void xfs_cleanup_procfs(void); @@ -231,10 +243,6 @@ extern void xfs_cleanup_procfs(void); #else /* !CONFIG_PROC_FS */ -# define XFS_STATS_INC(count) -# define XFS_STATS_DEC(count) -# define XFS_STATS_ADD(count, inc) - static inline int xfs_init_procfs(void) { return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 904f637cfa5f..59c9b7bd958d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -137,7 +137,7 @@ static const match_table_t tokens = { }; -STATIC unsigned long +STATIC int suffix_kstrtoint(char *s, unsigned int base, int *res) { int last, shift_left_factor = 0, _res; @@ -838,17 +838,18 @@ xfs_init_mount_workqueues( goto out_destroy_unwritten; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_reclaim_workqueue) goto out_destroy_cil; mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", - WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0, + mp->m_fsname); if (!mp->m_log_workqueue) goto out_destroy_reclaim; mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_eofblocks_workqueue) goto out_destroy_log; @@ -922,7 +923,7 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - XFS_STATS_INC(vn_reclaim); + XFS_STATS_INC(ip->i_mount, vn_reclaim); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); @@ -983,8 +984,8 @@ xfs_fs_evict_inode( truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - XFS_STATS_INC(vn_rele); - XFS_STATS_INC(vn_remove); + XFS_STATS_INC(ip->i_mount, vn_rele); + XFS_STATS_INC(ip->i_mount, vn_remove); xfs_inactive(ip); } @@ -1474,9 +1475,16 @@ xfs_fs_fill_super( if (error) goto out_destroy_workqueues; + /* Allocate stats memory before we do operations that might use it */ + mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); + if (!mp->m_stats.xs_stats) { + error = -ENOMEM; + goto out_destroy_counters; + } + error = xfs_readsb(mp, flags); if (error) - goto out_destroy_counters; + goto out_free_stats; error = xfs_finish_flags(mp); if (error) @@ -1545,9 +1553,11 @@ xfs_fs_fill_super( xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); + out_free_stats: + free_percpu(mp->m_stats.xs_stats); out_destroy_counters: xfs_destroy_percpu_counters(mp); -out_destroy_workqueues: + out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); out_close_devices: xfs_close_devices(mp); @@ -1574,6 +1584,7 @@ xfs_fs_put_super( xfs_unmountfs(mp); xfs_freesb(mp); + free_percpu(mp->m_stats.xs_stats); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); @@ -1703,8 +1714,8 @@ xfs_init_zones(void) xfs_inode_zone = kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, - xfs_fs_inode_init_once); + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD | + KM_ZONE_ACCOUNT, xfs_fs_inode_init_once); if (!xfs_inode_zone) goto out_destroy_efi_zone; @@ -1838,19 +1849,32 @@ init_xfs_fs(void) xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj); if (!xfs_kset) { error = -ENOMEM; - goto out_sysctl_unregister;; + goto out_sysctl_unregister; + } + + xfsstats.xs_kobj.kobject.kset = xfs_kset; + + xfsstats.xs_stats = alloc_percpu(struct xfsstats); + if (!xfsstats.xs_stats) { + error = -ENOMEM; + goto out_kset_unregister; } + error = xfs_sysfs_init(&xfsstats.xs_kobj, &xfs_stats_ktype, NULL, + "stats"); + if (error) + goto out_free_stats; + #ifdef DEBUG xfs_dbg_kobj.kobject.kset = xfs_kset; error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug"); if (error) - goto out_kset_unregister; + goto out_remove_stats_kobj; #endif error = xfs_qm_init(); if (error) - goto out_remove_kobj; + goto out_remove_dbg_kobj; error = register_filesystem(&xfs_fs_type); if (error) @@ -1859,11 +1883,15 @@ init_xfs_fs(void) out_qm_exit: xfs_qm_exit(); - out_remove_kobj: + out_remove_dbg_kobj: #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); - out_kset_unregister: + out_remove_stats_kobj: #endif + xfs_sysfs_del(&xfsstats.xs_kobj); + out_free_stats: + free_percpu(xfsstats.xs_stats); + out_kset_unregister: kset_unregister(xfs_kset); out_sysctl_unregister: xfs_sysctl_unregister(); @@ -1889,6 +1917,8 @@ exit_xfs_fs(void) #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); #endif + xfs_sysfs_del(&xfsstats.xs_kobj); + free_percpu(xfsstats.xs_stats); kset_unregister(xfs_kset); xfs_sysctl_unregister(); xfs_cleanup_procfs(); @@ -1896,6 +1926,7 @@ exit_xfs_fs(void) xfs_mru_cache_uninit(); xfs_destroy_workqueues(); xfs_destroy_zones(); + xfs_uuid_table_free(); } module_init(init_xfs_fs); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 996481eeb491..b44284c1adda 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -178,7 +178,6 @@ xfs_symlink( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - int committed; xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; int nmaps; @@ -387,7 +386,7 @@ xfs_symlink( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -434,7 +433,6 @@ xfs_inactive_symlink_rmt( struct xfs_inode *ip) { xfs_buf_t *bp; - int committed; int done; int error; xfs_fsblock_t first_block; @@ -510,16 +508,10 @@ xfs_inactive_symlink_rmt( /* * Commit the first transaction. This logs the EFI and the inode. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, ip); if (error) goto error_bmap_cancel; /* - * The transaction must have been committed, since there were - * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. - * The new tp has the extent freeing and EFDs. - */ - ASSERT(committed); - /* * The first xact was committed, so add the inode to the new one. * Mark it dirty so it will be logged and moved forward in the log as * part of every commit. diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index a0c8067cea6f..aed74d3f8da9 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -19,6 +19,7 @@ #include <linux/sysctl.h> #include <linux/proc_fs.h> #include "xfs_error.h" +#include "xfs_stats.h" static struct ctl_table_header *xfs_table_header; @@ -31,22 +32,12 @@ xfs_stats_clear_proc_handler( size_t *lenp, loff_t *ppos) { - int c, ret, *valp = ctl->data; - __uint32_t vn_active; + int ret, *valp = ctl->data; ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); if (!ret && write && *valp) { - xfs_notice(NULL, "Clearing xfsstats"); - for_each_possible_cpu(c) { - preempt_disable(); - /* save vn_active, it's a universal truth! */ - vn_active = per_cpu(xfsstats, c).vn_active; - memset(&per_cpu(xfsstats, c), 0, - sizeof(struct xfsstats)); - per_cpu(xfsstats, c).vn_active = vn_active; - preempt_enable(); - } + xfs_stats_clearall(xfsstats.xs_stats); xfs_stats_clear = 0; } diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index aa03670851d8..641d625eb334 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -21,11 +21,13 @@ #include "xfs_log_format.h" #include "xfs_log.h" #include "xfs_log_priv.h" +#include "xfs_stats.h" struct xfs_sysfs_attr { struct attribute attr; - ssize_t (*show)(char *buf, void *data); - ssize_t (*store)(const char *buf, size_t count, void *data); + ssize_t (*show)(struct kobject *kobject, char *buf); + ssize_t (*store)(struct kobject *kobject, const char *buf, + size_t count); }; static inline struct xfs_sysfs_attr * @@ -38,6 +40,8 @@ to_attr(struct attribute *attr) static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name) #define XFS_SYSFS_ATTR_RO(name) \ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name) +#define XFS_SYSFS_ATTR_WO(name) \ + static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name) #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr @@ -51,14 +55,42 @@ struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, }; +STATIC ssize_t +xfs_sysfs_object_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0; +} + +STATIC ssize_t +xfs_sysfs_object_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0; +} + +static const struct sysfs_ops xfs_sysfs_ops = { + .show = xfs_sysfs_object_show, + .store = xfs_sysfs_object_store, +}; + #ifdef DEBUG /* debug */ STATIC ssize_t log_recovery_delay_store( + struct kobject *kobject, const char *buf, - size_t count, - void *data) + size_t count) { int ret; int val; @@ -77,8 +109,8 @@ log_recovery_delay_store( STATIC ssize_t log_recovery_delay_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay); } @@ -89,52 +121,87 @@ static struct attribute *xfs_dbg_attrs[] = { NULL, }; +struct kobj_type xfs_dbg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_attrs = xfs_dbg_attrs, +}; + +#endif /* DEBUG */ + +/* stats */ + +static inline struct xstats * +to_xstats(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + + return container_of(kobj, struct xstats, xs_kobj); +} + STATIC ssize_t -xfs_dbg_show( - struct kobject *kobject, - struct attribute *attr, - char *buf) +stats_show( + struct kobject *kobject, + char *buf) { - struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + struct xstats *stats = to_xstats(kobject); - return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0; + return xfs_stats_format(stats->xs_stats, buf); } +XFS_SYSFS_ATTR_RO(stats); STATIC ssize_t -xfs_dbg_store( - struct kobject *kobject, - struct attribute *attr, - const char *buf, - size_t count) +stats_clear_store( + struct kobject *kobject, + const char *buf, + size_t count) { - struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + int ret; + int val; + struct xstats *stats = to_xstats(kobject); + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; - return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0; + if (val != 1) + return -EINVAL; + + xfs_stats_clearall(stats->xs_stats); + return count; } +XFS_SYSFS_ATTR_WO(stats_clear); -static struct sysfs_ops xfs_dbg_ops = { - .show = xfs_dbg_show, - .store = xfs_dbg_store, +static struct attribute *xfs_stats_attrs[] = { + ATTR_LIST(stats), + ATTR_LIST(stats_clear), + NULL, }; -struct kobj_type xfs_dbg_ktype = { +struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, - .sysfs_ops = &xfs_dbg_ops, - .default_attrs = xfs_dbg_attrs, + .sysfs_ops = &xfs_sysfs_ops, + .default_attrs = xfs_stats_attrs, }; -#endif /* DEBUG */ - /* xlog */ +static inline struct xlog * +to_xlog(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + + return container_of(kobj, struct xlog, l_kobj); +} + STATIC ssize_t log_head_lsn_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) { - struct xlog *log = data; int cycle; int block; + struct xlog *log = to_xlog(kobject); spin_lock(&log->l_icloglock); cycle = log->l_curr_cycle; @@ -147,12 +214,12 @@ XFS_SYSFS_ATTR_RO(log_head_lsn); STATIC ssize_t log_tail_lsn_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) { - struct xlog *log = data; int cycle; int block; + struct xlog *log = to_xlog(kobject); xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block); return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); @@ -161,12 +228,13 @@ XFS_SYSFS_ATTR_RO(log_tail_lsn); STATIC ssize_t reserve_grant_head_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) + { - struct xlog *log = data; int cycle; int bytes; + struct xlog *log = to_xlog(kobject); xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes); return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); @@ -175,65 +243,64 @@ XFS_SYSFS_ATTR_RO(reserve_grant_head); STATIC ssize_t write_grant_head_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) { - struct xlog *log = data; int cycle; int bytes; + struct xlog *log = to_xlog(kobject); xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes); return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); } XFS_SYSFS_ATTR_RO(write_grant_head); -static struct attribute *xfs_log_attrs[] = { - ATTR_LIST(log_head_lsn), - ATTR_LIST(log_tail_lsn), - ATTR_LIST(reserve_grant_head), - ATTR_LIST(write_grant_head), - NULL, -}; - -static inline struct xlog * -to_xlog(struct kobject *kobject) -{ - struct xfs_kobj *kobj = to_kobj(kobject); - return container_of(kobj, struct xlog, l_kobj); -} - +#ifdef DEBUG STATIC ssize_t -xfs_log_show( - struct kobject *kobject, - struct attribute *attr, - char *buf) +log_badcrc_factor_store( + struct kobject *kobject, + const char *buf, + size_t count) { - struct xlog *log = to_xlog(kobject); - struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + struct xlog *log = to_xlog(kobject); + int ret; + uint32_t val; - return xfs_attr->show ? xfs_attr->show(buf, log) : 0; + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + log->l_badcrc_factor = val; + + return count; } STATIC ssize_t -xfs_log_store( - struct kobject *kobject, - struct attribute *attr, - const char *buf, - size_t count) +log_badcrc_factor_show( + struct kobject *kobject, + char *buf) { - struct xlog *log = to_xlog(kobject); - struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + struct xlog *log = to_xlog(kobject); - return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0; + return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor); } -static struct sysfs_ops xfs_log_ops = { - .show = xfs_log_show, - .store = xfs_log_store, +XFS_SYSFS_ATTR_RW(log_badcrc_factor); +#endif /* DEBUG */ + +static struct attribute *xfs_log_attrs[] = { + ATTR_LIST(log_head_lsn), + ATTR_LIST(log_tail_lsn), + ATTR_LIST(reserve_grant_head), + ATTR_LIST(write_grant_head), +#ifdef DEBUG + ATTR_LIST(log_badcrc_factor), +#endif + NULL, }; struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, - .sysfs_ops = &xfs_log_ops, + .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_log_attrs, }; diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 240eee35f342..be692e59938d 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -22,6 +22,7 @@ extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ extern struct kobj_type xfs_dbg_ktype; /* debug */ extern struct kobj_type xfs_log_ktype; /* xlog */ +extern struct kobj_type xfs_stats_ktype; /* stats */ static inline struct xfs_kobj * to_kobj(struct kobject *kobject) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 5ed36b1e04c1..391d797cb53f 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -689,6 +689,7 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); +DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), @@ -1221,6 +1222,32 @@ DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); +DECLARE_EVENT_CLASS(xfs_readpage_class, + TP_PROTO(struct inode *inode, int nr_pages), + TP_ARGS(inode, nr_pages), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, nr_pages) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->nr_pages = nr_pages; + ), + TP_printk("dev %d:%d ino 0x%llx nr_pages %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->nr_pages) +) + +#define DEFINE_READPAGE_EVENT(name) \ +DEFINE_EVENT(xfs_readpage_class, name, \ + TP_PROTO(struct inode *inode, int nr_pages), \ + TP_ARGS(inode, nr_pages)) +DEFINE_READPAGE_EVENT(xfs_vm_readpage); +DEFINE_READPAGE_EVENT(xfs_vm_readpages); + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int type, struct xfs_bmbt_irec *irec), @@ -1312,6 +1339,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize); +DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index a0ab1dae9c31..748b16aff45a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -930,9 +930,9 @@ __xfs_trans_commit( */ if (sync) { error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); - XFS_STATS_INC(xs_trans_sync); + XFS_STATS_INC(mp, xs_trans_sync); } else { - XFS_STATS_INC(xs_trans_async); + XFS_STATS_INC(mp, xs_trans_async); } return error; @@ -955,7 +955,7 @@ out_unreserve: xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); - XFS_STATS_INC(xs_trans_empty); + XFS_STATS_INC(mp, xs_trans_empty); return error; } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1098cf490189..aa67339b9537 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -349,7 +349,7 @@ xfsaild_push( xfs_ail_min_lsn(ailp))) { ailp->xa_log_flush = 0; - XFS_STATS_INC(xs_push_ail_flush); + XFS_STATS_INC(mp, xs_push_ail_flush); xfs_log_force(mp, XFS_LOG_SYNC); } @@ -371,7 +371,7 @@ xfsaild_push( goto out_done; } - XFS_STATS_INC(xs_push_ail); + XFS_STATS_INC(mp, xs_push_ail); lsn = lip->li_lsn; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { @@ -385,7 +385,7 @@ xfsaild_push( lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); switch (lock_result) { case XFS_ITEM_SUCCESS: - XFS_STATS_INC(xs_push_ail_success); + XFS_STATS_INC(mp, xs_push_ail_success); trace_xfs_ail_push(lip); ailp->xa_last_pushed_lsn = lsn; @@ -403,7 +403,7 @@ xfsaild_push( * re-try the flushing relatively soon if most of the * AIL is beeing flushed. */ - XFS_STATS_INC(xs_push_ail_flushing); + XFS_STATS_INC(mp, xs_push_ail_flushing); trace_xfs_ail_flushing(lip); flushing++; @@ -411,14 +411,14 @@ xfsaild_push( break; case XFS_ITEM_PINNED: - XFS_STATS_INC(xs_push_ail_pinned); + XFS_STATS_INC(mp, xs_push_ail_pinned); trace_xfs_ail_pinned(lip); stuck++; ailp->xa_log_flush++; break; case XFS_ITEM_LOCKED: - XFS_STATS_INC(xs_push_ail_locked); + XFS_STATS_INC(mp, xs_push_ail_locked); trace_xfs_ail_locked(lip); stuck++; @@ -497,6 +497,7 @@ xfsaild( long tout = 0; /* milliseconds */ current->flags |= PF_MEMALLOC; + set_freezable(); while (!kthread_should_stop()) { if (tout && tout <= 20) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index ce78534a047e..995170194df0 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -572,12 +572,16 @@ xfs_quota_warn( struct xfs_dquot *dqp, int type) { - /* no warnings for project quotas - we just return ENOSPC later */ + enum quota_type qtype; + if (dqp->dq_flags & XFS_DQ_PROJ) - return; - quota_send_warning(make_kqid(&init_user_ns, - (dqp->dq_flags & XFS_DQ_USER) ? - USRQUOTA : GRPQUOTA, + qtype = PRJQUOTA; + else if (dqp->dq_flags & XFS_DQ_USER) + qtype = USRQUOTA; + else + qtype = GRPQUOTA; + + quota_send_warning(make_kqid(&init_user_ns, qtype, be32_to_cpu(dqp->q_core.d_id)), mp->m_super->s_dev, type); } diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 17280cd71934..b97f1df910ab 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -108,6 +108,15 @@ xfs_trans_log_inode( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* + * Record the specific change for fdatasync optimisation. This + * allows fdatasync to skip log forces for inodes that are only + * timestamp dirty. We do this before the change count so that + * the core being logged in this case does not impact on fdatasync + * behaviour. + */ + ip->i_itemp->ili_fsync_fields |= flags; + + /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. We don't use * inode_inc_version() because there is no need for extra locking around diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index c036815183cb..110f1d7d86b0 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -32,15 +32,13 @@ static int -xfs_xattr_get(struct dentry *dentry, const char *name, - void *value, size_t size, int xflags) +xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, + const char *name, void *value, size_t size) { + int xflags = handler->flags; struct xfs_inode *ip = XFS_I(d_inode(dentry)); int error, asize = size; - if (strcmp(name, "") == 0) - return -EINVAL; - /* Convert Linux syscall to XFS internal ATTR flags */ if (!size) { xflags |= ATTR_KERNOVAL; @@ -53,14 +51,35 @@ xfs_xattr_get(struct dentry *dentry, const char *name, return asize; } -static int -xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int xflags) +void +xfs_forget_acl( + struct inode *inode, + const char *name, + int xflags) { - struct xfs_inode *ip = XFS_I(d_inode(dentry)); + /* + * Invalidate any cached ACLs if the user has bypassed the ACL + * interface. We don't validate the content whatsoever so it is caller + * responsibility to provide data in valid format and ensure i_mode is + * consistent. + */ + if (xflags & ATTR_ROOT) { +#ifdef CONFIG_XFS_POSIX_ACL + if (!strcmp(name, SGI_ACL_FILE)) + forget_cached_acl(inode, ACL_TYPE_ACCESS); + else if (!strcmp(name, SGI_ACL_DEFAULT)) + forget_cached_acl(inode, ACL_TYPE_DEFAULT); +#endif + } +} - if (strcmp(name, "") == 0) - return -EINVAL; +static int +xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, + const char *name, const void *value, size_t size, int flags) +{ + int xflags = handler->flags; + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + int error; /* Convert Linux syscall to XFS internal ATTR flags */ if (flags & XATTR_CREATE) @@ -70,8 +89,12 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, if (!value) return xfs_attr_remove(ip, (unsigned char *)name, xflags); - return xfs_attr_set(ip, (unsigned char *)name, + error = xfs_attr_set(ip, (unsigned char *)name, (void *)value, size, xflags); + if (!error) + xfs_forget_acl(d_inode(dentry), name, xflags); + + return error; } static const struct xattr_handler xfs_xattr_user_handler = { @@ -106,47 +129,19 @@ const struct xattr_handler *xfs_xattr_handlers[] = { NULL }; -static unsigned int xfs_xattr_prefix_len(int flags) -{ - if (flags & XFS_ATTR_SECURE) - return sizeof("security"); - else if (flags & XFS_ATTR_ROOT) - return sizeof("trusted"); - else - return sizeof("user"); -} - -static const char *xfs_xattr_prefix(int flags) -{ - if (flags & XFS_ATTR_SECURE) - return xfs_xattr_security_handler.prefix; - else if (flags & XFS_ATTR_ROOT) - return xfs_xattr_trusted_handler.prefix; - else - return xfs_xattr_user_handler.prefix; -} - static int -xfs_xattr_put_listent( +__xfs_xattr_put_listent( struct xfs_attr_list_context *context, - int flags, - unsigned char *name, - int namelen, - int valuelen, - unsigned char *value) + char *prefix, + int prefix_len, + unsigned char *name, + int namelen) { - unsigned int prefix_len = xfs_xattr_prefix_len(flags); char *offset; int arraytop; - ASSERT(context->count >= 0); - - /* - * Only show root namespace entries if we are actually allowed to - * see them. - */ - if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN)) - return 0; + if (!context->alist) + goto compute_size; arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { @@ -154,17 +149,19 @@ xfs_xattr_put_listent( return 1; } offset = (char *)context->alist + context->count; - strncpy(offset, xfs_xattr_prefix(flags), prefix_len); + strncpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ offset += namelen; *offset = '\0'; + +compute_size: context->count += prefix_len + namelen + 1; return 0; } static int -xfs_xattr_put_listent_sizes( +xfs_xattr_put_listent( struct xfs_attr_list_context *context, int flags, unsigned char *name, @@ -172,24 +169,55 @@ xfs_xattr_put_listent_sizes( int valuelen, unsigned char *value) { - context->count += xfs_xattr_prefix_len(flags) + namelen + 1; - return 0; -} + char *prefix; + int prefix_len; -static int -list_one_attr(const char *name, const size_t len, void *data, - size_t size, ssize_t *result) -{ - char *p = data + *result; + ASSERT(context->count >= 0); - *result += len; - if (!size) - return 0; - if (*result > size) - return -ERANGE; + if (flags & XFS_ATTR_ROOT) { +#ifdef CONFIG_XFS_POSIX_ACL + if (namelen == SGI_ACL_FILE_SIZE && + strncmp(name, SGI_ACL_FILE, + SGI_ACL_FILE_SIZE) == 0) { + int ret = __xfs_xattr_put_listent( + context, XATTR_SYSTEM_PREFIX, + XATTR_SYSTEM_PREFIX_LEN, + XATTR_POSIX_ACL_ACCESS, + strlen(XATTR_POSIX_ACL_ACCESS)); + if (ret) + return ret; + } else if (namelen == SGI_ACL_DEFAULT_SIZE && + strncmp(name, SGI_ACL_DEFAULT, + SGI_ACL_DEFAULT_SIZE) == 0) { + int ret = __xfs_xattr_put_listent( + context, XATTR_SYSTEM_PREFIX, + XATTR_SYSTEM_PREFIX_LEN, + XATTR_POSIX_ACL_DEFAULT, + strlen(XATTR_POSIX_ACL_DEFAULT)); + if (ret) + return ret; + } +#endif - strcpy(p, name); - return 0; + /* + * Only show root namespace entries if we are actually allowed to + * see them. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + prefix = XATTR_TRUSTED_PREFIX; + prefix_len = XATTR_TRUSTED_PREFIX_LEN; + } else if (flags & XFS_ATTR_SECURE) { + prefix = XATTR_SECURITY_PREFIX; + prefix_len = XATTR_SECURITY_PREFIX_LEN; + } else { + prefix = XATTR_USER_PREFIX; + prefix_len = XATTR_USER_PREFIX_LEN; + } + + return __xfs_xattr_put_listent(context, prefix, prefix_len, name, + namelen); } ssize_t @@ -198,7 +226,6 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) struct xfs_attr_list_context context; struct attrlist_cursor_kern cursor = { 0 }; struct inode *inode = d_inode(dentry); - int error; /* * First read the regular on-disk attributes. @@ -207,37 +234,14 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) context.dp = XFS_I(inode); context.cursor = &cursor; context.resynch = 1; - context.alist = data; + context.alist = size ? data : NULL; context.bufsize = size; context.firstu = context.bufsize; - - if (size) - context.put_listent = xfs_xattr_put_listent; - else - context.put_listent = xfs_xattr_put_listent_sizes; + context.put_listent = xfs_xattr_put_listent; xfs_attr_list_int(&context); if (context.count < 0) return -ERANGE; - /* - * Then add the two synthetic ACL attributes. - */ - if (posix_acl_access_exists(inode)) { - error = list_one_attr(POSIX_ACL_XATTR_ACCESS, - strlen(POSIX_ACL_XATTR_ACCESS) + 1, - data, size, &context.count); - if (error) - return error; - } - - if (posix_acl_default_exists(inode)) { - error = list_one_attr(POSIX_ACL_XATTR_DEFAULT, - strlen(POSIX_ACL_XATTR_DEFAULT) + 1, - data, size, &context.count); - if (error) - return error; - } - return context.count; } |