diff options
Diffstat (limited to 'fs')
669 files changed, 21761 insertions, 16275 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index ed4f8519b627..a9ef46f02354 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -100,7 +100,7 @@ static int build_path_from_dentry(struct v9fs_session_info *v9ses, for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) n++; - wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); + wnames = kmalloc_array(n, sizeof(char *), GFP_KERNEL); if (!wnames) goto err_out; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e622f0f10502..0429c8ee58f1 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -210,12 +210,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; - continue; - } - v9ses->debug = option; + } else { + v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG - p9_debug_level = option; + p9_debug_level = option; #endif + } break; case Opt_dfltuid: @@ -231,7 +231,6 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "uid field, but not a uid?\n"); ret = -EINVAL; - continue; } break; case Opt_dfltgid: @@ -247,7 +246,6 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "gid field, but not a gid?\n"); ret = -EINVAL; - continue; } break; case Opt_afid: @@ -256,9 +254,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; - continue; + } else { + v9ses->afid = option; } - v9ses->afid = option; break; case Opt_uname: kfree(v9ses->uname); @@ -306,13 +304,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) "problem allocating copy of cache arg\n"); goto free_and_return; } - ret = get_cache_mode(s); - if (ret == -EINVAL) { - kfree(s); - goto free_and_return; - } + r = get_cache_mode(s); + if (r < 0) + ret = r; + else + v9ses->cache = r; - v9ses->cache = ret; kfree(s); break; @@ -341,14 +338,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) pr_info("Unknown access argument %s\n", s); kfree(s); - goto free_and_return; + continue; } v9ses->uid = make_kuid(current_user_ns(), uid); if (!uid_valid(v9ses->uid)) { ret = -EINVAL; pr_info("Uknown uid %s\n", s); - kfree(s); - goto free_and_return; } } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 9ee534159cc6..42e102e2e74a 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -823,28 +823,21 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(dfid)) return ERR_CAST(dfid); - name = dentry->d_name.name; - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - if (fid == ERR_PTR(-ENOENT)) { - d_add(dentry, NULL); - return NULL; - } - return ERR_CAST(fid); - } /* * Make sure we don't use a wrong inode due to parallel * unlink. For cached mode create calls request for new * inode. But with cache disabled, lookup should do this. */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + name = dentry->d_name.name; + fid = p9_client_walk(dfid, 1, &name, 1); + if (fid == ERR_PTR(-ENOENT)) + inode = NULL; + else if (IS_ERR(fid)) + inode = ERR_CAST(fid); + else if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); else inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - p9_client_clunk(fid); - return ERR_CAST(inode); - } /* * If we had a rename on the server and a parallel lookup * for the new name, then make sure we instantiate with @@ -853,12 +846,14 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, * k/b. */ res = d_splice_alias(inode, dentry); - if (!res) - v9fs_fid_add(dentry, fid); - else if (!IS_ERR(res)) - v9fs_fid_add(res, fid); - else - p9_client_clunk(fid); + if (!IS_ERR(fid)) { + if (!res) + v9fs_fid_add(dentry, fid); + else if (!IS_ERR(res)) + v9fs_fid_add(res, fid); + else + p9_client_clunk(fid); + } return res; } diff --git a/fs/Kconfig b/fs/Kconfig index bc821a86d965..ac474a61be37 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -38,6 +38,7 @@ config FS_DAX bool "Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) + select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) select FS_IOMAP select DAX help @@ -108,7 +109,7 @@ source "fs/notify/Kconfig" source "fs/quota/Kconfig" -source "fs/autofs4/Kconfig" +source "fs/autofs/Kconfig" source "fs/fuse/Kconfig" source "fs/overlayfs/Kconfig" @@ -196,13 +197,16 @@ config HUGETLBFS help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read - <file:Documentation/vm/hugetlbpage.txt> for details. + <file:Documentation/admin-guide/mm/hugetlbpage.rst> for details. If unsure, say N. config HUGETLB_PAGE def_bool HUGETLBFS +config MEMFD_CREATE + def_bool TMPFS || HUGETLBFS + config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 57a27c42b5ac..56df483de619 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -168,7 +168,7 @@ config BINFMT_MISC will automatically feed it to the correct interpreter. You can do other nice things, too. Read the file - <file:Documentation/binfmt_misc.txt> to learn how to use this + <file:Documentation/admin-guide/binfmt-misc.rst> to learn how to use this feature, <file:Documentation/admin-guide/java.rst> for information about how to include Java support. and <file:Documentation/admin-guide/mono.rst> for information about how to include Mono-based .NET support. diff --git a/fs/Makefile b/fs/Makefile index c9375fd2c8c4..293733f61594 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -102,7 +102,7 @@ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ obj-$(CONFIG_QNX4FS_FS) += qnx4/ obj-$(CONFIG_QNX6FS_FS) += qnx6/ -obj-$(CONFIG_AUTOFS4_FS) += autofs4/ +obj-$(CONFIG_AUTOFS_FS) += autofs/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ obj-$(CONFIG_OVERLAY_FS) += overlayfs/ diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 29444c83da48..e18eff854e1a 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -146,20 +146,6 @@ adfs_dir_lookup_byname(struct inode *inode, const struct qstr *name, struct obje obj->parent_id = inode->i_ino; - /* - * '.' is handled by reserved_lookup() in fs/namei.c - */ - if (name->len == 2 && name->name[0] == '.' && name->name[1] == '.') { - /* - * Currently unable to fill in the rest of 'obj', - * but this is better than nothing. We need to - * ascend one level to find it's parent. - */ - obj->name_len = 0; - obj->file_id = obj->parent_id; - goto free_out; - } - read_lock(&adfs_dir_lock); ret = ops->setpos(&dir, 0); @@ -266,17 +252,17 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); if (error == 0) { - error = -EACCES; /* * This only returns NULL if get_empty_inode * fails. */ inode = adfs_iget(dir->i_sb, &obj); - if (inode) - error = 0; + if (!inode) + inode = ERR_PTR(-EACCES); + } else if (error != -ENOENT) { + inode = ERR_PTR(error); } - d_add(dentry, inode); - return ERR_PTR(error); + return d_splice_alias(inode, dentry); } /* diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 8dbd36f5e581..c836c425ca94 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -199,7 +199,7 @@ adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) return; cur_time: - *tv = current_time(inode); + *tv = timespec64_to_timespec(current_time(inode)); return; too_early: @@ -242,6 +242,7 @@ adfs_unix2adfs_time(struct inode *inode, unsigned int secs) struct inode * adfs_iget(struct super_block *sb, struct object_info *obj) { + struct timespec ts; struct inode *inode; inode = new_inode(sb); @@ -270,7 +271,9 @@ adfs_iget(struct super_block *sb, struct object_info *obj) ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000); inode->i_mode = adfs_atts2mode(sb, inode); - adfs_adfs2unix_time(&inode->i_mtime, inode); + ts = timespec64_to_timespec(inode->i_mtime); + adfs_adfs2unix_time(&ts, inode); + inode->i_mtime = timespec_to_timespec64(ts); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index cfda2c7caedc..71fa525d63a0 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -313,7 +313,7 @@ static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_di asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1); - dm = kmalloc(nzones * sizeof(*dm), GFP_KERNEL); + dm = kmalloc_array(nzones, sizeof(*dm), GFP_KERNEL); if (dm == NULL) { adfs_error(sb, "not enough memory"); return ERR_PTR(-ENOMEM); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d8aa0ae3d037..41c5749f4db7 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -201,14 +201,16 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) struct super_block *sb = dir->i_sb; struct buffer_head *bh; struct inode *inode = NULL; + struct dentry *res; pr_debug("%s(\"%pd\")\n", __func__, dentry); affs_lock_dir(dir); bh = affs_find_entry(dir, dentry); - affs_unlock_dir(dir); - if (IS_ERR(bh)) + if (IS_ERR(bh)) { + affs_unlock_dir(dir); return ERR_CAST(bh); + } if (bh) { u32 ino = bh->b_blocknr; @@ -222,11 +224,12 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } affs_brelse(bh); inode = affs_iget(sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); } - d_add(dentry, inode); - return NULL; + res = d_splice_alias(inode, dentry); + if (!IS_ERR_OR_NULL(res)) + res->d_fsdata = dentry->d_fsdata; + affs_unlock_dir(dir); + return res; } int diff --git a/fs/affs/super.c b/fs/affs/super.c index e602619aed9d..d1ad11a8a4a5 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -241,6 +241,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, affs_set_opt(*mount_opts, SF_NO_TRUNCATE); break; case Opt_prefix: + kfree(*prefix); *prefix = match_strdup(&args[0]); if (!*prefix) return 0; diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 532acae25453..546874057bd3 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -5,7 +5,7 @@ afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o -kafs-objs := \ +kafs-y := \ $(afs-cache-y) \ addr_list.o \ callback.o \ @@ -21,7 +21,6 @@ kafs-objs := \ main.o \ misc.o \ mntpt.o \ - proc.o \ rotate.o \ rxrpc.o \ security.o \ @@ -34,4 +33,5 @@ kafs-objs := \ write.o \ xattr.o +kafs-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_AFS_FS) := kafs.o diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 7587fb665ff1..025a9a5e1c32 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -43,8 +43,7 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, _enter("%u,%u,%u", nr, service, port); - alist = kzalloc(sizeof(*alist) + sizeof(alist->addrs[0]) * nr, - GFP_KERNEL); + alist = kzalloc(struct_size(alist, addrs, nr), GFP_KERNEL); if (!alist) return NULL; @@ -216,7 +215,7 @@ struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) _enter("%s", cell->name); ret = dns_query("afsdb", cell->name, cell->name_len, - "ipv4", &vllist, _expiry); + "", &vllist, _expiry); if (ret < 0) return ERR_PTR(ret); diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 571437dcb252..5f261fbf2182 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -21,6 +21,66 @@ #include "internal.h" /* + * Create volume and callback interests on a server. + */ +static struct afs_cb_interest *afs_create_interest(struct afs_server *server, + struct afs_vnode *vnode) +{ + struct afs_vol_interest *new_vi, *vi; + struct afs_cb_interest *new; + struct hlist_node **pp; + + new_vi = kzalloc(sizeof(struct afs_vol_interest), GFP_KERNEL); + if (!new_vi) + return NULL; + + new = kzalloc(sizeof(struct afs_cb_interest), GFP_KERNEL); + if (!new) { + kfree(new_vi); + return NULL; + } + + new_vi->usage = 1; + new_vi->vid = vnode->volume->vid; + INIT_HLIST_NODE(&new_vi->srv_link); + INIT_HLIST_HEAD(&new_vi->cb_interests); + + refcount_set(&new->usage, 1); + new->sb = vnode->vfs_inode.i_sb; + new->vid = vnode->volume->vid; + new->server = afs_get_server(server); + INIT_HLIST_NODE(&new->cb_vlink); + + write_lock(&server->cb_break_lock); + + for (pp = &server->cb_volumes.first; *pp; pp = &(*pp)->next) { + vi = hlist_entry(*pp, struct afs_vol_interest, srv_link); + if (vi->vid < new_vi->vid) + continue; + if (vi->vid > new_vi->vid) + break; + vi->usage++; + goto found_vi; + } + + new_vi->srv_link.pprev = pp; + new_vi->srv_link.next = *pp; + if (*pp) + (*pp)->pprev = &new_vi->srv_link.next; + *pp = &new_vi->srv_link; + vi = new_vi; + new_vi = NULL; +found_vi: + + new->vol_interest = vi; + hlist_add_head(&new->cb_vlink, &vi->cb_interests); + + write_unlock(&server->cb_break_lock); + kfree(new_vi); + return new; +} + +/* * Set up an interest-in-callbacks record for a volume on a server and * register it with the server. * - Called with vnode->io_lock held. @@ -77,20 +137,10 @@ again: } if (!cbi) { - new = kzalloc(sizeof(struct afs_cb_interest), GFP_KERNEL); + new = afs_create_interest(server, vnode); if (!new) return -ENOMEM; - refcount_set(&new->usage, 1); - new->sb = vnode->vfs_inode.i_sb; - new->vid = vnode->volume->vid; - new->server = afs_get_server(server); - INIT_LIST_HEAD(&new->cb_link); - - write_lock(&server->cb_break_lock); - list_add_tail(&new->cb_link, &server->cb_interests); - write_unlock(&server->cb_break_lock); - write_lock(&slist->lock); if (!entry->cb_interest) { entry->cb_interest = afs_get_cb_interest(new); @@ -126,11 +176,22 @@ again: */ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi) { + struct afs_vol_interest *vi; + if (cbi && refcount_dec_and_test(&cbi->usage)) { - if (!list_empty(&cbi->cb_link)) { + if (!hlist_unhashed(&cbi->cb_vlink)) { write_lock(&cbi->server->cb_break_lock); - list_del_init(&cbi->cb_link); + + hlist_del_init(&cbi->cb_vlink); + vi = cbi->vol_interest; + cbi->vol_interest = NULL; + if (--vi->usage == 0) + hlist_del(&vi->srv_link); + else + vi = NULL; + write_unlock(&cbi->server->cb_break_lock); + kfree(vi); afs_put_server(net, cbi->server); } kfree(cbi); @@ -182,20 +243,34 @@ void afs_break_callback(struct afs_vnode *vnode) static void afs_break_one_callback(struct afs_server *server, struct afs_fid *fid) { + struct afs_vol_interest *vi; struct afs_cb_interest *cbi; struct afs_iget_data data; struct afs_vnode *vnode; struct inode *inode; read_lock(&server->cb_break_lock); + hlist_for_each_entry(vi, &server->cb_volumes, srv_link) { + if (vi->vid < fid->vid) + continue; + if (vi->vid > fid->vid) { + vi = NULL; + break; + } + //atomic_inc(&vi->usage); + break; + } + + /* TODO: Find all matching volumes if we couldn't match the server and + * break them anyway. + */ + if (!vi) + goto out; /* Step through all interested superblocks. There may be more than one * because of cell aliasing. */ - list_for_each_entry(cbi, &server->cb_interests, cb_link) { - if (cbi->vid != fid->vid) - continue; - + hlist_for_each_entry(cbi, &vi->cb_interests, cb_vlink) { if (fid->vnode == 0 && fid->unique == 0) { /* The callback break applies to an entire volume. */ struct afs_super_info *as = AFS_FS_S(cbi->sb); @@ -217,6 +292,7 @@ static void afs_break_one_callback(struct afs_server *server, } } +out: read_unlock(&server->cb_break_lock); } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index fdf4c36cff79..f3d0bef16d78 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -15,6 +15,7 @@ #include <linux/dns_resolver.h> #include <linux/sched.h> #include <linux/inet.h> +#include <linux/namei.h> #include <keys/rxrpc-type.h> #include "internal.h" @@ -341,8 +342,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) /* install the new cell */ write_seqlock(&net->cells_lock); - old_root = net->ws_cell; - net->ws_cell = new_root; + old_root = rcu_access_pointer(net->ws_cell); + rcu_assign_pointer(net->ws_cell, new_root); write_sequnlock(&net->cells_lock); afs_put_cell(net, old_root); @@ -528,12 +529,14 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) NULL, 0, cell, 0, true); #endif - ret = afs_proc_cell_setup(net, cell); + ret = afs_proc_cell_setup(cell); if (ret < 0) return ret; - spin_lock(&net->proc_cells_lock); + + mutex_lock(&net->proc_cells_lock); list_add_tail(&cell->proc_link, &net->proc_cells); - spin_unlock(&net->proc_cells_lock); + afs_dynroot_mkdir(net, cell); + mutex_unlock(&net->proc_cells_lock); return 0; } @@ -544,11 +547,12 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) { _enter("%s", cell->name); - afs_proc_cell_remove(net, cell); + afs_proc_cell_remove(cell); - spin_lock(&net->proc_cells_lock); + mutex_lock(&net->proc_cells_lock); list_del_init(&cell->proc_link); - spin_unlock(&net->proc_cells_lock); + afs_dynroot_rmdir(net, cell); + mutex_unlock(&net->proc_cells_lock); #ifdef CONFIG_AFS_FSCACHE fscache_relinquish_cookie(cell->cache, NULL, false); @@ -755,8 +759,8 @@ void afs_cell_purge(struct afs_net *net) _enter(""); write_seqlock(&net->cells_lock); - ws = net->ws_cell; - net->ws_cell = NULL; + ws = rcu_access_pointer(net->ws_cell); + RCU_INIT_POINTER(net->ws_cell, NULL); write_sequnlock(&net->cells_lock); afs_put_cell(net, ws); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index c332c95a6940..9e51d6fe7e8f 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -191,7 +191,8 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (call->count > AFSCBMAX) return afs_protocol_error(call, -EBADMSG); - call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL); + call->buffer = kmalloc(array3_size(call->count, 3, 4), + GFP_KERNEL); if (!call->buffer) return -ENOMEM; call->offset = 0; @@ -330,7 +331,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) switch (call->unmarshall) { case 0: call->offset = 0; - call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL); if (!call->buffer) return -ENOMEM; call->unmarshall++; @@ -453,7 +454,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) switch (call->unmarshall) { case 0: call->offset = 0; - call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL); if (!call->buffer) return -ENOMEM; call->unmarshall++; @@ -525,7 +526,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) nifs = 0; ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL); if (ifs) { - nifs = afs_get_ipv4_interfaces(ifs, 32, false); + nifs = afs_get_ipv4_interfaces(call->net, ifs, 32, false); if (nifs < 0) { kfree(ifs); ifs = NULL; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 983f3946ab57..174e843f0633 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -1,4 +1,4 @@ -/* dir.c: AFS dynamic root handling +/* AFS dynamic root handling * * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -46,7 +46,7 @@ static int afs_probe_cell_name(struct dentry *dentry) return 0; } - ret = dns_query("afsdb", name, len, "ipv4", NULL, NULL); + ret = dns_query("afsdb", name, len, "", NULL, NULL); if (ret == -ENODATA) ret = -EDESTADDRREQ; return ret; @@ -207,3 +207,125 @@ const struct dentry_operations afs_dynroot_dentry_operations = { .d_release = afs_d_release, .d_automount = afs_d_automount, }; + +/* + * Create a manually added cell mount directory. + * - The caller must hold net->proc_cells_lock + */ +int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell) +{ + struct super_block *sb = net->dynroot_sb; + struct dentry *root, *subdir; + int ret; + + if (!sb || atomic_read(&sb->s_active) == 0) + return 0; + + /* Let the ->lookup op do the creation */ + root = sb->s_root; + inode_lock(root->d_inode); + subdir = lookup_one_len(cell->name, root, cell->name_len); + if (IS_ERR(subdir)) { + ret = PTR_ERR(subdir); + goto unlock; + } + + /* Note that we're retaining an extra ref on the dentry */ + subdir->d_fsdata = (void *)1UL; + ret = 0; +unlock: + inode_unlock(root->d_inode); + return ret; +} + +/* + * Remove a manually added cell mount directory. + * - The caller must hold net->proc_cells_lock + */ +void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell) +{ + struct super_block *sb = net->dynroot_sb; + struct dentry *root, *subdir; + + if (!sb || atomic_read(&sb->s_active) == 0) + return; + + root = sb->s_root; + inode_lock(root->d_inode); + + /* Don't want to trigger a lookup call, which will re-add the cell */ + subdir = try_lookup_one_len(cell->name, root, cell->name_len); + if (IS_ERR_OR_NULL(subdir)) { + _debug("lookup %ld", PTR_ERR(subdir)); + goto no_dentry; + } + + _debug("rmdir %pd %u", subdir, d_count(subdir)); + + if (subdir->d_fsdata) { + _debug("unpin %u", d_count(subdir)); + subdir->d_fsdata = NULL; + dput(subdir); + } + dput(subdir); +no_dentry: + inode_unlock(root->d_inode); + _leave(""); +} + +/* + * Populate a newly created dynamic root with cell names. + */ +int afs_dynroot_populate(struct super_block *sb) +{ + struct afs_cell *cell; + struct afs_net *net = afs_sb2net(sb); + int ret; + + if (mutex_lock_interruptible(&net->proc_cells_lock) < 0) + return -ERESTARTSYS; + + net->dynroot_sb = sb; + list_for_each_entry(cell, &net->proc_cells, proc_link) { + ret = afs_dynroot_mkdir(net, cell); + if (ret < 0) + goto error; + } + + ret = 0; +out: + mutex_unlock(&net->proc_cells_lock); + return ret; + +error: + net->dynroot_sb = NULL; + goto out; +} + +/* + * When a dynamic root that's in the process of being destroyed, depopulate it + * of pinned directories. + */ +void afs_dynroot_depopulate(struct super_block *sb) +{ + struct afs_net *net = afs_sb2net(sb); + struct dentry *root = sb->s_root, *subdir, *tmp; + + /* Prevent more subdirs from being created */ + mutex_lock(&net->proc_cells_lock); + if (net->dynroot_sb == sb) + net->dynroot_sb = NULL; + mutex_unlock(&net->proc_cells_lock); + + inode_lock(root->d_inode); + + /* Remove all the pins for dirs created for manually added cells */ + list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { + if (subdir->d_fsdata) { + subdir->d_fsdata = NULL; + dput(subdir); + } + } + + inode_unlock(root->d_inode); +} diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index b273e1d60478..50929cb91732 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -72,7 +72,7 @@ void afs_update_inode_from_status(struct afs_vnode *vnode, const afs_dataversion_t *expected_version, u8 flags) { - struct timespec t; + struct timespec64 t; umode_t mode; t.tv_sec = status->mtime_client; @@ -138,10 +138,6 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, u64 data_version, size; u32 type, abort_code; u8 flags = 0; - int ret; - - if (vnode) - write_seqlock(&vnode->cb_lock); abort_code = ntohl(xdr->abort_code); @@ -154,8 +150,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, * case. */ status->abort_code = abort_code; - ret = 0; - goto out; + return 0; } pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version)); @@ -164,8 +159,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, if (abort_code != 0 && inline_error) { status->abort_code = abort_code; - ret = 0; - goto out; + return 0; } type = ntohl(xdr->type); @@ -235,17 +229,35 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, flags); } - ret = 0; - -out: - if (vnode) - write_sequnlock(&vnode->cb_lock); - return ret; + return 0; bad: xdr_dump_bad(*_bp); - ret = afs_protocol_error(call, -EBADMSG); - goto out; + return afs_protocol_error(call, -EBADMSG); +} + +/* + * Decode the file status. We need to lock the target vnode if we're going to + * update its status so that stat() sees the attributes update atomically. + */ +static int afs_decode_status(struct afs_call *call, + const __be32 **_bp, + struct afs_file_status *status, + struct afs_vnode *vnode, + const afs_dataversion_t *expected_version, + struct afs_read *read_req) +{ + int ret; + + if (!vnode) + return xdr_decode_AFSFetchStatus(call, _bp, status, vnode, + expected_version, read_req); + + write_seqlock(&vnode->cb_lock); + ret = xdr_decode_AFSFetchStatus(call, _bp, status, vnode, + expected_version, read_req); + write_sequnlock(&vnode->cb_lock); + return ret; } /* @@ -387,8 +399,8 @@ static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) + if (afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack(call, vnode, &bp); if (call->reply[1]) @@ -568,8 +580,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) return ret; bp = call->buffer; - if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, - &vnode->status.data_version, req) < 0) + if (afs_decode_status(call, &bp, &vnode->status, vnode, + &vnode->status.data_version, req) < 0) return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack(call, vnode, &bp); if (call->reply[1]) @@ -721,9 +733,9 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - if (xdr_decode_AFSFetchStatus(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 || - xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) + if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 || + afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack_raw(&bp, call->reply[3]); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ @@ -827,8 +839,8 @@ static int afs_deliver_fs_remove(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) + if (afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ @@ -917,9 +929,9 @@ static int afs_deliver_fs_link(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 || - xdr_decode_AFSFetchStatus(call, &bp, &dvnode->status, dvnode, - &call->expected_version, NULL) < 0) + if (afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 || + afs_decode_status(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL) < 0) return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ @@ -1004,9 +1016,9 @@ static int afs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - if (xdr_decode_AFSFetchStatus(call, &bp, call->reply[2], NULL, NULL, NULL) || - xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) + if (afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL) || + afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ @@ -1110,12 +1122,12 @@ static int afs_deliver_fs_rename(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (xdr_decode_AFSFetchStatus(call, &bp, &orig_dvnode->status, orig_dvnode, - &call->expected_version, NULL) < 0) + if (afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode, + &call->expected_version, NULL) < 0) return afs_protocol_error(call, -EBADMSG); if (new_dvnode != orig_dvnode && - xdr_decode_AFSFetchStatus(call, &bp, &new_dvnode->status, new_dvnode, - &call->expected_version_2, NULL) < 0) + afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode, + &call->expected_version_2, NULL) < 0) return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ @@ -1219,8 +1231,8 @@ static int afs_deliver_fs_store_data(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) + if (afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ @@ -1395,8 +1407,8 @@ static int afs_deliver_fs_store_status(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, - &call->expected_version, NULL) < 0) + if (afs_decode_status(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ @@ -2097,8 +2109,8 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(call, &bp, status, vnode, - &call->expected_version, NULL); + afs_decode_status(call, &bp, status, vnode, + &call->expected_version, NULL); callback[call->count].version = ntohl(bp[0]); callback[call->count].expiry = ntohl(bp[1]); callback[call->count].type = ntohl(bp[2]); @@ -2209,9 +2221,9 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) bp = call->buffer; statuses = call->reply[1]; - if (xdr_decode_AFSFetchStatus(call, &bp, &statuses[call->count], - call->count == 0 ? vnode : NULL, - NULL, NULL) < 0) + if (afs_decode_status(call, &bp, &statuses[call->count], + call->count == 0 ? vnode : NULL, + NULL, NULL) < 0) return afs_protocol_error(call, -EBADMSG); call->count++; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index e3f8a46663db..9778df135717 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -22,6 +22,8 @@ #include <linux/backing-dev.h> #include <linux/uuid.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/sock.h> #include <net/af_rxrpc.h> #include "afs.h" @@ -40,7 +42,8 @@ struct afs_mount_params { afs_voltype_t type; /* type of volume requested */ int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ - struct afs_net *net; /* Network namespace in effect */ + struct net *net_ns; /* Network namespace in effect */ + struct afs_net *net; /* the AFS net namespace stuff */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ struct key *key; /* key to use for secure mounting */ @@ -189,7 +192,7 @@ struct afs_read { * - there's one superblock per volume */ struct afs_super_info { - struct afs_net *net; /* Network namespace */ + struct net *net_ns; /* Network namespace */ struct afs_cell *cell; /* The cell in which the volume resides */ struct afs_volume *volume; /* volume record */ bool dyn_root; /* True if dynamic root */ @@ -210,7 +213,6 @@ struct afs_sysnames { char *subs[AFS_NR_SYSNAME]; refcount_t usage; unsigned short nr; - short error; char blank[1]; }; @@ -218,6 +220,7 @@ struct afs_sysnames { * AFS network namespace record. */ struct afs_net { + struct net *net; /* Backpointer to the owning net namespace */ struct afs_uuid uuid; bool live; /* F if this namespace is being removed */ @@ -231,13 +234,13 @@ struct afs_net { /* Cell database */ struct rb_root cells; - struct afs_cell *ws_cell; + struct afs_cell __rcu *ws_cell; struct work_struct cells_manager; struct timer_list cells_timer; atomic_t cells_outstanding; seqlock_t cells_lock; - spinlock_t proc_cells_lock; + struct mutex proc_cells_lock; struct list_head proc_cells; /* Known servers. Theoretically each fileserver can only be in one @@ -261,6 +264,7 @@ struct afs_net { struct mutex lock_manager_mutex; /* Misc */ + struct super_block *dynroot_sb; /* Dynamic root mount superblock */ struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ struct afs_sysnames *sysnames; rwlock_t sysnames_lock; @@ -280,7 +284,6 @@ struct afs_net { }; extern const char afs_init_sysname[]; -extern struct afs_net __afs_net;// Dummy AFS network namespace; TODO: replace with real netns enum afs_cell_state { AFS_CELL_UNSET, @@ -404,16 +407,27 @@ struct afs_server { rwlock_t fs_lock; /* access lock */ /* callback promise management */ - struct list_head cb_interests; /* List of superblocks using this server */ + struct hlist_head cb_volumes; /* List of volume interests on this server */ unsigned cb_s_break; /* Break-everything counter. */ rwlock_t cb_break_lock; /* Volume finding lock */ }; /* + * Volume collation in the server's callback interest list. + */ +struct afs_vol_interest { + struct hlist_node srv_link; /* Link in server->cb_volumes */ + struct hlist_head cb_interests; /* List of callback interests on the server */ + afs_volid_t vid; /* Volume ID to match */ + unsigned int usage; +}; + +/* * Interest by a superblock on a server. */ struct afs_cb_interest { - struct list_head cb_link; /* Link in server->cb_interests */ + struct hlist_node cb_vlink; /* Link in vol_interest->cb_interests */ + struct afs_vol_interest *vol_interest; struct afs_server *server; /* Server on which this interest resides */ struct super_block *sb; /* Superblock on which inodes reside */ afs_volid_t vid; /* Volume ID to match */ @@ -720,6 +734,10 @@ extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); +extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *); +extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *); +extern int afs_dynroot_populate(struct super_block *); +extern void afs_dynroot_depopulate(struct super_block *); /* * file.c @@ -806,34 +824,36 @@ extern int afs_drop_inode(struct inode *); * main.c */ extern struct workqueue_struct *afs_wq; +extern int afs_net_id; -static inline struct afs_net *afs_d2net(struct dentry *dentry) +static inline struct afs_net *afs_net(struct net *net) { - return &__afs_net; + return net_generic(net, afs_net_id); } -static inline struct afs_net *afs_i2net(struct inode *inode) +static inline struct afs_net *afs_sb2net(struct super_block *sb) { - return &__afs_net; + return afs_net(AFS_FS_S(sb)->net_ns); } -static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) +static inline struct afs_net *afs_d2net(struct dentry *dentry) { - return &__afs_net; + return afs_sb2net(dentry->d_sb); } -static inline struct afs_net *afs_sock2net(struct sock *sk) +static inline struct afs_net *afs_i2net(struct inode *inode) { - return &__afs_net; + return afs_sb2net(inode->i_sb); } -static inline struct afs_net *afs_get_net(struct afs_net *net) +static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) { - return net; + return afs_i2net(&vnode->vfs_inode); } -static inline void afs_put_net(struct afs_net *net) +static inline struct afs_net *afs_sock2net(struct sock *sk) { + return net_generic(sock_net(sk), afs_net_id); } static inline void __afs_stat(atomic_t *s) @@ -861,16 +881,25 @@ extern void afs_mntpt_kill_timer(void); /* * netdevices.c */ -extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool); +extern int afs_get_ipv4_interfaces(struct afs_net *, struct afs_interface *, + size_t, bool); /* * proc.c */ +#ifdef CONFIG_PROC_FS extern int __net_init afs_proc_init(struct afs_net *); extern void __net_exit afs_proc_cleanup(struct afs_net *); -extern int afs_proc_cell_setup(struct afs_net *, struct afs_cell *); -extern void afs_proc_cell_remove(struct afs_net *, struct afs_cell *); +extern int afs_proc_cell_setup(struct afs_cell *); +extern void afs_proc_cell_remove(struct afs_cell *); extern void afs_put_sysnames(struct afs_sysnames *); +#else +static inline int afs_proc_init(struct afs_net *net) { return 0; } +static inline void afs_proc_cleanup(struct afs_net *net) {} +static inline int afs_proc_cell_setup(struct afs_cell *cell) { return 0; } +static inline void afs_proc_cell_remove(struct afs_cell *cell) {} +static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} +#endif /* * rotate.c @@ -1002,7 +1031,7 @@ extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server * super.c */ extern int __init afs_fs_init(void); -extern void __exit afs_fs_exit(void); +extern void afs_fs_exit(void); /* * vlclient.c diff --git a/fs/afs/main.c b/fs/afs/main.c index d7560168b3bf..e84fe822a960 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -15,6 +15,7 @@ #include <linux/completion.h> #include <linux/sched.h> #include <linux/random.h> +#include <linux/proc_fs.h> #define CREATE_TRACE_POINTS #include "internal.h" @@ -32,7 +33,7 @@ module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); struct workqueue_struct *afs_wq; -struct afs_net __afs_net; +static struct proc_dir_entry *afs_proc_symlink; #if defined(CONFIG_ALPHA) const char afs_init_sysname[] = "alpha_linux26"; @@ -67,11 +68,13 @@ const char afs_init_sysname[] = "unknown_linux26"; /* * Initialise an AFS network namespace record. */ -static int __net_init afs_net_init(struct afs_net *net) +static int __net_init afs_net_init(struct net *net_ns) { struct afs_sysnames *sysnames; + struct afs_net *net = afs_net(net_ns); int ret; + net->net = net_ns; net->live = true; generate_random_uuid((unsigned char *)&net->uuid); @@ -83,7 +86,7 @@ static int __net_init afs_net_init(struct afs_net *net) INIT_WORK(&net->cells_manager, afs_manage_cells); timer_setup(&net->cells_timer, afs_cells_timer, 0); - spin_lock_init(&net->proc_cells_lock); + mutex_init(&net->proc_cells_lock); INIT_LIST_HEAD(&net->proc_cells); seqlock_init(&net->fs_lock); @@ -142,8 +145,10 @@ error_sysnames: /* * Clean up and destroy an AFS network namespace record. */ -static void __net_exit afs_net_exit(struct afs_net *net) +static void __net_exit afs_net_exit(struct net *net_ns) { + struct afs_net *net = afs_net(net_ns); + net->live = false; afs_cell_purge(net); afs_purge_servers(net); @@ -152,6 +157,13 @@ static void __net_exit afs_net_exit(struct afs_net *net) afs_put_sysnames(net->sysnames); } +static struct pernet_operations afs_net_ops = { + .init = afs_net_init, + .exit = afs_net_exit, + .id = &afs_net_id, + .size = sizeof(struct afs_net), +}; + /* * initialise the AFS client FS module */ @@ -178,7 +190,7 @@ static int __init afs_init(void) goto error_cache; #endif - ret = afs_net_init(&__afs_net); + ret = register_pernet_subsys(&afs_net_ops); if (ret < 0) goto error_net; @@ -187,10 +199,18 @@ static int __init afs_init(void) if (ret < 0) goto error_fs; + afs_proc_symlink = proc_symlink("fs/afs", NULL, "../self/net/afs"); + if (IS_ERR(afs_proc_symlink)) { + ret = PTR_ERR(afs_proc_symlink); + goto error_proc; + } + return ret; +error_proc: + afs_fs_exit(); error_fs: - afs_net_exit(&__afs_net); + unregister_pernet_subsys(&afs_net_ops); error_net: #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); @@ -219,8 +239,9 @@ static void __exit afs_exit(void) { printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); + proc_remove(afs_proc_symlink); afs_fs_exit(); - afs_net_exit(&__afs_net); + unregister_pernet_subsys(&afs_net_ops); #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); #endif diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c index 50bd5bb1c4fb..2a009d1939d7 100644 --- a/fs/afs/netdevices.c +++ b/fs/afs/netdevices.c @@ -17,8 +17,8 @@ * - maxbufs must be at least 1 * - returns the number of interface records in the buffer */ -int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs, - bool wantloopback) +int afs_get_ipv4_interfaces(struct afs_net *net, struct afs_interface *bufs, + size_t maxbufs, bool wantloopback) { struct net_device *dev; struct in_device *idev; @@ -27,7 +27,7 @@ int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs, ASSERT(maxbufs > 0); rtnl_lock(); - for_each_netdev(&init_net, dev) { + for_each_netdev(net->net, dev) { if (dev->type == ARPHRD_LOOPBACK && !wantloopback) continue; idev = __in_dev_get_rtnl(dev); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 839a22280606..0c3285c8db95 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -17,274 +17,78 @@ #include <linux/uaccess.h> #include "internal.h" -static inline struct afs_net *afs_proc2net(struct file *f) -{ - return &__afs_net; -} - static inline struct afs_net *afs_seq2net(struct seq_file *m) { - return &__afs_net; // TODO: use seq_file_net(m) + return afs_net(seq_file_net(m)); } -static int afs_proc_cells_open(struct inode *inode, struct file *file); -static void *afs_proc_cells_start(struct seq_file *p, loff_t *pos); -static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos); -static void afs_proc_cells_stop(struct seq_file *p, void *v); -static int afs_proc_cells_show(struct seq_file *m, void *v); -static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, - size_t size, loff_t *_pos); - -static const struct seq_operations afs_proc_cells_ops = { - .start = afs_proc_cells_start, - .next = afs_proc_cells_next, - .stop = afs_proc_cells_stop, - .show = afs_proc_cells_show, -}; - -static const struct file_operations afs_proc_cells_fops = { - .open = afs_proc_cells_open, - .read = seq_read, - .write = afs_proc_cells_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, - size_t size, loff_t *_pos); -static ssize_t afs_proc_rootcell_write(struct file *file, - const char __user *buf, - size_t size, loff_t *_pos); - -static const struct file_operations afs_proc_rootcell_fops = { - .read = afs_proc_rootcell_read, - .write = afs_proc_rootcell_write, - .llseek = no_llseek, -}; - -static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); -static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos); -static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, - loff_t *pos); -static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v); -static int afs_proc_cell_volumes_show(struct seq_file *m, void *v); - -static const struct seq_operations afs_proc_cell_volumes_ops = { - .start = afs_proc_cell_volumes_start, - .next = afs_proc_cell_volumes_next, - .stop = afs_proc_cell_volumes_stop, - .show = afs_proc_cell_volumes_show, -}; - -static const struct file_operations afs_proc_cell_volumes_fops = { - .open = afs_proc_cell_volumes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int afs_proc_cell_vlservers_open(struct inode *inode, - struct file *file); -static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos); -static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, - loff_t *pos); -static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v); -static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v); - -static const struct seq_operations afs_proc_cell_vlservers_ops = { - .start = afs_proc_cell_vlservers_start, - .next = afs_proc_cell_vlservers_next, - .stop = afs_proc_cell_vlservers_stop, - .show = afs_proc_cell_vlservers_show, -}; - -static const struct file_operations afs_proc_cell_vlservers_fops = { - .open = afs_proc_cell_vlservers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int afs_proc_servers_open(struct inode *inode, struct file *file); -static void *afs_proc_servers_start(struct seq_file *p, loff_t *pos); -static void *afs_proc_servers_next(struct seq_file *p, void *v, - loff_t *pos); -static void afs_proc_servers_stop(struct seq_file *p, void *v); -static int afs_proc_servers_show(struct seq_file *m, void *v); - -static const struct seq_operations afs_proc_servers_ops = { - .start = afs_proc_servers_start, - .next = afs_proc_servers_next, - .stop = afs_proc_servers_stop, - .show = afs_proc_servers_show, -}; - -static const struct file_operations afs_proc_servers_fops = { - .open = afs_proc_servers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int afs_proc_sysname_open(struct inode *inode, struct file *file); -static int afs_proc_sysname_release(struct inode *inode, struct file *file); -static void *afs_proc_sysname_start(struct seq_file *p, loff_t *pos); -static void *afs_proc_sysname_next(struct seq_file *p, void *v, - loff_t *pos); -static void afs_proc_sysname_stop(struct seq_file *p, void *v); -static int afs_proc_sysname_show(struct seq_file *m, void *v); -static ssize_t afs_proc_sysname_write(struct file *file, - const char __user *buf, - size_t size, loff_t *_pos); - -static const struct seq_operations afs_proc_sysname_ops = { - .start = afs_proc_sysname_start, - .next = afs_proc_sysname_next, - .stop = afs_proc_sysname_stop, - .show = afs_proc_sysname_show, -}; - -static const struct file_operations afs_proc_sysname_fops = { - .open = afs_proc_sysname_open, - .read = seq_read, - .llseek = seq_lseek, - .release = afs_proc_sysname_release, - .write = afs_proc_sysname_write, -}; - -static const struct file_operations afs_proc_stats_fops; - -/* - * initialise the /proc/fs/afs/ directory - */ -int afs_proc_init(struct afs_net *net) +static inline struct afs_net *afs_seq2net_single(struct seq_file *m) { - _enter(""); - - net->proc_afs = proc_mkdir("fs/afs", NULL); - if (!net->proc_afs) - goto error_dir; - - if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) || - !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) || - !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops) || - !proc_create("stats", 0644, net->proc_afs, &afs_proc_stats_fops) || - !proc_create("sysname", 0644, net->proc_afs, &afs_proc_sysname_fops)) - goto error_tree; - - _leave(" = 0"); - return 0; - -error_tree: - proc_remove(net->proc_afs); -error_dir: - _leave(" = -ENOMEM"); - return -ENOMEM; -} - -/* - * clean up the /proc/fs/afs/ directory - */ -void afs_proc_cleanup(struct afs_net *net) -{ - proc_remove(net->proc_afs); - net->proc_afs = NULL; + return afs_net(seq_file_single_net(m)); } /* - * open "/proc/fs/afs/cells" which provides a summary of extant cells + * Display the list of cells known to the namespace. */ -static int afs_proc_cells_open(struct inode *inode, struct file *file) +static int afs_proc_cells_show(struct seq_file *m, void *v) { - struct seq_file *m; - int ret; + struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); + struct afs_net *net = afs_seq2net(m); - ret = seq_open(file, &afs_proc_cells_ops); - if (ret < 0) - return ret; + if (v == &net->proc_cells) { + /* display header on line 1 */ + seq_puts(m, "USE NAME\n"); + return 0; + } - m = file->private_data; - m->private = PDE_DATA(inode); + /* display one cell per line on subsequent lines */ + seq_printf(m, "%3u %s\n", atomic_read(&cell->usage), cell->name); return 0; } -/* - * set up the iterator to start reading from the cells list and return the - * first item - */ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) __acquires(rcu) { - struct afs_net *net = afs_seq2net(m); - rcu_read_lock(); - return seq_list_start_head(&net->proc_cells, *_pos); + return seq_list_start_head(&afs_seq2net(m)->proc_cells, *_pos); } -/* - * move to next cell in cells list - */ static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos) { - struct afs_net *net = afs_seq2net(m); - - return seq_list_next(v, &net->proc_cells, pos); + return seq_list_next(v, &afs_seq2net(m)->proc_cells, pos); } -/* - * clean up after reading from the cells list - */ static void afs_proc_cells_stop(struct seq_file *m, void *v) __releases(rcu) { rcu_read_unlock(); } -/* - * display a header line followed by a load of cell lines - */ -static int afs_proc_cells_show(struct seq_file *m, void *v) -{ - struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); - struct afs_net *net = afs_seq2net(m); - - if (v == &net->proc_cells) { - /* display header on line 1 */ - seq_puts(m, "USE NAME\n"); - return 0; - } - - /* display one cell per line on subsequent lines */ - seq_printf(m, "%3u %s\n", atomic_read(&cell->usage), cell->name); - return 0; -} +static const struct seq_operations afs_proc_cells_ops = { + .start = afs_proc_cells_start, + .next = afs_proc_cells_next, + .stop = afs_proc_cells_stop, + .show = afs_proc_cells_show, +}; /* * handle writes to /proc/fs/afs/cells * - to add cells: echo "add <cellname> <IP>[:<IP>][:<IP>]" */ -static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, - size_t size, loff_t *_pos) +static int afs_proc_cells_write(struct file *file, char *buf, size_t size) { - struct afs_net *net = afs_proc2net(file); - char *kbuf, *name, *args; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net(m); + char *name, *args; int ret; - /* start by dragging the command into memory */ - if (size <= 1 || size >= PAGE_SIZE) - return -EINVAL; - - kbuf = memdup_user_nul(buf, size); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - /* trim to first NL */ - name = memchr(kbuf, '\n', size); + name = memchr(buf, '\n', size); if (name) *name = 0; /* split into command, name and argslist */ - name = strchr(kbuf, ' '); + name = strchr(buf, ' '); if (!name) goto inval; do { @@ -303,9 +107,9 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, goto inval; /* determine command to perform */ - _debug("cmd=%s name=%s args=%s", kbuf, name, args); + _debug("cmd=%s name=%s args=%s", buf, name, args); - if (strcmp(kbuf, "add") == 0) { + if (strcmp(buf, "add") == 0) { struct afs_cell *cell; cell = afs_lookup_cell(net, name, strlen(name), args, true); @@ -321,10 +125,9 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, goto inval; } - ret = size; + ret = 0; done: - kfree(kbuf); _leave(" = %d", ret); return ret; @@ -334,251 +137,141 @@ inval: goto done; } -static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, - size_t size, loff_t *_pos) +/* + * Display the name of the current workstation cell. + */ +static int afs_proc_rootcell_show(struct seq_file *m, void *v) { struct afs_cell *cell; - struct afs_net *net = afs_proc2net(file); - unsigned int seq = 0; - char name[AFS_MAXCELLNAME + 1]; - int len; - - if (*_pos > 0) - return 0; - if (!net->ws_cell) - return 0; - - rcu_read_lock(); - do { - read_seqbegin_or_lock(&net->cells_lock, &seq); - len = 0; - cell = rcu_dereference_raw(net->ws_cell); - if (cell) { - len = cell->name_len; - memcpy(name, cell->name, len); - } - } while (need_seqretry(&net->cells_lock, seq)); - done_seqretry(&net->cells_lock, seq); - rcu_read_unlock(); - - if (!len) - return 0; - - name[len++] = '\n'; - if (len > size) - len = size; - if (copy_to_user(buf, name, len) != 0) - return -EFAULT; - *_pos = 1; - return len; + struct afs_net *net; + + net = afs_seq2net_single(m); + if (rcu_access_pointer(net->ws_cell)) { + rcu_read_lock(); + cell = rcu_dereference(net->ws_cell); + if (cell) + seq_printf(m, "%s\n", cell->name); + rcu_read_unlock(); + } + return 0; } /* - * handle writes to /proc/fs/afs/rootcell - * - to initialize rootcell: echo "cell.name:192.168.231.14" + * Set the current workstation cell and optionally supply its list of volume + * location servers. + * + * echo "cell.name:192.168.231.14" >/proc/fs/afs/rootcell */ -static ssize_t afs_proc_rootcell_write(struct file *file, - const char __user *buf, - size_t size, loff_t *_pos) +static int afs_proc_rootcell_write(struct file *file, char *buf, size_t size) { - struct afs_net *net = afs_proc2net(file); - char *kbuf, *s; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net_single(m); + char *s; int ret; - /* start by dragging the command into memory */ - if (size <= 1 || size >= PAGE_SIZE) - return -EINVAL; - - kbuf = memdup_user_nul(buf, size); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - ret = -EINVAL; - if (kbuf[0] == '.') + if (buf[0] == '.') goto out; - if (memchr(kbuf, '/', size)) + if (memchr(buf, '/', size)) goto out; /* trim to first NL */ - s = memchr(kbuf, '\n', size); + s = memchr(buf, '\n', size); if (s) *s = 0; /* determine command to perform */ - _debug("rootcell=%s", kbuf); + _debug("rootcell=%s", buf); - ret = afs_cell_init(net, kbuf); - if (ret >= 0) - ret = size; /* consume everything, always */ + ret = afs_cell_init(net, buf); out: - kfree(kbuf); _leave(" = %d", ret); return ret; } -/* - * initialise /proc/fs/afs/<cell>/ - */ -int afs_proc_cell_setup(struct afs_net *net, struct afs_cell *cell) -{ - struct proc_dir_entry *dir; - - _enter("%p{%s},%p", cell, cell->name, net->proc_afs); - - dir = proc_mkdir(cell->name, net->proc_afs); - if (!dir) - goto error_dir; - - if (!proc_create_data("vlservers", 0, dir, - &afs_proc_cell_vlservers_fops, cell) || - !proc_create_data("volumes", 0, dir, - &afs_proc_cell_volumes_fops, cell)) - goto error_tree; - - _leave(" = 0"); - return 0; - -error_tree: - remove_proc_subtree(cell->name, net->proc_afs); -error_dir: - _leave(" = -ENOMEM"); - return -ENOMEM; -} - -/* - * remove /proc/fs/afs/<cell>/ - */ -void afs_proc_cell_remove(struct afs_net *net, struct afs_cell *cell) -{ - _enter(""); - - remove_proc_subtree(cell->name, net->proc_afs); - - _leave(""); -} +static const char afs_vol_types[3][3] = { + [AFSVL_RWVOL] = "RW", + [AFSVL_ROVOL] = "RO", + [AFSVL_BACKVOL] = "BK", +}; /* - * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells + * Display the list of volumes known to a cell. */ -static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) +static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) { - struct afs_cell *cell; - struct seq_file *m; - int ret; - - cell = PDE_DATA(inode); - if (!cell) - return -ENOENT; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_volume *vol = list_entry(v, struct afs_volume, proc_link); - ret = seq_open(file, &afs_proc_cell_volumes_ops); - if (ret < 0) - return ret; + /* Display header on line 1 */ + if (v == &cell->proc_volumes) { + seq_puts(m, "USE VID TY\n"); + return 0; + } - m = file->private_data; - m->private = cell; + seq_printf(m, "%3d %08x %s\n", + atomic_read(&vol->usage), vol->vid, + afs_vol_types[vol->type]); return 0; } -/* - * set up the iterator to start reading from the cells list and return the - * first item - */ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) __acquires(cell->proc_lock) { - struct afs_cell *cell = m->private; - - _enter("cell=%p pos=%Ld", cell, *_pos); + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); read_lock(&cell->proc_lock); return seq_list_start_head(&cell->proc_volumes, *_pos); } -/* - * move to next cell in cells list - */ -static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, +static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v, loff_t *_pos) { - struct afs_cell *cell = p->private; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); - _enter("cell=%p pos=%Ld", cell, *_pos); return seq_list_next(v, &cell->proc_volumes, _pos); } -/* - * clean up after reading from the cells list - */ -static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v) +static void afs_proc_cell_volumes_stop(struct seq_file *m, void *v) __releases(cell->proc_lock) { - struct afs_cell *cell = p->private; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); read_unlock(&cell->proc_lock); } -static const char afs_vol_types[3][3] = { - [AFSVL_RWVOL] = "RW", - [AFSVL_ROVOL] = "RO", - [AFSVL_BACKVOL] = "BK", +static const struct seq_operations afs_proc_cell_volumes_ops = { + .start = afs_proc_cell_volumes_start, + .next = afs_proc_cell_volumes_next, + .stop = afs_proc_cell_volumes_stop, + .show = afs_proc_cell_volumes_show, }; /* - * display a header line followed by a load of volume lines + * Display the list of Volume Location servers we're using for a cell. */ -static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) +static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) { - struct afs_cell *cell = m->private; - struct afs_volume *vol = list_entry(v, struct afs_volume, proc_link); + struct sockaddr_rxrpc *addr = v; - /* Display header on line 1 */ - if (v == &cell->proc_volumes) { - seq_puts(m, "USE VID TY\n"); + /* display header on line 1 */ + if (v == (void *)1) { + seq_puts(m, "ADDRESS\n"); return 0; } - seq_printf(m, "%3d %08x %s\n", - atomic_read(&vol->usage), vol->vid, - afs_vol_types[vol->type]); - - return 0; -} - -/* - * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume - * location server - */ -static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) -{ - struct afs_cell *cell; - struct seq_file *m; - int ret; - - cell = PDE_DATA(inode); - if (!cell) - return -ENOENT; - - ret = seq_open(file, &afs_proc_cell_vlservers_ops); - if (ret<0) - return ret; - - m = file->private_data; - m->private = cell; - + /* display one cell per line on subsequent lines */ + seq_printf(m, "%pISp\n", &addr->transport); return 0; } -/* - * set up the iterator to start reading from the cells list and return the - * first item - */ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) __acquires(rcu) { struct afs_addr_list *alist; - struct afs_cell *cell = m->private; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); loff_t pos = *_pos; rcu_read_lock(); @@ -596,14 +289,11 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) return alist->addrs + pos; } -/* - * move to next cell in cells list - */ -static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, +static void *afs_proc_cell_vlservers_next(struct seq_file *m, void *v, loff_t *_pos) { struct afs_addr_list *alist; - struct afs_cell *cell = p->private; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); loff_t pos; alist = rcu_dereference(cell->vl_addrs); @@ -616,170 +306,145 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, return alist->addrs + pos; } -/* - * clean up after reading from the cells list - */ -static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v) +static void afs_proc_cell_vlservers_stop(struct seq_file *m, void *v) __releases(rcu) { rcu_read_unlock(); } +static const struct seq_operations afs_proc_cell_vlservers_ops = { + .start = afs_proc_cell_vlservers_start, + .next = afs_proc_cell_vlservers_next, + .stop = afs_proc_cell_vlservers_stop, + .show = afs_proc_cell_vlservers_show, +}; + /* - * display a header line followed by a load of volume lines + * Display the list of fileservers we're using within a namespace. */ -static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) +static int afs_proc_servers_show(struct seq_file *m, void *v) { - struct sockaddr_rxrpc *addr = v; + struct afs_server *server; + struct afs_addr_list *alist; + int i; - /* display header on line 1 */ - if (v == (void *)1) { - seq_puts(m, "ADDRESS\n"); + if (v == SEQ_START_TOKEN) { + seq_puts(m, "UUID USE ADDR\n"); return 0; } - /* display one cell per line on subsequent lines */ - seq_printf(m, "%pISp\n", &addr->transport); + server = list_entry(v, struct afs_server, proc_link); + alist = rcu_dereference(server->addresses); + seq_printf(m, "%pU %3d %pISpc%s\n", + &server->uuid, + atomic_read(&server->usage), + &alist->addrs[0].transport, + alist->index == 0 ? "*" : ""); + for (i = 1; i < alist->nr_addrs; i++) + seq_printf(m, " %pISpc%s\n", + &alist->addrs[i].transport, + alist->index == i ? "*" : ""); return 0; } -/* - * open "/proc/fs/afs/servers" which provides a summary of active - * servers - */ -static int afs_proc_servers_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &afs_proc_servers_ops); -} - -/* - * Set up the iterator to start reading from the server list and return the - * first item. - */ static void *afs_proc_servers_start(struct seq_file *m, loff_t *_pos) __acquires(rcu) { - struct afs_net *net = afs_seq2net(m); - rcu_read_lock(); - return seq_hlist_start_head_rcu(&net->fs_proc, *_pos); + return seq_hlist_start_head_rcu(&afs_seq2net(m)->fs_proc, *_pos); } -/* - * move to next cell in cells list - */ static void *afs_proc_servers_next(struct seq_file *m, void *v, loff_t *_pos) { - struct afs_net *net = afs_seq2net(m); - - return seq_hlist_next_rcu(v, &net->fs_proc, _pos); + return seq_hlist_next_rcu(v, &afs_seq2net(m)->fs_proc, _pos); } -/* - * clean up after reading from the cells list - */ -static void afs_proc_servers_stop(struct seq_file *p, void *v) +static void afs_proc_servers_stop(struct seq_file *m, void *v) __releases(rcu) { rcu_read_unlock(); } +static const struct seq_operations afs_proc_servers_ops = { + .start = afs_proc_servers_start, + .next = afs_proc_servers_next, + .stop = afs_proc_servers_stop, + .show = afs_proc_servers_show, +}; + /* - * display a header line followed by a load of volume lines + * Display the list of strings that may be substituted for the @sys pathname + * macro. */ -static int afs_proc_servers_show(struct seq_file *m, void *v) +static int afs_proc_sysname_show(struct seq_file *m, void *v) { - struct afs_server *server; - struct afs_addr_list *alist; - - if (v == SEQ_START_TOKEN) { - seq_puts(m, "UUID USE ADDR\n"); - return 0; - } + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *sysnames = net->sysnames; + unsigned int i = (unsigned long)v - 1; - server = list_entry(v, struct afs_server, proc_link); - alist = rcu_dereference(server->addresses); - seq_printf(m, "%pU %3d %pISp\n", - &server->uuid, - atomic_read(&server->usage), - &alist->addrs[alist->index].transport); + if (i < sysnames->nr) + seq_printf(m, "%s\n", sysnames->subs[i]); return 0; } -void afs_put_sysnames(struct afs_sysnames *sysnames) +static void *afs_proc_sysname_start(struct seq_file *m, loff_t *pos) + __acquires(&net->sysnames_lock) { - int i; + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *names; - if (sysnames && refcount_dec_and_test(&sysnames->usage)) { - for (i = 0; i < sysnames->nr; i++) - if (sysnames->subs[i] != afs_init_sysname && - sysnames->subs[i] != sysnames->blank) - kfree(sysnames->subs[i]); - } + read_lock(&net->sysnames_lock); + + names = net->sysnames; + if (*pos >= names->nr) + return NULL; + return (void *)(unsigned long)(*pos + 1); } -/* - * Handle opening of /proc/fs/afs/sysname. If it is opened for writing, we - * assume the caller wants to change the substitution list and we allocate a - * buffer to hold the list. - */ -static int afs_proc_sysname_open(struct inode *inode, struct file *file) +static void *afs_proc_sysname_next(struct seq_file *m, void *v, loff_t *pos) { - struct afs_sysnames *sysnames; - struct seq_file *m; - int ret; - - ret = seq_open(file, &afs_proc_sysname_ops); - if (ret < 0) - return ret; + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *names = net->sysnames; - if (file->f_mode & FMODE_WRITE) { - sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); - if (!sysnames) { - seq_release(inode, file); - return -ENOMEM; - } + *pos += 1; + if (*pos >= names->nr) + return NULL; + return (void *)(unsigned long)(*pos + 1); +} - refcount_set(&sysnames->usage, 1); - m = file->private_data; - m->private = sysnames; - } +static void afs_proc_sysname_stop(struct seq_file *m, void *v) + __releases(&net->sysnames_lock) +{ + struct afs_net *net = afs_seq2net(m); - return 0; + read_unlock(&net->sysnames_lock); } +static const struct seq_operations afs_proc_sysname_ops = { + .start = afs_proc_sysname_start, + .next = afs_proc_sysname_next, + .stop = afs_proc_sysname_stop, + .show = afs_proc_sysname_show, +}; + /* - * Handle writes to /proc/fs/afs/sysname to set the @sys substitution. + * Allow the @sys substitution to be configured. */ -static ssize_t afs_proc_sysname_write(struct file *file, - const char __user *buf, - size_t size, loff_t *_pos) +static int afs_proc_sysname_write(struct file *file, char *buf, size_t size) { - struct afs_sysnames *sysnames; + struct afs_sysnames *sysnames, *kill; struct seq_file *m = file->private_data; - char *kbuf = NULL, *s, *p, *sub; + struct afs_net *net = afs_seq2net(m); + char *s, *p, *sub; int ret, len; - sysnames = m->private; + sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); if (!sysnames) - return -EINVAL; - if (sysnames->error) - return sysnames->error; - - if (size >= PAGE_SIZE - 1) { - sysnames->error = -EINVAL; - return -EINVAL; - } - if (size == 0) - return 0; - - kbuf = memdup_user_nul(buf, size); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - inode_lock(file_inode(file)); + return -ENOMEM; + refcount_set(&sysnames->usage, 1); + kill = sysnames; - p = kbuf; + p = buf; while ((s = strsep(&p, " \t\n"))) { len = strlen(s); if (len == 0) @@ -820,85 +485,36 @@ static ssize_t afs_proc_sysname_write(struct file *file, sysnames->nr++; } - ret = size; /* consume everything, always */ + if (sysnames->nr == 0) { + sysnames->subs[0] = sysnames->blank; + sysnames->nr++; + } + + write_lock(&net->sysnames_lock); + kill = net->sysnames; + net->sysnames = sysnames; + write_unlock(&net->sysnames_lock); + ret = 0; out: - inode_unlock(file_inode(file)); - kfree(kbuf); + afs_put_sysnames(kill); return ret; invalid: ret = -EINVAL; error: - sysnames->error = ret; goto out; } -static int afs_proc_sysname_release(struct inode *inode, struct file *file) +void afs_put_sysnames(struct afs_sysnames *sysnames) { - struct afs_sysnames *sysnames, *kill = NULL; - struct seq_file *m = file->private_data; - struct afs_net *net = afs_seq2net(m); + int i; - sysnames = m->private; - if (sysnames) { - if (!sysnames->error) { - kill = sysnames; - if (sysnames->nr == 0) { - sysnames->subs[0] = sysnames->blank; - sysnames->nr++; - } - write_lock(&net->sysnames_lock); - kill = net->sysnames; - net->sysnames = sysnames; - write_unlock(&net->sysnames_lock); - } - afs_put_sysnames(kill); + if (sysnames && refcount_dec_and_test(&sysnames->usage)) { + for (i = 0; i < sysnames->nr; i++) + if (sysnames->subs[i] != afs_init_sysname && + sysnames->subs[i] != sysnames->blank) + kfree(sysnames->subs[i]); } - - return seq_release(inode, file); -} - -static void *afs_proc_sysname_start(struct seq_file *m, loff_t *pos) - __acquires(&net->sysnames_lock) -{ - struct afs_net *net = afs_seq2net(m); - struct afs_sysnames *names = net->sysnames; - - read_lock(&net->sysnames_lock); - - if (*pos >= names->nr) - return NULL; - return (void *)(unsigned long)(*pos + 1); -} - -static void *afs_proc_sysname_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct afs_net *net = afs_seq2net(m); - struct afs_sysnames *names = net->sysnames; - - *pos += 1; - if (*pos >= names->nr) - return NULL; - return (void *)(unsigned long)(*pos + 1); -} - -static void afs_proc_sysname_stop(struct seq_file *m, void *v) - __releases(&net->sysnames_lock) -{ - struct afs_net *net = afs_seq2net(m); - - read_unlock(&net->sysnames_lock); -} - -static int afs_proc_sysname_show(struct seq_file *m, void *v) -{ - struct afs_net *net = afs_seq2net(m); - struct afs_sysnames *sysnames = net->sysnames; - unsigned int i = (unsigned long)v - 1; - - if (i < sysnames->nr) - seq_printf(m, "%s\n", sysnames->subs[i]); - return 0; } /* @@ -906,7 +522,7 @@ static int afs_proc_sysname_show(struct seq_file *m, void *v) */ static int afs_proc_stats_show(struct seq_file *m, void *v) { - struct afs_net *net = afs_seq2net(m); + struct afs_net *net = afs_seq2net_single(m); seq_puts(m, "kAFS statistics\n"); @@ -933,16 +549,99 @@ static int afs_proc_stats_show(struct seq_file *m, void *v) } /* - * Open "/proc/fs/afs/stats" to allow reading of the stat counters. + * initialise /proc/fs/afs/<cell>/ */ -static int afs_proc_stats_open(struct inode *inode, struct file *file) +int afs_proc_cell_setup(struct afs_cell *cell) { - return single_open(file, afs_proc_stats_show, NULL); + struct proc_dir_entry *dir; + struct afs_net *net = cell->net; + + _enter("%p{%s},%p", cell, cell->name, net->proc_afs); + + dir = proc_net_mkdir(net->net, cell->name, net->proc_afs); + if (!dir) + goto error_dir; + + if (!proc_create_net_data("vlservers", 0444, dir, + &afs_proc_cell_vlservers_ops, + sizeof(struct seq_net_private), + cell) || + !proc_create_net_data("volumes", 0444, dir, + &afs_proc_cell_volumes_ops, + sizeof(struct seq_net_private), + cell)) + goto error_tree; + + _leave(" = 0"); + return 0; + +error_tree: + remove_proc_subtree(cell->name, net->proc_afs); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; } -static const struct file_operations afs_proc_stats_fops = { - .open = afs_proc_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +/* + * remove /proc/fs/afs/<cell>/ + */ +void afs_proc_cell_remove(struct afs_cell *cell) +{ + struct afs_net *net = cell->net; + + _enter(""); + remove_proc_subtree(cell->name, net->proc_afs); + _leave(""); +} + +/* + * initialise the /proc/fs/afs/ directory + */ +int afs_proc_init(struct afs_net *net) +{ + struct proc_dir_entry *p; + + _enter(""); + + p = proc_net_mkdir(net->net, "afs", net->net->proc_net); + if (!p) + goto error_dir; + + if (!proc_create_net_data_write("cells", 0644, p, + &afs_proc_cells_ops, + afs_proc_cells_write, + sizeof(struct seq_net_private), + NULL) || + !proc_create_net_single_write("rootcell", 0644, p, + afs_proc_rootcell_show, + afs_proc_rootcell_write, + NULL) || + !proc_create_net("servers", 0444, p, &afs_proc_servers_ops, + sizeof(struct seq_net_private)) || + !proc_create_net_single("stats", 0444, p, afs_proc_stats_show, NULL) || + !proc_create_net_data_write("sysname", 0644, p, + &afs_proc_sysname_ops, + afs_proc_sysname_write, + sizeof(struct seq_net_private), + NULL)) + goto error_tree; + + net->proc_afs = p; + _leave(" = 0"); + return 0; + +error_tree: + proc_remove(p); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/afs/ directory + */ +void afs_proc_cleanup(struct afs_net *net) +{ + proc_remove(net->proc_afs); + net->proc_afs = NULL; +} diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index e065bc0768e6..1faef56b12bd 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -310,6 +310,10 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) case -ETIME: _debug("no conn"); goto iterate_address; + + case -ECONNRESET: + _debug("call reset"); + goto failed; } restart_from_beginning: diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 08735948f15d..a1b18082991b 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -46,7 +46,7 @@ int afs_open_socket(struct afs_net *net) _enter(""); - ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET6, &socket); + ret = sock_create_kern(net->net, AF_RXRPC, SOCK_DGRAM, PF_INET6, &socket); if (ret < 0) goto error_1; diff --git a/fs/afs/security.c b/fs/afs/security.c index 1992b0ffa543..81dfedb7879f 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -372,18 +372,14 @@ int afs_permission(struct inode *inode, int mask) mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file"); if (S_ISDIR(inode->i_mode)) { - if (mask & MAY_EXEC) { + if (mask & (MAY_EXEC | MAY_READ | MAY_CHDIR)) { if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; - } else if (mask & MAY_READ) { - if (!(access & AFS_ACE_LOOKUP)) - goto permission_denied; - } else if (mask & MAY_WRITE) { + } + if (mask & MAY_WRITE) { if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */ AFS_ACE_INSERT))) /* create, mkdir, symlink, rename to */ goto permission_denied; - } else { - BUG(); } } else { if (!(access & AFS_ACE_LOOKUP)) diff --git a/fs/afs/server.c b/fs/afs/server.c index 3af4625e2f8c..1d329e6981d5 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -228,7 +228,7 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, server->flags = (1UL << AFS_SERVER_FL_NEW); server->update_at = ktime_get_real_seconds() + afs_server_update_delay; rwlock_init(&server->fs_lock); - INIT_LIST_HEAD(&server->cb_interests); + INIT_HLIST_HEAD(&server->cb_volumes); rwlock_init(&server->cb_break_lock); afs_inc_servers_outstanding(net); diff --git a/fs/afs/super.c b/fs/afs/super.c index 9e5d7966621c..4d3e274207fb 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -48,6 +48,8 @@ struct file_system_type afs_fs_type = { }; MODULE_ALIAS_FS("afs"); +int afs_net_id; + static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, @@ -117,7 +119,7 @@ int __init afs_fs_init(void) /* * clean up the filesystem */ -void __exit afs_fs_exit(void) +void afs_fs_exit(void) { _enter(""); @@ -351,14 +353,19 @@ static int afs_test_super(struct super_block *sb, void *data) struct afs_super_info *as1 = data; struct afs_super_info *as = AFS_FS_S(sb); - return (as->net == as1->net && + return (as->net_ns == as1->net_ns && as->volume && - as->volume->vid == as1->volume->vid); + as->volume->vid == as1->volume->vid && + !as->dyn_root); } static int afs_dynroot_test_super(struct super_block *sb, void *data) { - return false; + struct afs_super_info *as1 = data; + struct afs_super_info *as = AFS_FS_S(sb); + + return (as->net_ns == as1->net_ns && + as->dyn_root); } static int afs_set_super(struct super_block *sb, void *data) @@ -418,10 +425,14 @@ static int afs_fill_super(struct super_block *sb, if (!sb->s_root) goto error; - if (params->dyn_root) + if (as->dyn_root) { sb->s_d_op = &afs_dynroot_dentry_operations; - else + ret = afs_dynroot_populate(sb); + if (ret < 0) + goto error; + } else { sb->s_d_op = &afs_fs_dentry_operations; + } _leave(" = 0"); return 0; @@ -437,7 +448,7 @@ static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params) as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); if (as) { - as->net = afs_get_net(params->net); + as->net_ns = get_net(params->net_ns); if (params->dyn_root) as->dyn_root = true; else @@ -450,12 +461,31 @@ static void afs_destroy_sbi(struct afs_super_info *as) { if (as) { afs_put_volume(as->cell, as->volume); - afs_put_cell(as->net, as->cell); - afs_put_net(as->net); + afs_put_cell(afs_net(as->net_ns), as->cell); + put_net(as->net_ns); kfree(as); } } +static void afs_kill_super(struct super_block *sb) +{ + struct afs_super_info *as = AFS_FS_S(sb); + struct afs_net *net = afs_net(as->net_ns); + + if (as->dyn_root) + afs_dynroot_depopulate(sb); + + /* Clear the callback interests (which will do ilookup5) before + * deactivating the superblock. + */ + if (as->volume) + afs_clear_callback_interests(net, as->volume->servers); + kill_anon_super(sb); + if (as->volume) + afs_deactivate_volume(as->volume); + afs_destroy_sbi(as); +} + /* * get an AFS superblock */ @@ -472,12 +502,13 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, _enter(",,%s,%p", dev_name, options); memset(¶ms, 0, sizeof(params)); - params.net = &__afs_net; ret = -EINVAL; if (current->nsproxy->net_ns != &init_net) goto error; - + params.net_ns = current->nsproxy->net_ns; + params.net = afs_net(params.net_ns); + /* parse the options and device name */ if (options) { ret = afs_parse_options(¶ms, options, &dev_name); @@ -563,21 +594,6 @@ error: return ERR_PTR(ret); } -static void afs_kill_super(struct super_block *sb) -{ - struct afs_super_info *as = AFS_FS_S(sb); - - /* Clear the callback interests (which will do ilookup5) before - * deactivating the superblock. - */ - if (as->volume) - afs_clear_callback_interests(as->net, as->volume->servers); - kill_anon_super(sb); - if (as->volume) - afs_deactivate_volume(as->volume); - afs_destroy_sbi(as); -} - /* * Initialise an inode cache slab element prior to any use. Note that * afs_alloc_inode() *must* reset anything that could incorrectly leak from one diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 1ed7e2fd2f35..c3b740813fc7 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -23,7 +23,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) struct afs_uvldbentry__xdr *uvldb; struct afs_vldb_entry *entry; bool new_only = false; - u32 tmp, nr_servers; + u32 tmp, nr_servers, vlflags; int i, ret; _enter(""); @@ -55,6 +55,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) new_only = true; } + vlflags = ntohl(uvldb->flags); for (i = 0; i < nr_servers; i++) { struct afs_uuid__xdr *xdr; struct afs_uuid *uuid; @@ -64,12 +65,13 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) if (tmp & AFS_VLSF_DONTUSE || (new_only && !(tmp & AFS_VLSF_NEWREPSITE))) continue; - if (tmp & AFS_VLSF_RWVOL) + if (tmp & AFS_VLSF_RWVOL) { entry->fs_mask[i] |= AFS_VOL_VTM_RW; + if (vlflags & AFS_VLF_BACKEXISTS) + entry->fs_mask[i] |= AFS_VOL_VTM_BAK; + } if (tmp & AFS_VLSF_ROVOL) entry->fs_mask[i] |= AFS_VOL_VTM_RO; - if (tmp & AFS_VLSF_BACKVOL) - entry->fs_mask[i] |= AFS_VOL_VTM_BAK; if (!entry->fs_mask[i]) continue; @@ -89,15 +91,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) for (i = 0; i < AFS_MAXTYPES; i++) entry->vid[i] = ntohl(uvldb->volumeId[i]); - tmp = ntohl(uvldb->flags); - if (tmp & AFS_VLF_RWEXISTS) + if (vlflags & AFS_VLF_RWEXISTS) __set_bit(AFS_VLDB_HAS_RW, &entry->flags); - if (tmp & AFS_VLF_ROEXISTS) + if (vlflags & AFS_VLF_ROEXISTS) __set_bit(AFS_VLDB_HAS_RO, &entry->flags); - if (tmp & AFS_VLF_BACKEXISTS) + if (vlflags & AFS_VLF_BACKEXISTS) __set_bit(AFS_VLDB_HAS_BAK, &entry->flags); - if (!(tmp & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) { + if (!(vlflags & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) { entry->error = -ENOMEDIUM; __set_bit(AFS_VLDB_QUERY_ERROR, &entry->flags); } @@ -46,6 +46,8 @@ #include "internal.h" +#define KIOCB_KEY 0 + #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 @@ -156,21 +158,17 @@ struct kioctx { unsigned id; }; -/* - * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either - * cancelled or completed (this makes a certain amount of sense because - * successful cancellation - io_cancel() - does deliver the completion to - * userspace). - * - * And since most things don't implement kiocb cancellation and we'd really like - * kiocb completion to be lockless when possible, we use ki_cancel to - * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED - * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). - */ -#define KIOCB_CANCELLED ((void *) (~0ULL)) +struct fsync_iocb { + struct work_struct work; + struct file *file; + bool datasync; +}; struct aio_kiocb { - struct kiocb common; + union { + struct kiocb rw; + struct fsync_iocb fsync; + }; struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; @@ -264,9 +262,6 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - - pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); - return 0; } __initcall(aio_setup); @@ -552,42 +547,20 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { - struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common); + struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw); struct kioctx *ctx = req->ki_ctx; unsigned long flags; - spin_lock_irqsave(&ctx->ctx_lock, flags); - - if (!req->ki_list.next) - list_add(&req->ki_list, &ctx->active_reqs); + if (WARN_ON_ONCE(!list_empty(&req->ki_list))) + return; + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_add_tail(&req->ki_list, &ctx->active_reqs); req->ki_cancel = cancel; - spin_unlock_irqrestore(&ctx->ctx_lock, flags); } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct aio_kiocb *kiocb) -{ - kiocb_cancel_fn *old, *cancel; - - /* - * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it - * actually has a cancel function, hence the cmpxchg() - */ - - cancel = READ_ONCE(kiocb->ki_cancel); - do { - if (!cancel || cancel == KIOCB_CANCELLED) - return -EINVAL; - - old = cancel; - cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); - } while (cancel != old); - - return cancel(&kiocb->common); -} - /* * free_ioctx() should be RCU delayed to synchronize against the RCU * protected lookup_ioctx() and also needs process context to call @@ -634,9 +607,8 @@ static void free_ioctx_users(struct percpu_ref *ref) while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); - + req->ki_cancel(&req->rw); list_del_init(&req->ki_list); - kiocb_cancel(req); } spin_unlock_irq(&ctx->ctx_lock); @@ -1042,7 +1014,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) goto out_put; percpu_ref_get(&ctx->reqs); - + INIT_LIST_HEAD(&req->ki_list); req->ki_ctx = ctx; return req; out_put: @@ -1050,15 +1022,6 @@ out_put: return NULL; } -static void kiocb_free(struct aio_kiocb *req) -{ - if (req->common.ki_filp) - fput(req->common.ki_filp); - if (req->ki_eventfd != NULL) - eventfd_ctx_put(req->ki_eventfd); - kmem_cache_free(kiocb_cachep, req); -} - static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct aio_ring __user *ring = (void __user *)ctx_id; @@ -1078,8 +1041,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { - percpu_ref_get(&ctx->users); - ret = ctx; + if (percpu_ref_tryget_live(&ctx->users)) + ret = ctx; } out: rcu_read_unlock(); @@ -1089,44 +1052,14 @@ out: /* aio_complete * Called when the io request on the given iocb is complete. */ -static void aio_complete(struct kiocb *kiocb, long res, long res2) +static void aio_complete(struct aio_kiocb *iocb, long res, long res2) { - struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common); struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; unsigned tail, pos, head; unsigned long flags; - if (kiocb->ki_flags & IOCB_WRITE) { - struct file *file = kiocb->ki_filp; - - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (S_ISREG(file_inode(file)->i_mode)) - __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); - file_end_write(file); - } - - /* - * Special case handling for sync iocbs: - * - events go directly into the iocb for fast handling - * - the sync task with the iocb in its stack holds the single iocb - * ref, no other paths have a way to get another ref - * - the sync task helpfully left a reference to itself in the iocb - */ - BUG_ON(is_sync_kiocb(kiocb)); - - if (iocb->ki_list.next) { - unsigned long flags; - - spin_lock_irqsave(&ctx->ctx_lock, flags); - list_del(&iocb->ki_list); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - } - /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail @@ -1180,11 +1113,12 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2) * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (iocb->ki_eventfd != NULL) + if (iocb->ki_eventfd) { eventfd_signal(iocb->ki_eventfd, 1); + eventfd_ctx_put(iocb->ki_eventfd); + } - /* everything turned out well, dispose of the aiocb. */ - kiocb_free(iocb); + kmem_cache_free(kiocb_cachep, iocb); /* * We have to order our ring_info tail store above and test @@ -1250,14 +1184,13 @@ static long aio_read_events_ring(struct kioctx *ctx, if (head == tail) break; - avail = min(avail, nr - ret); - avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - - ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); - pos = head + AIO_EVENTS_OFFSET; page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; pos %= AIO_EVENTS_PER_PAGE; + avail = min(avail, nr - ret); + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); + ev = kmap(page); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); @@ -1328,10 +1261,6 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, wait_event_interruptible_hrtimeout(ctx->wait, aio_read_events(ctx, min_nr, nr, event, &ret), until); - - if (!ret && signal_pending(current)) - ret = -EINTR; - return ret; } @@ -1447,6 +1376,74 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) return -EINVAL; } +static void aio_remove_iocb(struct aio_kiocb *iocb) +{ + struct kioctx *ctx = iocb->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&iocb->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} + +static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) +{ + struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); + + if (!list_empty_careful(&iocb->ki_list)) + aio_remove_iocb(iocb); + + if (kiocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(kiocb->ki_filp); + + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (S_ISREG(inode->i_mode)) + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + file_end_write(kiocb->ki_filp); + } + + fput(kiocb->ki_filp); + aio_complete(iocb, res, res2); +} + +static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) +{ + int ret; + + req->ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->ki_filp)) + return -EBADF; + req->ki_complete = aio_complete_rw; + req->ki_pos = iocb->aio_offset; + req->ki_flags = iocb_flags(req->ki_filp); + if (iocb->aio_flags & IOCB_FLAG_RESFD) + req->ki_flags |= IOCB_EVENTFD; + req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp)); + if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { + /* + * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then + * aio_reqprio is interpreted as an I/O scheduling + * class and priority. + */ + ret = ioprio_check_cap(iocb->aio_reqprio); + if (ret) { + pr_debug("aio ioprio check cap error: %d\n", ret); + return ret; + } + + req->ki_ioprio = iocb->aio_reqprio; + } else + req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + if (unlikely(ret)) + fput(req->ki_filp); + return ret; +} + static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { @@ -1466,11 +1463,11 @@ static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter); } -static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret) +static inline void aio_rw_done(struct kiocb *req, ssize_t ret) { switch (ret) { case -EIOCBQUEUED: - return ret; + break; case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: @@ -1482,85 +1479,140 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret) ret = -EINTR; /*FALLTHRU*/ default: - aio_complete(req, ret, 0); - return 0; + aio_complete_rw(req, ret, 0); } } static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, bool compat) { - struct file *file = req->ki_filp; struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; + struct file *file; ssize_t ret; + ret = aio_prep_rw(req, iocb); + if (ret) + return ret; + file = req->ki_filp; + + ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - return -EBADF; + goto out_fput; + ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) - return -EINVAL; + goto out_fput; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); if (ret) - return ret; + goto out_fput; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) - ret = aio_ret(req, call_read_iter(file, req, &iter)); + aio_rw_done(req, call_read_iter(file, req, &iter)); kfree(iovec); +out_fput: + if (unlikely(ret)) + fput(file); return ret; } static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, bool compat) { - struct file *file = req->ki_filp; struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; + struct file *file; ssize_t ret; + ret = aio_prep_rw(req, iocb); + if (ret) + return ret; + file = req->ki_filp; + + ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) - return -EBADF; + goto out_fput; + ret = -EINVAL; if (unlikely(!file->f_op->write_iter)) - return -EINVAL; + goto out_fput; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); if (ret) - return ret; + goto out_fput; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { - req->ki_flags |= IOCB_WRITE; - file_start_write(file); - ret = aio_ret(req, call_write_iter(file, req, &iter)); /* - * We release freeze protection in aio_complete(). Fool lockdep - * by telling it the lock got released so that it doesn't - * complain about held lock when we return to userspace. + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * aio_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. */ - if (S_ISREG(file_inode(file)->i_mode)) + if (S_ISREG(file_inode(file)->i_mode)) { + __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); + } + req->ki_flags |= IOCB_WRITE; + aio_rw_done(req, call_write_iter(file, req, &iter)); } kfree(iovec); +out_fput: + if (unlikely(ret)) + fput(file); return ret; } +static void aio_fsync_work(struct work_struct *work) +{ + struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); + int ret; + + ret = vfs_fsync(req->file, req->datasync); + fput(req->file); + aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); +} + +static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) +{ + if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || + iocb->aio_rw_flags)) + return -EINVAL; + + req->file = fget(iocb->aio_fildes); + if (unlikely(!req->file)) + return -EBADF; + if (unlikely(!req->file->f_op->fsync)) { + fput(req->file); + return -EINVAL; + } + + req->datasync = datasync; + INIT_WORK(&req->work, aio_fsync_work); + schedule_work(&req->work); + return 0; +} + static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, bool compat) + bool compat) { struct aio_kiocb *req; - struct file *file; + struct iocb iocb; ssize_t ret; + if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) + return -EFAULT; + /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved2)) { + if (unlikely(iocb.aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( - (iocb->aio_buf != (unsigned long)iocb->aio_buf) || - (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || - ((ssize_t)iocb->aio_nbytes < 0) + (iocb.aio_buf != (unsigned long)iocb.aio_buf) || + (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || + ((ssize_t)iocb.aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; @@ -1570,37 +1622,19 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!req)) return -EAGAIN; - req->common.ki_filp = file = fget(iocb->aio_fildes); - if (unlikely(!req->common.ki_filp)) { - ret = -EBADF; - goto out_put_req; - } - req->common.ki_pos = iocb->aio_offset; - req->common.ki_complete = aio_complete; - req->common.ki_flags = iocb_flags(req->common.ki_filp); - req->common.ki_hint = file_write_hint(file); - - if (iocb->aio_flags & IOCB_FLAG_RESFD) { + if (iocb.aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); + req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); req->ki_eventfd = NULL; goto out_put_req; } - - req->common.ki_flags |= IOCB_EVENTFD; - } - - ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags); - if (unlikely(ret)) { - pr_debug("EINVAL: aio_rw_flags\n"); - goto out_put_req; } ret = put_user(KIOCB_KEY, &user_iocb->aio_key); @@ -1610,41 +1644,64 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb->aio_data; + req->ki_user_data = iocb.aio_data; - get_file(file); - switch (iocb->aio_lio_opcode) { + switch (iocb.aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(&req->common, iocb, false, compat); + ret = aio_read(&req->rw, &iocb, false, compat); break; case IOCB_CMD_PWRITE: - ret = aio_write(&req->common, iocb, false, compat); + ret = aio_write(&req->rw, &iocb, false, compat); break; case IOCB_CMD_PREADV: - ret = aio_read(&req->common, iocb, true, compat); + ret = aio_read(&req->rw, &iocb, true, compat); break; case IOCB_CMD_PWRITEV: - ret = aio_write(&req->common, iocb, true, compat); + ret = aio_write(&req->rw, &iocb, true, compat); + break; + case IOCB_CMD_FSYNC: + ret = aio_fsync(&req->fsync, &iocb, false); + break; + case IOCB_CMD_FDSYNC: + ret = aio_fsync(&req->fsync, &iocb, true); break; default: - pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); + pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); ret = -EINVAL; break; } - fput(file); - if (ret && ret != -EIOCBQUEUED) + /* + * If ret is 0, we'd either done aio_complete() ourselves or have + * arranged for that to be done asynchronously. Anything non-zero + * means that we need to destroy req ourselves. + */ + if (ret) goto out_put_req; return 0; out_put_req: put_reqs_available(ctx, 1); percpu_ref_put(&ctx->reqs); - kiocb_free(req); + if (req->ki_eventfd) + eventfd_ctx_put(req->ki_eventfd); + kmem_cache_free(kiocb_cachep, req); return ret; } -static long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user *__user *iocbpp, bool compat) +/* sys_io_submit: + * Queue the nr iocbs pointed to by iocbpp for processing. Returns + * the number of iocbs queued. May return -EINVAL if the aio_context + * specified by ctx_id is invalid, if nr is < 0, if the iocb at + * *iocbpp[0] is not properly initialized, if the operation specified + * is invalid for the file descriptor in the iocb. May fail with + * -EFAULT if any of the data structures point to invalid data. May + * fail with -EBADF if the file descriptor specified in the first + * iocb is invalid. May fail with -EAGAIN if insufficient resources + * are available to queue any iocbs. Will return 0 if nr is 0. Will + * fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, + struct iocb __user * __user *, iocbpp) { struct kioctx *ctx; long ret = 0; @@ -1654,39 +1711,25 @@ static long do_io_submit(aio_context_t ctx_id, long nr, if (unlikely(nr < 0)) return -EINVAL; - if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) - nr = LONG_MAX/sizeof(*iocbpp); - - if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) - return -EFAULT; - ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } - blk_start_plug(&plug); + if (nr > ctx->nr_events) + nr = ctx->nr_events; - /* - * AKPM: should this return a partial result if some of the IOs were - * successfully submitted? - */ - for (i=0; i<nr; i++) { + blk_start_plug(&plug); + for (i = 0; i < nr; i++) { struct iocb __user *user_iocb; - struct iocb tmp; - - if (unlikely(__get_user(user_iocb, iocbpp + i))) { - ret = -EFAULT; - break; - } - if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) { + if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } - ret = io_submit_one(ctx, user_iocb, &tmp, compat); + ret = io_submit_one(ctx, user_iocb, false); if (ret) break; } @@ -1696,59 +1739,44 @@ static long do_io_submit(aio_context_t ctx_id, long nr, return i ? i : ret; } -/* sys_io_submit: - * Queue the nr iocbs pointed to by iocbpp for processing. Returns - * the number of iocbs queued. May return -EINVAL if the aio_context - * specified by ctx_id is invalid, if nr is < 0, if the iocb at - * *iocbpp[0] is not properly initialized, if the operation specified - * is invalid for the file descriptor in the iocb. May fail with - * -EFAULT if any of the data structures point to invalid data. May - * fail with -EBADF if the file descriptor specified in the first - * iocb is invalid. May fail with -EAGAIN if insufficient resources - * are available to queue any iocbs. Will return 0 if nr is 0. Will - * fail with -ENOSYS if not implemented. - */ -SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, - struct iocb __user * __user *, iocbpp) -{ - return do_io_submit(ctx_id, nr, iocbpp, 0); -} - #ifdef CONFIG_COMPAT -static inline long -copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, compat_uptr_t __user *, iocbpp) { - compat_uptr_t uptr; - int i; + struct kioctx *ctx; + long ret = 0; + int i = 0; + struct blk_plug plug; - for (i = 0; i < nr; ++i) { - if (get_user(uptr, ptr32 + i)) - return -EFAULT; - if (put_user(compat_ptr(uptr), ptr64 + i)) - return -EFAULT; + if (unlikely(nr < 0)) + return -EINVAL; + + ctx = lookup_ioctx(ctx_id); + if (unlikely(!ctx)) { + pr_debug("EINVAL: invalid context id\n"); + return -EINVAL; } - return 0; -} -#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) + if (nr > ctx->nr_events) + nr = ctx->nr_events; -COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, - int, nr, u32 __user *, iocb) -{ - struct iocb __user * __user *iocb64; - long ret; + blk_start_plug(&plug); + for (i = 0; i < nr; i++) { + compat_uptr_t user_iocb; - if (unlikely(nr < 0)) - return -EINVAL; + if (unlikely(get_user(user_iocb, iocbpp + i))) { + ret = -EFAULT; + break; + } - if (nr > MAX_AIO_SUBMITS) - nr = MAX_AIO_SUBMITS; + ret = io_submit_one(ctx, compat_ptr(user_iocb), true); + if (ret) + break; + } + blk_finish_plug(&plug); - iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); - ret = copy_iocb(nr, iocb, iocb64); - if (!ret) - ret = do_io_submit(ctx_id, nr, iocb64, 1); - return ret; + percpu_ref_put(&ctx->users); + return i ? i : ret; } #endif @@ -1756,15 +1784,12 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, * Finds a given iocb for cancellation. */ static struct aio_kiocb * -lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key) +lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) { struct aio_kiocb *kiocb; assert_spin_locked(&ctx->ctx_lock); - if (key != KIOCB_KEY) - return NULL; - /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_user_iocb == iocb) @@ -1788,25 +1813,24 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, { struct kioctx *ctx; struct aio_kiocb *kiocb; + int ret = -EINVAL; u32 key; - int ret; - ret = get_user(key, &iocb->aio_key); - if (unlikely(ret)) + if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; + if (unlikely(key != KIOCB_KEY)) + return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - - kiocb = lookup_kiocb(ctx, iocb, key); - if (kiocb) - ret = kiocb_cancel(kiocb); - else - ret = -EINVAL; - + kiocb = lookup_kiocb(ctx, iocb); + if (kiocb) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + } spin_unlock_irq(&ctx->ctx_lock); if (!ret) { @@ -1861,13 +1885,65 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, struct timespec __user *, timeout) { struct timespec64 ts; + int ret; + + if (timeout && unlikely(get_timespec64(&ts, timeout))) + return -EFAULT; + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); + if (!ret && signal_pending(current)) + ret = -EINTR; + return ret; +} + +struct __aio_sigset { + const sigset_t __user *sigmask; + size_t sigsetsize; +}; + +SYSCALL_DEFINE6(io_pgetevents, + aio_context_t, ctx_id, + long, min_nr, + long, nr, + struct io_event __user *, events, + struct timespec __user *, timeout, + const struct __aio_sigset __user *, usig) +{ + struct __aio_sigset ksig = { NULL, }; + sigset_t ksigmask, sigsaved; + struct timespec64 ts; + int ret; + + if (timeout && unlikely(get_timespec64(&ts, timeout))) + return -EFAULT; + + if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) + return -EFAULT; - if (timeout) { - if (unlikely(get_timespec64(&ts, timeout))) + if (ksig.sigmask) { + if (ksig.sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask))) return -EFAULT; + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); + if (signal_pending(current)) { + if (ksig.sigmask) { + current->saved_sigmask = sigsaved; + set_restore_sigmask(); + } + + if (!ret) + ret = -ERESTARTNOHAND; + } else { + if (ksig.sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); } - return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); + return ret; } #ifdef CONFIG_COMPAT @@ -1878,13 +1954,64 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, struct compat_timespec __user *, timeout) { struct timespec64 t; + int ret; + + if (timeout && compat_get_timespec64(&t, timeout)) + return -EFAULT; + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); + if (!ret && signal_pending(current)) + ret = -EINTR; + return ret; +} - if (timeout) { - if (compat_get_timespec64(&t, timeout)) + +struct __compat_aio_sigset { + compat_sigset_t __user *sigmask; + compat_size_t sigsetsize; +}; + +COMPAT_SYSCALL_DEFINE6(io_pgetevents, + compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout, + const struct __compat_aio_sigset __user *, usig) +{ + struct __compat_aio_sigset ksig = { NULL, }; + sigset_t ksigmask, sigsaved; + struct timespec64 t; + int ret; + + if (timeout && compat_get_timespec64(&t, timeout)) + return -EFAULT; + + if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) + return -EFAULT; + + if (ksig.sigmask) { + if (ksig.sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (get_compat_sigset(&ksigmask, ksig.sigmask)) return -EFAULT; + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); + if (signal_pending(current)) { + if (ksig.sigmask) { + current->saved_sigmask = sigsaved; + set_restore_sigmask(); + } + if (!ret) + ret = -ERESTARTNOHAND; + } else { + if (ksig.sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); } - return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); + return ret; } #endif diff --git a/fs/attr.c b/fs/attr.c index 12ffdb6fb63c..e3d53bf12240 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,6 +18,32 @@ #include <linux/evm.h> #include <linux/ima.h> +static bool chown_ok(const struct inode *inode, kuid_t uid) +{ + if (uid_eq(current_fsuid(), inode->i_uid) && + uid_eq(uid, inode->i_uid)) + return true; + if (capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + return true; + if (uid_eq(inode->i_uid, INVALID_UID) && + ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) + return true; + return false; +} + +static bool chgrp_ok(const struct inode *inode, kgid_t gid) +{ + if (uid_eq(current_fsuid(), inode->i_uid) && + (in_group_p(gid) || gid_eq(gid, inode->i_gid))) + return true; + if (capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + return true; + if (gid_eq(inode->i_gid, INVALID_GID) && + ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) + return true; + return false; +} + /** * setattr_prepare - check if attribute changes to a dentry are allowed * @dentry: dentry to check @@ -52,17 +78,11 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr) goto kill_priv; /* Make sure a caller can chown. */ - if ((ia_valid & ATTR_UID) && - (!uid_eq(current_fsuid(), inode->i_uid) || - !uid_eq(attr->ia_uid, inode->i_uid)) && - !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid)) return -EPERM; /* Make sure caller can chgrp. */ - if ((ia_valid & ATTR_GID) && - (!uid_eq(current_fsuid(), inode->i_uid) || - (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && - !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid)) return -EPERM; /* Make sure a caller can chmod. */ @@ -163,14 +183,14 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; if (ia_valid & ATTR_ATIME) - inode->i_atime = timespec_trunc(attr->ia_atime, - inode->i_sb->s_time_gran); + inode->i_atime = timespec64_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MTIME) - inode->i_mtime = timespec_trunc(attr->ia_mtime, - inode->i_sb->s_time_gran); + inode->i_mtime = timespec64_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_CTIME) - inode->i_ctime = timespec_trunc(attr->ia_ctime, - inode->i_sb->s_time_gran); + inode->i_ctime = timespec64_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -207,7 +227,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; int error; - struct timespec now; + struct timespec64 now; unsigned int ia_valid = attr->ia_valid; WARN_ON_ONCE(!inode_is_locked(inode)); diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig new file mode 100644 index 000000000000..eaebcd430cc3 --- /dev/null +++ b/fs/autofs/Kconfig @@ -0,0 +1,31 @@ +config AUTOFS4_FS + tristate "Old Kconfig name for Kernel automounter support" + select AUTOFS_FS + help + This name exists for people to just automatically pick up the + new name of the autofs Kconfig option. All it does is select + the new option name. + + It will go away in a release or two as people have + transitioned to just plain AUTOFS_FS. + +config AUTOFS_FS + tristate "Kernel automounter support (supports v3, v4 and v5)" + default n + help + The automounter is a tool to automatically mount remote file systems + on demand. This implementation is partially kernel-based to reduce + overhead in the already-mounted case; this is unlike the BSD + automounter (amd), which is a pure user space daemon. + + To use the automounter you need the user-space tools from + <https://www.kernel.org/pub/linux/daemons/autofs/>; you also want + to answer Y to "NFS file system support", below. + + To compile this support as a module, choose M here: the module will be + called autofs. + + If you are not a part of a fairly large, distributed network or + don't have a laptop which needs to dynamically reconfigure to the + local network, you probably do not need an automounter, and can say + N here. diff --git a/fs/autofs4/Makefile b/fs/autofs/Makefile index a811c1f7d9ab..1f85d35ec8b7 100644 --- a/fs/autofs4/Makefile +++ b/fs/autofs/Makefile @@ -2,6 +2,6 @@ # Makefile for the linux autofs-filesystem routines. # -obj-$(CONFIG_AUTOFS4_FS) += autofs4.o +obj-$(CONFIG_AUTOFS_FS) += autofs4.o autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o diff --git a/fs/autofs4/autofs_i.h b/fs/autofs/autofs_i.h index 4737615f0eaa..9400a9f6318a 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -9,7 +9,7 @@ /* Internal header file for autofs */ -#include <linux/auto_fs4.h> +#include <linux/auto_fs.h> #include <linux/auto_dev-ioctl.h> #include <linux/kernel.h> @@ -25,7 +25,7 @@ #include <linux/spinlock.h> #include <linux/list.h> #include <linux/completion.h> -#include <asm/current.h> +#include <linux/file.h> /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -122,44 +122,44 @@ struct autofs_sb_info { struct rcu_head rcu; }; -static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) +static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) { return (struct autofs_sb_info *)(sb->s_fs_info); } -static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) +static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) { return (struct autofs_info *)(dentry->d_fsdata); } -/* autofs4_oz_mode(): do we see the man behind the curtain? (The +/* autofs_oz_mode(): do we see the man behind the curtain? (The * processes which do manipulations for us in user space sees the raw * filesystem without "magic".) */ -static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) +static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } -struct inode *autofs4_get_inode(struct super_block *, umode_t); -void autofs4_free_ino(struct autofs_info *); +struct inode *autofs_get_inode(struct super_block *, umode_t); +void autofs_free_ino(struct autofs_info *); /* Expiration */ -int is_autofs4_dentry(struct dentry *); -int autofs4_expire_wait(const struct path *path, int rcu_walk); -int autofs4_expire_run(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, - struct autofs_packet_expire __user *); -int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when); -int autofs4_expire_multi(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, int __user *); -struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); -struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); +int is_autofs_dentry(struct dentry *); +int autofs_expire_wait(const struct path *path, int rcu_walk); +int autofs_expire_run(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, + struct autofs_packet_expire __user *); +int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when); +int autofs_expire_multi(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, int __user *); +struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); +struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); /* Device node initialization */ @@ -168,11 +168,11 @@ void autofs_dev_ioctl_exit(void); /* Operations structures */ -extern const struct inode_operations autofs4_symlink_inode_operations; -extern const struct inode_operations autofs4_dir_inode_operations; -extern const struct file_operations autofs4_dir_operations; -extern const struct file_operations autofs4_root_operations; -extern const struct dentry_operations autofs4_dentry_operations; +extern const struct inode_operations autofs_symlink_inode_operations; +extern const struct inode_operations autofs_dir_inode_operations; +extern const struct file_operations autofs_dir_operations; +extern const struct file_operations autofs_root_operations; +extern const struct dentry_operations autofs_dentry_operations; /* VFS automount flags management functions */ static inline void __managed_dentry_set_managed(struct dentry *dentry) @@ -201,9 +201,9 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry) /* Initializing function */ -int autofs4_fill_super(struct super_block *, void *, int); -struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); -void autofs4_clean_ino(struct autofs_info *); +int autofs_fill_super(struct super_block *, void *, int); +struct autofs_info *autofs_new_ino(struct autofs_sb_info *); +void autofs_clean_ino(struct autofs_info *); static inline int autofs_prepare_pipe(struct file *pipe) { @@ -218,25 +218,25 @@ static inline int autofs_prepare_pipe(struct file *pipe) /* Queue management functions */ -int autofs4_wait(struct autofs_sb_info *, +int autofs_wait(struct autofs_sb_info *, const struct path *, enum autofs_notify); -int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); -void autofs4_catatonic_mode(struct autofs_sb_info *); +int autofs_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); +void autofs_catatonic_mode(struct autofs_sb_info *); -static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) +static inline u32 autofs_get_dev(struct autofs_sb_info *sbi) { return new_encode_dev(sbi->sb->s_dev); } -static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) +static inline u64 autofs_get_ino(struct autofs_sb_info *sbi) { return d_inode(sbi->sb->s_root)->i_ino; } -static inline void __autofs4_add_expiring(struct dentry *dentry) +static inline void __autofs_add_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { if (list_empty(&ino->expiring)) @@ -244,10 +244,10 @@ static inline void __autofs4_add_expiring(struct dentry *dentry) } } -static inline void autofs4_add_expiring(struct dentry *dentry) +static inline void autofs_add_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); @@ -257,10 +257,10 @@ static inline void autofs4_add_expiring(struct dentry *dentry) } } -static inline void autofs4_del_expiring(struct dentry *dentry) +static inline void autofs_del_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); @@ -270,4 +270,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry) } } -void autofs4_kill_sb(struct super_block *); +void autofs_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 26f6b4f41ce6..86eafda4a652 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -7,23 +7,10 @@ * option, any later version, incorporated herein by reference. */ -#include <linux/module.h> -#include <linux/vmalloc.h> #include <linux/miscdevice.h> -#include <linux/init.h> -#include <linux/wait.h> -#include <linux/namei.h> -#include <linux/fcntl.h> -#include <linux/file.h> -#include <linux/fdtable.h> -#include <linux/sched.h> -#include <linux/cred.h> #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/magic.h> -#include <linux/dcache.h> -#include <linux/uaccess.h> -#include <linux/slab.h> #include "autofs_i.h" @@ -148,6 +135,15 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) cmd); goto out; } + } else { + unsigned int inr = _IOC_NR(cmd); + + if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD || + inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD || + inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) { + err = -EINVAL; + goto out; + } } err = 0; @@ -166,7 +162,7 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) if (f) { inode = file_inode(f); - sbi = autofs4_sbi(inode->i_sb); + sbi = autofs_sbi(inode->i_sb); } return sbi; } @@ -236,7 +232,7 @@ static int test_by_dev(const struct path *path, void *p) static int test_by_type(const struct path *path, void *p) { - struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + struct autofs_info *ino = autofs_dentry_ino(path->dentry); return ino && ino->sbi->type & *(unsigned *)p; } @@ -284,7 +280,8 @@ static int autofs_dev_ioctl_openmount(struct file *fp, dev_t devid; int err, fd; - /* param->path has already been checked */ + /* param->path has been checked in validate_dev_ioctl() */ + if (!param->openmount.devid) return -EINVAL; @@ -324,7 +321,7 @@ static int autofs_dev_ioctl_ready(struct file *fp, autofs_wqt_t token; token = (autofs_wqt_t) param->ready.token; - return autofs4_wait_release(sbi, token, 0); + return autofs_wait_release(sbi, token, 0); } /* @@ -340,7 +337,7 @@ static int autofs_dev_ioctl_fail(struct file *fp, token = (autofs_wqt_t) param->fail.token; status = param->fail.status < 0 ? param->fail.status : -ENOENT; - return autofs4_wait_release(sbi, token, status); + return autofs_wait_release(sbi, token, status); } /* @@ -412,7 +409,7 @@ static int autofs_dev_ioctl_catatonic(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); return 0; } @@ -446,10 +443,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, dev_t devid; int err = -ENOENT; - if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { - err = -EINVAL; - goto out; - } + /* param->path has been checked in validate_dev_ioctl() */ devid = sbi->sb->s_dev; @@ -459,10 +453,10 @@ static int autofs_dev_ioctl_requester(struct file *fp, if (err) goto out; - ino = autofs4_dentry_ino(path.dentry); + ino = autofs_dentry_ino(path.dentry); if (ino) { err = 0; - autofs4_expire_wait(&path, 0); + autofs_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); @@ -489,7 +483,7 @@ static int autofs_dev_ioctl_expire(struct file *fp, how = param->expire.how; mnt = fp->f_path.mnt; - return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how); + return autofs_do_expire_multi(sbi->sb, mnt, sbi, how); } /* Check if autofs mount point is in use */ @@ -534,10 +528,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, unsigned int devid, magic; int err = -ENOENT; - if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { - err = -EINVAL; - goto out; - } + /* param->path has been checked in validate_dev_ioctl() */ name = param->path; type = param->ismountpoint.in.type; @@ -686,7 +677,7 @@ static int _autofs_dev_ioctl(unsigned int command, * Admin needs to be able to set the mount catatonic in * order to be able to perform the re-open. */ - if (!autofs4_oz_mode(sbi) && + if (!autofs_oz_mode(sbi) && cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { err = -EACCES; fput(fp); diff --git a/fs/autofs4/expire.c b/fs/autofs/expire.c index 57725d4a8c59..b332d3f6e730 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs/expire.c @@ -13,10 +13,10 @@ static unsigned long now; /* Check if a dentry can be expired */ -static inline int autofs4_can_expire(struct dentry *dentry, - unsigned long timeout, int do_now) +static inline int autofs_can_expire(struct dentry *dentry, + unsigned long timeout, int do_now) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); /* dentry in the process of being deleted */ if (ino == NULL) @@ -31,7 +31,7 @@ static inline int autofs4_can_expire(struct dentry *dentry, } /* Check a mount point for busyness */ -static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) +static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) { struct dentry *top = dentry; struct path path = {.mnt = mnt, .dentry = dentry}; @@ -44,8 +44,8 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) if (!follow_down_one(&path)) goto done; - if (is_autofs4_dentry(path.dentry)) { - struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb); + if (is_autofs_dentry(path.dentry)) { + struct autofs_sb_info *sbi = autofs_sbi(path.dentry->d_sb); /* This is an autofs submount, we can't expire it */ if (autofs_type_indirect(sbi->type)) @@ -56,7 +56,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) if (!may_umount_tree(path.mnt)) { struct autofs_info *ino; - ino = autofs4_dentry_ino(top); + ino = autofs_dentry_ino(top); ino->last_used = jiffies; goto done; } @@ -74,7 +74,7 @@ done: static struct dentry *get_next_positive_subdir(struct dentry *prev, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct list_head *next; struct dentry *q; @@ -121,7 +121,7 @@ cont: static struct dentry *get_next_positive_dentry(struct dentry *prev, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct list_head *next; struct dentry *p, *ret; @@ -184,10 +184,10 @@ again: * The tree is not busy iff no mountpoints are busy and there are no * autofs submounts. */ -static int autofs4_direct_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) +static int autofs_direct_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) { pr_debug("top %p %pd\n", top, top); @@ -195,14 +195,14 @@ static int autofs4_direct_busy(struct vfsmount *mnt, if (!may_umount_tree(mnt)) { struct autofs_info *ino; - ino = autofs4_dentry_ino(top); + ino = autofs_dentry_ino(top); if (ino) ino->last_used = jiffies; return 1; } /* Timeout of a direct mount is determined by its top dentry */ - if (!autofs4_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, do_now)) return 1; return 0; @@ -212,12 +212,12 @@ static int autofs4_direct_busy(struct vfsmount *mnt, * Check a directory tree of mount points for busyness * The tree is not busy iff no mountpoints are busy */ -static int autofs4_tree_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) +static int autofs_tree_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) { - struct autofs_info *top_ino = autofs4_dentry_ino(top); + struct autofs_info *top_ino = autofs_dentry_ino(top); struct dentry *p; pr_debug("top %p %pd\n", top, top); @@ -237,13 +237,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt, * If the fs is busy update the expiry counter. */ if (d_mountpoint(p)) { - if (autofs4_mount_busy(mnt, p)) { + if (autofs_mount_busy(mnt, p)) { top_ino->last_used = jiffies; dput(p); return 1; } } else { - struct autofs_info *ino = autofs4_dentry_ino(p); + struct autofs_info *ino = autofs_dentry_ino(p); unsigned int ino_count = atomic_read(&ino->count); /* allow for dget above and top is already dgot */ @@ -261,16 +261,16 @@ static int autofs4_tree_busy(struct vfsmount *mnt, } /* Timeout of a tree mount is ultimately determined by its top dentry */ - if (!autofs4_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, do_now)) return 1; return 0; } -static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, - struct dentry *parent, - unsigned long timeout, - int do_now) +static struct dentry *autofs_check_leaves(struct vfsmount *mnt, + struct dentry *parent, + unsigned long timeout, + int do_now) { struct dentry *p; @@ -282,11 +282,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, if (d_mountpoint(p)) { /* Can we umount this guy */ - if (autofs4_mount_busy(mnt, p)) + if (autofs_mount_busy(mnt, p)) continue; /* Can we expire this guy */ - if (autofs4_can_expire(p, timeout, do_now)) + if (autofs_can_expire(p, timeout, do_now)) return p; } } @@ -294,10 +294,10 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, } /* Check if we can expire a direct mount (possibly a tree) */ -struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = dget(sb->s_root); @@ -310,9 +310,9 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, do_now)) { spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(root); + ino = autofs_dentry_ino(root); /* No point expiring a pending mount */ if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); @@ -321,7 +321,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, do_now)) { spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); @@ -350,7 +350,7 @@ static struct dentry *should_expire(struct dentry *dentry, { int do_now = how & AUTOFS_EXP_IMMEDIATE; int exp_leaves = how & AUTOFS_EXP_LEAVES; - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); unsigned int ino_count; /* No point expiring a pending mount */ @@ -367,11 +367,11 @@ static struct dentry *should_expire(struct dentry *dentry, pr_debug("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ - if (autofs4_mount_busy(mnt, dentry)) + if (autofs_mount_busy(mnt, dentry)) return NULL; /* Can we expire this guy */ - if (autofs4_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, do_now)) return dentry; return NULL; } @@ -382,7 +382,7 @@ static struct dentry *should_expire(struct dentry *dentry, * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. */ - if (autofs4_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, do_now)) return dentry; return NULL; } @@ -397,7 +397,7 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_count(dentry) > ino_count) return NULL; - if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) + if (!autofs_tree_busy(mnt, dentry, timeout, do_now)) return dentry; /* * Case 3: pseudo direct mount, expire individual leaves @@ -411,7 +411,7 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_count(dentry) > ino_count) return NULL; - expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); + expired = autofs_check_leaves(mnt, dentry, timeout, do_now); if (expired) { if (expired == dentry) dput(dentry); @@ -427,10 +427,10 @@ static struct dentry *should_expire(struct dentry *dentry, * - it is unused by any user process * - it has been unused for exp_timeout time */ -struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = sb->s_root; @@ -450,7 +450,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, int flags = how; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; @@ -462,7 +462,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, continue; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(expired); + ino = autofs_dentry_ino(expired); ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); @@ -498,11 +498,11 @@ found: return expired; } -int autofs4_expire_wait(const struct path *path, int rcu_walk) +int autofs_expire_wait(const struct path *path, int rcu_walk) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; int state; @@ -529,7 +529,7 @@ retry: pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); - status = autofs4_wait(sbi, path, NFY_NONE); + status = autofs_wait(sbi, path, NFY_NONE); wait_for_completion(&ino->expire_complete); pr_debug("expire done status=%d\n", status); @@ -545,10 +545,10 @@ retry: } /* Perform an expiry operation */ -int autofs4_expire_run(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - struct autofs_packet_expire __user *pkt_p) +int autofs_expire_run(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + struct autofs_packet_expire __user *pkt_p) { struct autofs_packet_expire pkt; struct autofs_info *ino; @@ -560,7 +560,7 @@ int autofs4_expire_run(struct super_block *sb, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = autofs_ptype_expire; - dentry = autofs4_expire_indirect(sb, mnt, sbi, 0); + dentry = autofs_expire_indirect(sb, mnt, sbi, 0); if (!dentry) return -EAGAIN; @@ -573,7 +573,7 @@ int autofs4_expire_run(struct super_block *sb, ret = -EFAULT; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = now; ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); @@ -583,25 +583,25 @@ int autofs4_expire_run(struct super_block *sb, return ret; } -int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when) +int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when) { struct dentry *dentry; int ret = -EAGAIN; if (autofs_type_trigger(sbi->type)) - dentry = autofs4_expire_direct(sb, mnt, sbi, when); + dentry = autofs_expire_direct(sb, mnt, sbi, when); else - dentry = autofs4_expire_indirect(sb, mnt, sbi, when); + dentry = autofs_expire_indirect(sb, mnt, sbi, when); if (dentry) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); const struct path path = { .mnt = mnt, .dentry = dentry }; /* This is synchronous because it makes the daemon a * little easier */ - ret = autofs4_wait(sbi, &path, NFY_EXPIRE); + ret = autofs_wait(sbi, &path, NFY_EXPIRE); spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ @@ -619,7 +619,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, * Call repeatedly until it returns -EAGAIN, meaning there's nothing * more to be done. */ -int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, +int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int __user *arg) { int do_now = 0; @@ -627,6 +627,5 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - return autofs4_do_expire_multi(sb, mnt, sbi, do_now); + return autofs_do_expire_multi(sb, mnt, sbi, do_now); } - diff --git a/fs/autofs4/init.c b/fs/autofs/init.c index 8cf0e63389ae..79ae07d9592f 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs/init.c @@ -13,18 +13,19 @@ static struct dentry *autofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_nodev(fs_type, flags, data, autofs4_fill_super); + return mount_nodev(fs_type, flags, data, autofs_fill_super); } static struct file_system_type autofs_fs_type = { .owner = THIS_MODULE, .name = "autofs", .mount = autofs_mount, - .kill_sb = autofs4_kill_sb, + .kill_sb = autofs_kill_sb, }; MODULE_ALIAS_FS("autofs"); +MODULE_ALIAS("autofs"); -static int __init init_autofs4_fs(void) +static int __init init_autofs_fs(void) { int err; @@ -37,12 +38,12 @@ static int __init init_autofs4_fs(void) return err; } -static void __exit exit_autofs4_fs(void) +static void __exit exit_autofs_fs(void) { autofs_dev_ioctl_exit(); unregister_filesystem(&autofs_fs_type); } -module_init(init_autofs4_fs) -module_exit(exit_autofs4_fs) +module_init(init_autofs_fs) +module_exit(exit_autofs_fs) MODULE_LICENSE("GPL"); diff --git a/fs/autofs4/inode.c b/fs/autofs/inode.c index 09e7d68dff02..b51980fc274e 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs/inode.c @@ -7,18 +7,14 @@ * option, any later version, incorporated herein by reference. */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/file.h> #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/parser.h> -#include <linux/bitops.h> #include <linux/magic.h> + #include "autofs_i.h" -#include <linux/module.h> -struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) +struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) { struct autofs_info *ino; @@ -32,21 +28,21 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) return ino; } -void autofs4_clean_ino(struct autofs_info *ino) +void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; ino->last_used = jiffies; } -void autofs4_free_ino(struct autofs_info *ino) +void autofs_free_ino(struct autofs_info *ino) { kfree(ino); } -void autofs4_kill_sb(struct super_block *sb) +void autofs_kill_sb(struct super_block *sb) { - struct autofs_sb_info *sbi = autofs4_sbi(sb); + struct autofs_sb_info *sbi = autofs_sbi(sb); /* * In the event of a failure in get_sb_nodev the superblock @@ -56,7 +52,7 @@ void autofs4_kill_sb(struct super_block *sb) */ if (sbi) { /* Free wait queues, close pipe */ - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); put_pid(sbi->oz_pgrp); } @@ -66,9 +62,9 @@ void autofs4_kill_sb(struct super_block *sb) kfree_rcu(sbi, rcu); } -static int autofs4_show_options(struct seq_file *m, struct dentry *root) +static int autofs_show_options(struct seq_file *m, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) @@ -101,16 +97,16 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void autofs4_evict_inode(struct inode *inode) +static void autofs_evict_inode(struct inode *inode) { clear_inode(inode); kfree(inode->i_private); } -static const struct super_operations autofs4_sops = { +static const struct super_operations autofs_sops = { .statfs = simple_statfs, - .show_options = autofs4_show_options, - .evict_inode = autofs4_evict_inode, + .show_options = autofs_show_options, + .evict_inode = autofs_evict_inode, }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, @@ -206,7 +202,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, return (*pipefd < 0); } -int autofs4_fill_super(struct super_block *s, void *data, int silent) +int autofs_fill_super(struct super_block *s, void *data, int silent) { struct inode *root_inode; struct dentry *root; @@ -246,19 +242,19 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; - s->s_op = &autofs4_sops; - s->s_d_op = &autofs4_dentry_operations; + s->s_op = &autofs_sops; + s->s_d_op = &autofs_dentry_operations; s->s_time_gran = 1; /* * Get the root inode and dentry, but defer checking for errors. */ - ino = autofs4_new_ino(sbi); + ino = autofs_new_ino(sbi); if (!ino) { ret = -ENOMEM; goto fail_free; } - root_inode = autofs4_get_inode(s, S_IFDIR | 0755); + root_inode = autofs_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); if (!root) goto fail_ino; @@ -305,8 +301,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); - root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = &autofs4_dir_inode_operations; + root_inode->i_fop = &autofs_root_operations; + root_inode->i_op = &autofs_dir_inode_operations; pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); @@ -340,14 +336,14 @@ fail_dput: dput(root); goto fail_free; fail_ino: - autofs4_free_ino(ino); + autofs_free_ino(ino); fail_free: kfree(sbi); s->s_fs_info = NULL; return ret; } -struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) +struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); @@ -364,10 +360,10 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) if (S_ISDIR(mode)) { set_nlink(inode, 2); - inode->i_op = &autofs4_dir_inode_operations; - inode->i_fop = &autofs4_dir_operations; + inode->i_op = &autofs_dir_inode_operations; + inode->i_fop = &autofs_dir_operations; } else if (S_ISLNK(mode)) { - inode->i_op = &autofs4_symlink_inode_operations; + inode->i_op = &autofs_symlink_inode_operations; } else WARN_ON(1); diff --git a/fs/autofs4/root.c b/fs/autofs/root.c index b12e37f27530..a3d414150578 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs/root.c @@ -9,72 +9,66 @@ */ #include <linux/capability.h> -#include <linux/errno.h> -#include <linux/stat.h> -#include <linux/slab.h> -#include <linux/param.h> -#include <linux/time.h> #include <linux/compat.h> -#include <linux/mutex.h> #include "autofs_i.h" -static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *); -static int autofs4_dir_unlink(struct inode *, struct dentry *); -static int autofs4_dir_rmdir(struct inode *, struct dentry *); -static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t); -static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long); +static int autofs_dir_symlink(struct inode *, struct dentry *, const char *); +static int autofs_dir_unlink(struct inode *, struct dentry *); +static int autofs_dir_rmdir(struct inode *, struct dentry *); +static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t); +static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT -static long autofs4_root_compat_ioctl(struct file *, - unsigned int, unsigned long); +static long autofs_root_compat_ioctl(struct file *, + unsigned int, unsigned long); #endif -static int autofs4_dir_open(struct inode *inode, struct file *file); -static struct dentry *autofs4_lookup(struct inode *, - struct dentry *, unsigned int); -static struct vfsmount *autofs4_d_automount(struct path *); -static int autofs4_d_manage(const struct path *, bool); -static void autofs4_dentry_release(struct dentry *); - -const struct file_operations autofs4_root_operations = { +static int autofs_dir_open(struct inode *inode, struct file *file); +static struct dentry *autofs_lookup(struct inode *, + struct dentry *, unsigned int); +static struct vfsmount *autofs_d_automount(struct path *); +static int autofs_d_manage(const struct path *, bool); +static void autofs_dentry_release(struct dentry *); + +const struct file_operations autofs_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .read = generic_read_dir, .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, - .unlocked_ioctl = autofs4_root_ioctl, + .unlocked_ioctl = autofs_root_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = autofs4_root_compat_ioctl, + .compat_ioctl = autofs_root_compat_ioctl, #endif }; -const struct file_operations autofs4_dir_operations = { - .open = autofs4_dir_open, +const struct file_operations autofs_dir_operations = { + .open = autofs_dir_open, .release = dcache_dir_close, .read = generic_read_dir, .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, }; -const struct inode_operations autofs4_dir_inode_operations = { - .lookup = autofs4_lookup, - .unlink = autofs4_dir_unlink, - .symlink = autofs4_dir_symlink, - .mkdir = autofs4_dir_mkdir, - .rmdir = autofs4_dir_rmdir, +const struct inode_operations autofs_dir_inode_operations = { + .lookup = autofs_lookup, + .unlink = autofs_dir_unlink, + .symlink = autofs_dir_symlink, + .mkdir = autofs_dir_mkdir, + .rmdir = autofs_dir_rmdir, }; -const struct dentry_operations autofs4_dentry_operations = { - .d_automount = autofs4_d_automount, - .d_manage = autofs4_d_manage, - .d_release = autofs4_dentry_release, +const struct dentry_operations autofs_dentry_operations = { + .d_automount = autofs_d_automount, + .d_manage = autofs_d_manage, + .d_release = autofs_dentry_release, }; -static void autofs4_add_active(struct dentry *dentry) +static void autofs_add_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino; - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); if (!ino->active_count) { @@ -86,12 +80,12 @@ static void autofs4_add_active(struct dentry *dentry) } } -static void autofs4_del_active(struct dentry *dentry) +static void autofs_del_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino; - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); ino->active_count--; @@ -103,14 +97,14 @@ static void autofs4_del_active(struct dentry *dentry) } } -static int autofs4_dir_open(struct inode *inode, struct file *file) +static int autofs_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); - if (autofs4_oz_mode(sbi)) + if (autofs_oz_mode(sbi)) goto out; /* @@ -133,10 +127,10 @@ out: return dcache_dir_open(inode, file); } -static void autofs4_dentry_release(struct dentry *de) +static void autofs_dentry_release(struct dentry *de) { - struct autofs_info *ino = autofs4_dentry_ino(de); - struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); + struct autofs_info *ino = autofs_dentry_ino(de); + struct autofs_sb_info *sbi = autofs_sbi(de->d_sb); pr_debug("releasing %p\n", de); @@ -152,12 +146,12 @@ static void autofs4_dentry_release(struct dentry *de) spin_unlock(&sbi->lookup_lock); } - autofs4_free_ino(ino); + autofs_free_ino(ino); } -static struct dentry *autofs4_lookup_active(struct dentry *dentry) +static struct dentry *autofs_lookup_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; const struct qstr *name = &dentry->d_name; unsigned int len = name->len; @@ -209,10 +203,10 @@ next: return NULL; } -static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, - bool rcu_walk) +static struct dentry *autofs_lookup_expiring(struct dentry *dentry, + bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; const struct qstr *name = &dentry->d_name; unsigned int len = name->len; @@ -269,17 +263,17 @@ next: return NULL; } -static int autofs4_mount_wait(const struct path *path, bool rcu_walk) +static int autofs_mount_wait(const struct path *path, bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + struct autofs_sb_info *sbi = autofs_sbi(path->dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(path->dentry); int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; pr_debug("waiting for mount name=%pd\n", path->dentry); - status = autofs4_wait(sbi, path, NFY_MOUNT); + status = autofs_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); } ino->last_used = jiffies; @@ -291,11 +285,11 @@ static int do_expire_wait(const struct path *path, bool rcu_walk) struct dentry *dentry = path->dentry; struct dentry *expiring; - expiring = autofs4_lookup_expiring(dentry, rcu_walk); + expiring = autofs_lookup_expiring(dentry, rcu_walk); if (IS_ERR(expiring)) return PTR_ERR(expiring); if (!expiring) - return autofs4_expire_wait(path, rcu_walk); + return autofs_expire_wait(path, rcu_walk); else { const struct path this = { .mnt = path->mnt, .dentry = expiring }; /* @@ -303,17 +297,17 @@ static int do_expire_wait(const struct path *path, bool rcu_walk) * be quite complete, but the directory has been removed * so it must have been successful, just wait for it. */ - autofs4_expire_wait(&this, 0); - autofs4_del_expiring(expiring); + autofs_expire_wait(&this, 0); + autofs_del_expiring(expiring); dput(expiring); } return 0; } -static struct dentry *autofs4_mountpoint_changed(struct path *path) +static struct dentry *autofs_mountpoint_changed(struct path *path) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); /* * If this is an indirect mount the dentry could have gone away @@ -327,7 +321,7 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; - ino = autofs4_dentry_ino(new); + ino = autofs_dentry_ino(new); ino->last_used = jiffies; dput(path->dentry); path->dentry = new; @@ -335,17 +329,17 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) return path->dentry; } -static struct vfsmount *autofs4_d_automount(struct path *path) +static struct vfsmount *autofs_d_automount(struct path *path) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never triggers a mount. */ - if (autofs4_oz_mode(sbi)) + if (autofs_oz_mode(sbi)) return NULL; /* @@ -364,7 +358,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(path, 0); + status = autofs_mount_wait(path, 0); if (status) return ERR_PTR(status); goto done; @@ -405,7 +399,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(path, 0); + status = autofs_mount_wait(path, 0); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; if (status) { @@ -416,24 +410,24 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_unlock(&sbi->fs_lock); done: /* Mount succeeded, check if we ended up with a new dentry */ - dentry = autofs4_mountpoint_changed(path); + dentry = autofs_mountpoint_changed(path); if (!dentry) return ERR_PTR(-ENOENT); return NULL; } -static int autofs4_d_manage(const struct path *path, bool rcu_walk) +static int autofs_d_manage(const struct path *path, bool rcu_walk) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never waits. */ - if (autofs4_oz_mode(sbi)) { + if (autofs_oz_mode(sbi)) { if (!path_is_mountpoint(path)) return -EISDIR; return 0; @@ -447,7 +441,7 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) * This dentry may be under construction so wait on mount * completion. */ - status = autofs4_mount_wait(path, rcu_walk); + status = autofs_mount_wait(path, rcu_walk); if (status) return status; @@ -500,8 +494,8 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) } /* Lookups in the root directory */ -static struct dentry *autofs4_lookup(struct inode *dir, - struct dentry *dentry, unsigned int flags) +static struct dentry *autofs_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) { struct autofs_sb_info *sbi; struct autofs_info *ino; @@ -513,13 +507,13 @@ static struct dentry *autofs4_lookup(struct inode *dir, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - sbi = autofs4_sbi(dir->i_sb); + sbi = autofs_sbi(dir->i_sb); pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", current->pid, task_pgrp_nr(current), sbi->catatonic, - autofs4_oz_mode(sbi)); + autofs_oz_mode(sbi)); - active = autofs4_lookup_active(dentry); + active = autofs_lookup_active(dentry); if (active) return active; else { @@ -529,7 +523,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, * can return fail immediately. The daemon however does need * to create directories within the file system. */ - if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) + if (!autofs_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) return ERR_PTR(-ENOENT); /* Mark entries in the root as mount triggers */ @@ -537,24 +531,24 @@ static struct dentry *autofs4_lookup(struct inode *dir, autofs_type_indirect(sbi->type)) __managed_dentry_set_managed(dentry); - ino = autofs4_new_ino(sbi); + ino = autofs_new_ino(sbi); if (!ino) return ERR_PTR(-ENOMEM); dentry->d_fsdata = ino; ino->dentry = dentry; - autofs4_add_active(dentry); + autofs_add_active(dentry); } return NULL; } -static int autofs4_dir_symlink(struct inode *dir, +static int autofs_dir_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; size_t size = strlen(symname); @@ -562,14 +556,14 @@ static int autofs4_dir_symlink(struct inode *dir, pr_debug("%s <- %pd\n", symname, dentry); - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; BUG_ON(!ino); - autofs4_clean_ino(ino); + autofs_clean_ino(ino); - autofs4_del_active(dentry); + autofs_del_active(dentry); cp = kmalloc(size + 1, GFP_KERNEL); if (!cp) @@ -577,7 +571,7 @@ static int autofs4_dir_symlink(struct inode *dir, strcpy(cp, symname); - inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); + inode = autofs_get_inode(dir->i_sb, S_IFLNK | 0555); if (!inode) { kfree(cp); return -ENOMEM; @@ -588,7 +582,7 @@ static int autofs4_dir_symlink(struct inode *dir, dget(dentry); atomic_inc(&ino->count); - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); @@ -610,20 +604,20 @@ static int autofs4_dir_symlink(struct inode *dir, * If a process is blocked on the dentry waiting for the expire to finish, * it will invalidate the dentry and try to mount with a new one. * - * Also see autofs4_dir_rmdir().. + * Also see autofs_dir_rmdir().. */ -static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) +static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; /* This allows root to remove symlinks */ - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_dec(&p_ino->count); } @@ -635,7 +629,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = current_time(dir); spin_lock(&sbi->lookup_lock); - __autofs4_add_expiring(dentry); + __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -692,15 +686,15 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry) managed_dentry_set_managed(parent); } -static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) +static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; pr_debug("dentry %p, removing %pd\n", dentry, dentry); - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; spin_lock(&sbi->lookup_lock); @@ -708,7 +702,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) spin_unlock(&sbi->lookup_lock); return -ENOTEMPTY; } - __autofs4_add_expiring(dentry); + __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -716,7 +710,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) autofs_clear_leaf_automount_flags(dentry); if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) atomic_dec(&p_ino->count); } @@ -730,26 +724,26 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs4_dir_mkdir(struct inode *dir, - struct dentry *dentry, umode_t mode) +static int autofs_dir_mkdir(struct inode *dir, + struct dentry *dentry, umode_t mode) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); - autofs4_clean_ino(ino); + autofs_clean_ino(ino); - autofs4_del_active(dentry); + autofs_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode); + inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return -ENOMEM; d_add(dentry, inode); @@ -759,7 +753,7 @@ static int autofs4_dir_mkdir(struct inode *dir, dget(dentry); atomic_inc(&ino->count); - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); inc_nlink(dir); @@ -770,7 +764,7 @@ static int autofs4_dir_mkdir(struct inode *dir, /* Get/set timeout ioctl() operation */ #ifdef CONFIG_COMPAT -static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, +static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi, compat_ulong_t __user *p) { unsigned long ntimeout; @@ -795,7 +789,7 @@ error: } #endif -static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, +static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi, unsigned long __user *p) { unsigned long ntimeout; @@ -820,14 +814,14 @@ error: } /* Return protocol version */ -static inline int autofs4_get_protover(struct autofs_sb_info *sbi, +static inline int autofs_get_protover(struct autofs_sb_info *sbi, int __user *p) { return put_user(sbi->version, p); } /* Return protocol sub version */ -static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, +static inline int autofs_get_protosubver(struct autofs_sb_info *sbi, int __user *p) { return put_user(sbi->sub_version, p); @@ -836,7 +830,7 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, /* * Tells the daemon whether it can umount the autofs mount. */ -static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) +static inline int autofs_ask_umount(struct vfsmount *mnt, int __user *p) { int status = 0; @@ -850,14 +844,14 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) return status; } -/* Identify autofs4_dentries - this is so we can tell if there's +/* Identify autofs_dentries - this is so we can tell if there's * an extra dentry refcount or not. We only hold a refcount on the * dentry if its non-negative (ie, d_inode != NULL) */ -int is_autofs4_dentry(struct dentry *dentry) +int is_autofs_dentry(struct dentry *dentry) { return dentry && d_really_is_positive(dentry) && - dentry->d_op == &autofs4_dentry_operations && + dentry->d_op == &autofs_dentry_operations && dentry->d_fsdata != NULL; } @@ -865,10 +859,10 @@ int is_autofs4_dentry(struct dentry *dentry) * ioctl()'s on the root directory is the chief method for the daemon to * generate kernel reactions */ -static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, +static int autofs_root_ioctl_unlocked(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { - struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); + struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); void __user *p = (void __user *)arg; pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", @@ -878,64 +872,63 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ - return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0); + return autofs_wait_release(sbi, (autofs_wqt_t) arg, 0); case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ - return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); + return autofs_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); return 0; case AUTOFS_IOC_PROTOVER: /* Get protocol version */ - return autofs4_get_protover(sbi, p); + return autofs_get_protover(sbi, p); case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */ - return autofs4_get_protosubver(sbi, p); + return autofs_get_protosubver(sbi, p); case AUTOFS_IOC_SETTIMEOUT: - return autofs4_get_set_timeout(sbi, p); + return autofs_get_set_timeout(sbi, p); #ifdef CONFIG_COMPAT case AUTOFS_IOC_SETTIMEOUT32: - return autofs4_compat_get_set_timeout(sbi, p); + return autofs_compat_get_set_timeout(sbi, p); #endif case AUTOFS_IOC_ASKUMOUNT: - return autofs4_ask_umount(filp->f_path.mnt, p); + return autofs_ask_umount(filp->f_path.mnt, p); /* return a single thing to expire */ case AUTOFS_IOC_EXPIRE: - return autofs4_expire_run(inode->i_sb, - filp->f_path.mnt, sbi, p); + return autofs_expire_run(inode->i_sb, filp->f_path.mnt, sbi, p); /* same as above, but can send multiple expires through pipe */ case AUTOFS_IOC_EXPIRE_MULTI: - return autofs4_expire_multi(inode->i_sb, - filp->f_path.mnt, sbi, p); + return autofs_expire_multi(inode->i_sb, + filp->f_path.mnt, sbi, p); default: return -EINVAL; } } -static long autofs4_root_ioctl(struct file *filp, +static long autofs_root_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + return autofs_root_ioctl_unlocked(inode, filp, cmd, arg); } #ifdef CONFIG_COMPAT -static long autofs4_root_compat_ioctl(struct file *filp, +static long autofs_root_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); int ret; if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) - ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + ret = autofs_root_ioctl_unlocked(inode, filp, cmd, arg); else - ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, + ret = autofs_root_ioctl_unlocked(inode, filp, cmd, (unsigned long) compat_ptr(arg)); return ret; diff --git a/fs/autofs4/symlink.c b/fs/autofs/symlink.c index ab0b4285a202..aad3902c0cc1 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs/symlink.c @@ -8,22 +8,22 @@ #include "autofs_i.h" -static const char *autofs4_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) +static const char *autofs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { struct autofs_sb_info *sbi; struct autofs_info *ino; if (!dentry) return ERR_PTR(-ECHILD); - sbi = autofs4_sbi(dentry->d_sb); - ino = autofs4_dentry_ino(dentry); - if (ino && !autofs4_oz_mode(sbi)) + sbi = autofs_sbi(dentry->d_sb); + ino = autofs_dentry_ino(dentry); + if (ino && !autofs_oz_mode(sbi)) ino->last_used = jiffies; return d_inode(dentry)->i_private; } -const struct inode_operations autofs4_symlink_inode_operations = { - .get_link = autofs4_get_link +const struct inode_operations autofs_symlink_inode_operations = { + .get_link = autofs_get_link }; diff --git a/fs/autofs4/waitq.c b/fs/autofs/waitq.c index be9c3dc048ab..f6385c6ef0a5 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs/waitq.c @@ -7,19 +7,15 @@ * option, any later version, incorporated herein by reference. */ -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/signal.h> #include <linux/sched/signal.h> -#include <linux/file.h> #include "autofs_i.h" /* We make this a static variable rather than a part of the superblock; it * is better if we don't reassign numbers easily even across filesystems */ -static autofs_wqt_t autofs4_next_wait_queue = 1; +static autofs_wqt_t autofs_next_wait_queue = 1; -void autofs4_catatonic_mode(struct autofs_sb_info *sbi) +void autofs_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; @@ -49,8 +45,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) mutex_unlock(&sbi->wq_mutex); } -static int autofs4_write(struct autofs_sb_info *sbi, - struct file *file, const void *addr, int bytes) +static int autofs_write(struct autofs_sb_info *sbi, + struct file *file, const void *addr, int bytes) { unsigned long sigpipe, flags; const char *data = (const char *)addr; @@ -82,7 +78,7 @@ static int autofs4_write(struct autofs_sb_info *sbi, return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } -static void autofs4_notify_daemon(struct autofs_sb_info *sbi, +static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) { @@ -167,23 +163,23 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); - switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { + switch (ret = autofs_write(sbi, pipe, &pkt, pktsz)) { case 0: break; case -ENOMEM: case -ERESTARTSYS: /* Just fail this one */ - autofs4_wait_release(sbi, wq->wait_queue_token, ret); + autofs_wait_release(sbi, wq->wait_queue_token, ret); break; default: - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); break; } fput(pipe); } -static int autofs4_getpath(struct autofs_sb_info *sbi, - struct dentry *dentry, char **name) +static int autofs_getpath(struct autofs_sb_info *sbi, + struct dentry *dentry, char *name) { struct dentry *root = sbi->sb->s_root; struct dentry *tmp; @@ -193,7 +189,7 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, unsigned seq; rename_retry: - buf = *name; + buf = name; len = 0; seq = read_seqbegin(&rename_lock); @@ -228,7 +224,7 @@ rename_retry: } static struct autofs_wait_queue * -autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) +autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { struct autofs_wait_queue *wq; @@ -263,7 +259,7 @@ static int validate_request(struct autofs_wait_queue **wait, return -ENOENT; /* Wait in progress, continue; */ - wq = autofs4_find_wait(sbi, qstr); + wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; @@ -272,7 +268,7 @@ static int validate_request(struct autofs_wait_queue **wait, *wait = NULL; /* If we don't yet have any info this is a new request */ - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (!ino) return 1; @@ -297,7 +293,7 @@ static int validate_request(struct autofs_wait_queue **wait, if (sbi->catatonic) return -ENOENT; - wq = autofs4_find_wait(sbi, qstr); + wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; @@ -351,7 +347,7 @@ static int validate_request(struct autofs_wait_queue **wait, return 1; } -int autofs4_wait(struct autofs_sb_info *sbi, +int autofs_wait(struct autofs_sb_info *sbi, const struct path *path, enum autofs_notify notify) { struct dentry *dentry = path->dentry; @@ -399,7 +395,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) qstr.len = sprintf(name, "%p", dentry); else { - qstr.len = autofs4_getpath(sbi, dentry, &name); + qstr.len = autofs_getpath(sbi, dentry, name); if (!qstr.len) { kfree(name); return -ENOENT; @@ -430,15 +426,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -ENOMEM; } - wq->wait_queue_token = autofs4_next_wait_queue; - if (++autofs4_next_wait_queue == 0) - autofs4_next_wait_queue = 1; + wq->wait_queue_token = autofs_next_wait_queue; + if (++autofs_next_wait_queue == 0) + autofs_next_wait_queue = 1; wq->next = sbi->queues; sbi->queues = wq; init_waitqueue_head(&wq->queue); memcpy(&wq->name, &qstr, sizeof(struct qstr)); - wq->dev = autofs4_get_dev(sbi); - wq->ino = autofs4_get_ino(sbi); + wq->dev = autofs_get_dev(sbi); + wq->ino = autofs_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); wq->pid = pid; @@ -467,9 +463,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, wq->name.name, notify); /* - * autofs4_notify_daemon() may block; it will unlock ->wq_mutex + * autofs_notify_daemon() may block; it will unlock ->wq_mutex */ - autofs4_notify_daemon(sbi, wq, type); + autofs_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", @@ -500,12 +496,12 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *de = NULL; /* direct mount or browsable map */ - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (!ino) { /* If not lookup actual dentry used */ de = d_lookup(dentry->d_parent, &dentry->d_name); if (de) - ino = autofs4_dentry_ino(de); + ino = autofs_dentry_ino(de); } /* Set mount requester */ @@ -530,7 +526,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, } -int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) +int autofs_wait_release(struct autofs_sb_info *sbi, + autofs_wqt_t wait_queue_token, int status) { struct autofs_wait_queue *wq, **wql; diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig deleted file mode 100644 index 44727bf18297..000000000000 --- a/fs/autofs4/Kconfig +++ /dev/null @@ -1,20 +0,0 @@ -config AUTOFS4_FS - tristate "Kernel automounter version 4 support (also supports v3)" - help - The automounter is a tool to automatically mount remote file systems - on demand. This implementation is partially kernel-based to reduce - overhead in the already-mounted case; this is unlike the BSD - automounter (amd), which is a pure user space daemon. - - To use the automounter you need the user-space tools from - <https://www.kernel.org/pub/linux/daemons/autofs/v4/>; you also - want to answer Y to "NFS file system support", below. - - To compile this support as a module, choose M here: the module will be - called autofs4. You will need to add "alias autofs autofs4" to your - modules configuration file. - - If you are not a part of a fairly large, distributed network or - don't have a laptop which needs to dynamically reconfigure to the - local network, you probably do not need an automounter, and can say - N here. diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 213b51dbbb60..125e8bbd22a2 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -126,7 +126,7 @@ static int bad_inode_fiemap(struct inode *inode, return -EIO; } -static int bad_inode_update_time(struct inode *inode, struct timespec *time, +static int bad_inode_update_time(struct inode *inode, struct timespec64 *time, int flags) { return -EIO; diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog index 16f2dfe8c2f7..aff7eec8f327 100644 --- a/fs/befs/ChangeLog +++ b/fs/befs/ChangeLog @@ -389,7 +389,7 @@ Version 0.4 (2001-10-28) (fs/nls/Config.in) * Added Configure.help entries for CONFIG_BEFS_FS and CONFIG_DEBUG_BEFS - (Documentation/Configure.help) + (currently at fs/befs/Kconfig) 2001-08-?? ========== diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index af2832aaeec5..4700b4534439 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -198,23 +198,16 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) if (ret == BEFS_BT_NOT_FOUND) { befs_debug(sb, "<--- %s %pd not found", __func__, dentry); - d_add(dentry, NULL); - return ERR_PTR(-ENOENT); - + inode = NULL; } else if (ret != BEFS_OK || offset == 0) { befs_error(sb, "<--- %s Error", __func__); - return ERR_PTR(-ENODATA); + inode = ERR_PTR(-ENODATA); + } else { + inode = befs_iget(dir->i_sb, (ino_t) offset); } - - inode = befs_iget(dir->i_sb, (ino_t) offset); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - d_add(dentry, inode); - befs_debug(sb, "<--- %s", __func__); - return NULL; + return d_splice_alias(inode, dentry); } static int diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index ee832ca5f734..f32f21c3bbc7 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -21,10 +21,9 @@ #define dprintf(x...) #endif -static int bfs_add_entry(struct inode *dir, const unsigned char *name, - int namelen, int ino); +static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino); static struct buffer_head *bfs_find_entry(struct inode *dir, - const unsigned char *name, int namelen, + const struct qstr *child, struct bfs_dirent **res_dir); static int bfs_readdir(struct file *f, struct dir_context *ctx) @@ -111,8 +110,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, mark_inode_dirty(inode); bfs_dump_imap("create", s); - err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, - inode->i_ino); + err = bfs_add_entry(dir, &dentry->d_name, inode->i_ino); if (err) { inode_dec_link_count(inode); mutex_unlock(&info->bfs_lock); @@ -136,19 +134,14 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); mutex_lock(&info->bfs_lock); - bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de); + bh = bfs_find_entry(dir, &dentry->d_name, &de); if (bh) { unsigned long ino = (unsigned long)le16_to_cpu(de->ino); brelse(bh); inode = bfs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) { - mutex_unlock(&info->bfs_lock); - return ERR_CAST(inode); - } } mutex_unlock(&info->bfs_lock); - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static int bfs_link(struct dentry *old, struct inode *dir, @@ -159,8 +152,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, int err; mutex_lock(&info->bfs_lock); - err = bfs_add_entry(dir, new->d_name.name, new->d_name.len, - inode->i_ino); + err = bfs_add_entry(dir, &new->d_name, inode->i_ino); if (err) { mutex_unlock(&info->bfs_lock); return err; @@ -183,7 +175,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) struct bfs_sb_info *info = BFS_SB(inode->i_sb); mutex_lock(&info->bfs_lock); - bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de); + bh = bfs_find_entry(dir, &dentry->d_name, &de); if (!bh || (le16_to_cpu(de->ino) != inode->i_ino)) goto out_brelse; @@ -228,27 +220,21 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, info = BFS_SB(old_inode->i_sb); mutex_lock(&info->bfs_lock); - old_bh = bfs_find_entry(old_dir, - old_dentry->d_name.name, - old_dentry->d_name.len, &old_de); + old_bh = bfs_find_entry(old_dir, &old_dentry->d_name, &old_de); if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino)) goto end_rename; error = -EPERM; new_inode = d_inode(new_dentry); - new_bh = bfs_find_entry(new_dir, - new_dentry->d_name.name, - new_dentry->d_name.len, &new_de); + new_bh = bfs_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh && !new_inode) { brelse(new_bh); new_bh = NULL; } if (!new_bh) { - error = bfs_add_entry(new_dir, - new_dentry->d_name.name, - new_dentry->d_name.len, + error = bfs_add_entry(new_dir, &new_dentry->d_name, old_inode->i_ino); if (error) goto end_rename; @@ -278,9 +264,10 @@ const struct inode_operations bfs_dir_inops = { .rename = bfs_rename, }; -static int bfs_add_entry(struct inode *dir, const unsigned char *name, - int namelen, int ino) +static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) { + const unsigned char *name = child->name; + int namelen = child->len; struct buffer_head *bh; struct bfs_dirent *de; int block, sblock, eblock, off, pos; @@ -332,12 +319,14 @@ static inline int bfs_namecmp(int len, const unsigned char *name, } static struct buffer_head *bfs_find_entry(struct inode *dir, - const unsigned char *name, int namelen, + const struct qstr *child, struct bfs_dirent **res_dir) { unsigned long block = 0, offset = 0; struct buffer_head *bh = NULL; struct bfs_dirent *de; + const unsigned char *name = child->name; + int namelen = child->len; *res_dir = NULL; if (namelen > BFS_NAMELEN) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 4ad6f669fe34..816cc921cf36 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1259,9 +1259,8 @@ static int load_elf_library(struct file *file) goto out_free_ph; } - len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + - ELF_MIN_ALIGN - 1); - bss = eppnt->p_memsz + eppnt->p_vaddr; + len = ELF_PAGEALIGN(eppnt->p_filesz + eppnt->p_vaddr); + bss = ELF_PAGEALIGN(eppnt->p_memsz + eppnt->p_vaddr); if (bss > len) { error = vm_brk(len, bss - len); if (error) @@ -1621,8 +1620,8 @@ static int fill_files_note(struct memelfnote *note) if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ return -EINVAL; size = round_up(size, PAGE_SIZE); - data = vmalloc(size); - if (!data) + data = kvmalloc(size, GFP_KERNEL); + if (ZERO_OR_NULL_PTR(data)) return -ENOMEM; start_end_ofs = data + 2; @@ -1639,7 +1638,7 @@ static int fill_files_note(struct memelfnote *note) filename = file_path(file, name_curpos, remaining); if (IS_ERR(filename)) { if (PTR_ERR(filename) == -ENAMETOOLONG) { - vfree(data); + kvfree(data); size = size * 5 / 4; goto alloc; } @@ -1932,7 +1931,7 @@ static void free_note_info(struct elf_note_info *info) kfree(t); } kfree(info->psinfo.data); - vfree(info->files.data); + kvfree(info->files.data); } #else @@ -2010,7 +2009,7 @@ static int elf_note_info_init(struct elf_note_info *info) INIT_LIST_HEAD(&info->thread_list); /* Allocate space for ELF notes */ - info->notes = kmalloc(8 * sizeof(struct memelfnote), GFP_KERNEL); + info->notes = kmalloc_array(8, sizeof(struct memelfnote), GFP_KERNEL); if (!info->notes) return 0; info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); @@ -2148,7 +2147,7 @@ static void free_note_info(struct elf_note_info *info) /* Free data possibly allocated by fill_files_note(): */ if (info->notes_files) - vfree(info->notes_files->data); + kvfree(info->notes_files->data); kfree(info->prstatus); kfree(info->psinfo); @@ -2294,8 +2293,9 @@ static int elf_core_dump(struct coredump_params *cprm) if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) goto end_coredump; - vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz)); - if (!vma_filesz) + vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), + GFP_KERNEL); + if (ZERO_OR_NULL_PTR(vma_filesz)) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; @@ -2402,7 +2402,7 @@ end_coredump: cleanup: free_note_info(&info); kfree(shdr4extnum); - vfree(vma_filesz); + kvfree(vma_filesz); kfree(phdr4note); kfree(elf); out: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index d90993adeffa..b53bb3729ac1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1600,7 +1600,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) goto cleanup; - notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL); + notes = kmalloc_array(NUM_NOTES, sizeof(struct memelfnote), + GFP_KERNEL); if (!notes) goto cleanup; fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a41b48f82a70..4b5fff31ef27 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -4,7 +4,7 @@ * Copyright (C) 1997 Richard Günther * * binfmt_misc detects binaries via a magic or filename extension and invokes - * a specified wrapper. See Documentation/binfmt_misc.txt for more details. + * a specified wrapper. See Documentation/admin-guide/binfmt-misc.rst for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -387,8 +387,13 @@ static Node *create_entry(const char __user *buffer, size_t count) s = strchr(p, del); if (!s) goto einval; - *s++ = '\0'; - e->offset = simple_strtoul(p, &p, 10); + *s = '\0'; + if (p != s) { + int r = kstrtoint(p, 10, &e->offset); + if (r != 0 || e->offset < 0) + goto einval; + } + p = s; if (*p++) goto einval; pr_debug("register: offset: %#x\n", e->offset); @@ -428,7 +433,8 @@ static Node *create_entry(const char __user *buffer, size_t count) if (e->mask && string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto einval; - if (e->size + e->offset > BINPRM_BUF_SIZE) + if (e->size > BINPRM_BUF_SIZE || + BINPRM_BUF_SIZE - e->size < e->offset) goto einval; pr_debug("register: magic/mask length: %i\n", e->size); if (USE_DEBUG) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 7ec920e27065..0dd87aaeb39a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -205,7 +205,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { - vecs = kmalloc(nr_pages * sizeof(struct bio_vec), GFP_KERNEL); + vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), + GFP_KERNEL); if (!vecs) return -ENOMEM; } @@ -216,6 +217,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; + bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) @@ -272,7 +274,7 @@ struct blkdev_dio { struct bio bio; }; -static struct bio_set *blkdev_dio_pool __read_mostly; +static struct bio_set blkdev_dio_pool; static void blkdev_bio_end_io(struct bio *bio) { @@ -334,7 +336,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool); + bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); @@ -355,6 +357,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; + bio->bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { @@ -432,10 +435,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); - if (!blkdev_dio_pool) - return -ENOMEM; - return 0; + return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); } module_init(blkdev_init); @@ -1322,27 +1322,30 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @bdev: struct bdev to adjust. + * @verbose: if %true log a message about a size change if there is any * * This routine checks to see if the bdev size does not match the disk size * and adjusts it if it differs. When shrinking the bdev size, its all caches * are freed. */ -void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev, + bool verbose) { loff_t disk_size, bdev_size; disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { - printk(KERN_INFO - "%s: detected capacity change from %lld to %lld\n", - disk->disk_name, bdev_size, disk_size); + if (verbose) { + printk(KERN_INFO + "%s: detected capacity change from %lld to %lld\n", + disk->disk_name, bdev_size, disk_size); + } i_size_write(bdev->bd_inode, disk_size); if (bdev_size > disk_size) flush_disk(bdev, false); } } -EXPORT_SYMBOL(check_disk_size_change); /** * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back @@ -1364,7 +1367,7 @@ int revalidate_disk(struct gendisk *disk) return ret; mutex_lock(&bdev->bd_mutex); - check_disk_size_change(disk, bdev); + check_disk_size_change(disk, bdev, ret == 0); bdev->bd_invalidated = 0; mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 234bae55b85d..7e075343daa5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -19,17 +19,17 @@ * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. */ -#define BTRFS_INODE_ORDERED_DATA_CLOSE 0 -#define BTRFS_INODE_ORPHAN_META_RESERVED 1 -#define BTRFS_INODE_DUMMY 2 -#define BTRFS_INODE_IN_DEFRAG 3 -#define BTRFS_INODE_HAS_ORPHAN_ITEM 4 -#define BTRFS_INODE_HAS_ASYNC_EXTENT 5 -#define BTRFS_INODE_NEEDS_FULL_SYNC 6 -#define BTRFS_INODE_COPY_EVERYTHING 7 -#define BTRFS_INODE_IN_DELALLOC_LIST 8 -#define BTRFS_INODE_READDIO_NEED_LOCK 9 -#define BTRFS_INODE_HAS_PROPS 10 +enum { + BTRFS_INODE_ORDERED_DATA_CLOSE = 0, + BTRFS_INODE_DUMMY, + BTRFS_INODE_IN_DEFRAG, + BTRFS_INODE_HAS_ASYNC_EXTENT, + BTRFS_INODE_NEEDS_FULL_SYNC, + BTRFS_INODE_COPY_EVERYTHING, + BTRFS_INODE_IN_DELALLOC_LIST, + BTRFS_INODE_READDIO_NEED_LOCK, + BTRFS_INODE_HAS_PROPS, +}; /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index dc062b195c46..a3fdb4fe967d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1603,8 +1603,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> PAGE_SHIFT; - block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + - sizeof(*block_ctx->pagev)) * + block_ctx->mem_to_free = kcalloc(sizeof(*block_ctx->datav) + + sizeof(*block_ctx->pagev), num_pages, GFP_NOFS); if (!block_ctx->mem_to_free) return -ENOMEM; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1061575a7d25..d3e447b45bf7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -990,12 +990,7 @@ static void __free_workspace(int type, struct list_head *workspace, btrfs_compress_op[idx]->free_workspace(workspace); atomic_dec(total_ws); wake: - /* - * Make sure counter is updated before we wake up waiters. - */ - smp_mb(); - if (waitqueue_active(ws_wait)) - wake_up(ws_wait); + cond_wake_up(ws_wait); } static void free_workspace(int type, struct list_head *ws) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index cc605f7b23fb..ddda9b80bf20 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -6,6 +6,8 @@ #ifndef BTRFS_COMPRESSION_H #define BTRFS_COMPRESSION_H +#include <linux/sizes.h> + /* * We want to make sure that amount of RAM required to uncompress an extent is * reasonable, so we limit the total size in ram of a compressed extent to diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8c68961925b1..4bc326df472e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2330,7 +2330,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level, no_skips = 1; t = path->nodes[i]; - if (i >= lowest_unlock && i > skip_level && path->locks[i]) { + if (i >= lowest_unlock && i > skip_level) { btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; if (write_lock_level && @@ -2432,7 +2432,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, btrfs_unlock_up_safe(p, level + 1); btrfs_set_path_blocking(p); - free_extent_buffer(tmp); if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); @@ -2446,7 +2445,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!btrfs_buffer_uptodate(tmp, 0, 0)) + if (!extent_buffer_uptodate(tmp)) ret = -EIO; free_extent_buffer(tmp); } else { @@ -2599,6 +2598,78 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, return 0; } +static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, + struct btrfs_path *p, + int write_lock_level) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *b; + int root_lock; + int level = 0; + + /* We try very hard to do read locks on the root */ + root_lock = BTRFS_READ_LOCK; + + if (p->search_commit_root) { + /* The commit roots are read only so we always do read locks */ + if (p->need_commit_sem) + down_read(&fs_info->commit_root_sem); + b = root->commit_root; + extent_buffer_get(b); + level = btrfs_header_level(b); + if (p->need_commit_sem) + up_read(&fs_info->commit_root_sem); + /* + * Ensure that all callers have set skip_locking when + * p->search_commit_root = 1. + */ + ASSERT(p->skip_locking == 1); + + goto out; + } + + if (p->skip_locking) { + b = btrfs_root_node(root); + level = btrfs_header_level(b); + goto out; + } + + /* + * If the level is set to maximum, we can skip trying to get the read + * lock. + */ + if (write_lock_level < BTRFS_MAX_LEVEL) { + /* + * We don't know the level of the root node until we actually + * have it read locked + */ + b = btrfs_read_lock_root_node(root); + level = btrfs_header_level(b); + if (level > write_lock_level) + goto out; + + /* Whoops, must trade for write lock */ + btrfs_tree_read_unlock(b); + free_extent_buffer(b); + } + + b = btrfs_lock_root_node(root); + root_lock = BTRFS_WRITE_LOCK; + + /* The level might have changed, check again */ + level = btrfs_header_level(b); + +out: + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = root_lock; + /* + * Callers are responsible for dropping b's references. + */ + return b; +} + + /* * btrfs_search_slot - look for a key in a tree and perform necessary * modifications to preserve tree invariants. @@ -2635,7 +2706,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, int err; int level; int lowest_unlock = 1; - int root_lock; /* everything at write_lock_level or lower must be write locked */ int write_lock_level = 0; u8 lowest_level = 0; @@ -2673,50 +2743,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, again: prev_cmp = -1; - /* - * we try very hard to do read locks on the root - */ - root_lock = BTRFS_READ_LOCK; - level = 0; - if (p->search_commit_root) { - /* - * the commit roots are read only - * so we always do read locks - */ - if (p->need_commit_sem) - down_read(&fs_info->commit_root_sem); - b = root->commit_root; - extent_buffer_get(b); - level = btrfs_header_level(b); - if (p->need_commit_sem) - up_read(&fs_info->commit_root_sem); - if (!p->skip_locking) - btrfs_tree_read_lock(b); - } else { - if (p->skip_locking) { - b = btrfs_root_node(root); - level = btrfs_header_level(b); - } else { - /* we don't know the level of the root node - * until we actually have it read locked - */ - b = btrfs_read_lock_root_node(root); - level = btrfs_header_level(b); - if (level <= write_lock_level) { - /* whoops, must trade for write lock */ - btrfs_tree_read_unlock(b); - free_extent_buffer(b); - b = btrfs_lock_root_node(root); - root_lock = BTRFS_WRITE_LOCK; - - /* the level might have changed, check again */ - level = btrfs_header_level(b); - } - } - } - p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = root_lock; + b = btrfs_search_slot_get_root(root, p, write_lock_level); while (b) { level = btrfs_header_level(b); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0d422c9908b8..118346aceea9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -739,6 +739,12 @@ struct btrfs_delayed_root; */ #define BTRFS_FS_NEED_ASYNC_COMMIT 17 +/* + * Indicate that balance has been set up from the ioctl and is in the main + * phase. The fs_info::balance_ctl is initialized. + */ +#define BTRFS_FS_BALANCE_RUNNING 18 + struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; @@ -838,7 +844,6 @@ struct btrfs_fs_info { struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; - struct mutex volume_mutex; /* * this is taken to make sure we don't set block groups ro after @@ -1004,7 +1009,6 @@ struct btrfs_fs_info { /* restriper state */ spinlock_t balance_lock; struct mutex balance_mutex; - atomic_t balance_running; atomic_t balance_pause_req; atomic_t balance_cancel_req; struct btrfs_balance_control *balance_ctl; @@ -1219,9 +1223,6 @@ struct btrfs_root { spinlock_t log_extents_lock[2]; struct list_head logged_list[2]; - spinlock_t orphan_lock; - atomic_t orphan_inodes; - struct btrfs_block_rsv *orphan_block_rsv; int orphan_cleanup_state; spinlock_t inode_lock; @@ -2764,13 +2765,9 @@ void btrfs_delalloc_release_space(struct inode *inode, void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); -int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); -void btrfs_orphan_release_metadata(struct btrfs_inode *inode); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, - int nitems, - u64 *qgroup_reserved, bool use_global_rsv); + int nitems, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, @@ -2828,7 +2825,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, const u64 type); u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_fs_info *info, u64 start, u64 end); + u64 start, u64 end); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, @@ -3042,11 +3039,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* uuid-tree.c */ -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); -int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, int (*check_func)(struct btrfs_fs_info *, u8 *, u8, @@ -3163,18 +3158,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, struct extent_map *em); /* inode.c */ -struct btrfs_delalloc_work { - struct inode *inode; - int delay_iput; - struct completion completion; - struct list_head list; - struct btrfs_work work; -}; - -struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int delay_iput); -void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); - struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); @@ -3193,10 +3176,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index); -int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, u64 objectid, - const char *name, int name_len); +int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, @@ -3204,9 +3184,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, - int nr); +int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state, int dedupe); @@ -3218,7 +3197,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); void btrfs_set_range_writeback(void *private_data, u64 start, u64 end); -int btrfs_page_mkwrite(struct vm_fault *vmf); +vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); @@ -3240,10 +3219,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); -void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); -void btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -3262,14 +3238,14 @@ void btrfs_test_inode_set_ops(struct inode *inode); long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_update_iflags(struct inode *inode); +void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, +void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, struct file *dst_file, u64 dst_loff); @@ -3767,4 +3743,26 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) return 0; } +static inline void cond_wake_up(struct wait_queue_head *wq) +{ + /* + * This implies a full smp_mb barrier, see comments for + * waitqueue_active why. + */ + if (wq_has_sleeper(wq)) + wake_up(wq); +} + +static inline void cond_wake_up_nomb(struct wait_queue_head *wq) +{ + /* + * Special case for conditional wakeup where the barrier required for + * waitqueue_active is implied by some of the preceding code. Eg. one + * of such atomic operations (atomic_dec_and_return, ...), or a + * unlock/lock sequence, etc. + */ + if (waitqueue_active(wq)) + wake_up(wq); +} + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a8d492dbd3e7..fe6caa7e698b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -460,13 +460,10 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); - /* - * atomic_dec_return implies a barrier for waitqueue_active - */ + /* atomic_dec_return implies a barrier */ if ((atomic_dec_return(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && - waitqueue_active(&delayed_root->wait)) - wake_up(&delayed_root->wait); + BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0)) + cond_wake_up_nomb(&delayed_root->wait); } static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e1b0651686f7..03dec673d12a 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -286,10 +286,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans, } void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; @@ -323,9 +323,7 @@ again: } } -int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - u64 seq) +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) { struct seq_list *elem; int ret = 0; @@ -336,10 +334,9 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct seq_list, list); if (seq >= elem->seq) { btrfs_debug(fs_info, - "holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)", + "holding back delayed_ref %#x.%x, lowest is %#x.%x", (u32)(seq >> 32), (u32)seq, - (u32)(elem->seq >> 32), (u32)elem->seq, - delayed_refs); + (u32)(elem->seq >> 32), (u32)elem->seq); ret = 1; } } @@ -529,33 +526,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, spin_unlock(&existing->lock); } -/* - * helper function to actually insert a head node into the rbtree. - * this does all the dirty work in terms of maintaining the correct - * overall modification count. - */ -static noinline struct btrfs_delayed_ref_head * -add_delayed_ref_head(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head_ref, - struct btrfs_qgroup_extent_record *qrecord, - u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, - int action, int is_data, int is_system, - int *qrecord_inserted_ret, - int *old_ref_mod, int *new_ref_mod) - +static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr, u64 num_bytes, u64 ref_root, + u64 reserved, int action, bool is_data, + bool is_system) { - struct btrfs_delayed_ref_head *existing; - struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; int must_insert_reserved = 0; - int qrecord_inserted = 0; /* If reserved is provided, it must be a data extent. */ BUG_ON(!is_data && reserved); /* - * the head node stores the sum of all the mods, so dropping a ref + * The head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. */ if (action == BTRFS_UPDATE_DELAYED_HEAD) @@ -564,12 +548,11 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, count_mod = -1; /* - * BTRFS_ADD_DELAYED_EXTENT means that we need to update - * the reserved accounting when the extent is finally added, or - * if a later modification deletes the delayed ref without ever - * inserting the extent into the extent allocation tree. - * ref->must_insert_reserved is the flag used to record - * that accounting mods are required. + * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved + * accounting when the extent is finally added, or if a later + * modification deletes the delayed ref without ever inserting the + * extent into the extent allocation tree. ref->must_insert_reserved + * is the flag used to record that accounting mods are required. * * Once we record must_insert_reserved, switch the action to * BTRFS_ADD_DELAYED_REF because other special casing is not required. @@ -579,8 +562,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, else must_insert_reserved = 0; - delayed_refs = &trans->transaction->delayed_refs; - refcount_set(&head_ref->refs, 1); head_ref->bytenr = bytenr; head_ref->num_bytes = num_bytes; @@ -598,7 +579,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); - /* Record qgroup extent info if provided */ if (qrecord) { if (ref_root && reserved) { head_ref->qgroup_ref_root = ref_root; @@ -608,20 +588,44 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->bytenr = bytenr; qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; + } +} + +/* + * helper function to actually insert a head node into the rbtree. + * this does all the dirty work in terms of maintaining the correct + * overall modification count. + */ +static noinline struct btrfs_delayed_ref_head * +add_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head_ref, + struct btrfs_qgroup_extent_record *qrecord, + int action, int *qrecord_inserted_ret, + int *old_ref_mod, int *new_ref_mod) +{ + struct btrfs_delayed_ref_head *existing; + struct btrfs_delayed_ref_root *delayed_refs; + int qrecord_inserted = 0; - if(btrfs_qgroup_trace_extent_nolock(fs_info, + delayed_refs = &trans->transaction->delayed_refs; + + /* Record qgroup extent info if provided */ + if (qrecord) { + if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, delayed_refs, qrecord)) kfree(qrecord); else qrecord_inserted = 1; } - trace_add_delayed_ref_head(fs_info, head_ref, action); + trace_add_delayed_ref_head(trans->fs_info, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - WARN_ON(ref_root && reserved && existing->qgroup_ref_root + WARN_ON(qrecord && head_ref->qgroup_ref_root + && head_ref->qgroup_reserved + && existing->qgroup_ref_root && existing->qgroup_reserved); update_existing_head_ref(delayed_refs, existing, head_ref, old_ref_mod); @@ -634,8 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, } else { if (old_ref_mod) *old_ref_mod = 0; - if (is_data && count_mod < 0) - delayed_refs->pending_csums += num_bytes; + if (head_ref->is_data && head_ref->ref_mod < 0) + delayed_refs->pending_csums += head_ref->num_bytes; delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -645,90 +649,48 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, *qrecord_inserted_ret = qrecord_inserted; if (new_ref_mod) *new_ref_mod = head_ref->total_ref_mod; - return head_ref; -} - -/* - * helper to insert a delayed tree ref into the rbtree. - */ -static noinline void -add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head_ref, - struct btrfs_delayed_ref_node *ref, u64 bytenr, - u64 num_bytes, u64 parent, u64 ref_root, int level, - int action) -{ - struct btrfs_delayed_tree_ref *full_ref; - struct btrfs_delayed_ref_root *delayed_refs; - u64 seq = 0; - int ret; - - if (action == BTRFS_ADD_DELAYED_EXTENT) - action = BTRFS_ADD_DELAYED_REF; - if (is_fstree(ref_root)) - seq = atomic64_read(&fs_info->tree_mod_seq); - delayed_refs = &trans->transaction->delayed_refs; - - /* first set the basic ref node struct up */ - refcount_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = 1; - ref->action = action; - ref->is_head = 0; - ref->in_tree = 1; - ref->seq = seq; - RB_CLEAR_NODE(&ref->ref_node); - INIT_LIST_HEAD(&ref->add_list); - - full_ref = btrfs_delayed_node_to_tree_ref(ref); - full_ref->parent = parent; - full_ref->root = ref_root; - if (parent) - ref->type = BTRFS_SHARED_BLOCK_REF_KEY; - else - ref->type = BTRFS_TREE_BLOCK_REF_KEY; - full_ref->level = level; - - trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); - - ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); - - /* - * XXX: memory should be freed at the same level allocated. - * But bad practice is anywhere... Follow it now. Need cleanup. - */ - if (ret > 0) - kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); + return head_ref; } /* - * helper to insert a delayed data ref into the rbtree. + * init_delayed_ref_common - Initialize the structure which represents a + * modification to a an extent. + * + * @fs_info: Internal to the mounted filesystem mount structure. + * + * @ref: The structure which is going to be initialized. + * + * @bytenr: The logical address of the extent for which a modification is + * going to be recorded. + * + * @num_bytes: Size of the extent whose modification is being recorded. + * + * @ref_root: The id of the root where this modification has originated, this + * can be either one of the well-known metadata trees or the + * subvolume id which references this extent. + * + * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or + * BTRFS_ADD_DELAYED_EXTENT + * + * @ref_type: Holds the type of the extent which is being recorded, can be + * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY + * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ + * BTRFS_EXTENT_DATA_REF_KEY when recording data extent */ -static noinline void -add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head_ref, - struct btrfs_delayed_ref_node *ref, u64 bytenr, - u64 num_bytes, u64 parent, u64 ref_root, u64 owner, - u64 offset, int action) +static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 ref_root, + int action, u8 ref_type) { - struct btrfs_delayed_data_ref *full_ref; - struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; - int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; - delayed_refs = &trans->transaction->delayed_refs; - if (is_fstree(ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); - /* first set the basic ref node struct up */ refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; @@ -737,25 +699,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; + ref->type = ref_type; RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); - - full_ref = btrfs_delayed_node_to_data_ref(ref); - full_ref->parent = parent; - full_ref->root = ref_root; - if (parent) - ref->type = BTRFS_SHARED_DATA_REF_KEY; - else - ref->type = BTRFS_EXTENT_DATA_REF_KEY; - - full_ref->objectid = owner; - full_ref->offset = offset; - - trace_add_delayed_data_ref(fs_info, ref, full_ref, action); - - ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); - if (ret > 0) - kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } /* @@ -775,13 +721,25 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; - int is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID); + bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID); + int ret; + u8 ref_type; BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; + if (parent) + ref_type = BTRFS_SHARED_BLOCK_REF_KEY; + else + ref_type = BTRFS_TREE_BLOCK_REF_KEY; + init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, + ref_root, action, ref_type); + ref->root = ref_root; + ref->parent = parent; + ref->level = level; + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) goto free_ref; @@ -793,6 +751,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, goto free_head_ref; } + init_delayed_ref_head(head_ref, record, bytenr, num_bytes, + ref_root, 0, action, false, is_system); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -802,15 +762,19 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, - bytenr, num_bytes, 0, 0, action, 0, - is_system, &qrecord_inserted, + head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted, old_ref_mod, new_ref_mod); - add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, - num_bytes, parent, ref_root, level, action); + ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + trace_add_delayed_tree_ref(fs_info, &ref->node, ref, + action == BTRFS_ADD_DELAYED_EXTENT ? + BTRFS_ADD_DELAYED_REF : action); + if (ret > 0) + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + if (qrecord_inserted) btrfs_qgroup_trace_extent_post(fs_info, record); @@ -839,11 +803,25 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; + int ret; + u8 ref_type; ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; + if (parent) + ref_type = BTRFS_SHARED_DATA_REF_KEY; + else + ref_type = BTRFS_EXTENT_DATA_REF_KEY; + init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, + ref_root, action, ref_type); + ref->root = ref_root; + ref->parent = parent; + ref->objectid = owner; + ref->offset = offset; + + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); @@ -861,6 +839,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, } } + init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, + reserved, action, true, false); head_ref->extent_op = NULL; delayed_refs = &trans->transaction->delayed_refs; @@ -870,16 +850,20 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, - bytenr, num_bytes, ref_root, reserved, - action, 1, 0, &qrecord_inserted, + head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted, old_ref_mod, new_ref_mod); - add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, - num_bytes, parent, ref_root, owner, offset, - action); + ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, + action == BTRFS_ADD_DELAYED_EXTENT ? + BTRFS_ADD_DELAYED_REF : action); + if (ret > 0) + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + + if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(fs_info, record); return 0; @@ -897,19 +881,16 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, if (!head_ref) return -ENOMEM; + init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, + BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, + false); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - /* - * extent_ops just modify the flags of an extent and they don't result - * in ref count changes, hence it's safe to pass false/0 for is_system - * argument - */ - add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr, - num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data, 0, NULL, NULL, NULL); + add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, + NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 7f00db50bd24..ea1aecb6a50d 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -251,7 +251,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); @@ -269,9 +268,7 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) struct btrfs_delayed_ref_head * btrfs_select_ref_head(struct btrfs_trans_handle *trans); -int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - u64 seq); +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); /* * helper functions to cast a node into its container diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f82be266ba4b..e2ba0419297a 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -33,8 +33,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev); static int btrfs_dev_replace_kthread(void *data); -static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); - int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) { @@ -179,6 +177,105 @@ out: } /* + * Initialize a new device for device replace target from a given source dev + * and path. + * + * Return 0 and new device in @device_out, otherwise return < 0 + */ +static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + const char *device_path, + struct btrfs_device *srcdev, + struct btrfs_device **device_out) +{ + struct btrfs_device *device; + struct block_device *bdev; + struct list_head *devices; + struct rcu_string *name; + u64 devid = BTRFS_DEV_REPLACE_DEVID; + int ret = 0; + + *device_out = NULL; + if (fs_info->fs_devices->seeding) { + btrfs_err(fs_info, "the filesystem is a seed filesystem!"); + return -EINVAL; + } + + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + fs_info->bdev_holder); + if (IS_ERR(bdev)) { + btrfs_err(fs_info, "target device %s is invalid!", device_path); + return PTR_ERR(bdev); + } + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { + if (device->bdev == bdev) { + btrfs_err(fs_info, + "target device is in the filesystem!"); + ret = -EEXIST; + goto error; + } + } + + + if (i_size_read(bdev->bd_inode) < + btrfs_device_get_total_bytes(srcdev)) { + btrfs_err(fs_info, + "target device is smaller than source device!"); + ret = -EINVAL; + goto error; + } + + + device = btrfs_alloc_device(NULL, &devid, NULL); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto error; + } + + name = rcu_string_strdup(device_path, GFP_KERNEL); + if (!name) { + btrfs_free_device(device); + ret = -ENOMEM; + goto error; + } + rcu_assign_pointer(device->name, name); + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + device->generation = 0; + device->io_width = fs_info->sectorsize; + device->io_align = fs_info->sectorsize; + device->sector_size = fs_info->sectorsize; + device->total_bytes = btrfs_device_get_total_bytes(srcdev); + device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); + device->bytes_used = btrfs_device_get_bytes_used(srcdev); + device->commit_total_bytes = srcdev->commit_total_bytes; + device->commit_bytes_used = device->bytes_used; + device->fs_info = fs_info; + device->bdev = bdev; + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + device->mode = FMODE_EXCL; + device->dev_stats_valid = 1; + set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + device->fs_devices = fs_info->fs_devices; + list_add(&device->dev_list, &fs_info->fs_devices->devices); + fs_info->fs_devices->num_devices++; + fs_info->fs_devices->open_devices++; + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + *device_out = device; + return 0; + +error: + blkdev_put(bdev, FMODE_EXCL); + return ret; +} + +/* * called from commit_transaction. Writes changed device replace state to * disk. */ @@ -317,18 +414,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - /* the disk copy procedure reuses the scrub code */ - mutex_lock(&fs_info->volume_mutex); ret = btrfs_find_device_by_devspec(fs_info, srcdevid, srcdev_name, &src_device); - if (ret) { - mutex_unlock(&fs_info->volume_mutex); + if (ret) return ret; - } ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, src_device, &tgt_device); - mutex_unlock(&fs_info->volume_mutex); if (ret) return ret; @@ -360,7 +452,6 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, dev_replace->cont_reading_from_srcdev_mode = read_src; WARN_ON(!src_device); dev_replace->srcdev = src_device; - WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; btrfs_info_in_rcu(fs_info, @@ -503,7 +594,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, 0, -1); + ret = btrfs_start_delalloc_roots(fs_info, -1); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; @@ -518,7 +609,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ret = btrfs_commit_transaction(trans); WARN_ON(ret); - mutex_lock(&uuid_mutex); /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); @@ -545,7 +635,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_dev_replace_write_unlock(dev_replace); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - mutex_unlock(&uuid_mutex); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); @@ -596,7 +685,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, */ mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); @@ -800,7 +888,17 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) } btrfs_dev_replace_write_unlock(dev_replace); - WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); + /* + * This could collide with a paused balance, but the exclusive op logic + * should never allow both to start and pause. We don't want to allow + * dev-replace to start anyway. + */ + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + btrfs_info(fs_info, + "cannot resume dev-replace, other exclusive operation running"); + return 0; + } + task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); return PTR_ERR_OR_ZERO(task); } @@ -810,6 +908,7 @@ static int btrfs_dev_replace_kthread(void *data) struct btrfs_fs_info *fs_info = data; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; u64 progress; + int ret; progress = btrfs_dev_replace_progress(fs_info); progress = div_u64(progress, 10); @@ -820,23 +919,14 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_dev_name(dev_replace->tgtdev), (unsigned int)progress); - btrfs_dev_replace_continue_on_mount(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); - - return 0; -} - -static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) -{ - struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - int ret; - ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, dev_replace->committed_cursor_left, btrfs_device_get_total_bytes(dev_replace->srcdev), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret); + + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; } @@ -916,9 +1006,9 @@ void btrfs_dev_replace_clear_lock_blocking( ASSERT(atomic_read(&dev_replace->read_locks) > 0); ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); read_lock(&dev_replace->lock); - if (atomic_dec_and_test(&dev_replace->blocking_readers) && - waitqueue_active(&dev_replace->read_lock_wq)) - wake_up(&dev_replace->read_lock_wq); + /* Barrier implied by atomic_dec_and_test */ + if (atomic_dec_and_test(&dev_replace->blocking_readers)) + cond_wake_up_nomb(&dev_replace->read_lock_wq); } void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) @@ -929,9 +1019,7 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { percpu_counter_sub(&fs_info->bio_counter, amount); - - if (waitqueue_active(&fs_info->replace_wait)) - wake_up(&fs_info->replace_wait); + cond_wake_up_nomb(&fs_info->replace_wait); } void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c3504b4d281b..205092dc9390 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -55,7 +55,6 @@ static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); @@ -416,7 +415,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, static int verify_level_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int level, - struct btrfs_key *first_key) + struct btrfs_key *first_key, u64 parent_transid) { int found_level; struct btrfs_key found_key; @@ -454,10 +453,11 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, if (ret) { WARN_ON(1); btrfs_err(fs_info, -"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)", - eb->start, first_key->objectid, first_key->type, - first_key->offset, found_key.objectid, - found_key.type, found_key.offset); +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, parent_transid, first_key->objectid, + first_key->type, first_key->offset, + found_key.objectid, found_key.type, + found_key.offset); } #endif return ret; @@ -493,7 +493,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, parent_transid, 0)) ret = -EIO; else if (verify_level_key(fs_info, eb, level, - first_key)) + first_key, parent_transid)) ret = -EUCLEAN; else break; @@ -1185,7 +1185,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; - root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); @@ -1195,7 +1194,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->logged_list[0]); INIT_LIST_HEAD(&root->logged_list[1]); - spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); @@ -1216,7 +1214,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); - atomic_set(&root->orphan_inodes, 0); refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); root->log_transid = 0; @@ -2164,7 +2161,6 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) { spin_lock_init(&fs_info->balance_lock); mutex_init(&fs_info->balance_mutex); - atomic_set(&fs_info->balance_running, 0); atomic_set(&fs_info->balance_pause_req, 0); atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; @@ -2442,6 +2438,211 @@ out: return ret; } +/* + * Real super block validation + * NOTE: super csum type and incompat features will not be checked here. + * + * @sb: super block to check + * @mirror_num: the super block number to check its bytenr: + * 0 the primary (1st) sb + * 1, 2 2nd and 3rd backup copy + * -1 skip bytenr check + */ +static int validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num) +{ + u64 nodesize = btrfs_super_nodesize(sb); + u64 sectorsize = btrfs_super_sectorsize(sb); + int ret = 0; + + if (btrfs_super_magic(sb) != BTRFS_MAGIC) { + btrfs_err(fs_info, "no valid FS found"); + ret = -EINVAL; + } + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { + btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } + if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "tree_root level too big: %d >= %d", + btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "chunk_root level too big: %d >= %d", + btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "log_root level too big: %d >= %d", + btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + + /* + * Check sectorsize and nodesize first, other check will need it. + * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. + */ + if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { + btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); + ret = -EINVAL; + } + /* Only PAGE SIZE is supported yet */ + if (sectorsize != PAGE_SIZE) { + btrfs_err(fs_info, + "sectorsize %llu not supported yet, only support %lu", + sectorsize, PAGE_SIZE); + ret = -EINVAL; + } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { + btrfs_err(fs_info, "invalid nodesize %llu", nodesize); + ret = -EINVAL; + } + if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { + btrfs_err(fs_info, "invalid leafsize %u, should be %llu", + le32_to_cpu(sb->__unused_leafsize), nodesize); + ret = -EINVAL; + } + + /* Root alignment check */ + if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { + btrfs_warn(fs_info, "tree_root block unaligned: %llu", + btrfs_super_root(sb)); + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { + btrfs_warn(fs_info, "chunk_root block unaligned: %llu", + btrfs_super_chunk_root(sb)); + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { + btrfs_warn(fs_info, "log_root block unaligned: %llu", + btrfs_super_log_root(sb)); + ret = -EINVAL; + } + + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { + btrfs_err(fs_info, + "dev_item UUID does not match fsid: %pU != %pU", + fs_info->fsid, sb->dev_item.fsid); + ret = -EINVAL; + } + + /* + * Hint to catch really bogus numbers, bitflips or so, more exact checks are + * done later + */ + if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + btrfs_err(fs_info, "bytes_used is too small %llu", + btrfs_super_bytes_used(sb)); + ret = -EINVAL; + } + if (!is_power_of_2(btrfs_super_stripesize(sb))) { + btrfs_err(fs_info, "invalid stripesize %u", + btrfs_super_stripesize(sb)); + ret = -EINVAL; + } + if (btrfs_super_num_devices(sb) > (1UL << 31)) + btrfs_warn(fs_info, "suspicious number of devices: %llu", + btrfs_super_num_devices(sb)); + if (btrfs_super_num_devices(sb) == 0) { + btrfs_err(fs_info, "number of devices is 0"); + ret = -EINVAL; + } + + if (mirror_num >= 0 && + btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { + btrfs_err(fs_info, "super offset mismatch %llu != %u", + btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); + ret = -EINVAL; + } + + /* + * Obvious sys_chunk_array corruptions, it must hold at least one key + * and one chunk + */ + if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + btrfs_err(fs_info, "system chunk array too big %u > %u", + btrfs_super_sys_array_size(sb), + BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + ret = -EINVAL; + } + if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)) { + btrfs_err(fs_info, "system chunk array too small %u < %zu", + btrfs_super_sys_array_size(sb), + sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)); + ret = -EINVAL; + } + + /* + * The generation is a global counter, we'll trust it more than the others + * but it's still possible that it's the one that's wrong. + */ + if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) + btrfs_warn(fs_info, + "suspicious: generation < chunk_root_generation: %llu < %llu", + btrfs_super_generation(sb), + btrfs_super_chunk_root_generation(sb)); + if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) + && btrfs_super_cache_generation(sb) != (u64)-1) + btrfs_warn(fs_info, + "suspicious: generation < cache_generation: %llu < %llu", + btrfs_super_generation(sb), + btrfs_super_cache_generation(sb)); + + return ret; +} + +/* + * Validation of super block at mount time. + * Some checks already done early at mount time, like csum type and incompat + * flags will be skipped. + */ +static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) +{ + return validate_super(fs_info, fs_info->super_copy, 0); +} + +/* + * Validation of super block at write time. + * Some checks like bytenr check will be skipped as their values will be + * overwritten soon. + * Extra checks like csum type and incompat flags will be done here. + */ +static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb) +{ + int ret; + + ret = validate_super(fs_info, sb, -1); + if (ret < 0) + goto out; + if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { + ret = -EUCLEAN; + btrfs_err(fs_info, "invalid csum type, has %u want %u", + btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); + goto out; + } + if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "invalid incompat flags, has 0x%llx valid mask 0x%llx", + btrfs_super_incompat_flags(sb), + (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP); + goto out; + } +out: + if (ret < 0) + btrfs_err(fs_info, + "super block corruption detected before writing it to disk"); + return ret; +} + int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -2601,7 +2802,6 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); - mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); @@ -2668,7 +2868,7 @@ int open_ctree(struct super_block *sb, memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - ret = btrfs_check_super_valid(fs_info); + ret = btrfs_validate_mount_super(fs_info); if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; @@ -3523,7 +3723,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (raid_type == BTRFS_RAID_SINGLE) continue; - if (!(flags & btrfs_raid_group[raid_type])) + if (!(flags & btrfs_raid_array[raid_type].bg_flag)) continue; min_tolerated = min(min_tolerated, btrfs_raid_array[raid_type]. @@ -3603,6 +3803,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + ret = btrfs_validate_write_super(fs_info, sb); + if (ret < 0) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_handle_fs_error(fs_info, -EUCLEAN, + "unexpected superblock corruption detected"); + return -EUCLEAN; + } + ret = write_dev_supers(dev, sb, max_mirrors); if (ret) total_errors++; @@ -3674,8 +3882,6 @@ static void free_fs_root(struct btrfs_root *root) { iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv); - root->orphan_block_rsv = NULL; if (root->anon_dev) free_anon_bdev(root->anon_dev); if (root->subv_writers) @@ -3766,7 +3972,6 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) void close_ctree(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); @@ -3862,9 +4067,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); - __btrfs_free_block_rsv(root->orphan_block_rsv); - root->orphan_block_rsv = NULL; - while (!list_empty(&fs_info->pinned_chunks)) { struct extent_map *em; @@ -3975,155 +4177,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, level, first_key); } -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) -{ - struct btrfs_super_block *sb = fs_info->super_copy; - u64 nodesize = btrfs_super_nodesize(sb); - u64 sectorsize = btrfs_super_sectorsize(sb); - int ret = 0; - - if (btrfs_super_magic(sb) != BTRFS_MAGIC) { - btrfs_err(fs_info, "no valid FS found"); - ret = -EINVAL; - } - if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { - btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", - btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); - ret = -EINVAL; - } - if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "tree_root level too big: %d >= %d", - btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); - ret = -EINVAL; - } - if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "chunk_root level too big: %d >= %d", - btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); - ret = -EINVAL; - } - if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "log_root level too big: %d >= %d", - btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); - ret = -EINVAL; - } - - /* - * Check sectorsize and nodesize first, other check will need it. - * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. - */ - if (!is_power_of_2(sectorsize) || sectorsize < 4096 || - sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { - btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); - ret = -EINVAL; - } - /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_SIZE) { - btrfs_err(fs_info, - "sectorsize %llu not supported yet, only support %lu", - sectorsize, PAGE_SIZE); - ret = -EINVAL; - } - if (!is_power_of_2(nodesize) || nodesize < sectorsize || - nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { - btrfs_err(fs_info, "invalid nodesize %llu", nodesize); - ret = -EINVAL; - } - if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { - btrfs_err(fs_info, "invalid leafsize %u, should be %llu", - le32_to_cpu(sb->__unused_leafsize), nodesize); - ret = -EINVAL; - } - - /* Root alignment check */ - if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { - btrfs_warn(fs_info, "tree_root block unaligned: %llu", - btrfs_super_root(sb)); - ret = -EINVAL; - } - if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { - btrfs_warn(fs_info, "chunk_root block unaligned: %llu", - btrfs_super_chunk_root(sb)); - ret = -EINVAL; - } - if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { - btrfs_warn(fs_info, "log_root block unaligned: %llu", - btrfs_super_log_root(sb)); - ret = -EINVAL; - } - - if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { - btrfs_err(fs_info, - "dev_item UUID does not match fsid: %pU != %pU", - fs_info->fsid, sb->dev_item.fsid); - ret = -EINVAL; - } - - /* - * Hint to catch really bogus numbers, bitflips or so, more exact checks are - * done later - */ - if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { - btrfs_err(fs_info, "bytes_used is too small %llu", - btrfs_super_bytes_used(sb)); - ret = -EINVAL; - } - if (!is_power_of_2(btrfs_super_stripesize(sb))) { - btrfs_err(fs_info, "invalid stripesize %u", - btrfs_super_stripesize(sb)); - ret = -EINVAL; - } - if (btrfs_super_num_devices(sb) > (1UL << 31)) - btrfs_warn(fs_info, "suspicious number of devices: %llu", - btrfs_super_num_devices(sb)); - if (btrfs_super_num_devices(sb) == 0) { - btrfs_err(fs_info, "number of devices is 0"); - ret = -EINVAL; - } - - if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { - btrfs_err(fs_info, "super offset mismatch %llu != %u", - btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); - ret = -EINVAL; - } - - /* - * Obvious sys_chunk_array corruptions, it must hold at least one key - * and one chunk - */ - if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { - btrfs_err(fs_info, "system chunk array too big %u > %u", - btrfs_super_sys_array_size(sb), - BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); - ret = -EINVAL; - } - if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) - + sizeof(struct btrfs_chunk)) { - btrfs_err(fs_info, "system chunk array too small %u < %zu", - btrfs_super_sys_array_size(sb), - sizeof(struct btrfs_disk_key) - + sizeof(struct btrfs_chunk)); - ret = -EINVAL; - } - - /* - * The generation is a global counter, we'll trust it more than the others - * but it's still possible that it's the one that's wrong. - */ - if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) - btrfs_warn(fs_info, - "suspicious: generation < chunk_root_generation: %llu < %llu", - btrfs_super_generation(sb), - btrfs_super_chunk_root_generation(sb)); - if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) - && btrfs_super_cache_generation(sb) != (u64)-1) - btrfs_warn(fs_info, - "suspicious: generation < cache_generation: %llu < %llu", - btrfs_super_generation(sb), - btrfs_super_cache_generation(sb)); - - return ret; -} - static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { /* cleanup FS via transaction */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 51b5e2da708c..3d9fe58c0080 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -66,10 +66,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 parent, u64 root_objectid, - u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins); + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 flags, int force); @@ -256,7 +254,7 @@ static int exclude_super_stripes(struct btrfs_fs_info *fs_info, for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); ret = btrfs_rmap_block(fs_info, cache->key.objectid, - bytenr, 0, &logical, &nr, &stripe_len); + bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -343,8 +341,9 @@ static void fragment_free_space(struct btrfs_block_group_cache *block_group) * since their free space will be released as soon as the transaction commits. */ u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_fs_info *info, u64 start, u64 end) + u64 start, u64 end) { + struct btrfs_fs_info *info = block_group->fs_info; u64 extent_start, extent_end, size, total_added = 0; int ret; @@ -489,8 +488,7 @@ next: if (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY) { - total_found += add_new_free_space(block_group, - fs_info, last, + total_found += add_new_free_space(block_group, last, key.objectid); if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + @@ -508,7 +506,7 @@ next: } ret = 0; - total_found += add_new_free_space(block_group, fs_info, last, + total_found += add_new_free_space(block_group, last, block_group->key.objectid + block_group->key.offset); caching_ctl->progress = (u64)-1; @@ -744,12 +742,12 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, } static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, - u64 owner, u64 root_objectid) + bool metadata, u64 root_objectid) { struct btrfs_space_info *space_info; u64 flags; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (metadata) { if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) flags = BTRFS_BLOCK_GROUP_SYSTEM; else @@ -2200,8 +2198,11 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, &old_ref_mod, &new_ref_mod); } - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid); + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) { + bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + + add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid); + } return ret; } @@ -2428,10 +2429,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_delayed_tree_ref *ref; - struct btrfs_key ins; u64 parent = 0; u64 ref_root = 0; - bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); trace_run_delayed_tree_ref(fs_info, node, ref, node->action); @@ -2440,15 +2439,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent = ref->parent; ref_root = ref->root; - ins.objectid = node->bytenr; - if (skinny_metadata) { - ins.offset = ref->level; - ins.type = BTRFS_METADATA_ITEM_KEY; - } else { - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - } - if (node->ref_mod != 1) { btrfs_err(fs_info, "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", @@ -2458,11 +2448,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { BUG_ON(!extent_op || !extent_op->update_flags); - ret = alloc_reserved_tree_block(trans, fs_info, - parent, ref_root, - extent_op->flags_to_set, - &extent_op->key, - ref->level, &ins); + ret = alloc_reserved_tree_block(trans, node, extent_op); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent, ref_root, @@ -2594,8 +2580,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, delayed_refs->num_heads--; rb_erase(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); - spin_unlock(&delayed_refs->lock); spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); atomic_dec(&delayed_refs->num_entries); trace_run_delayed_ref_head(fs_info, head, 0); @@ -2700,17 +2686,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, - locked_ref); + btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); - /* - * locked_ref is the head node, so we have to go one - * node back for any delayed ref updates - */ ref = select_delayed_ref(locked_ref); if (ref && ref->seq && - btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { + btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); unselect_delayed_ref_head(delayed_refs, locked_ref); locked_ref = NULL; @@ -3291,7 +3272,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, path = btrfs_alloc_path(); if (!path) - return -ENOENT; + return -ENOMEM; do { ret = check_committed_ref(root, path, objectid, @@ -4026,8 +4007,7 @@ static const char *alloc_name(u64 flags) }; } -static int create_space_info(struct btrfs_fs_info *info, u64 flags, - struct btrfs_space_info **new) +static int create_space_info(struct btrfs_fs_info *info, u64 flags) { struct btrfs_space_info *space_info; @@ -4065,7 +4045,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags, return ret; } - *new = space_info; list_add_rcu(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = space_info; @@ -4122,7 +4101,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * returns target flags in extended format or 0 if restripe for this * chunk_type is not in progress * - * should be called with either volume_mutex or balance_lock held + * should be called with balance_lock held */ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) { @@ -4178,7 +4157,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) /* First, mask out the RAID levels which aren't possible */ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (num_devices >= btrfs_raid_array[raid_type].devs_min) - allowed |= btrfs_raid_group[raid_type]; + allowed |= btrfs_raid_array[raid_type].bg_flag; } allowed &= flags; @@ -4341,7 +4320,7 @@ commit_trans: need_commit--; if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, 0, -1); + btrfs_start_delalloc_roots(fs_info, -1); btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } @@ -4678,12 +4657,14 @@ again: trans->allocating_chunk = false; spin_lock(&space_info->lock); - if (ret < 0 && ret != -ENOSPC) - goto out; - if (ret) - space_info->full = 1; - else + if (ret < 0) { + if (ret == -ENOSPC) + space_info->full = 1; + else + goto out; + } else { ret = 1; + } space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; out: @@ -4792,7 +4773,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_delalloc_roots(fs_info, 0, nr_items); + btrfs_start_delalloc_roots(fs_info, nr_items); if (!current->journal_info) btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); } @@ -5949,44 +5930,6 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) trans->chunk_bytes_reserved = 0; } -/* Can only return 0 or -ENOSPC */ -int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; - /* - * We always use trans->block_rsv here as we will have reserved space - * for our orphan when starting the transaction, using get_block_rsv() - * here will sometimes make us choose the wrong block rsv as we could be - * doing a reloc inode for a non refcounted root. - */ - struct btrfs_block_rsv *src_rsv = trans->block_rsv; - struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; - - /* - * We need to hold space in order to delete our orphan item once we've - * added it, so this takes the reservation so we can release it later - * when we are truly done with the orphan item. - */ - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - - trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), - num_bytes, 1); - return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); -} - -void btrfs_orphan_release_metadata(struct btrfs_inode *inode) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - - trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), - num_bytes, 0); - btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes); -} - /* * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation * root: the root of the parent directory @@ -6004,7 +5947,6 @@ void btrfs_orphan_release_metadata(struct btrfs_inode *inode) int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int items, - u64 *qgroup_reserved, bool use_global_rsv) { u64 num_bytes; @@ -6022,8 +5964,6 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, num_bytes = 0; } - *qgroup_reserved = num_bytes; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); rsv->space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); @@ -6033,8 +5973,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret == -ENOSPC && use_global_rsv) ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); - if (ret && *qgroup_reserved) - btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved); + if (ret && num_bytes) + btrfs_qgroup_free_meta_prealloc(root, num_bytes); return ret; } @@ -6354,6 +6294,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&info->unused_bgs_lock); if (list_empty(&cache->bg_list)) { btrfs_get_block_group(cache); + trace_btrfs_add_unused_block_group(cache); list_add_tail(&cache->bg_list, &info->unused_bgs); } @@ -6511,6 +6452,7 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, struct btrfs_key key; int found_type; int i; + int ret = 0; if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) return 0; @@ -6527,10 +6469,12 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, continue; key.objectid = btrfs_file_extent_disk_bytenr(eb, item); key.offset = btrfs_file_extent_disk_num_bytes(eb, item); - __exclude_logged_extent(fs_info, key.objectid, key.offset); + ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); + if (ret) + break; } - return 0; + return ret; } static void @@ -7122,7 +7066,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = add_to_free_space_tree(trans, info, bytenr, num_bytes); + ret = add_to_free_space_tree(trans, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -7266,7 +7210,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } out: if (pin) - add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), + add_pinned_bytes(fs_info, buf->len, true, root->root_key.objectid); if (last_ref) { @@ -7320,8 +7264,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, &old_ref_mod, &new_ref_mod); } - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) { + bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + + add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid); + } return ret; } @@ -7373,24 +7320,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return ret; } -static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = "raid10", - [BTRFS_RAID_RAID1] = "raid1", - [BTRFS_RAID_DUP] = "dup", - [BTRFS_RAID_RAID0] = "raid0", - [BTRFS_RAID_SINGLE] = "single", - [BTRFS_RAID_RAID5] = "raid5", - [BTRFS_RAID_RAID6] = "raid6", -}; - -static const char *get_raid_name(enum btrfs_raid_types type) -{ - if (type >= BTRFS_NR_RAID_TYPES) - return NULL; - - return btrfs_raid_type_names[type]; -} - enum btrfs_loop_type { LOOP_CACHING_NOWAIT = 0, LOOP_CACHING_WAIT = 1, @@ -7662,7 +7591,7 @@ have_block_group: if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(fs_info, + trace_btrfs_reserve_extent_cluster( used_block_group, search_start, num_bytes); if (used_block_group != block_group) { @@ -7735,7 +7664,7 @@ refill_cluster: if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(fs_info, + trace_btrfs_reserve_extent_cluster( block_group, search_start, num_bytes); goto checks; @@ -7835,8 +7764,7 @@ checks: ins->objectid = search_start; ins->offset = num_bytes; - trace_btrfs_reserve_extent(fs_info, block_group, - search_start, num_bytes); + trace_btrfs_reserve_extent(block_group, search_start, num_bytes); btrfs_release_block_group(block_group, delalloc); break; loop: @@ -8184,8 +8112,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, - ins->offset); + ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); if (ret) return ret; @@ -8200,37 +8127,52 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - u64 parent, u64 root_objectid, - u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins) + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_extent_item *extent_item; + struct btrfs_key extent_key; struct btrfs_tree_block_info *block_info; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; + struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 num_bytes = ins->offset; + u64 num_bytes; + u64 flags = extent_op->flags_to_set; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - if (!skinny_metadata) + ref = btrfs_delayed_node_to_tree_ref(node); + + extent_key.objectid = node->bytenr; + if (skinny_metadata) { + extent_key.offset = ref->level; + extent_key.type = BTRFS_METADATA_ITEM_KEY; + num_bytes = fs_info->nodesize; + } else { + extent_key.offset = node->num_bytes; + extent_key.type = BTRFS_EXTENT_ITEM_KEY; size += sizeof(*block_info); + num_bytes = node->num_bytes; + } path = btrfs_alloc_path(); if (!path) { - btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, + btrfs_free_and_pin_reserved_extent(fs_info, + extent_key.objectid, fs_info->nodesize); return -ENOMEM; } path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); + &extent_key, size); if (ret) { btrfs_free_path(path); - btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, + btrfs_free_and_pin_reserved_extent(fs_info, + extent_key.objectid, fs_info->nodesize); return ret; } @@ -8245,42 +8187,41 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - num_bytes = fs_info->nodesize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); - btrfs_set_tree_block_key(leaf, block_info, key); - btrfs_set_tree_block_level(leaf, block_info, level); + btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); + btrfs_set_tree_block_level(leaf, block_info, ref->level); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); } - if (parent > 0) { + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); } else { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); } btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + ret = remove_from_free_space_tree(trans, extent_key.objectid, num_bytes); if (ret) return ret; - ret = update_block_group(trans, fs_info, ins->objectid, + ret = update_block_group(trans, fs_info, extent_key.objectid, fs_info->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", - ins->objectid, ins->offset); + extent_key.objectid, extent_key.offset); BUG(); } - trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, + trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, fs_info->nodesize); return ret; } @@ -10173,8 +10114,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } else if (btrfs_block_group_used(&cache->item) == 0) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, info, - found_key.objectid, + add_new_free_space(cache, found_key.objectid, found_key.objectid + found_key.offset); free_excluded_extents(info, cache); @@ -10204,6 +10144,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) /* Should always be true but just in case. */ if (list_empty(&cache->bg_list)) { btrfs_get_block_group(cache); + trace_btrfs_add_unused_block_group(cache); list_add_tail(&cache->bg_list, &info->unused_bgs); } @@ -10269,7 +10210,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) key.offset); if (ret) btrfs_abort_transaction(trans, ret); - add_block_group_free_space(trans, fs_info, block_group); + add_block_group_free_space(trans, block_group); /* already aborted the transaction if it failed. */ next: list_del_init(&block_group->bg_list); @@ -10310,7 +10251,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return ret; } - add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size); + add_new_free_space(cache, chunk_offset, chunk_offset + size); free_excluded_extents(fs_info, cache); @@ -10391,6 +10332,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, BUG_ON(!block_group); BUG_ON(!block_group->ro); + trace_btrfs_remove_block_group(block_group); /* * Free the reserved super bytes from this block group before * remove it. @@ -10648,7 +10590,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->chunk_mutex); - ret = remove_block_group_free_space(trans, fs_info, block_group); + ret = remove_block_group_free_space(trans, block_group); if (ret) goto out; @@ -10755,6 +10697,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * the ro check in case balance is currently acting on * this block group. */ + trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); up_write(&space_info->groups_sem); goto next; @@ -10877,7 +10820,6 @@ next: int btrfs_init_space_info(struct btrfs_fs_info *fs_info) { - struct btrfs_space_info *space_info; struct btrfs_super_block *disk_super; u64 features; u64 flags; @@ -10893,21 +10835,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = create_space_info(fs_info, flags, &space_info); + ret = create_space_info(fs_info, flags); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags, &space_info); + ret = create_space_info(fs_info, flags); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = create_space_info(fs_info, flags, &space_info); + ret = create_space_info(fs_info, flags); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags, &space_info); + ret = create_space_info(fs_info, flags); } out: return ret; @@ -11092,12 +11034,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) void btrfs_end_write_no_snapshotting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); - /* - * Make sure counter is updated before we wake up waiters. - */ - smp_mb(); - if (waitqueue_active(&root->subv_writers->wait)) - wake_up(&root->subv_writers->wait); + cond_wake_up(&root->subv_writers->wait); } int btrfs_start_write_no_snapshotting(struct btrfs_root *root) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e99b329002cf..b3e45714d28f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -26,7 +26,7 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; -static struct bio_set *btrfs_bioset; +static struct bio_set btrfs_bioset; static inline bool extent_state_in_tree(const struct extent_state *state) { @@ -162,20 +162,18 @@ int __init extent_io_init(void) if (!extent_buffer_cache) goto free_state_cache; - btrfs_bioset = bioset_create(BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio), - BIOSET_NEED_BVECS); - if (!btrfs_bioset) + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio), + BIOSET_NEED_BVECS)) goto free_buffer_cache; - if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) + if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) goto free_bioset; return 0; free_bioset: - bioset_free(btrfs_bioset); - btrfs_bioset = NULL; + bioset_exit(&btrfs_bioset); free_buffer_cache: kmem_cache_destroy(extent_buffer_cache); @@ -198,8 +196,7 @@ void __cold extent_io_exit(void) rcu_barrier(); kmem_cache_destroy(extent_state_cache); kmem_cache_destroy(extent_buffer_cache); - if (btrfs_bioset) - bioset_free(btrfs_bioset); + bioset_exit(&btrfs_bioset); } void extent_io_tree_init(struct extent_io_tree *tree, @@ -2679,7 +2676,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); @@ -2692,7 +2689,7 @@ struct bio *btrfs_bio_clone(struct bio *bio) struct bio *new; /* Bio allocation backed by a bioset does not fail */ - new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset); + new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); btrfs_bio = btrfs_io_bio(new); btrfs_io_bio_init(btrfs_bio); btrfs_bio->iter = bio->bi_iter; @@ -2704,7 +2701,7 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) struct bio *bio; /* Bio allocation backed by a bioset does not fail */ - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } @@ -2715,7 +2712,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) struct btrfs_io_bio *btrfs_bio; /* this will never fail when it's backed by a bioset */ - bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset); + bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); btrfs_bio = btrfs_io_bio(bio); @@ -4109,14 +4106,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, return ret; } -int extent_writepages(struct extent_io_tree *tree, - struct address_space *mapping, +int extent_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret = 0; struct extent_page_data epd = { .bio = NULL, - .tree = tree, + .tree = &BTRFS_I(mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4126,9 +4122,8 @@ int extent_writepages(struct extent_io_tree *tree, return ret; } -int extent_readpages(struct extent_io_tree *tree, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +int extent_readpages(struct address_space *mapping, struct list_head *pages, + unsigned nr_pages) { struct bio *bio = NULL; unsigned page_idx; @@ -4136,6 +4131,7 @@ int extent_readpages(struct extent_io_tree *tree, struct page *pagepool[16]; struct page *page; struct extent_map *em_cached = NULL; + struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; int nr = 0; u64 prev_em_start = (u64)-1; @@ -4202,8 +4198,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ -static int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, +static int try_release_extent_state(struct extent_io_tree *tree, struct page *page, gfp_t mask) { u64 start = page_offset(page); @@ -4238,13 +4233,14 @@ static int try_release_extent_state(struct extent_map_tree *map, * in the range corresponding to the page, both state records and extent * map records are removed */ -int try_release_extent_mapping(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask) +int try_release_extent_mapping(struct page *page, gfp_t mask) { struct extent_map *em; u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; + struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &btrfs_inode->io_tree; + struct extent_map_tree *map = &btrfs_inode->extent_tree; if (gfpflags_allow_blocking(mask) && page->mapping->host->i_size > SZ_16M) { @@ -4267,6 +4263,8 @@ int try_release_extent_mapping(struct extent_map_tree *map, extent_map_end(em) - 1, EXTENT_LOCKED | EXTENT_WRITEBACK, 0, NULL)) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &btrfs_inode->runtime_flags); remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em); @@ -4278,7 +4276,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, free_extent_map(em); } } - return try_release_extent_state(map, tree, page, mask); + return try_release_extent_state(tree, page, mask); } /* @@ -4547,8 +4545,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; - disko = 0; flags = 0; + if (em->block_start < EXTENT_MAP_LAST_BYTE) + disko = em->block_start + offset_in_extent; + else + disko = 0; /* * bump off for our next call to get_extent @@ -4570,8 +4571,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 bytenr = em->block_start - (em->start - em->orig_start); - disko = em->block_start + offset_in_extent; - /* * As btrfs supports shared space, this information * can be exported to userspace tools via @@ -5620,46 +5619,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } -void le_bitmap_set(u8 *map, unsigned int start, int len) -{ - u8 *p = map + BIT_BYTE(start); - const unsigned int size = start + len; - int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); - u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); - - while (len - bits_to_set >= 0) { - *p |= mask_to_set; - len -= bits_to_set; - bits_to_set = BITS_PER_BYTE; - mask_to_set = ~0; - p++; - } - if (len) { - mask_to_set &= BITMAP_LAST_BYTE_MASK(size); - *p |= mask_to_set; - } -} - -void le_bitmap_clear(u8 *map, unsigned int start, int len) -{ - u8 *p = map + BIT_BYTE(start); - const unsigned int size = start + len; - int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); - u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); - - while (len - bits_to_clear >= 0) { - *p &= ~mask_to_clear; - len -= bits_to_clear; - bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~0; - p++; - } - if (len) { - mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); - *p &= ~mask_to_clear; - } -} - /* * eb_bitmap_offset() - calculate the page and offset of the byte containing the * given bit number diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a53009694b16..0bfd4aeb822d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -79,14 +79,6 @@ #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -static inline int le_test_bit(int nr, const u8 *addr) -{ - return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1))); -} - -void le_bitmap_set(u8 *map, unsigned int start, int len); -void le_bitmap_clear(u8 *map, unsigned int start, int len); - struct extent_state; struct btrfs_root; struct btrfs_inode; @@ -278,9 +270,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int create); void extent_io_tree_init(struct extent_io_tree *tree, void *private_data); -int try_release_extent_mapping(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask); +int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached); @@ -421,14 +411,12 @@ int extent_invalidatepage(struct extent_io_tree *tree, int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); -int extent_writepages(struct extent_io_tree *tree, - struct address_space *mapping, +int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -int extent_readpages(struct extent_io_tree *tree, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); +int extent_readpages(struct address_space *mapping, struct list_head *pages, + unsigned nr_pages); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1b8a078f92eb..6648d55e5339 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -518,6 +518,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, /** * btrfs_add_extent_mapping - add extent mapping into em_tree + * @fs_info - used for tracepoint * @em_tree - the extent tree into which we want to insert the extent mapping * @em_in - extent we are inserting * @start - start of the logical range btrfs_get_extent() is requesting @@ -535,7 +536,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, * Return 0 on success, otherwise -EEXIST. * */ -int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, struct extent_map **em_in, u64 start, u64 len) { int ret; @@ -553,7 +555,7 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, existing = search_extent_mapping(em_tree, start, len); - trace_btrfs_handle_em_exist(existing, em, start, len); + trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); /* * existing will always be non-NULL, since there must be diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 5fcb80a6ce37..25d985e7532a 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -92,7 +92,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, struct extent_map **em_in, u64 start, u64 len); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f660ba1e5e58..51e77d72068a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1842,16 +1842,16 @@ out: static void update_time_for_write(struct inode *inode) { - struct timespec now; + struct timespec64 now; if (IS_NOCMTIME(inode)) return; now = current_time(inode); - if (!timespec_equal(&inode->i_mtime, &now)) + if (!timespec64_equal(&inode->i_mtime, &now)) inode->i_mtime = now; - if (!timespec_equal(&inode->i_ctime, &now)) + if (!timespec64_equal(&inode->i_ctime, &now)) inode->i_ctime = now; if (IS_I_VERSION(inode)) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e5b569bebc73..d5f80cb300be 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -253,10 +253,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, truncate_pagecache(inode, 0); /* - * We don't need an orphan item because truncating the free space cache - * will never be split across transactions. - * We don't need to check for -EAGAIN because we're a free space - * cache inode + * We skip the throttling logic for free space cache inodes, so we don't + * need to check for -EAGAIN. */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 32a0f6cb5594..b5950aacd697 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -12,7 +12,6 @@ #include "transaction.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); @@ -45,11 +44,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) } static int add_new_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = trans->fs_info->free_space_root; struct btrfs_free_space_info *info; struct btrfs_key key; struct extent_buffer *leaf; @@ -138,10 +136,11 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); } -static u8 *alloc_bitmap(u32 bitmap_size) +static unsigned long *alloc_bitmap(u32 bitmap_size) { - u8 *ret; + unsigned long *ret; unsigned int nofs_flag; + u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long)); /* * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse @@ -152,21 +151,42 @@ static u8 *alloc_bitmap(u32 bitmap_size) * know that recursion is unsafe. */ nofs_flag = memalloc_nofs_save(); - ret = kvzalloc(bitmap_size, GFP_KERNEL); + ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); return ret; } +static void le_bitmap_set(unsigned long *map, unsigned int start, int len) +{ + u8 *p = ((u8 *)map) + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~0; + p++; + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + *p |= mask_to_set; + } +} + int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->free_space_root; struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - u8 *bitmap, *bitmap_cursor; + unsigned long *bitmap; + char *bitmap_cursor; u64 start, end; u64 bitmap_range, i; u32 bitmap_size, flags, expected_extent_count; @@ -255,7 +275,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, goto out; } - bitmap_cursor = bitmap; + bitmap_cursor = (char *)bitmap; bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; i = start; while (i < end) { @@ -296,21 +316,18 @@ out: } int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->free_space_root; struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - u8 *bitmap; + unsigned long *bitmap; u64 start, end; - /* Initialize to silence GCC. */ - u64 extent_start = 0; - u64 offset; u32 bitmap_size, flags, expected_extent_count; - int prev_bit = 0, bit, bitnr; + unsigned long nrbits, start_bit, end_bit; u32 extent_count = 0; int done = 0, nr; int ret; @@ -348,7 +365,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, break; } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { unsigned long ptr; - u8 *bitmap_cursor; + char *bitmap_cursor; u32 bitmap_pos, data_size; ASSERT(found_key.objectid >= start); @@ -358,7 +375,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_pos = div_u64(found_key.objectid - start, fs_info->sectorsize * BITS_PER_BYTE); - bitmap_cursor = bitmap + bitmap_pos; + bitmap_cursor = ((char *)bitmap) + bitmap_pos; data_size = free_space_bitmap_size(found_key.offset, fs_info->sectorsize); @@ -392,32 +409,16 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - offset = start; - bitnr = 0; - while (offset < end) { - bit = !!le_test_bit(bitnr, bitmap); - if (prev_bit == 0 && bit == 1) { - extent_start = offset; - } else if (prev_bit == 1 && bit == 0) { - key.objectid = extent_start; - key.type = BTRFS_FREE_SPACE_EXTENT_KEY; - key.offset = offset - extent_start; - - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - if (ret) - goto out; - btrfs_release_path(path); + nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize); + start_bit = find_next_bit_le(bitmap, nrbits, 0); - extent_count++; - } - prev_bit = bit; - offset += fs_info->sectorsize; - bitnr++; - } - if (prev_bit == 1) { - key.objectid = extent_start; + while (start_bit < nrbits) { + end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit); + ASSERT(start_bit < end_bit); + + key.objectid = start + start_bit * block_group->fs_info->sectorsize; key.type = BTRFS_FREE_SPACE_EXTENT_KEY; - key.offset = end - extent_start; + key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) @@ -425,6 +426,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_release_path(path); extent_count++; + + start_bit = find_next_bit_le(bitmap, nrbits, end_bit); } if (extent_count != expected_extent_count) { @@ -446,7 +449,6 @@ out: } static int update_free_space_extent_count(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, int new_extents) @@ -459,7 +461,8 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (new_extents == 0) return 0; - info = search_free_space_info(trans, fs_info, block_group, path, 1); + info = search_free_space_info(trans, trans->fs_info, block_group, path, + 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; @@ -474,12 +477,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count > block_group->bitmap_high_thresh) { - ret = convert_free_space_to_bitmaps(trans, fs_info, block_group, - path); + ret = convert_free_space_to_bitmaps(trans, block_group, path); } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count < block_group->bitmap_low_thresh) { - ret = convert_free_space_to_extents(trans, fs_info, block_group, - path); + ret = convert_free_space_to_extents(trans, block_group, path); } out: @@ -576,12 +577,11 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans, * the bitmap. */ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size, int remove) { - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = block_group->fs_info->free_space_root; struct btrfs_key key; u64 end = start + size; u64 cur_start, cur_size; @@ -682,7 +682,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = update_free_space_extent_count(trans, fs_info, block_group, path, + ret = update_free_space_extent_count(trans, block_group, path, new_extents); out: @@ -690,12 +690,11 @@ out: } static int remove_free_space_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = trans->fs_info->free_space_root; struct btrfs_key key; u64 found_start, found_end; u64 end = start + size; @@ -769,7 +768,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = update_free_space_extent_count(trans, fs_info, block_group, path, + ret = update_free_space_extent_count(trans, block_group, path, new_extents); out: @@ -777,7 +776,6 @@ out: } int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) { @@ -786,36 +784,35 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, int ret; if (block_group->needs_free_space) { - ret = __add_block_group_free_space(trans, fs_info, block_group, - path); + ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; } - info = search_free_space_info(NULL, fs_info, block_group, path, 0); + info = search_free_space_info(NULL, trans->fs_info, block_group, path, + 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); btrfs_release_path(path); if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { - return modify_free_space_bitmap(trans, fs_info, block_group, - path, start, size, 1); + return modify_free_space_bitmap(trans, block_group, path, + start, size, 1); } else { - return remove_free_space_extent(trans, fs_info, block_group, - path, start, size); + return remove_free_space_extent(trans, block_group, path, + start, size); } } int remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 start, u64 size) { struct btrfs_block_group_cache *block_group; struct btrfs_path *path; int ret; - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; path = btrfs_alloc_path(); @@ -824,7 +821,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, goto out; } - block_group = btrfs_lookup_block_group(fs_info, start); + block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { ASSERT(0); ret = -ENOENT; @@ -832,8 +829,8 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, } mutex_lock(&block_group->free_space_lock); - ret = __remove_from_free_space_tree(trans, fs_info, block_group, path, - start, size); + ret = __remove_from_free_space_tree(trans, block_group, path, start, + size); mutex_unlock(&block_group->free_space_lock); btrfs_put_block_group(block_group); @@ -845,12 +842,11 @@ out: } static int add_free_space_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = trans->fs_info->free_space_root; struct btrfs_key key, new_key; u64 found_start, found_end; u64 end = start + size; @@ -965,7 +961,7 @@ insert: goto out; btrfs_release_path(path); - ret = update_free_space_extent_count(trans, fs_info, block_group, path, + ret = update_free_space_extent_count(trans, block_group, path, new_extents); out: @@ -973,17 +969,16 @@ out: } int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_free_space_info *info; u32 flags; int ret; if (block_group->needs_free_space) { - ret = __add_block_group_free_space(trans, fs_info, block_group, - path); + ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; } @@ -995,23 +990,22 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { - return modify_free_space_bitmap(trans, fs_info, block_group, - path, start, size, 0); + return modify_free_space_bitmap(trans, block_group, path, + start, size, 0); } else { - return add_free_space_extent(trans, fs_info, block_group, path, - start, size); + return add_free_space_extent(trans, block_group, path, start, + size); } } int add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 start, u64 size) { struct btrfs_block_group_cache *block_group; struct btrfs_path *path; int ret; - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; path = btrfs_alloc_path(); @@ -1020,7 +1014,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, goto out; } - block_group = btrfs_lookup_block_group(fs_info, start); + block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { ASSERT(0); ret = -ENOENT; @@ -1028,8 +1022,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, } mutex_lock(&block_group->free_space_lock); - ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start, - size); + ret = __add_to_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); btrfs_put_block_group(block_group); @@ -1046,10 +1039,9 @@ out: * through the normal add/remove hooks. */ static int populate_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root = trans->fs_info->extent_root; struct btrfs_path *path, *path2; struct btrfs_key key; u64 start, end; @@ -1066,7 +1058,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, return -ENOMEM; } - ret = add_new_free_space_info(trans, fs_info, block_group, path2); + ret = add_new_free_space_info(trans, block_group, path2); if (ret) goto out; @@ -1099,7 +1091,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, break; if (start < key.objectid) { - ret = __add_to_free_space_tree(trans, fs_info, + ret = __add_to_free_space_tree(trans, block_group, path2, start, key.objectid - @@ -1109,7 +1101,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, } start = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY) - start += fs_info->nodesize; + start += trans->fs_info->nodesize; else start += key.offset; } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { @@ -1124,8 +1116,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, break; } if (start < end) { - ret = __add_to_free_space_tree(trans, fs_info, block_group, - path2, start, end - start); + ret = __add_to_free_space_tree(trans, block_group, path2, + start, end - start); if (ret) goto out_locked; } @@ -1165,7 +1157,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) while (node) { block_group = rb_entry(node, struct btrfs_block_group_cache, cache_node); - ret = populate_free_space_tree(trans, fs_info, block_group); + ret = populate_free_space_tree(trans, block_group); if (ret) goto abort; node = rb_next(node); @@ -1269,7 +1261,6 @@ abort: } static int __add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { @@ -1277,19 +1268,19 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, block_group->needs_free_space = 0; - ret = add_new_free_space_info(trans, fs_info, block_group, path); + ret = add_new_free_space_info(trans, block_group, path); if (ret) return ret; - return __add_to_free_space_tree(trans, fs_info, block_group, path, + return __add_to_free_space_tree(trans, block_group, path, block_group->key.objectid, block_group->key.offset); } int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path = NULL; int ret = 0; @@ -1306,7 +1297,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans, goto out; } - ret = __add_block_group_free_space(trans, fs_info, block_group, path); + ret = __add_block_group_free_space(trans, block_group, path); out: btrfs_free_path(path); @@ -1317,10 +1308,9 @@ out: } int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_root *root = trans->fs_info->free_space_root; struct btrfs_path *path; struct btrfs_key key, found_key; struct extent_buffer *leaf; @@ -1328,7 +1318,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, int done = 0, nr; int ret; - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; if (block_group->needs_free_space) { @@ -1439,7 +1429,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, extent_start = offset; } else if (prev_bit == 1 && bit == 0) { total_found += add_new_free_space(block_group, - fs_info, extent_start, offset); if (total_found > CACHING_CTL_WAKE_UP) { @@ -1453,8 +1442,8 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } } if (prev_bit == 1) { - total_found += add_new_free_space(block_group, fs_info, - extent_start, end); + total_found += add_new_free_space(block_group, extent_start, + end); extent_count++; } @@ -1511,8 +1500,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, caching_ctl->progress = key.objectid; - total_found += add_new_free_space(block_group, fs_info, - key.objectid, + total_found += add_new_free_space(block_group, key.objectid, key.objectid + key.offset); if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 874b4feecad2..3133651d7d70 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -19,16 +19,12 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); int load_free_space_tree(struct btrfs_caching_control *caching_ctl); int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); int add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 start, u64 size); int remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 start, u64 size); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -38,19 +34,15 @@ search_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, int cow); int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size); int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size); int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); int free_space_test_bit(struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8e604e7071f1..eba61bcb9bb3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1018,8 +1018,10 @@ static noinline int cow_file_range(struct inode *inode, ram_size, /* ram_bytes */ BTRFS_COMPRESS_NONE, /* compress_type */ BTRFS_ORDERED_REGULAR /* type */); - if (IS_ERR(em)) + if (IS_ERR(em)) { + ret = PTR_ERR(em); goto out_reserve; + } free_extent_map(em); ret = btrfs_add_ordered_extent(inode, start, ins.objectid, @@ -1156,13 +1158,10 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> PAGE_SHIFT; - /* - * atomic_sub_return implies a barrier for waitqueue_active - */ + /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < - 5 * SZ_1M && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); + 5 * SZ_1M) + cond_wake_up_nomb(&fs_info->async_submit_wait); if (async_cow->inode) submit_compressed_extents(async_cow->inode, async_cow); @@ -1373,6 +1372,13 @@ next_slot: btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out_check; + /* + * Do the same check as in btrfs_cross_ref_exist but + * without the unnecessary search. + */ + if (btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; if (btrfs_extent_readonly(fs_info, disk_bytenr)) @@ -1754,6 +1760,7 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root, &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { + ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); BUG_ON(list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); @@ -3158,6 +3165,9 @@ out: /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); + /* Try to release some metadata so we don't get an OOM but don't wait */ + btrfs_btree_balance_dirty_nodelay(fs_info); + return ret; } @@ -3300,177 +3310,31 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) } /* - * This is called in transaction commit time. If there are no orphan - * files in the subvolume, it removes orphan item and frees block_rsv - * structure. - */ -void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv; - int ret; - - if (atomic_read(&root->orphan_inodes) || - root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) - return; - - spin_lock(&root->orphan_lock); - if (atomic_read(&root->orphan_inodes)) { - spin_unlock(&root->orphan_lock); - return; - } - - if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { - spin_unlock(&root->orphan_lock); - return; - } - - block_rsv = root->orphan_block_rsv; - root->orphan_block_rsv = NULL; - spin_unlock(&root->orphan_lock); - - if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && - btrfs_root_refs(&root->root_item) > 0) { - ret = btrfs_del_orphan_item(trans, fs_info->tree_root, - root->root_key.objectid); - if (ret) - btrfs_abort_transaction(trans, ret); - else - clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, - &root->state); - } - - if (block_rsv) { - WARN_ON(block_rsv->size > 0); - btrfs_free_block_rsv(fs_info, block_rsv); - } -} - -/* - * This creates an orphan entry for the given inode in case something goes - * wrong in the middle of an unlink/truncate. - * - * NOTE: caller of this function should reserve 5 units of metadata for - * this function. + * This creates an orphan entry for the given inode in case something goes wrong + * in the middle of an unlink. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; - struct btrfs_block_rsv *block_rsv = NULL; - int reserve = 0; - bool insert = false; int ret; - if (!root->orphan_block_rsv) { - block_rsv = btrfs_alloc_block_rsv(fs_info, - BTRFS_BLOCK_RSV_TEMP); - if (!block_rsv) - return -ENOMEM; - } - - if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &inode->runtime_flags)) - insert = true; - - if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &inode->runtime_flags)) - reserve = 1; - - spin_lock(&root->orphan_lock); - /* If someone has created ->orphan_block_rsv, be happy to use it. */ - if (!root->orphan_block_rsv) { - root->orphan_block_rsv = block_rsv; - } else if (block_rsv) { - btrfs_free_block_rsv(fs_info, block_rsv); - block_rsv = NULL; - } - - if (insert) - atomic_inc(&root->orphan_inodes); - spin_unlock(&root->orphan_lock); - - /* grab metadata reservation from transaction handle */ - if (reserve) { - ret = btrfs_orphan_reserve_metadata(trans, inode); - ASSERT(!ret); - if (ret) { - /* - * dec doesn't need spin_lock as ->orphan_block_rsv - * would be released only if ->orphan_inodes is - * zero. - */ - atomic_dec(&root->orphan_inodes); - clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &inode->runtime_flags); - if (insert) - clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &inode->runtime_flags); - return ret; - } - } - - /* insert an orphan item to track this unlinked/truncated file */ - if (insert) { - ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); - if (ret) { - if (reserve) { - clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &inode->runtime_flags); - btrfs_orphan_release_metadata(inode); - } - /* - * btrfs_orphan_commit_root may race with us and set - * ->orphan_block_rsv to zero, in order to avoid that, - * decrease ->orphan_inodes after everything is done. - */ - atomic_dec(&root->orphan_inodes); - if (ret != -EEXIST) { - clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &inode->runtime_flags); - btrfs_abort_transaction(trans, ret); - return ret; - } - } - ret = 0; + ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, ret); + return ret; } return 0; } /* - * We have done the truncate/delete so we can go ahead and remove the orphan - * item for this particular inode. + * We have done the delete so we can go ahead and remove the orphan item for + * this particular inode. */ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { - struct btrfs_root *root = inode->root; - int delete_item = 0; - int ret = 0; - - if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &inode->runtime_flags)) - delete_item = 1; - - if (delete_item && trans) - ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); - - if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, - &inode->runtime_flags)) - btrfs_orphan_release_metadata(inode); - - /* - * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv - * to zero, in order to avoid that, decrease ->orphan_inodes after - * everything is done. - */ - if (delete_item) - atomic_dec(&root->orphan_inodes); - - return ret; + return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); } /* @@ -3486,7 +3350,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) struct btrfs_trans_handle *trans; struct inode *inode; u64 last_objectid = 0; - int ret = 0, nr_unlink = 0, nr_truncate = 0; + int ret = 0, nr_unlink = 0; if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) return 0; @@ -3586,12 +3450,31 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) key.offset = found_key.objectid - 1; continue; } + } + /* - * Inode is already gone but the orphan item is still there, - * kill the orphan item. + * If we have an inode with links, there are a couple of + * possibilities. Old kernels (before v3.12) used to create an + * orphan item for truncate indicating that there were possibly + * extent items past i_size that needed to be deleted. In v3.12, + * truncate was changed to update i_size in sync with the extent + * items, but the (useless) orphan item was still created. Since + * v4.18, we don't create the orphan item for truncate at all. + * + * So, this item could mean that we need to do a truncate, but + * only if this filesystem was last used on a pre-v3.12 kernel + * and was not cleanly unmounted. The odds of that are quite + * slim, and it's a pain to do the truncate now, so just delete + * the orphan item. + * + * It's also possible that this orphan item was supposed to be + * deleted but wasn't. The inode number may have been reused, + * but either way, we can delete the orphan item. */ - if (ret == -ENOENT) { + if (ret == -ENOENT || inode->i_nlink) { + if (!ret) + iput(inode); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -3607,42 +3490,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } - /* - * add this inode to the orphan list so btrfs_orphan_del does - * the proper thing when we hit it - */ - set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags); - atomic_inc(&root->orphan_inodes); - - /* if we have links, this was a truncate, lets do that */ - if (inode->i_nlink) { - if (WARN_ON(!S_ISREG(inode->i_mode))) { - iput(inode); - continue; - } - nr_truncate++; - - /* 1 for the orphan item deletion. */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - iput(inode); - ret = PTR_ERR(trans); - goto out; - } - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - btrfs_end_transaction(trans); - if (ret) { - iput(inode); - goto out; - } - - ret = btrfs_truncate(inode, false); - if (ret) - btrfs_orphan_del(NULL, BTRFS_I(inode)); - } else { - nr_unlink++; - } + nr_unlink++; /* this will do delete_inode and everything for us */ iput(inode); @@ -3654,12 +3502,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; - if (root->orphan_block_rsv) - btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, - (u64)-1); - - if (root->orphan_block_rsv || - test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { + if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans); @@ -3667,8 +3510,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (nr_unlink) btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); - if (nr_truncate) - btrfs_debug(fs_info, "truncated %d orphans", nr_truncate); out: if (ret) @@ -3931,7 +3772,7 @@ cache_acl: break; } - btrfs_update_iflags(inode); + btrfs_sync_inode_flags_to_i_flags(inode); return 0; make_bad: @@ -4245,7 +4086,7 @@ out: return ret; } -int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, +static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, u64 objectid, const char *name, int name_len) @@ -4326,6 +4167,262 @@ out: return ret; } +/* + * Helper to check if the subvolume references other subvolumes or if it's + * default. + */ +static noinline int may_destroy_subvol(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 dir_id; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Make sure this root isn't set as the default subvol */ + dir_id = btrfs_super_root_dir(fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, + dir_id, "default", 7, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.objectid == root->root_key.objectid) { + ret = -EPERM; + btrfs_err(fs_info, + "deleting default subvolume %llu is not allowed", + key.objectid); + goto out; + } + btrfs_release_path(path); + } + + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == root->root_key.objectid && + key.type == BTRFS_ROOT_REF_KEY) + ret = -ENOTEMPTY; + } +out: + btrfs_free_path(path); + return ret; +} + +/* Delete all dentries for inodes belonging to the root */ +static void btrfs_prune_dentries(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + u64 objectid = 0; + + if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + WARN_ON(btrfs_root_refs(&root->root_item) != 0); + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode))) + node = node->rb_left; + else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode))) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1; + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + if (atomic_read(&inode->i_count) > 1) + d_prune_aliases(inode); + /* + * btrfs_drop_inode will have it removed from the inode + * cache when its usage count hits zero. + */ + iput(inode); + cond_resched(); + spin_lock(&root->inode_lock); + goto again; + } + + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); +} + +int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = d_inode(dentry); + struct btrfs_root *dest = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + u64 root_flags; + int ret; + int err; + + /* + * Don't allow to delete a subvolume with send in progress. This is + * inside the inode lock so the error handling that has to drop the bit + * again is not run concurrently. + */ + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + if (dest->send_in_progress == 0) { + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } else { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu during send", + dest->root_key.objectid); + return -EPERM; + } + + down_write(&fs_info->subvol_sem); + + err = may_destroy_subvol(dest); + if (err) + goto out_up_write; + + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * One for dir inode, + * two for dir entries, + * two for root ref/backref. + */ + err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); + if (err) + goto out_up_write; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_release; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; + + btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); + + ret = btrfs_unlink_subvol(trans, root, dir, + dest->root_key.objectid, + dentry->d_name.name, + dentry->d_name.len); + if (ret) { + err = ret; + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + + btrfs_record_root_in_trans(trans, dest); + + memset(&dest->root_item.drop_progress, 0, + sizeof(dest->root_item.drop_progress)); + dest->root_item.drop_level = 0; + btrfs_set_root_refs(&dest->root_item, 0); + + if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { + ret = btrfs_insert_orphan_item(trans, + fs_info->tree_root, + dest->root_key.objectid); + if (ret) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } + } + + ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } + if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { + ret = btrfs_uuid_tree_remove(trans, + dest->root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } + } + +out_end_trans: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; + ret = btrfs_end_transaction(trans); + if (ret && !err) + err = ret; + inode->i_flags |= S_DEAD; +out_release: + btrfs_subvolume_release_metadata(fs_info, &block_rsv); +out_up_write: + up_write(&fs_info->subvol_sem); + if (err) { + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } else { + d_invalidate(dentry); + btrfs_prune_dentries(dest); + ASSERT(dest->send_in_progress == 0); + + /* the last ref */ + if (dest->ino_cache_inode) { + iput(dest->ino_cache_inode); + dest->ino_cache_inode = NULL; + } + } + + return err; +} + static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); @@ -4337,7 +4434,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) - return -EPERM; + return btrfs_delete_subvolume(dir, dentry); trans = __unlink_start_trans(dir); if (IS_ERR(trans)) @@ -4449,7 +4546,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int pending_del_slot = 0; int extent_type = -1; int ret; - int err = 0; u64 ino = btrfs_ino(BTRFS_I(inode)); u64 bytes_deleted = 0; bool be_nice = false; @@ -4501,22 +4597,19 @@ search_again: * up a huge file in a single leaf. Most of the time that * bytes_deleted is > 0, it will be huge by the time we get here */ - if (be_nice && bytes_deleted > SZ_32M) { - if (btrfs_should_end_transaction(trans)) { - err = -EAGAIN; - goto error; - } + if (be_nice && bytes_deleted > SZ_32M && + btrfs_should_end_transaction(trans)) { + ret = -EAGAIN; + goto out; } - path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } if (ret > 0) { + ret = 0; /* there are no items in the tree for us to truncate, we're * done */ @@ -4627,7 +4720,7 @@ search_again: * We have to bail so the last_size is set to * just before this extent. */ - err = NEED_TRUNCATE_BLOCK; + ret = NEED_TRUNCATE_BLOCK; break; } @@ -4666,7 +4759,10 @@ delete: extent_num_bytes, 0, btrfs_header_owner(leaf), ino, extent_offset); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } if (btrfs_should_throttle_delayed_refs(trans, fs_info)) btrfs_async_run_delayed_refs(fs_info, trans->delayed_ref_updates * 2, @@ -4694,7 +4790,7 @@ delete: pending_del_nr); if (ret) { btrfs_abort_transaction(trans, ret); - goto error; + break; } pending_del_nr = 0; } @@ -4705,8 +4801,8 @@ delete: trans->delayed_ref_updates = 0; ret = btrfs_run_delayed_refs(trans, updates * 2); - if (ret && !err) - err = ret; + if (ret) + break; } } /* @@ -4714,8 +4810,8 @@ delete: * and let the transaction restart */ if (should_end) { - err = -EAGAIN; - goto error; + ret = -EAGAIN; + break; } goto search_again; } else { @@ -4723,32 +4819,37 @@ delete: } } out: - if (pending_del_nr) { - ret = btrfs_del_items(trans, root, path, pending_del_slot, + if (ret >= 0 && pending_del_nr) { + int err; + + err = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); - if (ret) - btrfs_abort_transaction(trans, ret); + if (err) { + btrfs_abort_transaction(trans, err); + ret = err; + } } -error: if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ASSERT(last_size >= new_size); - if (!err && last_size > new_size) + if (!ret && last_size > new_size) last_size = new_size; btrfs_ordered_update_i_size(inode, last_size, NULL); } btrfs_free_path(path); - if (be_nice && bytes_deleted > SZ_32M) { + if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) { unsigned long updates = trans->delayed_ref_updates; + int err; + if (updates) { trans->delayed_ref_updates = 0; - ret = btrfs_run_delayed_refs(trans, updates * 2); - if (ret && !err) - err = ret; + err = btrfs_run_delayed_refs(trans, updates * 2); + if (err) + ret = err; } } - return err; + return ret; } /* @@ -5090,30 +5191,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags); - /* - * 1 for the orphan item we're going to add - * 1 for the orphan item deletion. - */ - trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * We need to do this in case we fail at _any_ point during the - * actual truncate. Once we do the truncate_setsize we could - * invalidate pages which forces any outstanding ordered io to - * be instantly completed which will give us extents that need - * to be truncated. If we fail to get an orphan inode down we - * could have left over extents that were never meant to live, - * so we need to guarantee from this point on that everything - * will be consistent. - */ - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - btrfs_end_transaction(trans); - if (ret) - return ret; - - /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); /* Disable nonlocked read DIO to avoid the end less truncate */ @@ -5125,29 +5202,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (ret && inode->i_nlink) { int err; - /* To get a stable disk_i_size */ - err = btrfs_wait_ordered_range(inode, 0, (u64)-1); - if (err) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); - return err; - } - /* - * failed to truncate, disk_i_size is only adjusted down - * as we remove extents, so it should represent the true - * size of the inode, so reset the in memory size and - * delete our orphan entry. + * Truncate failed, so fix up the in-memory size. We + * adjusted disk_i_size down as we removed extents, so + * wait for disk_i_size to be stable and then update the + * in-memory size to match. */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); - return ret; - } - i_size_write(inode, BTRFS_I(inode)->disk_i_size); - err = btrfs_orphan_del(trans, BTRFS_I(inode)); + err = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (err) - btrfs_abort_transaction(trans, err); - btrfs_end_transaction(trans); + return err; + i_size_write(inode, BTRFS_I(inode)->disk_i_size); } } @@ -5277,13 +5341,52 @@ static void evict_inode_truncate_pages(struct inode *inode) spin_unlock(&io_tree->lock); } +static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 min_size) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int failures = 0; + + for (;;) { + struct btrfs_trans_handle *trans; + int ret; + + ret = btrfs_block_rsv_refill(root, rsv, min_size, + BTRFS_RESERVE_FLUSH_LIMIT); + + if (ret && ++failures > 2) { + btrfs_warn(fs_info, + "could not allocate space for a delete; will truncate on mount"); + return ERR_PTR(-ENOSPC); + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans) || !ret) + return trans; + + /* + * Try to steal from the global reserve if there is space for + * it. + */ + if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && + !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0)) + return trans; + + /* If not, commit and try again. */ + ret = btrfs_commit_transaction(trans); + if (ret) + return ERR_PTR(ret); + } +} + void btrfs_evict_inode(struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv, *global_rsv; - int steal_from_global = 0; + struct btrfs_block_rsv *rsv; u64 min_size; int ret; @@ -5304,21 +5407,16 @@ void btrfs_evict_inode(struct inode *inode) btrfs_is_free_space_inode(BTRFS_I(inode)))) goto no_delete; - if (is_bad_inode(inode)) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); + if (is_bad_inode(inode)) goto no_delete; - } /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ if (!special_file(inode->i_mode)) btrfs_wait_ordered_range(inode, 0, (u64)-1); btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { - BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags)); + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto no_delete; - } if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && @@ -5327,130 +5425,63 @@ void btrfs_evict_inode(struct inode *inode) } ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); - if (ret) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); + if (ret) goto no_delete; - } rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); + if (!rsv) goto no_delete; - } rsv->size = min_size; rsv->failfast = 1; - global_rsv = &fs_info->global_block_rsv; btrfs_i_size_write(BTRFS_I(inode), 0); - /* - * This is a bit simpler than btrfs_truncate since we've already - * reserved our space for our orphan item in the unlink, so we just - * need to reserve some slack space in case we add bytes and update - * inode item when doing the truncate. - */ while (1) { - ret = btrfs_block_rsv_refill(root, rsv, min_size, - BTRFS_RESERVE_FLUSH_LIMIT); - - /* - * Try and steal from the global reserve since we will - * likely not use this space anyway, we want to try as - * hard as possible to get this to work. - */ - if (ret) - steal_from_global++; - else - steal_from_global = 0; - ret = 0; - - /* - * steal_from_global == 0: we reserved stuff, hooray! - * steal_from_global == 1: we didn't reserve stuff, boo! - * steal_from_global == 2: we've committed, still not a lot of - * room but maybe we'll have room in the global reserve this - * time. - * steal_from_global == 3: abandon all hope! - */ - if (steal_from_global > 2) { - btrfs_warn(fs_info, - "Could not get space for a delete, will truncate on mount %d", - ret); - btrfs_orphan_del(NULL, BTRFS_I(inode)); - btrfs_free_block_rsv(fs_info, rsv); - goto no_delete; - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); - btrfs_free_block_rsv(fs_info, rsv); - goto no_delete; - } - - /* - * We can't just steal from the global reserve, we need to make - * sure there is room to do it, if not we need to commit and try - * again. - */ - if (steal_from_global) { - if (!btrfs_check_space_for_delayed_refs(trans, fs_info)) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, - min_size, 0); - else - ret = -ENOSPC; - } - - /* - * Couldn't steal from the global reserve, we have too much - * pending stuff built up, commit the transaction and try it - * again. - */ - if (ret) { - ret = btrfs_commit_transaction(trans); - if (ret) { - btrfs_orphan_del(NULL, BTRFS_I(inode)); - btrfs_free_block_rsv(fs_info, rsv); - goto no_delete; - } - continue; - } else { - steal_from_global = 0; - } + trans = evict_refill_and_join(root, rsv, min_size); + if (IS_ERR(trans)) + goto free_rsv; trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - if (ret != -ENOSPC && ret != -EAGAIN) - break; - trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); - trans = NULL; btrfs_btree_balance_dirty(fs_info); + if (ret && ret != -ENOSPC && ret != -EAGAIN) + goto free_rsv; + else if (!ret) + break; } - btrfs_free_block_rsv(fs_info, rsv); - /* - * Errors here aren't a big deal, it just means we leave orphan items - * in the tree. They will be cleaned up on the next mount. + * Errors here aren't a big deal, it just means we leave orphan items in + * the tree. They will be cleaned up on the next mount. If the inode + * number gets reused, cleanup deletes the orphan item without doing + * anything, and unlink reuses the existing orphan item. + * + * If it turns out that we are dropping too many of these, we might want + * to add a mechanism for retrying these after a commit. */ - if (ret == 0) { - trans->block_rsv = root->orphan_block_rsv; + trans = evict_refill_and_join(root, rsv, min_size); + if (!IS_ERR(trans)) { + trans->block_rsv = rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); - } else { - btrfs_orphan_del(NULL, BTRFS_I(inode)); + trans->block_rsv = &fs_info->trans_block_rsv; + btrfs_end_transaction(trans); } - trans->block_rsv = &fs_info->trans_block_rsv; if (!(root == fs_info->tree_root || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode))); - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(fs_info); +free_rsv: + btrfs_free_block_rsv(fs_info, rsv); no_delete: + /* + * If we didn't successfully delete, the orphan item will still be in + * the tree and we'll retry on the next mount. Again, we might also want + * to retry these periodically in the future. + */ btrfs_remove_delayed_node(BTRFS_I(inode)); clear_inode(inode); } @@ -5626,69 +5657,6 @@ static void inode_tree_del(struct inode *inode) } } -void btrfs_invalidate_inodes(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - u64 objectid = 0; - - if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) - WARN_ON(btrfs_root_refs(&root->root_item) != 0); - - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); - - if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode))) - node = node->rb_left; - else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode))) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1; - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - if (atomic_read(&inode->i_count) > 1) - d_prune_aliases(inode); - /* - * btrfs_drop_inode will have it removed from - * the inode cache when its usage count - * hits zero. - */ - iput(inode); - cond_resched(); - spin_lock(&root->inode_lock); - goto again; - } - - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); - } - spin_unlock(&root->inode_lock); -} static int btrfs_init_locked_inode(struct inode *inode, void *p) { @@ -5777,7 +5745,7 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + BTRFS_I(inode)->i_otime = timespec64_to_timespec(inode->i_mtime); return inode; } @@ -5850,11 +5818,6 @@ static int btrfs_dentry_delete(const struct dentry *dentry) return 0; } -static void btrfs_dentry_release(struct dentry *dentry) -{ - kfree(dentry->d_fsdata); -} - static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { @@ -6131,7 +6094,7 @@ static int btrfs_dirty_inode(struct inode *inode) * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ -static int btrfs_update_time(struct inode *inode, struct timespec *now, +static int btrfs_update_time(struct inode *inode, struct timespec64 *now, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -6270,7 +6233,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; } - btrfs_update_iflags(inode); + btrfs_sync_inode_flags_to_i_flags(inode); } static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, @@ -6386,7 +6349,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + BTRFS_I(inode)->i_otime = timespec64_to_timespec(inode->i_mtime); inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -6586,8 +6549,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; } else { btrfs_update_inode(trans, root, inode); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); } out_unlock: @@ -6663,8 +6625,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); @@ -6707,8 +6668,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, * 2 items for inode and inode ref * 2 items for dir items * 1 item for parent inode + * 1 item for orphan item deletion if O_TMPFILE */ - trans = btrfs_start_transaction(root, 5); + trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); if (IS_ERR(trans)) { err = PTR_ERR(trans); trans = NULL; @@ -6809,12 +6771,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) goto out_fail_inode; - d_instantiate(dentry, inode); - /* - * mkdir is special. We're unlocking after we call d_instantiate - * to avoid a race with nfsd calling d_instantiate. - */ - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); drop_on_err = 0; out_fail: @@ -7090,7 +7047,7 @@ insert: err = 0; write_lock(&em_tree->lock); - err = btrfs_add_extent_mapping(em_tree, &em, start, len); + err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: @@ -7375,6 +7332,14 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, btrfs_file_extent_other_encoding(leaf, fi)) goto out; + /* + * Do the same check as in btrfs_cross_ref_exist but without the + * unnecessary search. + */ + if (btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + backref_offset = btrfs_file_extent_offset(leaf, fi); if (orig_start) { @@ -7575,6 +7540,125 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, return em; } + +static int btrfs_get_blocks_direct_read(struct extent_map *em, + struct buffer_head *bh_result, + struct inode *inode, + u64 start, u64 len) +{ + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return -ENOENT; + + len = min(len, em->len - (start - em->start)); + + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + + return 0; +} + +static int btrfs_get_blocks_direct_write(struct extent_map **map, + struct buffer_head *bh_result, + struct inode *inode, + struct btrfs_dio_data *dio_data, + u64 start, u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *em = *map; + int ret = 0; + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + int type; + u64 block_start, orig_start, orig_block_len, ram_bytes; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = em->block_start + (start - em->start); + + if (can_nocow_extent(inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes) == 1 && + btrfs_inc_nocow_writers(fs_info, block_start)) { + struct extent_map *em2; + + em2 = btrfs_create_dio_extent(inode, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(fs_info, block_start); + if (type == BTRFS_ORDERED_PREALLOC) { + free_extent_map(em); + *map = em = em2; + } + + if (em2 && IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto out; + } + /* + * For inode marked NODATACOW or extent marked PREALLOC, + * use the existing or preallocated extent, so does not + * need to adjust btrfs_space_info's bytes_may_use. + */ + btrfs_free_reserved_data_space_noquota(inode, start, + len); + goto skip_cow; + } + } + + /* this will cow the extent */ + len = bh_result->b_size; + free_extent_map(em); + *map = em = btrfs_new_extent_direct(inode, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + len = min(len, em->len - (start - em->start)); + +skip_cow: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + /* + * Need to update the i_size under the extent lock so buffered + * readers will get the updated i_size when we unlock. + */ + if (!dio_data->overwrite && start + len > i_size_read(inode)) + i_size_write(inode, start + len); + + WARN_ON(dio_data->reserve < len); + dio_data->reserve -= len; + dio_data->unsubmitted_oe_range_end = start + len; + current->journal_info = dio_data; +out: + return ret; +} + static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -7643,116 +7727,36 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto unlock_err; } - /* Just a good old fashioned hole, return */ - if (!create && (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - free_extent_map(em); - goto unlock_err; - } - - /* - * We don't allocate a new extent in the following cases - * - * 1) The inode is marked as NODATACOW. In this case we'll just use the - * existing extent. - * 2) The extent is marked as PREALLOC. We're good to go here and can - * just use the extent. - * - */ - if (!create) { - len = min(len, em->len - (start - em->start)); - lockstart = start + len; - goto unlock; - } - - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || - ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - em->block_start != EXTENT_MAP_HOLE)) { - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - type = BTRFS_ORDERED_PREALLOC; - else - type = BTRFS_ORDERED_NOCOW; - len = min(len, em->len - (start - em->start)); - block_start = em->block_start + (start - em->start); - - if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) { - struct extent_map *em2; - - em2 = btrfs_create_dio_extent(inode, start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); - btrfs_dec_nocow_writers(fs_info, block_start); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - em = em2; - } - if (em2 && IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto unlock_err; - } - /* - * For inode marked NODATACOW or extent marked PREALLOC, - * use the existing or preallocated extent, so does not - * need to adjust btrfs_space_info's bytes_may_use. - */ - btrfs_free_reserved_data_space_noquota(inode, - start, len); - goto unlock; - } - } - - /* - * this will cow the extent, reset the len in case we changed - * it above - */ - len = bh_result->b_size; - free_extent_map(em); - em = btrfs_new_extent_direct(inode, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } - len = min(len, em->len - (start - em->start)); -unlock: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = em->bdev; - set_buffer_mapped(bh_result); if (create) { - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); + ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, + dio_data, start, len); + if (ret < 0) + goto unlock_err; + /* clear and unlock the entire range */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, 1, 0, &cached_state); + } else { + ret = btrfs_get_blocks_direct_read(em, bh_result, inode, + start, len); + /* Can be negative only if we read from a hole */ + if (ret < 0) { + ret = 0; + free_extent_map(em); + goto unlock_err; + } /* - * Need to update the i_size under the extent lock so buffered - * readers will get the updated i_size when we unlock. + * We need to unlock only the end area that we aren't using. + * The rest is going to be unlocked by the endio routine. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) - i_size_write(inode, start + len); - - WARN_ON(dio_data->reserve < len); - dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; - } - - /* - * In the case of write we need to clear and unlock the entire range, - * in the case of read we need to unlock only the end area that we - * aren't using if there is any left over space. - */ - if (lockstart < lockend) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state); - } else { - free_extent_state(cached_state); + lockstart = start + bh_result->b_size; + if (lockstart < lockend) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, unlock_bits, 1, 0, + &cached_state); + } else { + free_extent_state(cached_state); + } } free_extent_map(em); @@ -8138,7 +8142,6 @@ static void __endio_write_update_ordered(struct inode *inode, u64 ordered_offset = offset; u64 ordered_bytes = bytes; u64 last_offset; - int ret; if (btrfs_is_free_space_inode(BTRFS_I(inode))) { wq = fs_info->endio_freespace_worker; @@ -8148,32 +8151,31 @@ static void __endio_write_update_ordered(struct inode *inode, func = btrfs_endio_write_helper; } -again: - last_offset = ordered_offset; - ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes, - uptodate); - if (!ret) - goto out_test; - - btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &ordered->work); -out_test: - /* - * If btrfs_dec_test_ordered_pending does not find any ordered extent - * in the range, we can exit. - */ - if (ordered_offset == last_offset) - return; - /* - * our bio might span multiple ordered extents. If we haven't - * completed the accounting for the whole dio, go back and try again - */ - if (ordered_offset < offset + bytes) { - ordered_bytes = offset + bytes - ordered_offset; - ordered = NULL; - goto again; + while (ordered_offset < offset + bytes) { + last_offset = ordered_offset; + if (btrfs_dec_test_first_ordered_pending(inode, &ordered, + &ordered_offset, + ordered_bytes, + uptodate)) { + btrfs_init_work(&ordered->work, func, + finish_ordered_fn, + NULL, NULL); + btrfs_queue_work(wq, &ordered->work); + } + /* + * If btrfs_dec_test_ordered_pending does not find any ordered + * extent in the range, we can exit. + */ + if (ordered_offset == last_offset) + return; + /* + * Our bio might span multiple ordered extents. In this case + * we keep goin until we have accounted the whole dio. + */ + if (ordered_offset < offset + bytes) { + ordered_bytes = offset + bytes - ordered_offset; + ordered = NULL; + } } } @@ -8712,29 +8714,19 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc) static int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_io_tree *tree; - - tree = &BTRFS_I(mapping->host)->io_tree; - return extent_writepages(tree, mapping, wbc); + return extent_writepages(mapping, wbc); } static int btrfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct extent_io_tree *tree; - tree = &BTRFS_I(mapping->host)->io_tree; - return extent_readpages(tree, mapping, pages, nr_pages); + return extent_readpages(mapping, pages, nr_pages); } + static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { - struct extent_io_tree *tree; - struct extent_map_tree *map; - int ret; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - map = &BTRFS_I(page->mapping->host)->extent_tree; - ret = try_release_extent_mapping(map, tree, page, gfp_flags); + int ret = try_release_extent_mapping(page, gfp_flags); if (ret == 1) { ClearPagePrivate(page); set_page_private(page, 0); @@ -8875,12 +8867,12 @@ again: * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because - * vmtruncate() writes the inode size before removing pages, once we have the - * page lock we can determine safely if the page is beyond EOF. If it is not + * truncate_setsize() writes the inode size before removing pages, once we have + * the page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_fault *vmf) +vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -8892,7 +8884,8 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) char *kaddr; unsigned long zero_start; loff_t size; - int ret; + vm_fault_t ret; + int ret2; int reserved = 0; u64 reserved_space; u64 page_start; @@ -8914,17 +8907,14 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, + ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, reserved_space); - if (!ret) { - ret = file_update_time(vmf->vma->vm_file); + if (!ret2) { + ret2 = file_update_time(vmf->vma->vm_file); reserved = 1; } - if (ret) { - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else /* -ENOSPC, -EIO, etc */ - ret = VM_FAULT_SIGBUS; + if (ret2) { + ret = vmf_error(ret2); if (reserved) goto out; goto out_noreserve; @@ -8983,15 +8973,15 @@ again: EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); - ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, + ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state, 0); - if (ret) { + if (ret2) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state); ret = VM_FAULT_SIGBUS; goto out_unlock; } - ret = 0; + ret2 = 0; /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) @@ -9015,13 +9005,14 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); -out_unlock: - if (!ret) { + if (!ret2) { btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } + +out_unlock: unlock_page(page); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0)); @@ -9038,8 +9029,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - int ret = 0; - int err = 0; + int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); @@ -9052,39 +9042,31 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) } /* - * Yes ladies and gentlemen, this is indeed ugly. The fact is we have - * 3 things going on here + * Yes ladies and gentlemen, this is indeed ugly. We have a couple of + * things going on here: * - * 1) We need to reserve space for our orphan item and the space to - * delete our orphan item. Lord knows we don't want to have a dangling - * orphan item because we didn't reserve space to remove it. + * 1) We need to reserve space to update our inode. * - * 2) We need to reserve space to update our inode. - * - * 3) We need to have something to cache all the space that is going to + * 2) We need to have something to cache all the space that is going to * be free'd up by the truncate operation, but also have some slack * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * - * And we need these to all be separate. The fact is we can use a lot of + * And we need these to be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space * we will use, so we need the truncate reservation to be separate so it - * doesn't end up using space reserved for updating the inode or - * removing the orphan item. We also need to be able to stop the - * transaction and start a new one, which means we need to be able to - * update the inode several times, and we have no idea of knowing how - * many times that will be, so we can't just reserve 1 item for the - * entirety of the operation, so that has to be done separately as well. - * Then there is the orphan item, which does indeed need to be held on - * to for the whole operation, and we need nobody to touch this reserved - * space except the orphan code. + * doesn't end up using space reserved for updating the inode. We also + * need to be able to stop the transaction and start a new one, which + * means we need to be able to update the inode several times, and we + * have no idea of knowing how many times that will be, so we can't just + * reserve 1 item for the entirety of the operation, so that has to be + * done separately as well. * * So that leaves us with * - * 1) root->orphan_block_rsv - for the orphan deletion. - * 2) rsv - for the truncate reservation, which we will steal from the + * 1) rsv - for the truncate reservation, which we will steal from the * transaction reservation. - * 3) fs_info->trans_block_rsv - this will have 1 items worth left for + * 2) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); @@ -9099,7 +9081,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) */ trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out; } @@ -9123,23 +9105,19 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) inode->i_size, BTRFS_EXTENT_DATA_KEY); trans->block_rsv = &fs_info->trans_block_rsv; - if (ret != -ENOSPC && ret != -EAGAIN) { - err = ret; + if (ret != -ENOSPC && ret != -EAGAIN) break; - } ret = btrfs_update_inode(trans, root, inode); - if (ret) { - err = ret; + if (ret) break; - } btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { - ret = err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; break; } @@ -9172,29 +9150,23 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_ordered_update_i_size(inode, inode->i_size, NULL); } - if (ret == 0 && inode->i_nlink > 0) { - trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (ret) - err = ret; - } - if (trans) { + int ret2; + trans->block_rsv = &fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret && !err) - err = ret; + ret2 = btrfs_update_inode(trans, root, inode); + if (ret2 && !ret) + ret = ret2; - ret = btrfs_end_transaction(trans); + ret2 = btrfs_end_transaction(trans); + if (ret2 && !ret) + ret = ret2; btrfs_btree_balance_dirty(fs_info); } out: btrfs_free_block_rsv(fs_info, rsv); - if (ret && !err) - err = ret; - - return err; + return ret; } /* @@ -9330,13 +9302,6 @@ void btrfs_destroy_inode(struct inode *inode) if (!root) goto free; - if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags)) { - btrfs_info(fs_info, "inode %llu still on the orphan list", - btrfs_ino(BTRFS_I(inode))); - atomic_dec(&root->orphan_inodes); - } - while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) @@ -9471,7 +9436,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; - struct timespec ctime = current_time(old_inode); + struct timespec64 ctime = current_time(old_inode); struct dentry *parent; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); @@ -9479,6 +9444,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, u64 new_idx = 0; u64 root_objectid; int ret; + int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; @@ -9675,7 +9641,8 @@ out_fail: dest_log_pinned = false; } } - ret = btrfs_end_transaction(trans); + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); @@ -9970,6 +9937,13 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } +struct btrfs_delalloc_work { + struct inode *inode; + struct completion completion; + struct list_head list; + struct btrfs_work work; +}; + static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; @@ -9983,15 +9957,11 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - if (delalloc_work->delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + iput(inode); complete(&delalloc_work->completion); } -struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int delay_iput) +static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) { struct btrfs_delalloc_work *work; @@ -10002,7 +9972,6 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - work->delay_iput = delay_iput; WARN_ON_ONCE(!inode); btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, btrfs_run_delalloc_work, NULL, NULL); @@ -10010,18 +9979,11 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, return work; } -void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) -{ - wait_for_completion(&work->completion); - kfree(work); -} - /* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, - int nr) +static int start_delalloc_inodes(struct btrfs_root *root, int nr) { struct btrfs_inode *binode; struct inode *inode; @@ -10049,12 +10011,9 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, } spin_unlock(&root->delalloc_lock); - work = btrfs_alloc_delalloc_work(inode, delay_iput); + work = btrfs_alloc_delalloc_work(inode); if (!work) { - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + iput(inode); ret = -ENOMEM; goto out; } @@ -10072,10 +10031,11 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); + wait_for_completion(&work->completion); + kfree(work); } - if (!list_empty_careful(&splice)) { + if (!list_empty(&splice)) { spin_lock(&root->delalloc_lock); list_splice_tail(&splice, &root->delalloc_inodes); spin_unlock(&root->delalloc_lock); @@ -10084,7 +10044,7 @@ out: return ret; } -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +int btrfs_start_delalloc_inodes(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -10092,14 +10052,13 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = __start_delalloc_inodes(root, delay_iput, -1); + ret = start_delalloc_inodes(root, -1); if (ret > 0) ret = 0; return ret; } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, - int nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) { struct btrfs_root *root; struct list_head splice; @@ -10122,7 +10081,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = __start_delalloc_inodes(root, delay_iput, nr); + ret = start_delalloc_inodes(root, nr); btrfs_put_fs_root(root); if (ret < 0) goto out; @@ -10137,7 +10096,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, ret = 0; out: - if (!list_empty_careful(&splice)) { + if (!list_empty(&splice)) { spin_lock(&fs_info->delalloc_root_lock); list_splice_tail(&splice, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); @@ -10257,8 +10216,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); @@ -10676,5 +10634,4 @@ static const struct inode_operations btrfs_symlink_inode_operations = { const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, - .d_release = btrfs_dentry_release, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 632e26d6f7ce..b077544b5232 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -93,20 +93,22 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int no_time_update); /* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags) +static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, + unsigned int flags) { - if (S_ISDIR(mode)) + if (S_ISDIR(inode->i_mode)) return flags; - else if (S_ISREG(mode)) + else if (S_ISREG(inode->i_mode)) return flags & ~FS_DIRSYNC_FL; else return flags & (FS_NODUMP_FL | FS_NOATIME_FL); } /* - * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. + * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS + * ioctl. */ -static unsigned int btrfs_flags_to_ioctl(unsigned int flags) +static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags) { unsigned int iflags = 0; @@ -136,20 +138,20 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) /* * Update inode->i_flags based on the btrfs internal flags. */ -void btrfs_update_iflags(struct inode *inode) +void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) { - struct btrfs_inode *ip = BTRFS_I(inode); + struct btrfs_inode *binode = BTRFS_I(inode); unsigned int new_fl = 0; - if (ip->flags & BTRFS_INODE_SYNC) + if (binode->flags & BTRFS_INODE_SYNC) new_fl |= S_SYNC; - if (ip->flags & BTRFS_INODE_IMMUTABLE) + if (binode->flags & BTRFS_INODE_IMMUTABLE) new_fl |= S_IMMUTABLE; - if (ip->flags & BTRFS_INODE_APPEND) + if (binode->flags & BTRFS_INODE_APPEND) new_fl |= S_APPEND; - if (ip->flags & BTRFS_INODE_NOATIME) + if (binode->flags & BTRFS_INODE_NOATIME) new_fl |= S_NOATIME; - if (ip->flags & BTRFS_INODE_DIRSYNC) + if (binode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; set_mask_bits(&inode->i_flags, @@ -159,15 +161,16 @@ void btrfs_update_iflags(struct inode *inode) static int btrfs_ioctl_getflags(struct file *file, void __user *arg) { - struct btrfs_inode *ip = BTRFS_I(file_inode(file)); - unsigned int flags = btrfs_flags_to_ioctl(ip->flags); + struct btrfs_inode *binode = BTRFS_I(file_inode(file)); + unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags); if (copy_to_user(arg, &flags, sizeof(flags))) return -EFAULT; return 0; } -static int check_flags(unsigned int flags) +/* Check if @flags are a supported and valid set of FS_*_FL flags */ +static int check_fsflags(unsigned int flags) { if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ @@ -186,13 +189,13 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *ip = BTRFS_I(inode); - struct btrfs_root *root = ip->root; + struct btrfs_inode *binode = BTRFS_I(inode); + struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; - unsigned int flags, oldflags; + unsigned int fsflags, old_fsflags; int ret; - u64 ip_oldflags; - unsigned int i_oldflags; + u64 old_flags; + unsigned int old_i_flags; umode_t mode; if (!inode_owner_or_capable(inode)) @@ -201,10 +204,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (btrfs_root_readonly(root)) return -EROFS; - if (copy_from_user(&flags, arg, sizeof(flags))) + if (copy_from_user(&fsflags, arg, sizeof(fsflags))) return -EFAULT; - ret = check_flags(flags); + ret = check_fsflags(fsflags); if (ret) return ret; @@ -214,44 +217,44 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) inode_lock(inode); - ip_oldflags = ip->flags; - i_oldflags = inode->i_flags; + old_flags = binode->flags; + old_i_flags = inode->i_flags; mode = inode->i_mode; - flags = btrfs_mask_flags(inode->i_mode, flags); - oldflags = btrfs_flags_to_ioctl(ip->flags); - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); + old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); + if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { ret = -EPERM; goto out_unlock; } } - if (flags & FS_SYNC_FL) - ip->flags |= BTRFS_INODE_SYNC; + if (fsflags & FS_SYNC_FL) + binode->flags |= BTRFS_INODE_SYNC; else - ip->flags &= ~BTRFS_INODE_SYNC; - if (flags & FS_IMMUTABLE_FL) - ip->flags |= BTRFS_INODE_IMMUTABLE; + binode->flags &= ~BTRFS_INODE_SYNC; + if (fsflags & FS_IMMUTABLE_FL) + binode->flags |= BTRFS_INODE_IMMUTABLE; else - ip->flags &= ~BTRFS_INODE_IMMUTABLE; - if (flags & FS_APPEND_FL) - ip->flags |= BTRFS_INODE_APPEND; + binode->flags &= ~BTRFS_INODE_IMMUTABLE; + if (fsflags & FS_APPEND_FL) + binode->flags |= BTRFS_INODE_APPEND; else - ip->flags &= ~BTRFS_INODE_APPEND; - if (flags & FS_NODUMP_FL) - ip->flags |= BTRFS_INODE_NODUMP; + binode->flags &= ~BTRFS_INODE_APPEND; + if (fsflags & FS_NODUMP_FL) + binode->flags |= BTRFS_INODE_NODUMP; else - ip->flags &= ~BTRFS_INODE_NODUMP; - if (flags & FS_NOATIME_FL) - ip->flags |= BTRFS_INODE_NOATIME; + binode->flags &= ~BTRFS_INODE_NODUMP; + if (fsflags & FS_NOATIME_FL) + binode->flags |= BTRFS_INODE_NOATIME; else - ip->flags &= ~BTRFS_INODE_NOATIME; - if (flags & FS_DIRSYNC_FL) - ip->flags |= BTRFS_INODE_DIRSYNC; + binode->flags &= ~BTRFS_INODE_NOATIME; + if (fsflags & FS_DIRSYNC_FL) + binode->flags |= BTRFS_INODE_DIRSYNC; else - ip->flags &= ~BTRFS_INODE_DIRSYNC; - if (flags & FS_NOCOW_FL) { + binode->flags &= ~BTRFS_INODE_DIRSYNC; + if (fsflags & FS_NOCOW_FL) { if (S_ISREG(mode)) { /* * It's safe to turn csums off here, no extents exist. @@ -259,10 +262,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) * status of the file and will not set it. */ if (inode->i_size == 0) - ip->flags |= BTRFS_INODE_NODATACOW - | BTRFS_INODE_NODATASUM; + binode->flags |= BTRFS_INODE_NODATACOW + | BTRFS_INODE_NODATASUM; } else { - ip->flags |= BTRFS_INODE_NODATACOW; + binode->flags |= BTRFS_INODE_NODATACOW; } } else { /* @@ -270,10 +273,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) */ if (S_ISREG(mode)) { if (inode->i_size == 0) - ip->flags &= ~(BTRFS_INODE_NODATACOW + binode->flags &= ~(BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM); } else { - ip->flags &= ~BTRFS_INODE_NODATACOW; + binode->flags &= ~BTRFS_INODE_NODATACOW; } } @@ -282,18 +285,18 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) * flag may be changed automatically if compression code won't make * things smaller. */ - if (flags & FS_NOCOMP_FL) { - ip->flags &= ~BTRFS_INODE_COMPRESS; - ip->flags |= BTRFS_INODE_NOCOMPRESS; + if (fsflags & FS_NOCOMP_FL) { + binode->flags &= ~BTRFS_INODE_COMPRESS; + binode->flags |= BTRFS_INODE_NOCOMPRESS; ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) goto out_drop; - } else if (flags & FS_COMPR_FL) { + } else if (fsflags & FS_COMPR_FL) { const char *comp; - ip->flags |= BTRFS_INODE_COMPRESS; - ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + binode->flags |= BTRFS_INODE_COMPRESS; + binode->flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) @@ -308,7 +311,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) goto out_drop; - ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } trans = btrfs_start_transaction(root, 1); @@ -317,7 +320,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) goto out_drop; } - btrfs_update_iflags(inode); + btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, inode); @@ -325,8 +328,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_end_transaction(trans); out_drop: if (ret) { - ip->flags = ip_oldflags; - inode->i_flags = i_oldflags; + binode->flags = old_flags; + inode->i_flags = old_i_flags; } out_unlock: @@ -335,6 +338,148 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) return ret; } +/* + * Translate btrfs internal inode flags to xflags as expected by the + * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are + * silently dropped. + */ +static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags) +{ + unsigned int xflags = 0; + + if (flags & BTRFS_INODE_APPEND) + xflags |= FS_XFLAG_APPEND; + if (flags & BTRFS_INODE_IMMUTABLE) + xflags |= FS_XFLAG_IMMUTABLE; + if (flags & BTRFS_INODE_NOATIME) + xflags |= FS_XFLAG_NOATIME; + if (flags & BTRFS_INODE_NODUMP) + xflags |= FS_XFLAG_NODUMP; + if (flags & BTRFS_INODE_SYNC) + xflags |= FS_XFLAG_SYNC; + + return xflags; +} + +/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */ +static int check_xflags(unsigned int flags) +{ + if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME | + FS_XFLAG_NODUMP | FS_XFLAG_SYNC)) + return -EOPNOTSUPP; + return 0; +} + +/* + * Set the xflags from the internal inode flags. The remaining items of fsxattr + * are zeroed. + */ +static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg) +{ + struct btrfs_inode *binode = BTRFS_I(file_inode(file)); + struct fsxattr fa; + + memset(&fa, 0, sizeof(fa)); + fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags); + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -EFAULT; + + return 0; +} + +static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct btrfs_inode *binode = BTRFS_I(inode); + struct btrfs_root *root = binode->root; + struct btrfs_trans_handle *trans; + struct fsxattr fa; + unsigned old_flags; + unsigned old_i_flags; + int ret = 0; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (btrfs_root_readonly(root)) + return -EROFS; + + memset(&fa, 0, sizeof(fa)); + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; + + ret = check_xflags(fa.fsx_xflags); + if (ret) + return ret; + + if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0) + return -EOPNOTSUPP; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + inode_lock(inode); + + old_flags = binode->flags; + old_i_flags = inode->i_flags; + + /* We need the capabilities to change append-only or immutable inode */ + if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) || + (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) && + !capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; + } + + if (fa.fsx_xflags & FS_XFLAG_SYNC) + binode->flags |= BTRFS_INODE_SYNC; + else + binode->flags &= ~BTRFS_INODE_SYNC; + if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE) + binode->flags |= BTRFS_INODE_IMMUTABLE; + else + binode->flags &= ~BTRFS_INODE_IMMUTABLE; + if (fa.fsx_xflags & FS_XFLAG_APPEND) + binode->flags |= BTRFS_INODE_APPEND; + else + binode->flags &= ~BTRFS_INODE_APPEND; + if (fa.fsx_xflags & FS_XFLAG_NODUMP) + binode->flags |= BTRFS_INODE_NODUMP; + else + binode->flags &= ~BTRFS_INODE_NODUMP; + if (fa.fsx_xflags & FS_XFLAG_NOATIME) + binode->flags |= BTRFS_INODE_NOATIME; + else + binode->flags &= ~BTRFS_INODE_NOATIME; + + /* 1 item for the inode */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + btrfs_sync_inode_flags_to_i_flags(inode); + inode_inc_iversion(inode); + inode->i_ctime = current_time(inode); + ret = btrfs_update_inode(trans, root, inode); + + btrfs_end_transaction(trans); + +out_unlock: + if (ret) { + binode->flags = old_flags; + inode->i_flags = old_i_flags; + } + + inode_unlock(inode); + mnt_drop_write_file(file); + + return ret; +} + static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { struct inode *inode = file_inode(file); @@ -417,14 +562,13 @@ static noinline int create_subvol(struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; - struct timespec cur_time = current_time(dir); + struct timespec64 cur_time = current_time(dir); struct inode *inode; int ret; int err; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; - u64 qgroup_reserved; uuid_le new_uuid; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); @@ -449,8 +593,7 @@ static noinline int create_subvol(struct inode *dir, * The same as the snapshot creation, please see the comment * of create_snapshot(). */ - ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, - 8, &qgroup_reserved, false); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); if (ret) goto fail_free; @@ -573,7 +716,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_ino(BTRFS_I(dir)), index, name, namelen); BUG_ON(ret); - ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid, + ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) btrfs_abort_transaction(trans, ret); @@ -640,7 +783,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, wait_event(root->subv_writers->wait, percpu_counter_sum(&root->subv_writers->counter) == 0); - ret = btrfs_start_delalloc_inodes(root, 0); + ret = btrfs_start_delalloc_inodes(root); if (ret) goto dec_and_free; @@ -658,7 +801,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, */ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, 8, - &pending_snapshot->qgroup_reserved, false); if (ret) goto dec_and_free; @@ -1457,7 +1599,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } - mutex_lock(&fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -1565,7 +1706,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); mnt_drop_write_file(file); return ret; @@ -1832,60 +1972,6 @@ out: return ret; } -/* - * helper to check if the subvolume references other subvolumes - */ -static noinline int may_destroy_subvol(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; - struct btrfs_dir_item *di; - struct btrfs_key key; - u64 dir_id; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* Make sure this root isn't set as the default subvol */ - dir_id = btrfs_super_root_dir(fs_info->super_copy); - di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, - dir_id, "default", 7, 0); - if (di && !IS_ERR(di)) { - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.objectid == root->root_key.objectid) { - ret = -EPERM; - btrfs_err(fs_info, - "deleting default subvolume %llu is not allowed", - key.objectid); - goto out; - } - btrfs_release_path(path); - } - - key.objectid = root->root_key.objectid; - key.type = BTRFS_ROOT_REF_KEY; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret == 0); - - ret = 0; - if (path->slots[0] > 0) { - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == root->root_key.objectid && - key.type == BTRFS_ROOT_REF_KEY) - ret = -ENOTEMPTY; - } -out: - btrfs_free_path(path); - return ret; -} - static noinline int key_in_sk(struct btrfs_key *key, struct btrfs_ioctl_search_key *sk) { @@ -2066,7 +2152,7 @@ static noinline int search_ioctl(struct inode *inode, root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { btrfs_free_path(path); - return -ENOENT; + return PTR_ERR(root); } } @@ -2200,8 +2286,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - btrfs_err(info, "could not find root %llu", tree_id); - ret = -ENOENT; + ret = PTR_ERR(root); goto out; } @@ -2256,6 +2341,169 @@ out: return ret; } +static int btrfs_search_path_in_tree_user(struct inode *inode, + struct btrfs_ioctl_ino_lookup_user_args *args) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct super_block *sb = inode->i_sb; + struct btrfs_key upper_limit = BTRFS_I(inode)->location; + u64 treeid = BTRFS_I(inode)->root->root_key.objectid; + u64 dirid = args->dirid; + unsigned long item_off; + unsigned long item_len; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key, key2; + struct extent_buffer *leaf; + struct inode *temp_inode; + char *ptr; + int slot; + int len; + int total_len = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * If the bottom subvolume does not exist directly under upper_limit, + * construct the path in from the bottom up. + */ + if (dirid != upper_limit.objectid) { + ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; + + key.objectid = treeid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = btrfs_previous_item(root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(leaf, iref); + ptr -= len + 1; + total_len += len + 1; + if (ptr < args->path) { + ret = -ENAMETOOLONG; + goto out; + } + + *(ptr + len) = '/'; + read_extent_buffer(leaf, ptr, + (unsigned long)(iref + 1), len); + + /* Check the read+exec permission of this directory */ + ret = btrfs_previous_item(root, path, dirid, + BTRFS_INODE_ITEM_KEY); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key2, slot); + if (key2.objectid != dirid) { + ret = -ENOENT; + goto out; + } + + temp_inode = btrfs_iget(sb, &key2, root, NULL); + if (IS_ERR(temp_inode)) { + ret = PTR_ERR(temp_inode); + goto out; + } + ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC); + iput(temp_inode); + if (ret) { + ret = -EACCES; + goto out; + } + + if (key.offset == upper_limit.objectid) + break; + if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { + ret = -EACCES; + goto out; + } + + btrfs_release_path(path); + key.objectid = key.offset; + key.offset = (u64)-1; + dirid = key.objectid; + } + + memmove(args->path, ptr, total_len); + args->path[total_len] = '\0'; + btrfs_release_path(path); + } + + /* Get the bottom subvolume's name from ROOT_REF */ + root = fs_info->tree_root; + key.objectid = treeid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = args->treeid; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + item_off = btrfs_item_ptr_offset(leaf, slot); + item_len = btrfs_item_size_nr(leaf, slot); + /* Check if dirid in ROOT_REF corresponds to passed dirid */ + rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); + if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { + ret = -EINVAL; + goto out; + } + + /* Copy subvolume's name */ + item_off += sizeof(struct btrfs_root_ref); + item_len -= sizeof(struct btrfs_root_ref); + read_extent_buffer(leaf, args->name, item_off, item_len); + args->name[item_len] = 0; + +out: + btrfs_free_path(path); + return ret; +} + static noinline int btrfs_ioctl_ino_lookup(struct file *file, void __user *argp) { @@ -2298,6 +2546,265 @@ out: return ret; } +/* + * Version of ino_lookup ioctl (unprivileged) + * + * The main differences from ino_lookup ioctl are: + * + * 1. Read + Exec permission will be checked using inode_permission() during + * path construction. -EACCES will be returned in case of failure. + * 2. Path construction will be stopped at the inode number which corresponds + * to the fd with which this ioctl is called. If constructed path does not + * exist under fd's inode, -EACCES will be returned. + * 3. The name of bottom subvolume is also searched and filled. + */ +static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_ino_lookup_user_args *args; + struct inode *inode; + int ret; + + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); + + inode = file_inode(file); + + if (args->dirid == BTRFS_FIRST_FREE_OBJECTID && + BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) { + /* + * The subvolume does not exist under fd with which this is + * called + */ + kfree(args); + return -EACCES; + } + + ret = btrfs_search_path_in_tree_user(inode, args); + + if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = -EFAULT; + + kfree(args); + return ret; +} + +/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ +static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_get_subvol_info_args *subvol_info; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root_item *root_item; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long item_off; + unsigned long item_len; + struct inode *inode; + int slot; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL); + if (!subvol_info) { + btrfs_free_path(path); + return -ENOMEM; + } + + inode = file_inode(file); + fs_info = BTRFS_I(inode)->root->fs_info; + + /* Get root_item of inode's subvolume */ + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + root_item = &root->root_item; + + subvol_info->treeid = key.objectid; + + subvol_info->generation = btrfs_root_generation(root_item); + subvol_info->flags = btrfs_root_flags(root_item); + + memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE); + memcpy(subvol_info->parent_uuid, root_item->parent_uuid, + BTRFS_UUID_SIZE); + memcpy(subvol_info->received_uuid, root_item->received_uuid, + BTRFS_UUID_SIZE); + + subvol_info->ctransid = btrfs_root_ctransid(root_item); + subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime); + subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime); + + subvol_info->otransid = btrfs_root_otransid(root_item); + subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime); + subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime); + + subvol_info->stransid = btrfs_root_stransid(root_item); + subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime); + subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime); + + subvol_info->rtransid = btrfs_root_rtransid(root_item); + subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime); + subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime); + + if (key.objectid != BTRFS_FS_TREE_OBJECTID) { + /* Search root tree for ROOT_BACKREF of this subvolume */ + root = fs_info->tree_root; + + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid == subvol_info->treeid && + key.type == BTRFS_ROOT_BACKREF_KEY) { + subvol_info->parent_id = key.offset; + + rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); + subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref); + + item_off = btrfs_item_ptr_offset(leaf, slot) + + sizeof(struct btrfs_root_ref); + item_len = btrfs_item_size_nr(leaf, slot) + - sizeof(struct btrfs_root_ref); + read_extent_buffer(leaf, subvol_info->name, + item_off, item_len); + } else { + ret = -ENOENT; + goto out; + } + } + + if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) + ret = -EFAULT; + +out: + btrfs_free_path(path); + kzfree(subvol_info); + return ret; +} + +/* + * Return ROOT_REF information of the subvolume containing this inode + * except the subvolume name. + */ +static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; + struct btrfs_root_ref *rref; + struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + struct inode *inode; + u64 objectid; + int slot; + int ret; + u8 found; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + rootrefs = memdup_user(argp, sizeof(*rootrefs)); + if (IS_ERR(rootrefs)) { + btrfs_free_path(path); + return PTR_ERR(rootrefs); + } + + inode = file_inode(file); + root = BTRFS_I(inode)->root->fs_info->tree_root; + objectid = BTRFS_I(inode)->root->root_key.objectid; + + key.objectid = objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = rootrefs->min_treeid; + found = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) { + ret = 0; + goto out; + } + + if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) { + ret = -EOVERFLOW; + goto out; + } + + rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); + rootrefs->rootref[found].treeid = key.offset; + rootrefs->rootref[found].dirid = + btrfs_root_ref_dirid(leaf, rref); + found++; + + ret = btrfs_next_item(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + +out: + if (!ret || ret == -EOVERFLOW) { + rootrefs->num_items = found; + /* update min_treeid for next search */ + if (found) + rootrefs->min_treeid = + rootrefs->rootref[found - 1].treeid + 1; + if (copy_to_user(argp, rootrefs, sizeof(*rootrefs))) + ret = -EFAULT; + } + + kfree(rootrefs); + btrfs_free_path(path); + + return ret; +} + static noinline int btrfs_ioctl_snap_destroy(struct file *file, void __user *arg) { @@ -2309,12 +2816,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; - struct btrfs_block_rsv block_rsv; - u64 root_flags; - u64 qgroup_reserved; int namelen; - int ret; int err = 0; if (!S_ISDIR(dir->i_mode)) @@ -2398,133 +2900,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } inode_lock(inode); - - /* - * Don't allow to delete a subvolume with send in progress. This is - * inside the i_mutex so the error handling that has to drop the bit - * again is not run concurrently. - */ - spin_lock(&dest->root_item_lock); - root_flags = btrfs_root_flags(&dest->root_item); - if (dest->send_in_progress == 0) { - btrfs_set_root_flags(&dest->root_item, - root_flags | BTRFS_ROOT_SUBVOL_DEAD); - spin_unlock(&dest->root_item_lock); - } else { - spin_unlock(&dest->root_item_lock); - btrfs_warn(fs_info, - "Attempt to delete subvolume %llu during send", - dest->root_key.objectid); - err = -EPERM; - goto out_unlock_inode; - } - - down_write(&fs_info->subvol_sem); - - err = may_destroy_subvol(dest); - if (err) - goto out_up_write; - - btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); - /* - * One for dir inode, two for dir entries, two for root - * ref/backref. - */ - err = btrfs_subvolume_reserve_metadata(root, &block_rsv, - 5, &qgroup_reserved, true); - if (err) - goto out_up_write; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_release; - } - trans->block_rsv = &block_rsv; - trans->bytes_reserved = block_rsv.size; - - btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); - - ret = btrfs_unlink_subvol(trans, root, dir, - dest->root_key.objectid, - dentry->d_name.name, - dentry->d_name.len); - if (ret) { - err = ret; - btrfs_abort_transaction(trans, ret); - goto out_end_trans; - } - - btrfs_record_root_in_trans(trans, dest); - - memset(&dest->root_item.drop_progress, 0, - sizeof(dest->root_item.drop_progress)); - dest->root_item.drop_level = 0; - btrfs_set_root_refs(&dest->root_item, 0); - - if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { - ret = btrfs_insert_orphan_item(trans, - fs_info->tree_root, - dest->root_key.objectid); - if (ret) { - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_end_trans; - } - } - - ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - dest->root_key.objectid); - if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_end_trans; - } - if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { - ret = btrfs_uuid_tree_rem(trans, fs_info, - dest->root_item.received_uuid, - BTRFS_UUID_KEY_RECEIVED_SUBVOL, - dest->root_key.objectid); - if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_end_trans; - } - } - -out_end_trans: - trans->block_rsv = NULL; - trans->bytes_reserved = 0; - ret = btrfs_end_transaction(trans); - if (ret && !err) - err = ret; - inode->i_flags |= S_DEAD; -out_release: - btrfs_subvolume_release_metadata(fs_info, &block_rsv); -out_up_write: - up_write(&fs_info->subvol_sem); - if (err) { - spin_lock(&dest->root_item_lock); - root_flags = btrfs_root_flags(&dest->root_item); - btrfs_set_root_flags(&dest->root_item, - root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); - spin_unlock(&dest->root_item_lock); - } -out_unlock_inode: + err = btrfs_delete_subvolume(dir, dentry); inode_unlock(inode); - if (!err) { - d_invalidate(dentry); - btrfs_invalidate_inodes(dest); + if (!err) d_delete(dentry); - ASSERT(dest->send_in_progress == 0); - /* the last ref */ - if (dest->ino_cache_inode) { - iput(dest->ino_cache_inode); - dest->ino_cache_inode = NULL; - } - } out_dput: dput(dentry); out_unlock_dir: @@ -2613,7 +2993,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; - mutex_lock(&fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2628,7 +3007,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return ret; } @@ -2654,8 +3032,10 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } /* Check for compatibility reject unknown flags */ - if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) - return -EOPNOTSUPP; + if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) { + ret = -EOPNOTSUPP; + goto out; + } if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; @@ -2947,15 +3327,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp) if (pg) { unlock_page(pg); put_page(pg); + cmp->src_pages[i] = NULL; } pg = cmp->dst_pages[i]; if (pg) { unlock_page(pg); put_page(pg); + cmp->dst_pages[i] = NULL; } } - kfree(cmp->src_pages); - kfree(cmp->dst_pages); } static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, @@ -2964,40 +3344,14 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, { int ret; int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; - struct page **src_pgarr, **dst_pgarr; - /* - * We must gather up all the pages before we initiate our - * extent locking. We use an array for the page pointers. Size - * of the array is bounded by len, which is in turn bounded by - * BTRFS_MAX_DEDUPE_LEN. - */ - src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); - dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL); - if (!src_pgarr || !dst_pgarr) { - kfree(src_pgarr); - kfree(dst_pgarr); - return -ENOMEM; - } cmp->num_pages = num_pages; - cmp->src_pages = src_pgarr; - cmp->dst_pages = dst_pgarr; - - /* - * If deduping ranges in the same inode, locking rules make it mandatory - * to always lock pages in ascending order to avoid deadlocks with - * concurrent tasks (such as starting writeback/delalloc). - */ - if (src == dst && dst_loff < loff) { - swap(src_pgarr, dst_pgarr); - swap(loff, dst_loff); - } - ret = gather_extent_pages(src, src_pgarr, cmp->num_pages, loff); + ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff); if (ret) goto out; - ret = gather_extent_pages(dst, dst_pgarr, cmp->num_pages, dst_loff); + ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff); out: if (ret) @@ -3067,31 +3421,23 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, return 0; } -static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, - struct inode *dst, u64 dst_loff) +static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, + struct inode *dst, u64 dst_loff, + struct cmp_pages *cmp) { int ret; u64 len = olen; - struct cmp_pages cmp; bool same_inode = (src == dst); u64 same_lock_start = 0; u64 same_lock_len = 0; - if (len == 0) - return 0; - - if (same_inode) - inode_lock(src); - else - btrfs_double_inode_lock(src, dst); - ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) - goto out_unlock; + return ret; ret = extent_same_check_offsets(dst, dst_loff, &len, olen); if (ret) - goto out_unlock; + return ret; if (same_inode) { /* @@ -3108,32 +3454,21 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, * allow an unaligned length so long as it ends at * i_size. */ - if (len != olen) { - ret = -EINVAL; - goto out_unlock; - } + if (len != olen) + return -EINVAL; /* Check for overlapping ranges */ - if (dst_loff + len > loff && dst_loff < loff + len) { - ret = -EINVAL; - goto out_unlock; - } + if (dst_loff + len > loff && dst_loff < loff + len) + return -EINVAL; same_lock_start = min_t(u64, loff, dst_loff); same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; } - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; - } - again: - ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); + ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp); if (ret) - goto out_unlock; + return ret; if (same_inode) ret = lock_extent_range(src, same_lock_start, same_lock_len, @@ -3154,7 +3489,7 @@ again: * Ranges in the io trees already unlocked. Now unlock all * pages before waiting for all IO to complete. */ - btrfs_cmp_data_free(&cmp); + btrfs_cmp_data_free(cmp); if (same_inode) { btrfs_wait_ordered_range(src, same_lock_start, same_lock_len); @@ -3167,12 +3502,12 @@ again: ASSERT(ret == 0); if (WARN_ON(ret)) { /* ranges in the io trees already unlocked */ - btrfs_cmp_data_free(&cmp); + btrfs_cmp_data_free(cmp); return ret; } /* pass original length for comparison so we stay within i_size */ - ret = btrfs_cmp_data(olen, &cmp); + ret = btrfs_cmp_data(olen, cmp); if (ret == 0) ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); @@ -3182,7 +3517,82 @@ again: else btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); - btrfs_cmp_data_free(&cmp); + btrfs_cmp_data_free(cmp); + + return ret; +} + +#define BTRFS_MAX_DEDUPE_LEN SZ_16M + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, + struct inode *dst, u64 dst_loff) +{ + int ret; + struct cmp_pages cmp; + int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; + bool same_inode = (src == dst); + u64 i, tail_len, chunk_count; + + if (olen == 0) + return 0; + + if (same_inode) + inode_lock(src); + else + btrfs_double_inode_lock(src, dst); + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } + + tail_len = olen % BTRFS_MAX_DEDUPE_LEN; + chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); + if (chunk_count == 0) + num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; + + /* + * If deduping ranges in the same inode, locking rules make it + * mandatory to always lock pages in ascending order to avoid deadlocks + * with concurrent tasks (such as starting writeback/delalloc). + */ + if (same_inode && dst_loff < loff) + swap(loff, dst_loff); + + /* + * We must gather up all the pages before we initiate our extent + * locking. We use an array for the page pointers. Size of the array is + * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN. + */ + cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_ZERO); + cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_ZERO); + if (!cmp.src_pages || !cmp.dst_pages) { + ret = -ENOMEM; + goto out_free; + } + + for (i = 0; i < chunk_count; i++) { + ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, + dst, dst_loff, &cmp); + if (ret) + goto out_free; + + loff += BTRFS_MAX_DEDUPE_LEN; + dst_loff += BTRFS_MAX_DEDUPE_LEN; + } + + if (tail_len > 0) + ret = btrfs_extent_same_range(src, loff, tail_len, dst, + dst_loff, &cmp); + +out_free: + kvfree(cmp.src_pages); + kvfree(cmp.dst_pages); + out_unlock: if (same_inode) inode_unlock(src); @@ -3192,8 +3602,6 @@ out_unlock: return ret; } -#define BTRFS_MAX_DEDUPE_LEN SZ_16M - ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, struct file *dst_file, u64 dst_loff) { @@ -3202,9 +3610,6 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; ssize_t res; - if (olen > BTRFS_MAX_DEDUPE_LEN) - olen = BTRFS_MAX_DEDUPE_LEN; - if (WARN_ON_ONCE(bs < PAGE_SIZE)) { /* * Btrfs does not support blocksize < page_size. As a @@ -3826,11 +4231,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, src->i_sb != inode->i_sb) return -EXDEV; - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - return -EINVAL; - if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) return -EISDIR; @@ -3840,6 +4240,13 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, inode_lock(src); } + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } + /* determine range to clone */ ret = -EINVAL; if (off + len > src->i_size || off + len < off) @@ -4007,8 +4414,8 @@ out: return ret; } -void btrfs_get_block_group_info(struct list_head *groups_list, - struct btrfs_ioctl_space_info *space) +static void get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) { struct btrfs_block_group_cache *block_group; @@ -4124,8 +4531,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, down_read(&info->groups_sem); for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { if (!list_empty(&info->block_groups[c])) { - btrfs_get_block_group_info( - &info->block_groups[c], &space); + get_block_group_info(&info->block_groups[c], + &space); memcpy(dest, &space, sizeof(space)); dest++; space_args.total_spaces++; @@ -4490,14 +4897,14 @@ out_loi: return ret; } -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, +void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; bargs->flags = bctl->flags; - if (atomic_read(&fs_info->balance_running)) + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) bargs->state |= BTRFS_BALANCE_STATE_RUNNING; if (atomic_read(&fs_info->balance_pause_req)) bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; @@ -4508,13 +4915,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); - if (lock) { - spin_lock(&fs_info->balance_lock); - memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); - spin_unlock(&fs_info->balance_lock); - } else { - memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); - } + spin_lock(&fs_info->balance_lock); + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + spin_unlock(&fs_info->balance_lock); } static long btrfs_ioctl_balance(struct file *file, void __user *arg) @@ -4535,7 +4938,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) again: if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; @@ -4550,21 +4952,22 @@ again: mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { /* this is either (2) or (3) */ - if (!atomic_read(&fs_info->balance_running)) { + if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { mutex_unlock(&fs_info->balance_mutex); - if (!mutex_trylock(&fs_info->volume_mutex)) - goto again; + /* + * Lock released to allow other waiters to continue, + * we'll reexamine the status again. + */ mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl && - !atomic_read(&fs_info->balance_running)) { + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { /* this is (3) */ need_unlock = false; goto locked; } mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); goto again; } else { /* this is (2) */ @@ -4617,7 +5020,6 @@ locked: goto out_bargs; } - bctl->fs_info = fs_info; if (arg) { memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); @@ -4636,14 +5038,14 @@ locked: do_balance: /* - * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP - * goes to to btrfs_balance. bctl is freed in __cancel_balance, - * or, if restriper was paused all the way until unmount, in - * free_fs_info. The flag is cleared in __cancel_balance. + * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to + * btrfs_balance. bctl is freed in reset_balance_state, or, if + * restriper was paused all the way until unmount, in free_fs_info. + * The flag should be cleared after reset_balance_state. */ need_unlock = false; - ret = btrfs_balance(bctl, bargs); + ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; if (arg) { @@ -4657,7 +5059,6 @@ out_bargs: kfree(bargs); out_unlock: mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); if (need_unlock) clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out: @@ -4701,7 +5102,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, goto out; } - update_ioctl_balance_args(fs_info, 1, bargs); + btrfs_update_ioctl_balance_args(fs_info, bargs); if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; @@ -4996,7 +5397,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; - struct timespec ct = current_time(inode); + struct timespec64 ct = current_time(inode); int ret = 0; int received_uuid_changed; @@ -5038,8 +5439,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, BTRFS_UUID_SIZE); if (received_uuid_changed && !btrfs_is_empty_uuid(root_item->received_uuid)) { - ret = btrfs_uuid_tree_rem(trans, fs_info, - root_item->received_uuid, + ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret && ret != -ENOENT) { @@ -5063,7 +5463,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, goto out; } if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info, sa->uuid, + ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { @@ -5497,7 +5897,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, 0, -1); + ret = btrfs_start_delalloc_roots(fs_info, -1); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); @@ -5565,6 +5965,16 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_features(file, argp); case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); + case FS_IOC_FSGETXATTR: + return btrfs_ioctl_fsgetxattr(file, argp); + case FS_IOC_FSSETXATTR: + return btrfs_ioctl_fssetxattr(file, argp); + case BTRFS_IOC_GET_SUBVOL_INFO: + return btrfs_ioctl_get_subvol_info(file, argp); + case BTRFS_IOC_GET_SUBVOL_ROOTREF: + return btrfs_ioctl_get_subvol_rootref(file, argp); + case BTRFS_IOC_INO_LOOKUP_USER: + return btrfs_ioctl_ino_lookup_user(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index e4faefac9d16..1da768e5ef75 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -66,22 +66,16 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) write_lock(&eb->lock); WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); - /* - * atomic_dec_and_test implies a barrier for waitqueue_active - */ - if (atomic_dec_and_test(&eb->blocking_writers) && - waitqueue_active(&eb->write_lock_wq)) - wake_up(&eb->write_lock_wq); + /* atomic_dec_and_test implies a barrier */ + if (atomic_dec_and_test(&eb->blocking_writers)) + cond_wake_up_nomb(&eb->write_lock_wq); } else if (rw == BTRFS_READ_LOCK_BLOCKING) { BUG_ON(atomic_read(&eb->blocking_readers) == 0); read_lock(&eb->lock); atomic_inc(&eb->spinning_readers); - /* - * atomic_dec_and_test implies a barrier for waitqueue_active - */ - if (atomic_dec_and_test(&eb->blocking_readers) && - waitqueue_active(&eb->read_lock_wq)) - wake_up(&eb->read_lock_wq); + /* atomic_dec_and_test implies a barrier */ + if (atomic_dec_and_test(&eb->blocking_readers)) + cond_wake_up_nomb(&eb->read_lock_wq); } } @@ -221,12 +215,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); - /* - * atomic_dec_and_test implies a barrier for waitqueue_active - */ - if (atomic_dec_and_test(&eb->blocking_readers) && - waitqueue_active(&eb->read_lock_wq)) - wake_up(&eb->read_lock_wq); + /* atomic_dec_and_test implies a barrier */ + if (atomic_dec_and_test(&eb->blocking_readers)) + cond_wake_up_nomb(&eb->read_lock_wq); atomic_dec(&eb->read_locks); } @@ -275,12 +266,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { WARN_ON(atomic_read(&eb->spinning_writers)); atomic_dec(&eb->blocking_writers); - /* - * Make sure counter is updated before we wake up waiters. - */ + /* Use the lighter barrier after atomic */ smp_mb__after_atomic(); - if (waitqueue_active(&eb->write_lock_wq)) - wake_up(&eb->write_lock_wq); + cond_wake_up_nomb(&eb->write_lock_wq); } else { WARN_ON(atomic_read(&eb->spinning_writers) != 1); atomic_dec(&eb->spinning_writers); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 0667ea07f766..b6a4cc178bee 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -17,6 +17,43 @@ #define LZO_LEN 4 +/* + * Btrfs LZO compression format + * + * Regular and inlined LZO compressed data extents consist of: + * + * 1. Header + * Fixed size. LZO_LEN (4) bytes long, LE32. + * Records the total size (including the header) of compressed data. + * + * 2. Segment(s) + * Variable size. Each segment includes one segment header, followd by data + * payload. + * One regular LZO compressed extent can have one or more segments. + * For inlined LZO compressed extent, only one segment is allowed. + * One segment represents at most one page of uncompressed data. + * + * 2.1 Segment header + * Fixed size. LZO_LEN (4) bytes long, LE32. + * Records the total size of the segment (not including the header). + * Segment header never crosses page boundary, thus it's possible to + * have at most 3 padding zeros at the end of the page. + * + * 2.2 Data Payload + * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE) + * which is 4419 for a 4KiB page. + * + * Example: + * Page 1: + * 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10 + * 0x0000 | Header | SegHdr 01 | Data payload 01 ... | + * ... + * 0x0ff0 | SegHdr N | Data payload N ... |00| + * ^^ padding zeros + * Page 2: + * 0x1000 | SegHdr N+1| Data payload N+1 ... | + */ + struct workspace { void *mem; void *buf; /* where decompressed data goes */ @@ -258,6 +295,7 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long working_bytes; size_t in_len; size_t out_len; + const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); unsigned long in_offset; unsigned long in_page_bytes_left; unsigned long tot_in; @@ -271,10 +309,22 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) data_in = kmap(pages_in[0]); tot_len = read_compress_length(data_in); + /* + * Compressed data header check. + * + * The real compressed size can't exceed the maximum extent length, and + * all pages should be used (whole unused page with just the segment + * header is not possible). If this happens it means the compressed + * extent is corrupted. + */ + if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) || + tot_len < srclen - PAGE_SIZE) { + ret = -EUCLEAN; + goto done; + } tot_in = LZO_LEN; in_offset = LZO_LEN; - tot_len = min_t(size_t, srclen, tot_len); in_page_bytes_left = PAGE_SIZE - LZO_LEN; tot_out = 0; @@ -285,6 +335,17 @@ static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) in_offset += LZO_LEN; tot_in += LZO_LEN; + /* + * Segment header check. + * + * The segment length must not exceed the maximum LZO + * compression size, nor the total compressed size. + */ + if (in_len > max_segment_len || tot_in + in_len > tot_len) { + ret = -EUCLEAN; + goto done; + } + tot_in += in_len; working_bytes = in_len; may_late_unmap = need_unmap = false; @@ -335,7 +396,7 @@ cont: } } - out_len = lzo1x_worst_compress(PAGE_SIZE); + out_len = max_segment_len; ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, &out_len); if (need_unmap) @@ -369,15 +430,24 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, struct workspace *workspace = list_entry(ws, struct workspace, list); size_t in_len; size_t out_len; + size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); int ret = 0; char *kaddr; unsigned long bytes; - BUG_ON(srclen < LZO_LEN); + if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) + return -EUCLEAN; + in_len = read_compress_length(data_in); + if (in_len != srclen) + return -EUCLEAN; data_in += LZO_LEN; in_len = read_compress_length(data_in); + if (in_len != srclen - LZO_LEN * 2) { + ret = -EUCLEAN; + goto out; + } data_in += LZO_LEN; out_len = PAGE_SIZE; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6db8bb2f2c28..2e1a1694a33d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -343,11 +343,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - /* - * Implicit memory barrier after test_and_set_bit - */ - if (waitqueue_active(&entry->wait)) - wake_up(&entry->wait); + /* test_and_set_bit implies a barrier */ + cond_wake_up_nomb(&entry->wait); } else { ret = 1; } @@ -410,11 +407,8 @@ have_entry: if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - /* - * Implicit memory barrier after test_and_set_bit - */ - if (waitqueue_active(&entry->wait)) - wake_up(&entry->wait); + /* test_and_set_bit implies a barrier */ + cond_wake_up_nomb(&entry->wait); } else { ret = 1; } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 21a831d3d087..a4e11cf04671 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -166,6 +166,25 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, } } +/* + * Helper to output refs and locking status of extent buffer. Useful to debug + * race condition related problems. + */ +static void print_eb_refs_lock(struct extent_buffer *eb) +{ +#ifdef CONFIG_BTRFS_DEBUG + btrfs_info(eb->fs_info, +"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", + atomic_read(&eb->refs), atomic_read(&eb->write_locks), + atomic_read(&eb->read_locks), + atomic_read(&eb->blocking_writers), + atomic_read(&eb->blocking_readers), + atomic_read(&eb->spinning_writers), + atomic_read(&eb->spinning_readers), + eb->lock_owner, current->pid); +#endif +} + void btrfs_print_leaf(struct extent_buffer *l) { struct btrfs_fs_info *fs_info; @@ -193,6 +212,7 @@ void btrfs_print_leaf(struct extent_buffer *l) "leaf %llu gen %llu total ptrs %d free space %d owner %llu", btrfs_header_bytenr(l), btrfs_header_generation(l), nr, btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l)); + print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); @@ -347,6 +367,7 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) btrfs_header_bytenr(c), level, btrfs_header_generation(c), nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr, btrfs_header_owner(c)); + print_eb_refs_lock(c); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n", diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9fb758d5077a..c25dc47210a3 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1882,8 +1882,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); - trace_qgroup_update_counters(fs_info, qg->qgroupid, - cur_old_count, cur_new_count); + trace_qgroup_update_counters(fs_info, qg, cur_old_count, + cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { @@ -2014,8 +2014,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, BUG_ON(!fs_info->quota_root); - trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes, - nr_old_roots, nr_new_roots); + trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, + num_bytes, nr_old_roots, nr_new_roots); qgroups = ulist_alloc(GFP_NOFS); if (!qgroups) { @@ -2580,6 +2580,21 @@ out: } /* + * Check if the leaf is the last leaf. Which means all node pointers + * are at their last position. + */ +static bool is_last_leaf(struct btrfs_path *path) +{ + int i; + + for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { + if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) + return false; + } + return true; +} + +/* * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done. */ @@ -2590,8 +2605,8 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; struct ulist *roots = NULL; - struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; + bool done; int slot; int ret; @@ -2620,12 +2635,12 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, mutex_unlock(&fs_info->qgroup_rescan_lock); return ret; } + done = is_last_leaf(path); btrfs_item_key_to_cpu(path->nodes[0], &found, btrfs_header_nritems(path->nodes[0]) - 1); fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; - btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); if (!scratch_leaf) { ret = -ENOMEM; @@ -2664,8 +2679,11 @@ out: btrfs_tree_read_unlock_blocking(scratch_leaf); free_extent_buffer(scratch_leaf); } - btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + if (done && !ret) { + ret = 1; + fs_info->qgroup_rescan_progress.objectid = (u64)-1; + } return ret; } @@ -2681,6 +2699,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) path = btrfs_alloc_path(); if (!path) goto out; + /* + * Rescan should only search for commit root, and any later difference + * should be recorded by qgroup + */ + path->search_commit_root = 1; + path->skip_locking = 1; err = 0; while (!err && !btrfs_fs_closing(fs_info)) { @@ -2760,26 +2784,43 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, { int ret = 0; - if (!init_flags && - (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) || - !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) { - ret = -EINVAL; - goto err; + if (!init_flags) { + /* we're resuming qgroup rescan at mount time */ + if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { + btrfs_warn(fs_info, + "qgroup rescan init failed, qgroup is not enabled"); + ret = -EINVAL; + } else if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_ON)) { + btrfs_warn(fs_info, + "qgroup rescan init failed, qgroup rescan is not queued"); + ret = -EINVAL; + } + + if (ret) + return ret; } mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); if (init_flags) { - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + btrfs_warn(fs_info, + "qgroup rescan is already in progress"); ret = -EINPROGRESS; - else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + } else if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_ON)) { + btrfs_warn(fs_info, + "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; + } if (ret) { spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - goto err; + return ret; } fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; } @@ -2798,13 +2839,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_init_work(&fs_info->qgroup_rescan_work, btrfs_qgroup_rescan_helper, btrfs_qgroup_rescan_worker, NULL, NULL); - - if (ret) { -err: - btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret); - return ret; - } - return 0; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 9abd950e7f78..5e4ad134b9ad 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -163,6 +163,12 @@ struct btrfs_raid_bio { * bitmap to record which horizontal stripe has data */ unsigned long *dbitmap; + + /* allocated with real_stripes-many pointers for finish_*() calls */ + void **finish_pointers; + + /* allocated with stripe_npages-many bits for finish_*() calls */ + unsigned long *finish_pbitmap; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); @@ -981,9 +987,14 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; - rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + - DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) * - sizeof(long), GFP_NOFS); + rbio = kzalloc(sizeof(*rbio) + + sizeof(*rbio->stripe_pages) * num_pages + + sizeof(*rbio->bio_pages) * num_pages + + sizeof(*rbio->finish_pointers) * real_stripes + + sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + + sizeof(*rbio->finish_pbitmap) * + BITS_TO_LONGS(stripe_npages), + GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1005,13 +1016,20 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, atomic_set(&rbio->stripes_pending, 0); /* - * the stripe_pages and bio_pages array point to the extra + * the stripe_pages, bio_pages, etc arrays point to the extra * memory we allocated past the end of the rbio */ p = rbio + 1; - rbio->stripe_pages = p; - rbio->bio_pages = p + sizeof(struct page *) * num_pages; - rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; +#define CONSUME_ALLOC(ptr, count) do { \ + ptr = p; \ + p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ + } while (0) + CONSUME_ALLOC(rbio->stripe_pages, num_pages); + CONSUME_ALLOC(rbio->bio_pages, num_pages); + CONSUME_ALLOC(rbio->finish_pointers, real_stripes); + CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); + CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); +#undef CONSUME_ALLOC if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) nr_data = real_stripes - 1; @@ -1180,7 +1198,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_bio *bbio = rbio->bbio; - void *pointers[rbio->real_stripes]; + void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; int stripe; int pagenr; @@ -2350,8 +2368,8 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_bio *bbio = rbio->bbio; - void *pointers[rbio->real_stripes]; - DECLARE_BITMAP(pbitmap, rbio->stripe_npages); + void **pointers = rbio->finish_pointers; + unsigned long *pbitmap = rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; int pagenr; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b041b945a7ae..879b76fa881a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4299,7 +4299,7 @@ out: return inode; } -static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) +static struct reloc_control *alloc_reloc_control(void) { struct reloc_control *rc; @@ -4344,7 +4344,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, DESCRIBE_FLAG(RAID5, "raid5"); DESCRIBE_FLAG(RAID6, "raid6"); if (flags) - snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags); + snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags); #undef DESCRIBE_FLAG } @@ -4366,7 +4366,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; - rc = alloc_reloc_control(fs_info); + rc = alloc_reloc_control(); if (!rc) return -ENOMEM; @@ -4562,7 +4562,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = alloc_reloc_control(fs_info); + rc = alloc_reloc_control(); if (!rc) { err = -ENOMEM; goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 6db3bda44aa5..c451285976ac 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -485,9 +485,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_root_item *item = &root->root_item; - struct timespec ct; + struct timespec64 ct; - ktime_get_real_ts(&ct); + ktime_get_real_ts64(&ct); spin_lock(&root->root_item_lock); btrfs_set_root_ctransid(item, trans->transid); btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 52b39a0924e9..6702896cdb8f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1151,11 +1151,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) return ret; } - if (sctx->is_dev_replace && !is_metadata && !have_csum) { - sblocks_for_recheck = NULL; - goto nodatasum_case; - } - /* * read all mirrors one after the other. This includes to * re-read the extent or metadata block that failed (that was @@ -1268,13 +1263,19 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto out; } - if (!is_metadata && !have_csum) { + /* + * NOTE: Even for nodatasum case, it's still possible that it's a + * compressed data extent, thus scrub_fixup_nodatasum(), which write + * inode page cache onto disk, could cause serious data corruption. + * + * So here we could only read from disk, and hope our recovery could + * reach disk before the newer write. + */ + if (0 && !is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; WARN_ON(sctx->is_dev_replace); -nodatasum_case: - /* * !is_metadata and !have_csum, this means that the data * might not be COWed, that it might be modified @@ -2799,7 +2800,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, have_csum = scrub_find_csum(sctx, logical, csum); if (have_csum == 0) ++sctx->stat.no_csum; - if (sctx->is_dev_replace && !have_csum) { + if (0 && sctx->is_dev_replace && !have_csum) { ret = copy_nocow_pages(sctx, logical, l, mirror_num, physical_for_dev_replace); @@ -3984,6 +3985,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&cache->bg_list)) { btrfs_get_block_group(cache); + trace_btrfs_add_unused_block_group(cache); list_add_tail(&cache->bg_list, &fs_info->unused_bgs); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c0074d2d7d6d..c47f62b19226 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -235,6 +235,7 @@ struct orphan_dir_info { struct rb_node node; u64 ino; u64 gen; + u64 last_dir_index_offset; }; struct name_cache_entry { @@ -2844,12 +2845,6 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) struct rb_node *parent = NULL; struct orphan_dir_info *entry, *odi; - odi = kmalloc(sizeof(*odi), GFP_KERNEL); - if (!odi) - return ERR_PTR(-ENOMEM); - odi->ino = dir_ino; - odi->gen = 0; - while (*p) { parent = *p; entry = rb_entry(parent, struct orphan_dir_info, node); @@ -2858,11 +2853,17 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) } else if (dir_ino > entry->ino) { p = &(*p)->rb_right; } else { - kfree(odi); return entry; } } + odi = kmalloc(sizeof(*odi), GFP_KERNEL); + if (!odi) + return ERR_PTR(-ENOMEM); + odi->ino = dir_ino; + odi->gen = 0; + odi->last_dir_index_offset = 0; + rb_link_node(&odi->node, parent, p); rb_insert_color(&odi->node, &sctx->orphan_dirs); return odi; @@ -2917,6 +2918,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, struct btrfs_key found_key; struct btrfs_key loc; struct btrfs_dir_item *di; + struct orphan_dir_info *odi = NULL; /* * Don't try to rmdir the top/root subvolume dir. @@ -2931,6 +2933,11 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; + + odi = get_orphan_dir_info(sctx, dir); + if (odi) + key.offset = odi->last_dir_index_offset; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -2958,30 +2965,33 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, dm = get_waiting_dir_move(sctx, loc.objectid); if (dm) { - struct orphan_dir_info *odi; - odi = add_orphan_dir_info(sctx, dir); if (IS_ERR(odi)) { ret = PTR_ERR(odi); goto out; } odi->gen = dir_gen; + odi->last_dir_index_offset = found_key.offset; dm->rmdir_ino = dir; ret = 0; goto out; } if (loc.objectid > send_progress) { - struct orphan_dir_info *odi; - - odi = get_orphan_dir_info(sctx, dir); - free_orphan_dir_info(sctx, odi); + odi = add_orphan_dir_info(sctx, dir); + if (IS_ERR(odi)) { + ret = PTR_ERR(odi); + goto out; + } + odi->gen = dir_gen; + odi->last_dir_index_offset = found_key.offset; ret = 0; goto out; } path->slots[0]++; } + free_orphan_dir_info(sctx, odi); ret = 1; @@ -3259,13 +3269,16 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) if (rmdir_ino) { struct orphan_dir_info *odi; + u64 gen; odi = get_orphan_dir_info(sctx, rmdir_ino); if (!odi) { /* already deleted */ goto finish; } - ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino); + gen = odi->gen; + + ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino); if (ret < 0) goto out; if (!ret) @@ -3276,13 +3289,12 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) ret = -ENOMEM; goto out; } - ret = get_cur_path(sctx, rmdir_ino, odi->gen, name); + ret = get_cur_path(sctx, rmdir_ino, gen, name); if (ret < 0) goto out; ret = send_rmdir(sctx, name); if (ret < 0) goto out; - free_orphan_dir_info(sctx, odi); } finish: @@ -6454,7 +6466,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) */ if (root->send_in_progress < 0) btrfs_err(root->fs_info, - "send_in_progres unbalanced %d root %llu", + "send_in_progress unbalanced %d root %llu", root->send_in_progress, root->root_key.objectid); spin_unlock(&root->root_item_lock); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0628092b0b1b..81107ad49f3a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -323,6 +323,7 @@ enum { Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_nossd_spread, Opt_subvol, + Opt_subvol_empty, Opt_subvolid, Opt_thread_pool, Opt_treelog, Opt_notreelog, @@ -388,6 +389,7 @@ static const match_table_t tokens = { {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd_spread, "nossd_spread"}, {Opt_subvol, "subvol=%s"}, + {Opt_subvol_empty, "subvol="}, {Opt_subvolid, "subvolid=%s"}, {Opt_thread_pool, "thread_pool=%u"}, {Opt_treelog, "treelog"}, @@ -461,6 +463,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, DEGRADED); break; case Opt_subvol: + case Opt_subvol_empty: case Opt_subvolid: case Opt_subvolrootid: case Opt_device: @@ -1782,10 +1785,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } ret = btrfs_parse_options(fs_info, data, *flags); - if (ret) { - ret = -EINVAL; + if (ret) goto restore; - } btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 4848a4318fb5..4a4e960c7c66 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -210,12 +210,42 @@ static struct attribute *btrfs_supported_feature_attrs[] = { NULL }; +/* + * Features which depend on feature bits and may differ between each fs. + * + * /sys/fs/btrfs/features lists all available features of this kernel while + * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or + * can be changed online. + */ static const struct attribute_group btrfs_feature_attr_group = { .name = "features", .is_visible = btrfs_feature_visible, .attrs = btrfs_supported_feature_attrs, }; +static ssize_t rmdir_subvol_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0\n"); +} +BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); + +static struct attribute *btrfs_supported_static_feature_attrs[] = { + BTRFS_ATTR_PTR(static_feature, rmdir_subvol), + NULL +}; + +/* + * Features which only depend on kernel version. + * + * These are listed in /sys/fs/btrfs/features along with + * btrfs_feature_attr_group + */ +static const struct attribute_group btrfs_static_feature_attr_group = { + .name = "features", + .attrs = btrfs_supported_static_feature_attrs, +}; + static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) { u64 val; @@ -514,10 +544,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) } #define NUM_FEATURE_BITS 64 -static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; -static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; +#define BTRFS_FEATURE_NAME_MAX 13 +static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; +static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; -static const u64 supported_feature_masks[3] = { +static const u64 supported_feature_masks[FEAT_MAX] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, @@ -589,7 +620,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) return; } - list_for_each_entry(fs_devs, fs_uuids, list) { + list_for_each_entry(fs_devs, fs_uuids, fs_list) { __btrfs_sysfs_remove_fsid(fs_devs); } } @@ -609,7 +640,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); } -const char * const btrfs_feature_set_names[3] = { +const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_COMPAT] = "compat", [FEAT_COMPAT_RO] = "compat_ro", [FEAT_INCOMPAT] = "incompat", @@ -673,7 +704,7 @@ static void init_feature_attrs(void) if (fa->kobj_attr.attr.name) continue; - snprintf(name, 13, "%s:%u", + snprintf(name, BTRFS_FEATURE_NAME_MAX, "%s:%u", btrfs_feature_set_names[set], i); fa->kobj_attr.attr.name = name; @@ -900,8 +931,15 @@ int __init btrfs_init_sysfs(void) ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); if (ret) goto out2; + ret = sysfs_merge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + if (ret) + goto out_remove_group; return 0; + +out_remove_group: + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); out2: debugfs_remove_recursive(btrfs_debugfs_root_dentry); out1: @@ -912,6 +950,8 @@ out1: void __cold btrfs_exit_sysfs(void) { + sysfs_unmerge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); debugfs_remove_recursive(btrfs_debugfs_root_dentry); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index b567560d9aa9..c6ee600aff89 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -9,7 +9,7 @@ extern u64 btrfs_debugfs_test; enum btrfs_feature_set { - FEAT_COMPAT, + FEAT_COMPAT = 0, FEAT_COMPAT_RO, FEAT_INCOMPAT, FEAT_MAX @@ -77,7 +77,7 @@ attr_to_btrfs_feature_attr(struct attribute *attr) } char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -extern const char * const btrfs_feature_set_names[3]; +extern const char * const btrfs_feature_set_names[FEAT_MAX]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 30ed438da2a9..db72b3b6209e 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -219,11 +219,13 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache) kfree(cache); } -void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { memset(trans, 0, sizeof(*trans)); trans->transid = 1; trans->type = __TRANS_DUMMY; + trans->fs_info = fs_info; } int btrfs_run_sanity_tests(void) diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index a5a0b9500d3e..70ff9f9d86a1 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -9,7 +9,8 @@ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_run_sanity_tests(void); -#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) +#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) +#define test_err(fmt, ...) pr_err("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) struct btrfs_root; struct btrfs_trans_handle; @@ -28,7 +29,8 @@ void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); -void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); #else static inline int btrfs_run_sanity_tests(void) { diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 31e8a9ec228c..7d72eab6d32c 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -26,31 +26,31 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) u32 value_len = strlen(value); int ret = 0; - test_msg("Running btrfs_split_item tests\n"); + test_msg("running btrfs_split_item tests"); fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Could not allocate fs_info\n"); + test_err("could not allocate fs_info"); return -ENOMEM; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Could not allocate root\n"); + test_err("could not allocate root"); ret = PTR_ERR(root); goto out; } path = btrfs_alloc_path(); if (!path) { - test_msg("Could not allocate path\n"); + test_err("could not allocate path"); ret = -ENOMEM; goto out; } path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize); if (!eb) { - test_msg("Could not allocate dummy buffer\n"); + test_err("could not allocate dummy buffer"); ret = -ENOMEM; goto out; } @@ -75,7 +75,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) */ ret = btrfs_split_item(NULL, root, path, &key, 17); if (ret) { - test_msg("Split item failed %d\n", ret); + test_err("split item failed %d", ret); goto out; } @@ -86,14 +86,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) btrfs_item_key_to_cpu(eb, &key, 0); if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset != 0) { - test_msg("Invalid key at slot 0\n"); + test_err("invalid key at slot 0"); ret = -EINVAL; goto out; } item = btrfs_item_nr(0); if (btrfs_item_size(eb, item) != strlen(split1)) { - test_msg("Invalid len in the first split\n"); + test_err("invalid len in the first split"); ret = -EINVAL; goto out; } @@ -101,8 +101,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), strlen(split1)); if (memcmp(buf, split1, strlen(split1))) { - test_msg("Data in the buffer doesn't match what it should " - "in the first split have='%.*s' want '%s'\n", + test_err( +"data in the buffer doesn't match what it should in the first split have='%.*s' want '%s'", (int)strlen(split1), buf, split1); ret = -EINVAL; goto out; @@ -111,14 +111,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) btrfs_item_key_to_cpu(eb, &key, 1); if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset != 3) { - test_msg("Invalid key at slot 1\n"); + test_err("invalid key at slot 1"); ret = -EINVAL; goto out; } item = btrfs_item_nr(1); if (btrfs_item_size(eb, item) != strlen(split2)) { - test_msg("Invalid len in the second split\n"); + test_err("invalid len in the second split"); ret = -EINVAL; goto out; } @@ -126,8 +126,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), strlen(split2)); if (memcmp(buf, split2, strlen(split2))) { - test_msg("Data in the buffer doesn't match what it should " - "in the second split\n"); + test_err( + "data in the buffer doesn't match what it should in the second split"); ret = -EINVAL; goto out; } @@ -136,21 +136,21 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) /* Do it again so we test memmoving the other items in the leaf */ ret = btrfs_split_item(NULL, root, path, &key, 4); if (ret) { - test_msg("Second split item failed %d\n", ret); + test_err("second split item failed %d", ret); goto out; } btrfs_item_key_to_cpu(eb, &key, 0); if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset != 0) { - test_msg("Invalid key at slot 0\n"); + test_err("invalid key at slot 0"); ret = -EINVAL; goto out; } item = btrfs_item_nr(0); if (btrfs_item_size(eb, item) != strlen(split3)) { - test_msg("Invalid len in the first split\n"); + test_err("invalid len in the first split"); ret = -EINVAL; goto out; } @@ -158,8 +158,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0), strlen(split3)); if (memcmp(buf, split3, strlen(split3))) { - test_msg("Data in the buffer doesn't match what it should " - "in the third split"); + test_err( + "data in the buffer doesn't match what it should in the third split"); ret = -EINVAL; goto out; } @@ -167,14 +167,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) btrfs_item_key_to_cpu(eb, &key, 1); if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset != 1) { - test_msg("Invalid key at slot 1\n"); + test_err("invalid key at slot 1"); ret = -EINVAL; goto out; } item = btrfs_item_nr(1); if (btrfs_item_size(eb, item) != strlen(split4)) { - test_msg("Invalid len in the second split\n"); + test_err("invalid len in the second split"); ret = -EINVAL; goto out; } @@ -182,8 +182,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1), strlen(split4)); if (memcmp(buf, split4, strlen(split4))) { - test_msg("Data in the buffer doesn't match what it should " - "in the fourth split\n"); + test_err( + "data in the buffer doesn't match what it should in the fourth split"); ret = -EINVAL; goto out; } @@ -191,14 +191,14 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) btrfs_item_key_to_cpu(eb, &key, 2); if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset != 3) { - test_msg("Invalid key at slot 2\n"); + test_err("invalid key at slot 2"); ret = -EINVAL; goto out; } item = btrfs_item_nr(2); if (btrfs_item_size(eb, item) != strlen(split2)) { - test_msg("Invalid len in the second split\n"); + test_err("invalid len in the second split"); ret = -EINVAL; goto out; } @@ -206,8 +206,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2), strlen(split2)); if (memcmp(buf, split2, strlen(split2))) { - test_msg("Data in the buffer doesn't match what it should " - "in the last chunk\n"); + test_err( + "data in the buffer doesn't match what it should in the last chunk"); ret = -EINVAL; goto out; } @@ -220,6 +220,6 @@ out: int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize) { - test_msg("Running extent buffer operation tests\n"); + test_msg("running extent buffer operation tests"); return test_btrfs_split_item(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 76aa5a678a96..d9269a531a4d 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -46,7 +46,9 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, cond_resched(); loops++; if (loops > 100000) { - printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret); + printk(KERN_ERR + "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n", + start, end, nr_pages, ret); break; } } @@ -66,11 +68,11 @@ static int test_find_delalloc(u32 sectorsize) u64 found; int ret = -EINVAL; - test_msg("Running find delalloc tests\n"); + test_msg("running find delalloc tests"); inode = btrfs_new_test_inode(); if (!inode) { - test_msg("Failed to allocate test inode\n"); + test_err("failed to allocate test inode"); return -ENOMEM; } @@ -84,7 +86,7 @@ static int test_find_delalloc(u32 sectorsize) for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { - test_msg("Failed to allocate test page\n"); + test_err("failed to allocate test page"); ret = -ENOMEM; goto out; } @@ -107,11 +109,11 @@ static int test_find_delalloc(u32 sectorsize) found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { - test_msg("Should have found at least one delalloc\n"); + test_err("should have found at least one delalloc"); goto out_bits; } if (start != 0 || end != (sectorsize - 1)) { - test_msg("Expected start 0 end %u, got start %llu end %llu\n", + test_err("expected start 0 end %u, got start %llu end %llu", sectorsize - 1, start, end); goto out_bits; } @@ -129,7 +131,7 @@ static int test_find_delalloc(u32 sectorsize) locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_SHIFT); if (!locked_page) { - test_msg("Couldn't find the locked page\n"); + test_err("couldn't find the locked page"); goto out_bits; } set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); @@ -138,17 +140,17 @@ static int test_find_delalloc(u32 sectorsize) found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { - test_msg("Couldn't find delalloc in our range\n"); + test_err("couldn't find delalloc in our range"); goto out_bits; } if (start != test_start || end != max_bytes - 1) { - test_msg("Expected start %Lu end %Lu, got start %Lu, end " - "%Lu\n", test_start, max_bytes - 1, start, end); + test_err("expected start %llu end %llu, got start %llu, end %llu", + test_start, max_bytes - 1, start, end); goto out_bits; } if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { - test_msg("There were unlocked pages in the range\n"); + test_err("there were unlocked pages in the range"); goto out_bits; } unlock_extent(&tmp, start, end); @@ -164,7 +166,7 @@ static int test_find_delalloc(u32 sectorsize) locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_SHIFT); if (!locked_page) { - test_msg("Couldn't find the locked page\n"); + test_err("couldn't find the locked page"); goto out_bits; } start = test_start; @@ -172,11 +174,11 @@ static int test_find_delalloc(u32 sectorsize) found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (found) { - test_msg("Found range when we shouldn't have\n"); + test_err("found range when we shouldn't have"); goto out_bits; } if (end != (u64)-1) { - test_msg("Did not return the proper end offset\n"); + test_err("did not return the proper end offset"); goto out_bits; } @@ -193,17 +195,17 @@ static int test_find_delalloc(u32 sectorsize) found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { - test_msg("Didn't find our range\n"); + test_err("didn't find our range"); goto out_bits; } if (start != test_start || end != total_dirty - 1) { - test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", + test_err("expected start %llu end %llu, got start %llu end %llu", test_start, total_dirty - 1, start, end); goto out_bits; } if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { - test_msg("Pages in range were not all locked\n"); + test_err("pages in range were not all locked"); goto out_bits; } unlock_extent(&tmp, start, end); @@ -215,7 +217,7 @@ static int test_find_delalloc(u32 sectorsize) page = find_get_page(inode->i_mapping, (max_bytes + SZ_1M) >> PAGE_SHIFT); if (!page) { - test_msg("Couldn't find our page\n"); + test_err("couldn't find our page"); goto out_bits; } ClearPageDirty(page); @@ -234,18 +236,17 @@ static int test_find_delalloc(u32 sectorsize) found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, &end, max_bytes); if (!found) { - test_msg("Didn't find our range\n"); + test_err("didn't find our range"); goto out_bits; } if (start != test_start && end != test_start + PAGE_SIZE - 1) { - test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", - test_start, test_start + PAGE_SIZE - 1, start, - end); + test_err("expected start %llu end %llu, got start %llu end %llu", + test_start, test_start + PAGE_SIZE - 1, start, end); goto out_bits; } if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) { - test_msg("Pages in range were not all locked\n"); + test_err("pages in range were not all locked"); goto out_bits; } ret = 0; @@ -271,14 +272,14 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb, bit = !!test_bit(i, bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { - test_msg("Bits do not match\n"); + test_err("bits do not match"); return -EINVAL; } bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, i % BITS_PER_BYTE); if (bit1 != bit) { - test_msg("Offset bits do not match\n"); + test_err("offset bits do not match"); return -EINVAL; } } @@ -295,7 +296,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, memset(bitmap, 0, len); memzero_extent_buffer(eb, 0, len); if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Bitmap was not zeroed\n"); + test_err("bitmap was not zeroed"); return -EINVAL; } @@ -303,7 +304,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); ret = check_eb_bitmap(bitmap, eb, len); if (ret) { - test_msg("Setting all bits failed\n"); + test_err("setting all bits failed"); return ret; } @@ -311,7 +312,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); ret = check_eb_bitmap(bitmap, eb, len); if (ret) { - test_msg("Clearing all bits failed\n"); + test_err("clearing all bits failed"); return ret; } @@ -324,7 +325,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, sizeof(long) * BITS_PER_BYTE); ret = check_eb_bitmap(bitmap, eb, len); if (ret) { - test_msg("Setting straddling pages failed\n"); + test_err("setting straddling pages failed"); return ret; } @@ -337,7 +338,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, sizeof(long) * BITS_PER_BYTE); ret = check_eb_bitmap(bitmap, eb, len); if (ret) { - test_msg("Clearing straddling pages failed\n"); + test_err("clearing straddling pages failed"); return ret; } } @@ -361,7 +362,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, ret = check_eb_bitmap(bitmap, eb, len); if (ret) { - test_msg("Random bit pattern failed\n"); + test_err("random bit pattern failed"); return ret; } @@ -376,7 +377,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) struct extent_buffer *eb; int ret; - test_msg("Running extent buffer bitmap tests\n"); + test_msg("running extent buffer bitmap tests"); /* * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than @@ -389,13 +390,13 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { - test_msg("Couldn't allocate test bitmap\n"); + test_err("couldn't allocate test bitmap"); return -ENOMEM; } eb = __alloc_dummy_extent_buffer(fs_info, 0, len); if (!eb) { - test_msg("Couldn't allocate test extent buffer\n"); + test_err("couldn't allocate test extent buffer"); kfree(bitmap); return -ENOMEM; } @@ -408,7 +409,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) free_extent_buffer(eb); eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len); if (!eb) { - test_msg("Couldn't allocate test extent buffer\n"); + test_err("couldn't allocate test extent buffer"); kfree(bitmap); return -ENOMEM; } @@ -424,7 +425,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; - test_msg("Running extent I/O tests\n"); + test_msg("running extent I/O tests"); ret = test_find_delalloc(sectorsize); if (ret) @@ -432,6 +433,6 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) ret = test_eb_bitmaps(sectorsize, nodesize); out: - test_msg("Extent I/O tests finished\n"); + test_msg("extent I/O tests finished"); return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 79e0a5f4d9c9..385a5316e4bf 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -19,8 +19,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { - test_msg( -"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d\n", + test_err( +"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d", em->start, em->len, em->block_start, em->block_len, refcount_read(&em->refs)); @@ -47,7 +47,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) * ->add_extent_mapping(0, 16K) * -> #handle -EEXIST */ -static void test_case_1(struct extent_map_tree *em_tree) +static void test_case_1(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) { struct extent_map *em; u64 start = 0; @@ -90,14 +91,14 @@ static void test_case_1(struct extent_map_tree *em_tree) em->len = len; em->block_start = start; em->block_len = len; - ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); if (ret) - test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret); + test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); if (em && (em->start != 0 || extent_map_end(em) != SZ_16K || em->block_start != 0 || em->block_len != SZ_16K)) - test_msg( -"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n", + test_err( +"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); free_extent_map(em); @@ -112,7 +113,8 @@ out: * Reading the inline ending up with EEXIST, ie. read an inline * extent and discard page cache and read it again. */ -static void test_case_2(struct extent_map_tree *em_tree) +static void test_case_2(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) { struct extent_map *em; int ret; @@ -153,14 +155,14 @@ static void test_case_2(struct extent_map_tree *em_tree) em->len = SZ_1K; em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; - ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); if (ret) - test_msg("case2 [0 1K]: ret %d\n", ret); + test_err("case2 [0 1K]: ret %d", ret); if (em && (em->start != 0 || extent_map_end(em) != SZ_1K || em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) - test_msg( -"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n", + test_err( +"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", ret, em->start, em->len, em->block_start, em->block_len); free_extent_map(em); @@ -169,7 +171,8 @@ out: free_extent_map_tree(em_tree); } -static void __test_case_3(struct extent_map_tree *em_tree, u64 start) +static void __test_case_3(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, u64 start) { struct extent_map *em; u64 len = SZ_4K; @@ -198,9 +201,9 @@ static void __test_case_3(struct extent_map_tree *em_tree, u64 start) em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_16K; - ret = btrfs_add_extent_mapping(em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); if (ret) - test_msg("case3 [0x%llx 0x%llx): ret %d\n", + test_err("case3 [0x%llx 0x%llx): ret %d", start, start + len, ret); /* * Since bytes within em are contiguous, em->block_start is identical to @@ -209,8 +212,8 @@ static void __test_case_3(struct extent_map_tree *em_tree, u64 start) if (em && (start < em->start || start + len > extent_map_end(em) || em->start != em->block_start || em->len != em->block_len)) - test_msg( -"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n", + test_err( +"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); free_extent_map(em); @@ -235,14 +238,16 @@ out: * -> add_extent_mapping() * -> add_extent_mapping() */ -static void test_case_3(struct extent_map_tree *em_tree) +static void test_case_3(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) { - __test_case_3(em_tree, 0); - __test_case_3(em_tree, SZ_8K); - __test_case_3(em_tree, (12 * 1024ULL)); + __test_case_3(fs_info, em_tree, 0); + __test_case_3(fs_info, em_tree, SZ_8K); + __test_case_3(fs_info, em_tree, (12 * 1024ULL)); } -static void __test_case_4(struct extent_map_tree *em_tree, u64 start) +static void __test_case_4(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, u64 start) { struct extent_map *em; u64 len = SZ_4K; @@ -283,14 +288,14 @@ static void __test_case_4(struct extent_map_tree *em_tree, u64 start) em->len = SZ_32K; em->block_start = 0; em->block_len = SZ_32K; - ret = btrfs_add_extent_mapping(em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); if (ret) - test_msg("case4 [0x%llx 0x%llx): ret %d\n", + test_err("case4 [0x%llx 0x%llx): ret %d", start, len, ret); if (em && (start < em->start || start + len > extent_map_end(em))) - test_msg( -"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n", + test_err( +"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", start, len, ret, em->start, em->len, em->block_start, em->block_len); free_extent_map(em); @@ -324,30 +329,45 @@ out: * # handle -EEXIST when adding * # [0, 32K) */ -static void test_case_4(struct extent_map_tree *em_tree) +static void test_case_4(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree) { - __test_case_4(em_tree, 0); - __test_case_4(em_tree, SZ_4K); + __test_case_4(fs_info, em_tree, 0); + __test_case_4(fs_info, em_tree, SZ_4K); } int btrfs_test_extent_map(void) { + struct btrfs_fs_info *fs_info = NULL; struct extent_map_tree *em_tree; - test_msg("Running extent_map tests\n"); + test_msg("running extent_map tests"); + + /* + * Note: the fs_info is not set up completely, we only need + * fs_info::fsid for the tracepoint. + */ + fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info"); + return -ENOMEM; + } em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL); if (!em_tree) /* Skip the test on error. */ - return 0; + goto out; extent_map_tree_init(em_tree); - test_case_1(em_tree); - test_case_2(em_tree); - test_case_3(em_tree); - test_case_4(em_tree); + test_case_1(fs_info, em_tree); + test_case_2(fs_info, em_tree); + test_case_3(fs_info, em_tree); + test_case_4(fs_info, em_tree); kfree(em_tree); +out: + btrfs_free_dummy_fs_info(fs_info); + return 0; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index d3c9f8a59ba5..5c2f77e9439b 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -20,63 +20,63 @@ static int test_extents(struct btrfs_block_group_cache *cache) { int ret = 0; - test_msg("Running extent only tests\n"); + test_msg("running extent only tests"); /* First just make sure we can remove an entire entry */ ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { - test_msg("Error adding initial extents %d\n", ret); + test_err("error adding initial extents %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { - test_msg("Error removing extent %d\n", ret); + test_err("error removing extent %d", ret); return ret; } if (test_check_exists(cache, 0, SZ_4M)) { - test_msg("Full remove left some lingering space\n"); + test_err("full remove left some lingering space"); return -1; } /* Ok edge and middle cases now */ ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { - test_msg("Error adding half extent %d\n", ret); + test_err("error adding half extent %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M); if (ret) { - test_msg("Error removing tail end %d\n", ret); + test_err("error removing tail end %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { - test_msg("Error removing front end %d\n", ret); + test_err("error removing front end %d", ret); return ret; } ret = btrfs_remove_free_space(cache, SZ_2M, 4096); if (ret) { - test_msg("Error removing middle piece %d\n", ret); + test_err("error removing middle piece %d", ret); return ret; } if (test_check_exists(cache, 0, SZ_1M)) { - test_msg("Still have space at the front\n"); + test_err("still have space at the front"); return -1; } if (test_check_exists(cache, SZ_2M, 4096)) { - test_msg("Still have space in the middle\n"); + test_err("still have space in the middle"); return -1; } if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) { - test_msg("Still have space at the end\n"); + test_err("still have space at the end"); return -1; } @@ -92,34 +92,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache, u64 next_bitmap_offset; int ret; - test_msg("Running bitmap only tests\n"); + test_msg("running bitmap only tests"); ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { - test_msg("Couldn't create a bitmap entry %d\n", ret); + test_err("couldn't create a bitmap entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { - test_msg("Error removing bitmap full range %d\n", ret); + test_err("error removing bitmap full range %d", ret); return ret; } if (test_check_exists(cache, 0, SZ_4M)) { - test_msg("Left some space in bitmap\n"); + test_err("left some space in bitmap"); return -1; } ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { - test_msg("Couldn't add to our bitmap entry %d\n", ret); + test_err("couldn't add to our bitmap entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M); if (ret) { - test_msg("Couldn't remove middle chunk %d\n", ret); + test_err("couldn't remove middle chunk %d", ret); return ret; } @@ -133,19 +133,19 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache, ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, SZ_4M, 1); if (ret) { - test_msg("Couldn't add space that straddles two bitmaps %d\n", + test_err("couldn't add space that straddles two bitmaps %d", ret); return ret; } ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M); if (ret) { - test_msg("Couldn't remove overlapping space %d\n", ret); + test_err("couldn't remove overlapping space %d", ret); return ret; } if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) { - test_msg("Left some space when removing overlapping\n"); + test_err("left some space when removing overlapping"); return -1; } @@ -161,7 +161,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); int ret; - test_msg("Running bitmap and extent tests\n"); + test_msg("running bitmap and extent tests"); /* * First let's do something simple, an extent at the same offset as the @@ -170,42 +170,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, */ ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1); if (ret) { - test_msg("Couldn't create bitmap entry %d\n", ret); + test_err("couldn't create bitmap entry %d", ret); return ret; } ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); + test_err("couldn't add extent entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { - test_msg("Couldn't remove extent entry %d\n", ret); + test_err("couldn't remove extent entry %d", ret); return ret; } if (test_check_exists(cache, 0, SZ_1M)) { - test_msg("Left remnants after our remove\n"); + test_err("left remnants after our remove"); return -1; } /* Now to add back the extent entry and remove from the bitmap */ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { - test_msg("Couldn't re-add extent entry %d\n", ret); + test_err("couldn't re-add extent entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M); if (ret) { - test_msg("Couldn't remove from bitmap %d\n", ret); + test_err("couldn't remove from bitmap %d", ret); return ret; } if (test_check_exists(cache, SZ_4M, SZ_1M)) { - test_msg("Left remnants in the bitmap\n"); + test_err("left remnants in the bitmap"); return -1; } @@ -215,18 +215,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, */ ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1); if (ret) { - test_msg("Couldn't add to a bitmap %d\n", ret); + test_err("couldn't add to a bitmap %d", ret); return ret; } ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M); if (ret) { - test_msg("Couldn't remove overlapping space %d\n", ret); + test_err("couldn't remove overlapping space %d", ret); return ret; } if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) { - test_msg("Left over pieces after removing overlapping\n"); + test_err("left over pieces after removing overlapping"); return -1; } @@ -235,24 +235,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, /* Now with the extent entry offset into the bitmap */ ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1); if (ret) { - test_msg("Couldn't add space to the bitmap %d\n", ret); + test_err("couldn't add space to the bitmap %d", ret); return ret; } ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0); if (ret) { - test_msg("Couldn't add extent to the cache %d\n", ret); + test_err("couldn't add extent to the cache %d", ret); return ret; } ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M); if (ret) { - test_msg("Problem removing overlapping space %d\n", ret); + test_err("problem removing overlapping space %d", ret); return ret; } if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) { - test_msg("Left something behind when removing space"); + test_err("left something behind when removing space"); return -1; } @@ -269,25 +269,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, __btrfs_remove_free_space_cache(cache->free_space_ctl); ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1); if (ret) { - test_msg("Couldn't add bitmap %d\n", ret); + test_err("couldn't add bitmap %d", ret); return ret; } ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M, 5 * SZ_1M, 0); if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); + test_err("couldn't add extent entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M); if (ret) { - test_msg("Failed to free our space %d\n", ret); + test_err("failed to free our space %d", ret); return ret; } if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) { - test_msg("Left stuff over\n"); + test_err("left stuff over"); return -1; } @@ -301,19 +301,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, */ ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1); if (ret) { - test_msg("Couldn't add bitmap entry %d\n", ret); + test_err("couldn't add bitmap entry %d", ret); return ret; } ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0); if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); + test_err("couldn't add extent entry %d", ret); return ret; } ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M); if (ret) { - test_msg("Error removing bitmap and extent overlapping %d\n", ret); + test_err("error removing bitmap and extent overlapping %d", ret); return ret; } @@ -335,12 +335,14 @@ check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache, const int num_bitmaps) { if (cache->free_space_ctl->free_extents != num_extents) { - test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n", + test_err( + "incorrect # of extent entries in the cache: %d, expected %d", cache->free_space_ctl->free_extents, num_extents); return -EINVAL; } if (cache->free_space_ctl->total_bitmaps != num_bitmaps) { - test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n", + test_err( + "incorrect # of extent entries in the cache: %d, expected %d", cache->free_space_ctl->total_bitmaps, num_bitmaps); return -EINVAL; } @@ -358,7 +360,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) * allocate. */ if (cache->free_space_ctl->free_space != 0) { - test_msg("Cache free space is not 0\n"); + test_err("cache free space is not 0"); return -EINVAL; } @@ -366,7 +368,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0, &max_extent_size); if (offset != 0) { - test_msg("Space allocation did not fail, returned offset: %llu", + test_err("space allocation did not fail, returned offset: %llu", offset); return -EINVAL; } @@ -402,7 +404,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, }; const struct btrfs_free_space_op *orig_free_space_ops; - test_msg("Running space stealing from bitmap to extent\n"); + test_msg("running space stealing from bitmap to extent"); /* * For this test, we want to ensure we end up with an extent entry @@ -430,7 +432,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0); if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); + test_err("couldn't add extent entry %d", ret); return ret; } @@ -438,7 +440,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K, SZ_128M - SZ_512K, 1); if (ret) { - test_msg("Couldn't add bitmap entry %d\n", ret); + test_err("couldn't add bitmap entry %d", ret); return ret; } @@ -457,17 +459,17 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, SZ_128M + 768 * SZ_1K, SZ_128M - 768 * SZ_1K); if (ret) { - test_msg("Failed to free part of bitmap space %d\n", ret); + test_err("failed to free part of bitmap space %d", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) { - test_msg("Free space range missing\n"); + test_err("free space range missing"); return -ENOENT; } if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) { - test_msg("Free space range missing\n"); + test_err("free space range missing"); return -ENOENT; } @@ -477,7 +479,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ if (test_check_exists(cache, SZ_128M + 768 * SZ_1K, SZ_128M - 768 * SZ_1K)) { - test_msg("Bitmap region not removed from space cache\n"); + test_err("bitmap region not removed from space cache"); return -EINVAL; } @@ -486,7 +488,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * covered by the bitmap, isn't marked as free. */ if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) { - test_msg("Invalid bitmap region marked as free\n"); + test_err("invalid bitmap region marked as free"); return -EINVAL; } @@ -495,7 +497,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * by the bitmap too, isn't marked as free either. */ if (test_check_exists(cache, SZ_128M, SZ_256K)) { - test_msg("Invalid bitmap region marked as free\n"); + test_err("invalid bitmap region marked as free"); return -EINVAL; } @@ -506,12 +508,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } /* Confirm the region is marked as free. */ if (!test_check_exists(cache, SZ_128M, SZ_512K)) { - test_msg("Bitmap region not marked as free\n"); + test_err("bitmap region not marked as free"); return -ENOENT; } @@ -531,7 +533,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } @@ -550,12 +552,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } /* Confirm the region is marked as free. */ if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) { - test_msg("Extent region not marked as free\n"); + test_err("extent region not marked as free"); return -ENOENT; } @@ -583,12 +585,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * allocate the whole free space at once. */ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) { - test_msg("Expected region not marked as free\n"); + test_err("expected region not marked as free"); return -ENOENT; } if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) { - test_msg("Cache free space is not 1Mb + %u\n", sectorsize); + test_err("cache free space is not 1Mb + %u", sectorsize); return -EINVAL; } @@ -596,7 +598,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, 0, SZ_1M, 0, &max_extent_size); if (offset != (SZ_128M - SZ_256K)) { - test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", + test_err( + "failed to allocate 1Mb from space cache, returned offset is: %llu", offset); return -EINVAL; } @@ -610,7 +613,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, return ret; if (cache->free_space_ctl->free_space != sectorsize) { - test_msg("Cache free space is not %u\n", sectorsize); + test_err("cache free space is not %u", sectorsize); return -EINVAL; } @@ -618,7 +621,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, 0, sectorsize, 0, &max_extent_size); if (offset != (SZ_128M + SZ_16M)) { - test_msg("Failed to allocate %u, returned offset : %llu\n", + test_err("failed to allocate %u, returned offset : %llu", sectorsize, offset); return -EINVAL; } @@ -640,14 +643,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0); if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); + test_err("couldn't add extent entry %d", ret); return ret; } /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */ ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1); if (ret) { - test_msg("Couldn't add bitmap entry %d\n", ret); + test_err("couldn't add bitmap entry %d", ret); return ret; } @@ -664,17 +667,17 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K); if (ret) { - test_msg("Failed to free part of bitmap space %d\n", ret); + test_err("failed to free part of bitmap space %d", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) { - test_msg("Free space range missing\n"); + test_err("free space range missing"); return -ENOENT; } if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) { - test_msg("Free space range missing\n"); + test_err("free space range missing"); return -ENOENT; } @@ -683,7 +686,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * as free anymore. */ if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) { - test_msg("Bitmap region not removed from space cache\n"); + test_err("bitmap region not removed from space cache"); return -EINVAL; } @@ -692,7 +695,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * covered by the bitmap, isn't marked as free. */ if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { - test_msg("Invalid bitmap region marked as free\n"); + test_err("invalid bitmap region marked as free"); return -EINVAL; } @@ -703,12 +706,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } /* Confirm the region is marked as free. */ if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { - test_msg("Bitmap region not marked as free\n"); + test_err("bitmap region not marked as free"); return -ENOENT; } @@ -728,7 +731,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } @@ -739,12 +742,12 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, */ ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K); if (ret) { - test_msg("Error adding free space: %d\n", ret); + test_err("error adding free space: %d", ret); return ret; } /* Confirm the region is marked as free. */ if (!test_check_exists(cache, SZ_128M, SZ_128K)) { - test_msg("Extent region not marked as free\n"); + test_err("extent region not marked as free"); return -ENOENT; } @@ -772,19 +775,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, * allocate the whole free space at once. */ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) { - test_msg("Expected region not marked as free\n"); + test_err("expected region not marked as free"); return -ENOENT; } if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) { - test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize); + test_err("cache free space is not 1Mb + %u", 2 * sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0, &max_extent_size); if (offset != (SZ_128M - 768 * SZ_1K)) { - test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", + test_err( + "failed to allocate 1Mb from space cache, returned offset is: %llu", offset); return -EINVAL; } @@ -798,7 +802,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, return ret; if (cache->free_space_ctl->free_space != 2 * sectorsize) { - test_msg("Cache free space is not %u\n", 2 * sectorsize); + test_err("cache free space is not %u", 2 * sectorsize); return -EINVAL; } @@ -806,9 +810,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, 0, 2 * sectorsize, 0, &max_extent_size); if (offset != SZ_32M) { - test_msg("Failed to allocate %u, offset: %llu\n", - 2 * sectorsize, - offset); + test_err("failed to allocate %u, offset: %llu", + 2 * sectorsize, offset); return -EINVAL; } @@ -829,7 +832,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) struct btrfs_root *root = NULL; int ret = -ENOMEM; - test_msg("Running btrfs free space cache tests\n"); + test_msg("running btrfs free space cache tests"); fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) return -ENOMEM; @@ -843,7 +846,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) cache = btrfs_alloc_dummy_block_group(fs_info, BITS_PER_BITMAP * sectorsize + PAGE_SIZE); if (!cache) { - test_msg("Couldn't run the tests\n"); + test_err("couldn't run the tests"); btrfs_free_dummy_fs_info(fs_info); return 0; } @@ -871,6 +874,6 @@ out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); - test_msg("Free space cache tests finished\n"); + test_msg("free space cache tests finished"); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index e1f9666c4974..89346da890cf 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, info = search_free_space_info(trans, fs_info, cache, path, 0); if (IS_ERR(info)) { - test_msg("Could not find free space info\n"); + test_err("could not find free space info"); ret = PTR_ERR(info); goto out; } @@ -40,7 +40,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, extent_count = btrfs_free_space_extent_count(path->nodes[0], info); if (extent_count != num_extents) { - test_msg("Extent count is wrong\n"); + test_err("extent count is wrong"); ret = -EINVAL; goto out; } @@ -99,7 +99,7 @@ out: btrfs_release_path(path); return ret; invalid: - test_msg("Free space tree is invalid\n"); + test_err("free space tree is invalid"); ret = -EINVAL; goto out; } @@ -117,7 +117,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, info = search_free_space_info(trans, fs_info, cache, path, 0); if (IS_ERR(info)) { - test_msg("Could not find free space info\n"); + test_err("could not find free space info"); btrfs_release_path(path); return PTR_ERR(info); } @@ -131,15 +131,15 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, /* Flip it to the other format and check that for good measure. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { - ret = convert_free_space_to_extents(trans, fs_info, cache, path); + ret = convert_free_space_to_extents(trans, cache, path); if (ret) { - test_msg("Could not convert to extents\n"); + test_err("could not convert to extents"); return ret; } } else { - ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path); + ret = convert_free_space_to_bitmaps(trans, cache, path); if (ret) { - test_msg("Could not convert to bitmaps\n"); + test_err("could not convert to bitmaps"); return ret; } } @@ -170,11 +170,11 @@ static int test_remove_all(struct btrfs_trans_handle *trans, const struct free_space_extent extents[] = {}; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, cache->key.offset); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } @@ -194,10 +194,10 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, alignment); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } @@ -217,12 +217,12 @@ static int test_remove_end(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid + cache->key.offset - alignment, alignment); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } @@ -243,11 +243,11 @@ static int test_remove_middle(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid + alignment, alignment); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } @@ -266,26 +266,26 @@ static int test_merge_left(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, cache->key.offset); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, alignment); + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } @@ -304,27 +304,27 @@ static int test_merge_right(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, cache->key.offset); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + 2 * alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } @@ -343,34 +343,34 @@ static int test_merge_both(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, cache->key.offset); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, alignment); + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + 2 * alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } @@ -391,34 +391,34 @@ static int test_merge_none(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + ret = __remove_from_free_space_tree(trans, cache, path, cache->key.objectid, cache->key.offset); if (ret) { - test_msg("Could not remove free space\n"); + test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, alignment); + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + 4 * alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, fs_info, cache, path, + ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid + 2 * alignment, alignment); if (ret) { - test_msg("Could not add free space\n"); + test_err("could not add free space"); return ret; } @@ -444,14 +444,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + test_err("couldn't allocate dummy fs info"); ret = -ENOMEM; goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Couldn't allocate dummy root\n"); + test_err("couldn't allocate dummy root"); ret = PTR_ERR(root); goto out; } @@ -463,7 +463,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, root->node = alloc_test_extent_buffer(root->fs_info, nodesize); if (!root->node) { - test_msg("Couldn't allocate dummy buffer\n"); + test_err("couldn't allocate dummy buffer"); ret = -ENOMEM; goto out; } @@ -473,7 +473,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment); if (!cache) { - test_msg("Couldn't allocate dummy block group cache\n"); + test_err("couldn't allocate dummy block group cache"); ret = -ENOMEM; goto out; } @@ -482,26 +482,25 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, cache->needs_free_space = 1; cache->fs_info = root->fs_info; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, root->fs_info); path = btrfs_alloc_path(); if (!path) { - test_msg("Couldn't allocate path\n"); + test_err("couldn't allocate path"); ret = -ENOMEM; goto out; } - ret = add_block_group_free_space(&trans, root->fs_info, cache); + ret = add_block_group_free_space(&trans, cache); if (ret) { - test_msg("Could not add block group free space\n"); + test_err("could not add block group free space"); goto out; } if (bitmaps) { - ret = convert_free_space_to_bitmaps(&trans, root->fs_info, - cache, path); + ret = convert_free_space_to_bitmaps(&trans, cache, path); if (ret) { - test_msg("Could not convert block group to bitmaps\n"); + test_err("could not convert block group to bitmaps"); goto out; } } @@ -510,14 +509,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, if (ret) goto out; - ret = remove_block_group_free_space(&trans, root->fs_info, cache); + ret = remove_block_group_free_space(&trans, cache); if (ret) { - test_msg("Could not remove block group free space\n"); + test_err("could not remove block group free space"); goto out; } if (btrfs_header_nritems(root->node) != 0) { - test_msg("Free space tree has leftover items\n"); + test_err("free space tree has leftover items"); ret = -EINVAL; goto out; } @@ -539,14 +538,16 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize, ret = run_test(test_func, 0, sectorsize, nodesize, alignment); if (ret) { - test_msg("%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u\n", + test_err( + "%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u", test_func, sectorsize, nodesize, alignment); test_ret = ret; } ret = run_test(test_func, 1, sectorsize, nodesize, alignment); if (ret) { - test_msg("%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u\n", + test_err( + "%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u", test_func, sectorsize, nodesize, alignment); test_ret = ret; } @@ -577,7 +578,7 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) */ bitmap_alignment = BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE; - test_msg("Running free space tree tests\n"); + test_msg("running free space tree tests"); for (i = 0; i < ARRAY_SIZE(tests); i++) { int ret; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index e0ba799536b4..64043f028820 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -228,7 +228,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) inode = btrfs_new_test_inode(); if (!inode) { - test_msg("Couldn't allocate inode\n"); + test_err("couldn't allocate inode"); return ret; } @@ -238,19 +238,19 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + test_err("couldn't allocate dummy fs info"); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + test_err("couldn't allocate root"); goto out; } root->node = alloc_dummy_extent_buffer(fs_info, nodesize); if (!root->node) { - test_msg("Couldn't allocate dummy buffer\n"); + test_err("couldn't allocate dummy buffer"); goto out; } @@ -268,11 +268,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0); if (IS_ERR(em)) { em = NULL; - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole, got %llu\n", em->block_start); + test_err("expected a hole, got %llu", em->block_start); goto out; } free_extent_map(em); @@ -287,20 +287,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole, got %llu\n", em->block_start); + test_err("expected a hole, got %llu", em->block_start); goto out; } if (em->start != 0 || em->len != 5) { - test_msg("Unexpected extent wanted start 0 len 5, got start " - "%llu len %llu\n", em->start, em->len); + test_err( + "unexpected extent wanted start 0 len 5, got start %llu len %llu", + em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } offset = em->start + em->len; @@ -308,21 +309,22 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_INLINE) { - test_msg("Expected an inline, got %llu\n", em->block_start); + test_err("expected an inline, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != (sectorsize - 5)) { - test_msg("Unexpected extent wanted start %llu len 1, got start " - "%llu len %llu\n", offset, em->start, em->len); + test_err( + "unexpected extent wanted start %llu len 1, got start %llu len %llu", + offset, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } /* @@ -335,20 +337,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole, got %llu\n", em->block_start); + test_err("expected a hole, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != 4) { - test_msg("Unexpected extent wanted start %llu len 4, got start " - "%llu len %llu\n", offset, em->start, em->len); + test_err( + "unexpected extent wanted start %llu len 4, got start %llu len %llu", + offset, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } offset = em->start + em->len; @@ -357,24 +360,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* Regular extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize - 1) { - test_msg("Unexpected extent wanted start %llu len 4095, got " - "start %llu len %llu\n", offset, em->start, em->len); + test_err( + "unexpected extent wanted start %llu len 4095, got start %llu len %llu", + offset, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -384,25 +388,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* The next 3 are split extents */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -413,21 +417,21 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole, got %llu\n", em->block_start); + test_err("expected a hole, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } offset = em->start + em->len; @@ -435,31 +439,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != orig_start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", + test_err("wrong orig offset, want %llu, have %llu", orig_start, em->orig_start); goto out; } disk_bytenr += (em->start - orig_start); if (em->block_start != disk_bytenr) { - test_msg("Wrong block start, want %llu, have %llu\n", + test_err("wrong block start, want %llu, have %llu", disk_bytenr, em->block_start); goto out; } @@ -469,26 +473,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* Prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", prealloc_only, em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -498,26 +502,26 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* The next 3 are a half written prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", prealloc_only, em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -528,30 +532,30 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_HOLE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != orig_start) { - test_msg("Unexpected orig offset, wanted %llu, have %llu\n", + test_err("unexpected orig offset, wanted %llu, have %llu", orig_start, em->orig_start); goto out; } if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { - test_msg("Unexpected block start, wanted %llu, have %llu\n", + test_err("unexpected block start, wanted %llu, have %llu", disk_bytenr + (em->start - em->orig_start), em->block_start); goto out; @@ -561,31 +565,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", prealloc_only, em->flags); goto out; } if (em->orig_start != orig_start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start, + test_err("wrong orig offset, want %llu, have %llu", orig_start, em->orig_start); goto out; } if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) { - test_msg("Unexpected block start, wanted %llu, have %llu\n", + test_err("unexpected block start, wanted %llu, have %llu", disk_bytenr + (em->start - em->orig_start), em->block_start); goto out; @@ -596,31 +600,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* Now for the compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u," - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", compressed_only, em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } if (em->compress_type != BTRFS_COMPRESS_ZLIB) { - test_msg("Unexpected compress type, wanted %d, got %d\n", + test_err("unexpected compress type, wanted %d, got %d", BTRFS_COMPRESS_ZLIB, em->compress_type); goto out; } @@ -630,31 +634,31 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* Split compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u," - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", compressed_only, em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } if (em->compress_type != BTRFS_COMPRESS_ZLIB) { - test_msg("Unexpected compress type, wanted %d, got %d\n", + test_err("unexpected compress type, wanted %d, got %d", BTRFS_COMPRESS_ZLIB, em->compress_type); goto out; } @@ -665,25 +669,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -692,32 +696,32 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != disk_bytenr) { - test_msg("Block start does not match, want %llu got %llu\n", + test_err("block start does not match, want %llu got %llu", disk_bytenr, em->block_start); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", compressed_only, em->flags); goto out; } if (em->orig_start != orig_start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", + test_err("wrong orig offset, want %llu, have %llu", em->start, orig_start); goto out; } if (em->compress_type != BTRFS_COMPRESS_ZLIB) { - test_msg("Unexpected compress type, wanted %d, got %d\n", + test_err("unexpected compress type, wanted %d, got %d", BTRFS_COMPRESS_ZLIB, em->compress_type); goto out; } @@ -728,25 +732,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -755,11 +759,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole extent, got %llu\n", em->block_start); + test_err("expected a hole extent, got %llu", em->block_start); goto out; } /* @@ -768,18 +772,18 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) * test. */ if (em->start != offset || em->len != 3 * sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, 3 * sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { - test_msg("Unexpected flags set, want %lu have %lu\n", + test_err("unexpected flags set, want %lu have %lu", vacancy_only, em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -788,25 +792,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != offset || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %llu len %u," - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %llu len %u, got start %llu len %llu", offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, want 0 have %lu\n", em->flags); + test_err("unexpected flags set, want 0 have %lu", em->flags); goto out; } if (em->orig_start != em->start) { - test_msg("Wrong orig offset, want %llu, have %llu\n", em->start, + test_err("wrong orig offset, want %llu, have %llu", em->start, em->orig_start); goto out; } @@ -830,7 +834,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) inode = btrfs_new_test_inode(); if (!inode) { - test_msg("Couldn't allocate inode\n"); + test_err("couldn't allocate inode"); return ret; } @@ -840,19 +844,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + test_err("couldn't allocate dummy fs info"); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + test_err("couldn't allocate root"); goto out; } root->node = alloc_dummy_extent_buffer(fs_info, nodesize); if (!root->node) { - test_msg("Couldn't allocate dummy buffer\n"); + test_err("couldn't allocate dummy buffer"); goto out; } @@ -871,21 +875,21 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != EXTENT_MAP_HOLE) { - test_msg("Expected a hole, got %llu\n", em->block_start); + test_err("expected a hole, got %llu", em->block_start); goto out; } if (em->start != 0 || em->len != sectorsize) { - test_msg("Unexpected extent wanted start 0 len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start 0 len %u, got start %llu len %llu", sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { - test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only, + test_err("wrong flags, wanted %lu, have %lu", vacancy_only, em->flags); goto out; } @@ -894,21 +898,21 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize, 0); if (IS_ERR(em)) { - test_msg("Got an error when we shouldn't have\n"); + test_err("got an error when we shouldn't have"); goto out; } if (em->block_start != sectorsize) { - test_msg("Expected a real extent, got %llu\n", em->block_start); + test_err("expected a real extent, got %llu", em->block_start); goto out; } if (em->start != sectorsize || em->len != sectorsize) { - test_msg("Unexpected extent wanted start %u len %u, " - "got start %llu len %llu\n", + test_err( + "unexpected extent wanted start %u len %u, got start %llu len %llu", sectorsize, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { - test_msg("Unexpected flags set, wanted 0 got %lu\n", + test_err("unexpected flags set, wanted 0 got %lu", em->flags); goto out; } @@ -931,19 +935,19 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) inode = btrfs_new_test_inode(); if (!inode) { - test_msg("Couldn't allocate inode\n"); + test_err("couldn't allocate inode"); return ret; } fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + test_err("couldn't allocate dummy fs info"); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + test_err("couldn't allocate root"); goto out; } @@ -954,12 +958,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 1) { ret = -EINVAL; - test_msg("Miscount, wanted 1, got %u\n", + test_err("miscount, wanted 1, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -969,12 +973,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 2) { ret = -EINVAL; - test_msg("Miscount, wanted 2, got %u\n", + test_err("miscount, wanted 2, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -986,12 +990,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { - test_msg("clear_extent_bit returned %d\n", ret); + test_err("clear_extent_bit returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 2) { ret = -EINVAL; - test_msg("Miscount, wanted 2, got %u\n", + test_err("miscount, wanted 2, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1002,12 +1006,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) + sectorsize - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 2) { ret = -EINVAL; - test_msg("Miscount, wanted 2, got %u\n", + test_err("miscount, wanted 2, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1020,12 +1024,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 4) { ret = -EINVAL; - test_msg("Miscount, wanted 4, got %u\n", + test_err("miscount, wanted 4, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1037,12 +1041,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 3) { ret = -EINVAL; - test_msg("Miscount, wanted 3, got %u\n", + test_err("miscount, wanted 3, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1054,12 +1058,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { - test_msg("clear_extent_bit returned %d\n", ret); + test_err("clear_extent_bit returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 4) { ret = -EINVAL; - test_msg("Miscount, wanted 4, got %u\n", + test_err("miscount, wanted 4, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1072,12 +1076,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); if (ret) { - test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents != 3) { ret = -EINVAL; - test_msg("Miscount, wanted 3, got %u\n", + test_err("miscount, wanted 3, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1087,12 +1091,12 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { - test_msg("clear_extent_bit returned %d\n", ret); + test_err("clear_extent_bit returned %d", ret); goto out; } if (BTRFS_I(inode)->outstanding_extents) { ret = -EINVAL; - test_msg("Miscount, wanted 0, got %u\n", + test_err("miscount, wanted 0, got %u", BTRFS_I(inode)->outstanding_extents); goto out; } @@ -1115,14 +1119,14 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); - test_msg("Running btrfs_get_extent tests\n"); + test_msg("running btrfs_get_extent tests"); ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) return ret; - test_msg("Running hole first btrfs_get_extent test\n"); + test_msg("running hole first btrfs_get_extent test"); ret = test_hole_first(sectorsize, nodesize); if (ret) return ret; - test_msg("Running outstanding_extents tests\n"); + test_msg("running outstanding_extents tests"); return test_extent_accounting(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 39b95783f736..ace94db09d29 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -24,7 +24,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, NULL); ins.objectid = bytenr; ins.type = BTRFS_EXTENT_ITEM_KEY; @@ -32,14 +32,14 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_msg("Couldn't allocate path\n"); + test_err("couldn't allocate path"); return -ENOMEM; } path->leave_spinning = 1; ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); if (ret) { - test_msg("Couldn't insert ref %d\n", ret); + test_err("couldn't insert ref %d", ret); btrfs_free_path(path); return ret; } @@ -74,7 +74,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 refs; int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, NULL); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -82,14 +82,14 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, path = btrfs_alloc_path(); if (!path) { - test_msg("Couldn't allocate path\n"); + test_err("couldn't allocate path"); return -ENOMEM; } path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { - test_msg("Couldn't find extent ref\n"); + test_err("couldn't find extent ref"); btrfs_free_path(path); return ret; } @@ -111,7 +111,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); if (ret) - test_msg("Failed to insert backref\n"); + test_err("failed to insert backref"); btrfs_free_path(path); return ret; } @@ -124,7 +124,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, struct btrfs_path *path; int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, NULL); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -132,14 +132,14 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_msg("Couldn't allocate path\n"); + test_err("couldn't allocate path"); return -ENOMEM; } path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); if (ret) { - test_msg("Didn't find our key %d\n", ret); + test_err("didn't find our key %d", ret); btrfs_free_path(path); return ret; } @@ -158,7 +158,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, u64 refs; int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, NULL); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -166,14 +166,14 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_msg("Couldn't allocate path\n"); + test_err("couldn't allocate path"); return -ENOMEM; } path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { - test_msg("Couldn't find extent ref\n"); + test_err("couldn't find extent ref"); btrfs_free_path(path); return ret; } @@ -195,7 +195,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); if (ret) { - test_msg("Couldn't find backref %d\n", ret); + test_err("couldn't find backref %d", ret); btrfs_free_path(path); return ret; } @@ -213,12 +213,12 @@ static int test_no_shared_qgroup(struct btrfs_root *root, struct ulist *new_roots = NULL; int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, fs_info); - test_msg("Qgroup basic add\n"); + test_msg("qgroup basic add"); ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID); if (ret) { - test_msg("Couldn't create a qgroup %d\n", ret); + test_err("couldn't create a qgroup %d", ret); return ret; } @@ -231,7 +231,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, false); if (ret) { ulist_free(old_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } @@ -245,20 +245,20 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) { ulist_free(old_roots); ulist_free(new_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, nodesize, old_roots, new_roots); if (ret) { - test_msg("Couldn't account space for a qgroup %d\n", ret); + test_err("couldn't account space for a qgroup %d", ret); return ret; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } old_roots = NULL; @@ -268,7 +268,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, false); if (ret) { ulist_free(old_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } @@ -281,19 +281,19 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) { ulist_free(old_roots); ulist_free(new_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, nodesize, old_roots, new_roots); if (ret) { - test_msg("Couldn't account space for a qgroup %d\n", ret); + test_err("couldn't account space for a qgroup %d", ret); return -EINVAL; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } @@ -314,9 +314,9 @@ static int test_multiple_refs(struct btrfs_root *root, struct ulist *new_roots = NULL; int ret; - btrfs_init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans, fs_info); - test_msg("Qgroup multiple refs test\n"); + test_msg("qgroup multiple refs test"); /* * We have BTRFS_FS_TREE_OBJECTID created already from the @@ -324,7 +324,7 @@ static int test_multiple_refs(struct btrfs_root *root, */ ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID); if (ret) { - test_msg("Couldn't create a qgroup %d\n", ret); + test_err("couldn't create a qgroup %d", ret); return ret; } @@ -332,7 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root, false); if (ret) { ulist_free(old_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } @@ -346,20 +346,20 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) { ulist_free(old_roots); ulist_free(new_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, nodesize, old_roots, new_roots); if (ret) { - test_msg("Couldn't account space for a qgroup %d\n", ret); + test_err("couldn't account space for a qgroup %d", ret); return ret; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } @@ -367,7 +367,7 @@ static int test_multiple_refs(struct btrfs_root *root, false); if (ret) { ulist_free(old_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } @@ -381,26 +381,26 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) { ulist_free(old_roots); ulist_free(new_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, nodesize, old_roots, new_roots); if (ret) { - test_msg("Couldn't account space for a qgroup %d\n", ret); + test_err("couldn't account space for a qgroup %d", ret); return ret; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, 0)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, nodesize, 0)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } @@ -408,7 +408,7 @@ static int test_multiple_refs(struct btrfs_root *root, false); if (ret) { ulist_free(old_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } @@ -422,26 +422,26 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) { ulist_free(old_roots); ulist_free(new_roots); - test_msg("Couldn't find old roots: %d\n", ret); + test_err("couldn't find old roots: %d", ret); return ret; } ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, nodesize, old_roots, new_roots); if (ret) { - test_msg("Couldn't account space for a qgroup %d\n", ret); + test_err("couldn't account space for a qgroup %d", ret); return ret; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, 0, 0)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { - test_msg("Qgroup counts didn't match expected values\n"); + test_err("qgroup counts didn't match expected values"); return -EINVAL; } @@ -457,13 +457,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + test_err("couldn't allocate dummy fs info"); return -ENOMEM; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + test_err("couldn't allocate root"); ret = PTR_ERR(root); goto out; } @@ -485,7 +485,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) */ root->node = alloc_test_extent_buffer(root->fs_info, nodesize); if (!root->node) { - test_msg("Couldn't allocate dummy buffer\n"); + test_err("couldn't allocate dummy buffer"); ret = -ENOMEM; goto out; } @@ -495,7 +495,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { - test_msg("Couldn't allocate a fs root\n"); + test_err("couldn't allocate a fs root"); ret = PTR_ERR(tmp_root); goto out; } @@ -504,13 +504,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) root->fs_info->fs_root = tmp_root; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { - test_msg("Couldn't insert fs root %d\n", ret); + test_err("couldn't insert fs root %d", ret); goto out; } tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { - test_msg("Couldn't allocate a fs root\n"); + test_err("couldn't allocate a fs root"); ret = PTR_ERR(tmp_root); goto out; } @@ -518,11 +518,11 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { - test_msg("Couldn't insert fs root %d\n", ret); + test_err("couldn't insert fs root %d", ret); goto out; } - test_msg("Running qgroup tests\n"); + test_msg("running qgroup tests"); ret = test_no_shared_qgroup(root, sectorsize, nodesize); if (ret) goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c944b4769e3c..ff5f6c719976 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -877,12 +877,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, atomic_dec(&cur_trans->num_writers); extwriter_counter_dec(cur_trans, trans->type); - /* - * Make sure counter is updated before we wake up waiters. - */ - smp_mb(); - if (waitqueue_active(&cur_trans->writer_wait)) - wake_up(&cur_trans->writer_wait); + cond_wake_up(&cur_trans->writer_wait); btrfs_put_transaction(cur_trans); if (current->journal_info == trans) @@ -1250,7 +1245,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); - btrfs_orphan_commit_root(trans, root); btrfs_save_ino_cache(root, trans); @@ -1428,7 +1422,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; - struct timespec cur_time; + struct timespec64 cur_time; int ret = 0; u64 to_reserve = 0; u64 index = 0; @@ -1640,15 +1634,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto fail; } - ret = btrfs_uuid_tree_add(trans, fs_info, new_uuid.b, - BTRFS_UUID_KEY_SUBVOL, objectid); + ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, + objectid); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info, - new_root_item->received_uuid, + ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); if (ret && ret != -EEXIST) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d8c0826bc2c7..94439482a0ec 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -139,7 +139,6 @@ struct btrfs_pending_snapshot { struct btrfs_path *path; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; - u64 qgroup_reserved; /* extra metadata reservation for relocation */ int error; bool readonly; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8f23a94dab77..f8220ec02036 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -222,11 +222,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root) void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { - /* - * Implicit memory barrier after atomic_dec_and_test - */ - if (waitqueue_active(&root->log_writer_wait)) - wake_up(&root->log_writer_wait); + /* atomic_dec_and_test implies a barrier */ + cond_wake_up_nomb(&root->log_writer_wait); } } @@ -2988,11 +2985,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { - /* - * Implicit memory barrier after atomic_dec_and_test - */ - if (waitqueue_active(&log_root_tree->log_writer_wait)) - wake_up(&log_root_tree->log_writer_wait); + /* atomic_dec_and_test implies a barrier */ + cond_wake_up_nomb(&log_root_tree->log_writer_wait); } if (ret) { @@ -3116,10 +3110,11 @@ out_wake_log_root: mutex_unlock(&log_root_tree->log_mutex); /* - * The barrier before waitqueue_active is implied by mutex_unlock + * The barrier before waitqueue_active (in cond_wake_up) is needed so + * all the updates above are seen by the woken threads. It might not be + * necessary, but proving that seems to be hard. */ - if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) - wake_up(&log_root_tree->log_commit_wait[index2]); + cond_wake_up(&log_root_tree->log_commit_wait[index2]); out: mutex_lock(&root->log_mutex); btrfs_remove_all_log_ctxs(root, index1, ret); @@ -3128,10 +3123,11 @@ out: mutex_unlock(&root->log_mutex); /* - * The barrier before waitqueue_active is implied by mutex_unlock + * The barrier before waitqueue_active (in cond_wake_up) is needed so + * all the updates above are seen by the woken threads. It might not be + * necessary, but proving that seems to be hard. */ - if (waitqueue_active(&root->log_commit_wait[index1])) - wake_up(&root->log_commit_wait[index1]); + cond_wake_up(&root->log_commit_wait[index1]); return ret; } diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 1ba7ca2a4200..3b2ae342e649 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -79,10 +79,10 @@ out: return ret; } -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid_cpu) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; struct btrfs_path *path = NULL; @@ -144,10 +144,10 @@ out: return ret; } -int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; struct btrfs_path *path = NULL; @@ -239,7 +239,7 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, goto out; } - ret = btrfs_uuid_tree_rem(trans, uuid_root->fs_info, uuid, type, subid); + ret = btrfs_uuid_tree_remove(trans, uuid, type, subid); btrfs_end_transaction(trans); out: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index be3fc701f389..1da162928d1a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .raid_name = "raid10", + .bg_flag = BTRFS_BLOCK_GROUP_RAID10, + .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, }, [BTRFS_RAID_RAID1] = { .sub_stripes = 1, @@ -49,6 +52,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .raid_name = "raid1", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1, + .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, }, [BTRFS_RAID_DUP] = { .sub_stripes = 1, @@ -58,6 +64,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, + .raid_name = "dup", + .bg_flag = BTRFS_BLOCK_GROUP_DUP, + .mindev_error = 0, }, [BTRFS_RAID_RAID0] = { .sub_stripes = 1, @@ -67,6 +76,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .raid_name = "raid0", + .bg_flag = BTRFS_BLOCK_GROUP_RAID0, + .mindev_error = 0, }, [BTRFS_RAID_SINGLE] = { .sub_stripes = 1, @@ -76,6 +88,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .raid_name = "single", + .bg_flag = 0, + .mindev_error = 0, }, [BTRFS_RAID_RAID5] = { .sub_stripes = 1, @@ -85,6 +100,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 1, .ncopies = 2, + .raid_name = "raid5", + .bg_flag = BTRFS_BLOCK_GROUP_RAID5, + .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, }, [BTRFS_RAID_RAID6] = { .sub_stripes = 1, @@ -94,33 +112,19 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 2, .devs_increment = 1, .ncopies = 3, + .raid_name = "raid6", + .bg_flag = BTRFS_BLOCK_GROUP_RAID6, + .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, }, }; -const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, - [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, - [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, - [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, - [BTRFS_RAID_SINGLE] = 0, - [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, - [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, -}; +const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; -/* - * Table to convert BTRFS_RAID_* to the error code if minimum number of devices - * condition is not met. Zero means there's no corresponding - * BTRFS_ERROR_DEV_*_NOT_MET value. - */ -const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, - [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, - [BTRFS_RAID_DUP] = 0, - [BTRFS_RAID_RAID0] = 0, - [BTRFS_RAID_SINGLE] = 0, - [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, - [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, -}; + return btrfs_raid_array[type].raid_name; +} static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); @@ -167,12 +171,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * may be used to exclude some operations from running concurrently without any * modifications to the list (see write_all_supers) * - * volume_mutex - * ------------ - * coarse lock owned by a mounted filesystem; used to exclude some operations - * that cannot run in parallel and affect the higher-level properties of the - * filesystem like: device add/deleting/resize/replace, or balance - * * balance_mutex * ------------- * protects balance structures (status, state) and context accessed from @@ -197,6 +195,41 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * device_list_mutex * chunk_mutex * balance_mutex + * + * + * Exclusive operations, BTRFS_FS_EXCL_OP + * ====================================== + * + * Maintains the exclusivity of the following operations that apply to the + * whole filesystem and cannot run in parallel. + * + * - Balance (*) + * - Device add + * - Device remove + * - Device replace (*) + * - Resize + * + * The device operations (as above) can be in one of the following states: + * + * - Running state + * - Paused state + * - Completed state + * + * Only device operations marked with (*) can go into the Paused state for the + * following reasons: + * + * - ioctl (only Balance can be Paused through ioctl) + * - filesystem remounted as read-only + * - filesystem unmounted and mounted as read-only + * - system power-cycle and filesystem mounted as read-only + * - filesystem or device errors leading to forced read-only + * + * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. + * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * A device operation in Paused or Running state can be canceled or resumed + * either by ioctl (Balance only) or when remounted as read-write. + * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * completed. */ DEFINE_MUTEX(uuid_mutex); @@ -227,14 +260,14 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); - INIT_LIST_HEAD(&fs_devs->list); + INIT_LIST_HEAD(&fs_devs->fs_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); return fs_devs; } -static void free_device(struct btrfs_device *device) +void btrfs_free_device(struct btrfs_device *device) { rcu_string_free(device->name); bio_put(device->flush_bio); @@ -249,7 +282,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - free_device(device); + btrfs_free_device(device); } kfree(fs_devices); } @@ -273,8 +306,8 @@ void __exit btrfs_cleanup_fs_uuids(void) while (!list_empty(&fs_uuids)) { fs_devices = list_entry(fs_uuids.next, - struct btrfs_fs_devices, list); - list_del(&fs_devices->list); + struct btrfs_fs_devices, fs_list); + list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } @@ -282,7 +315,7 @@ void __exit btrfs_cleanup_fs_uuids(void) /* * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. * Returned struct is not linked onto any lists and must be destroyed using - * free_device. + * btrfs_free_device. */ static struct btrfs_device *__alloc_device(void) { @@ -327,10 +360,9 @@ static struct btrfs_device *__alloc_device(void) static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, u64 devid, const u8 *uuid) { - struct list_head *head = &fs_devices->devices; struct btrfs_device *dev; - list_for_each_entry(dev, head, dev_list) { + list_for_each_entry(dev, &fs_devices->devices, dev_list) { if (dev->devid == devid && (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { return dev; @@ -343,7 +375,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) { struct btrfs_fs_devices *fs_devices; - list_for_each_entry(fs_devices, &fs_uuids, list) { + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) return fs_devices; } @@ -607,7 +639,7 @@ static void btrfs_free_stale_devices(const char *path, struct btrfs_fs_devices *fs_devs, *tmp_fs_devs; struct btrfs_device *dev, *tmp_dev; - list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) { + list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) { if (fs_devs->opened) continue; @@ -632,13 +664,13 @@ static void btrfs_free_stale_devices(const char *path, /* delete the stale device */ if (fs_devs->num_devices == 1) { btrfs_sysfs_remove_fsid(fs_devs); - list_del(&fs_devs->list); + list_del(&fs_devs->fs_list); free_fs_devices(fs_devs); break; } else { fs_devs->num_devices--; list_del(&dev->dev_list); - free_device(dev); + btrfs_free_device(dev); } } } @@ -732,7 +764,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); - list_add(&fs_devices->list, &fs_uuids); + list_add(&fs_devices->fs_list, &fs_uuids); device = NULL; } else { @@ -753,7 +785,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, name = rcu_string_strdup(path, GFP_NOFS); if (!name) { - free_device(device); + btrfs_free_device(device); return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); @@ -866,7 +898,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { - free_device(device); + btrfs_free_device(device); goto error; } rcu_assign_pointer(device->name, name); @@ -938,7 +970,7 @@ again: } list_del_init(&device->dev_list); fs_devices->num_devices--; - free_device(device); + btrfs_free_device(device); } if (fs_devices->seed) { @@ -956,7 +988,7 @@ static void free_device_rcu(struct rcu_head *head) struct btrfs_device *device; device = container_of(head, struct btrfs_device, rcu); - free_device(device); + btrfs_free_device(device); } static void btrfs_close_bdev(struct btrfs_device *device) @@ -1005,7 +1037,7 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device) new_device->fs_devices = device->fs_devices; } -static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +static int close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; struct list_head pending_put; @@ -1050,7 +1082,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) int ret; mutex_lock(&uuid_mutex); - ret = __btrfs_close_devices(fs_devices); + ret = close_fs_devices(fs_devices); if (!fs_devices->opened) { seed_devices = fs_devices->seed; fs_devices->seed = NULL; @@ -1060,23 +1092,22 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) while (seed_devices) { fs_devices = seed_devices; seed_devices = fs_devices->seed; - __btrfs_close_devices(fs_devices); + close_fs_devices(fs_devices); free_fs_devices(fs_devices); } return ret; } -static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, +static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { - struct list_head *head = &fs_devices->devices; struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; int ret = 0; flags |= FMODE_EXCL; - list_for_each_entry(device, head, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { /* Just open everything we can; ignore failures here */ if (btrfs_open_one_device(fs_devices, device, flags, holder)) continue; @@ -1116,14 +1147,17 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int ret; mutex_lock(&uuid_mutex); + mutex_lock(&fs_devices->device_list_mutex); if (fs_devices->opened) { fs_devices->opened++; ret = 0; } else { list_sort(NULL, &fs_devices->devices, devid_cmp); - ret = __btrfs_open_devices(fs_devices, flags, holder); + ret = open_fs_devices(fs_devices, flags, holder); } + mutex_unlock(&fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + return ret; } @@ -1201,31 +1235,29 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, */ bytenr = btrfs_sb_offset(0); flags |= FMODE_EXCL; - mutex_lock(&uuid_mutex); bdev = blkdev_get_by_path(path, flags, holder); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - goto error; - } + if (IS_ERR(bdev)) + return PTR_ERR(bdev); if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { ret = -EINVAL; goto error_bdev_put; } + mutex_lock(&uuid_mutex); device = device_list_add(path, disk_super); if (IS_ERR(device)) ret = PTR_ERR(device); else *fs_devices_ret = device->fs_devices; + mutex_unlock(&uuid_mutex); btrfs_release_disk_super(page); error_bdev_put: blkdev_put(bdev, flags); -error: - mutex_unlock(&uuid_mutex); + return ret; } @@ -1857,11 +1889,11 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, } while (read_seqretry(&fs_info->profiles_lock, seq)); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { - if (!(all_avail & btrfs_raid_group[i])) + if (!(all_avail & btrfs_raid_array[i].bg_flag)) continue; if (num_devices < btrfs_raid_array[i].devs_min) { - int ret = btrfs_raid_mindev_error[i]; + int ret = btrfs_raid_array[i].mindev_error; if (ret) return ret; @@ -1917,13 +1949,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 num_devices; int ret = 0; - mutex_lock(&fs_info->volume_mutex); mutex_lock(&uuid_mutex); - num_devices = fs_info->fs_devices->num_devices; + num_devices = fs_devices->num_devices; btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { WARN_ON(num_devices < 1); @@ -1986,27 +2018,32 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * (super_copy) should hold the device list mutex. */ + /* + * In normal cases the cur_devices == fs_devices. But in case + * of deleting a seed device, the cur_devices should point to + * its own fs_devices listed under the fs_devices->seed. + */ cur_devices = device->fs_devices; - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); list_del_rcu(&device->dev_list); - device->fs_devices->num_devices--; - device->fs_devices->total_devices--; + cur_devices->num_devices--; + cur_devices->total_devices--; if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - device->fs_devices->missing_devices--; + cur_devices->missing_devices--; btrfs_assign_next_active_device(fs_info, device, NULL); if (device->bdev) { - device->fs_devices->open_devices--; + cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(fs_devices, device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; btrfs_set_super_num_devices(fs_info->super_copy, num_devices); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* * at this point, the device is zero sized and detached from @@ -2020,8 +2057,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, call_rcu(&device->rcu, free_device_rcu); if (cur_devices->open_devices == 0) { - struct btrfs_fs_devices *fs_devices; - fs_devices = fs_info->fs_devices; while (fs_devices) { if (fs_devices->seed == cur_devices) { fs_devices->seed = cur_devices->seed; @@ -2030,20 +2065,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, fs_devices = fs_devices->seed; } cur_devices->seed = NULL; - __btrfs_close_devices(cur_devices); + close_fs_devices(cur_devices); free_fs_devices(cur_devices); } out: mutex_unlock(&uuid_mutex); - mutex_unlock(&fs_info->volume_mutex); return ret; error_undo: if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, - &fs_info->fs_devices->alloc_list); + &fs_devices->alloc_list); device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } @@ -2112,7 +2146,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, tmp_fs_devices = tmp_fs_devices->seed; } fs_devices->seed = NULL; - __btrfs_close_devices(fs_devices); + close_fs_devices(fs_devices); free_fs_devices(fs_devices); } } @@ -2120,23 +2154,23 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev) { - mutex_lock(&uuid_mutex); + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + WARN_ON(!tgtdev); - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); + btrfs_sysfs_rm_device_link(fs_devices, tgtdev); if (tgtdev->bdev) - fs_info->fs_devices->open_devices--; + fs_devices->open_devices--; - fs_info->fs_devices->num_devices--; + fs_devices->num_devices--; btrfs_assign_next_active_device(fs_info, tgtdev, NULL); list_del_rcu(&tgtdev->dev_list); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - mutex_unlock(&uuid_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* * The update_dev_time() with in btrfs_scratch_superblocks() @@ -2188,10 +2222,6 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, struct btrfs_device *tmp; devices = &fs_info->fs_devices->devices; - /* - * It is safe to read the devices since the volume_mutex - * is held by the caller. - */ list_for_each_entry(tmp, devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tmp->dev_state) && !tmp->bdev) { @@ -2259,7 +2289,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) return PTR_ERR(old_devices); } - list_add(&old_devices->list, &fs_uuids); + list_add(&old_devices->fs_list, &fs_uuids); memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); seed_devices->opened = 1; @@ -2570,7 +2600,7 @@ error_trans: if (trans) btrfs_end_transaction(trans); error_free_device: - free_device(device); + btrfs_free_device(device); error: blkdev_put(bdev, FMODE_EXCL); if (seeding_dev && !unlocked) { @@ -2580,99 +2610,6 @@ error: return ret; } -int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device *srcdev, - struct btrfs_device **device_out) -{ - struct btrfs_device *device; - struct block_device *bdev; - struct list_head *devices; - struct rcu_string *name; - u64 devid = BTRFS_DEV_REPLACE_DEVID; - int ret = 0; - - *device_out = NULL; - if (fs_info->fs_devices->seeding) { - btrfs_err(fs_info, "the filesystem is a seed filesystem!"); - return -EINVAL; - } - - bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, - fs_info->bdev_holder); - if (IS_ERR(bdev)) { - btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev); - } - - filemap_write_and_wait(bdev->bd_inode->i_mapping); - - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { - if (device->bdev == bdev) { - btrfs_err(fs_info, - "target device is in the filesystem!"); - ret = -EEXIST; - goto error; - } - } - - - if (i_size_read(bdev->bd_inode) < - btrfs_device_get_total_bytes(srcdev)) { - btrfs_err(fs_info, - "target device is smaller than source device!"); - ret = -EINVAL; - goto error; - } - - - device = btrfs_alloc_device(NULL, &devid, NULL); - if (IS_ERR(device)) { - ret = PTR_ERR(device); - goto error; - } - - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - free_device(device); - ret = -ENOMEM; - goto error; - } - rcu_assign_pointer(device->name, name); - - mutex_lock(&fs_info->fs_devices->device_list_mutex); - set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - device->generation = 0; - device->io_width = fs_info->sectorsize; - device->io_align = fs_info->sectorsize; - device->sector_size = fs_info->sectorsize; - device->total_bytes = btrfs_device_get_total_bytes(srcdev); - device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); - device->bytes_used = btrfs_device_get_bytes_used(srcdev); - device->commit_total_bytes = srcdev->commit_total_bytes; - device->commit_bytes_used = device->bytes_used; - device->fs_info = fs_info; - device->bdev = bdev; - set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->mode = FMODE_EXCL; - device->dev_stats_valid = 1; - set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); - device->fs_devices = fs_info->fs_devices; - list_add(&device->dev_list, &fs_info->fs_devices->devices); - fs_info->fs_devices->num_devices++; - fs_info->fs_devices->open_devices++; - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - - *device_out = device; - return ret; - -error: - blkdev_put(bdev, FMODE_EXCL); - return ret; -} - static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { @@ -3273,24 +3210,12 @@ static void update_balance_args(struct btrfs_balance_control *bctl) } /* - * Should be called with both balance and volume mutexes held to - * serialize other volume operations (add_dev/rm_dev/resize) with - * restriper. Same goes for unset_balance_control. + * Clear the balance status in fs_info and delete the balance item from disk. */ -static void set_balance_control(struct btrfs_balance_control *bctl) -{ - struct btrfs_fs_info *fs_info = bctl->fs_info; - - BUG_ON(fs_info->balance_ctl); - - spin_lock(&fs_info->balance_lock); - fs_info->balance_ctl = bctl; - spin_unlock(&fs_info->balance_lock); -} - -static void unset_balance_control(struct btrfs_fs_info *fs_info) +static void reset_balance_state(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; + int ret; BUG_ON(!fs_info->balance_ctl); @@ -3299,6 +3224,9 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->balance_lock); kfree(bctl); + ret = del_balance_item(fs_info); + if (ret) + btrfs_handle_fs_error(fs_info, ret, NULL); } /* @@ -3835,18 +3763,6 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info) atomic_read(&fs_info->balance_cancel_req) == 0); } -static void __cancel_balance(struct btrfs_fs_info *fs_info) -{ - int ret; - - unset_balance_control(fs_info); - ret = del_balance_item(fs_info); - if (ret) - btrfs_handle_fs_error(fs_info, ret, NULL); - - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); -} - /* Non-zero return value signifies invalidity */ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, u64 allowed) @@ -3857,12 +3773,12 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, } /* - * Should be called with both balance and volume mutexes held + * Should be called with balance mutexe held */ -int btrfs_balance(struct btrfs_balance_control *bctl, +int btrfs_balance(struct btrfs_fs_info *fs_info, + struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { - struct btrfs_fs_info *fs_info = bctl->fs_info; u64 meta_target, data_target; u64 allowed; int mixed = 0; @@ -3891,7 +3807,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, !(bctl->flags & BTRFS_BALANCE_METADATA) || memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { btrfs_err(fs_info, - "with mixed groups data and metadata balance options must be the same"); + "balance: mixed groups data and metadata options must be the same"); ret = -EINVAL; goto out; } @@ -3913,23 +3829,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed |= (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID6); if (validate_convert_profile(&bctl->data, allowed)) { + int index = btrfs_bg_flags_to_raid_index(bctl->data.target); + btrfs_err(fs_info, - "unable to start balance with target data profile %llu", - bctl->data.target); + "balance: invalid convert data profile %s", + get_raid_name(index)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->meta, allowed)) { + int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); + btrfs_err(fs_info, - "unable to start balance with target metadata profile %llu", - bctl->meta.target); + "balance: invalid convert metadata profile %s", + get_raid_name(index)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->sys, allowed)) { + int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); + btrfs_err(fs_info, - "unable to start balance with target system profile %llu", - bctl->sys.target); + "balance: invalid convert system profile %s", + get_raid_name(index)); ret = -EINVAL; goto out; } @@ -3950,10 +3872,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, !(bctl->meta.target & allowed))) { if (bctl->flags & BTRFS_BALANCE_FORCE) { btrfs_info(fs_info, - "force reducing metadata integrity"); + "balance: force reducing metadata integrity"); } else { btrfs_err(fs_info, - "balance will reduce metadata integrity, use force if you want this"); + "balance: reduces metadata integrity, use --force if you want this"); ret = -EINVAL; goto out; } @@ -3967,9 +3889,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl, bctl->data.target : fs_info->avail_data_alloc_bits; if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { + int meta_index = btrfs_bg_flags_to_raid_index(meta_target); + int data_index = btrfs_bg_flags_to_raid_index(data_target); + btrfs_warn(fs_info, - "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", - meta_target, data_target); + "balance: metadata profile %s has lower redundancy than data profile %s", + get_raid_name(meta_index), get_raid_name(data_index)); } ret = insert_balance_item(fs_info, bctl); @@ -3978,7 +3903,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { BUG_ON(ret == -EEXIST); - set_balance_control(bctl); + BUG_ON(fs_info->balance_ctl); + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); } else { BUG_ON(ret != -EEXIST); spin_lock(&fs_info->balance_lock); @@ -3986,22 +3914,24 @@ int btrfs_balance(struct btrfs_balance_control *bctl, spin_unlock(&fs_info->balance_lock); } - atomic_inc(&fs_info->balance_running); + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); - atomic_dec(&fs_info->balance_running); + clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { memset(bargs, 0, sizeof(*bargs)); - update_ioctl_balance_args(fs_info, 0, bargs); + btrfs_update_ioctl_balance_args(fs_info, bargs); } if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { - __cancel_balance(fs_info); + reset_balance_state(fs_info); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } wake_up(&fs_info->balance_wait_q); @@ -4009,11 +3939,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl, return ret; out: if (bctl->flags & BTRFS_BALANCE_RESUME) - __cancel_balance(fs_info); - else { + reset_balance_state(fs_info); + else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); - } + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + return ret; } @@ -4022,16 +3952,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - btrfs_info(fs_info, "continuing balance"); - ret = btrfs_balance(fs_info->balance_ctl, NULL); + btrfs_info(fs_info, "balance: resuming"); + ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); } - mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); return ret; } @@ -4040,15 +3966,15 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) { struct task_struct *tsk; - spin_lock(&fs_info->balance_lock); + mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { - spin_unlock(&fs_info->balance_lock); + mutex_unlock(&fs_info->balance_mutex); return 0; } - spin_unlock(&fs_info->balance_lock); + mutex_unlock(&fs_info->balance_mutex); if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { - btrfs_info(fs_info, "force skipping balance"); + btrfs_info(fs_info, "balance: resume skipped"); return 0; } @@ -4100,7 +4026,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); - bctl->fs_info = fs_info; bctl->flags = btrfs_balance_flags(leaf, item); bctl->flags |= BTRFS_BALANCE_RESUME; @@ -4111,15 +4036,26 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); + /* + * This should never happen, as the paused balance state is recovered + * during mount without any chance of other exclusive ops to collide. + * + * This gives the exclusive op status to balance and keeps in paused + * state until user intervention (cancel or umount). If the ownership + * cannot be assigned, show a message but do not fail. The balance + * is in a paused state and must have fs_info::balance_ctl properly + * set up. + */ + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + btrfs_warn(fs_info, + "balance: cannot set exclusive op status, resume manually"); - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); - - set_balance_control(bctl); - + BUG_ON(fs_info->balance_ctl); + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); out: btrfs_free_path(path); return ret; @@ -4135,16 +4071,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) return -ENOTCONN; } - if (atomic_read(&fs_info->balance_running)) { + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { atomic_inc(&fs_info->balance_pause_req); mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); /* we are good with balance_ctl ripped off from under us */ - BUG_ON(atomic_read(&fs_info->balance_running)); + BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_pause_req); } else { ret = -ENOTCONN; @@ -4156,38 +4092,49 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) { - if (sb_rdonly(fs_info->sb)) - return -EROFS; - mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); return -ENOTCONN; } + /* + * A paused balance with the item stored on disk can be resumed at + * mount time if the mount is read-write. Otherwise it's still paused + * and we must not allow cancelling as it deletes the item. + */ + if (sb_rdonly(fs_info->sb)) { + mutex_unlock(&fs_info->balance_mutex); + return -EROFS; + } + atomic_inc(&fs_info->balance_cancel_req); /* * if we are running just wait and return, balance item is * deleted in btrfs_balance in this case */ - if (atomic_read(&fs_info->balance_running)) { + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); } else { - /* __cancel_balance needs volume_mutex */ mutex_unlock(&fs_info->balance_mutex); - mutex_lock(&fs_info->volume_mutex); + /* + * Lock released to allow other waiters to continue, we'll + * reexamine the status again. + */ mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) - __cancel_balance(fs_info); - - mutex_unlock(&fs_info->volume_mutex); + if (fs_info->balance_ctl) { + reset_balance_state(fs_info); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_info(fs_info, "balance: canceled"); + } } - BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + BUG_ON(fs_info->balance_ctl || + test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_cancel_req); mutex_unlock(&fs_info->balance_mutex); return 0; @@ -4264,8 +4211,7 @@ static int btrfs_uuid_scan_kthread(void *data) } update_tree: if (!btrfs_is_empty_uuid(root_item.uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info, - root_item.uuid, + ret = btrfs_uuid_tree_add(trans, root_item.uuid, BTRFS_UUID_KEY_SUBVOL, key.objectid); if (ret < 0) { @@ -4276,7 +4222,7 @@ update_tree: } if (!btrfs_is_empty_uuid(root_item.received_uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info, + ret = btrfs_uuid_tree_add(trans, root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, key.objectid); @@ -4482,7 +4428,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (!path) return -ENOMEM; - path->reada = READA_FORWARD; + path->reada = READA_BACK; mutex_lock(&fs_info->chunk_mutex); @@ -6043,9 +5989,8 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); } -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, - u64 chunk_start, u64 physical, u64 devid, - u64 **logical, int *naddrs, int *stripe_len) +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -6077,8 +6022,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, BUG_ON(!buf); /* -ENOMEM */ for (i = 0; i < map->num_stripes; i++) { - if (devid && map->stripes[i].dev->devid != devid) - continue; if (map->stripes[i].physical > physical || map->stripes[i].physical + length <= physical) continue; @@ -6410,7 +6353,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() * on error. Returned struct is not linked onto any lists and must be - * destroyed with free_device. + * destroyed with btrfs_free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, @@ -6433,7 +6376,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ret = find_next_devid(fs_info, &tmp); if (ret) { - free_device(dev); + btrfs_free_device(dev); return ERR_PTR(ret); } } @@ -6684,8 +6627,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - ret = __btrfs_open_devices(fs_devices, FMODE_READ, - fs_info->bdev_holder); + ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); fs_devices = ERR_PTR(ret); @@ -6693,7 +6635,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, } if (!fs_devices->seeding) { - __btrfs_close_devices(fs_devices); + close_fs_devices(fs_devices); free_fs_devices(fs_devices); fs_devices = ERR_PTR(-EINVAL); goto out; @@ -7002,6 +6944,10 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; + /* + * uuid_mutex is needed only if we are mounting a sprout FS + * otherwise we don't need it. + */ mutex_lock(&uuid_mutex); mutex_lock(&fs_info->chunk_mutex); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 79096884654f..5139ec8daf4c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -208,6 +208,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + struct list_head fs_list; u64 num_devices; u64 open_devices; @@ -229,7 +230,6 @@ struct btrfs_fs_devices { struct list_head resized_devices; /* devices not currently being allocated */ struct list_head alloc_list; - struct list_head list; struct btrfs_fs_devices *seed; int seeding; @@ -329,11 +329,12 @@ struct btrfs_raid_attr { int tolerated_failures; /* max tolerated fail devs */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ + int mindev_error; /* error code if min devs requisite is unmet */ + const char raid_name[8]; /* name of the raid */ + u64 bg_flag; /* block group flag of the raid */ }; extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; -extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES]; -extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; struct map_lookup { u64 type; @@ -351,8 +352,6 @@ struct map_lookup { struct btrfs_balance_args; struct btrfs_balance_progress; struct btrfs_balance_control { - struct btrfs_fs_info *fs_info; - struct btrfs_balance_args data; struct btrfs_balance_args meta; struct btrfs_balance_args sys; @@ -393,9 +392,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, - u64 chunk_start, u64 physical, u64 devid, - u64 **logical, int *naddrs, int *stripe_len); +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, @@ -421,6 +419,7 @@ int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); +void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 devid); void __exit btrfs_cleanup_fs_uuids(void); @@ -431,11 +430,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); -int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device *srcdev, - struct btrfs_device **device_out); -int btrfs_balance(struct btrfs_balance_control *bctl, +int btrfs_balance(struct btrfs_fs_info *fs_info, + struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); @@ -553,6 +549,8 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } +const char *get_raid_name(enum btrfs_raid_types type); + void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans); diff --git a/fs/buffer.c b/fs/buffer.c index 249b83fafe48..cabc045f483d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3427,120 +3427,6 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); -/* - * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. - * - * Returns the offset within the file on success, and -ENOENT otherwise. - */ -static loff_t -page_seek_hole_data(struct page *page, loff_t lastoff, int whence) -{ - loff_t offset = page_offset(page); - struct buffer_head *bh, *head; - bool seek_data = whence == SEEK_DATA; - - if (lastoff < offset) - lastoff = offset; - - bh = head = page_buffers(page); - do { - offset += bh->b_size; - if (lastoff >= offset) - continue; - - /* - * Unwritten extents that have data in the page cache covering - * them can be identified by the BH_Unwritten state flag. - * Pages with multiple buffers might have a mix of holes, data - * and unwritten extents - any buffer with valid data in it - * should have BH_Uptodate flag set on it. - */ - - if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data) - return lastoff; - - lastoff = offset; - } while ((bh = bh->b_this_page) != head); - return -ENOENT; -} - -/* - * Seek for SEEK_DATA / SEEK_HOLE in the page cache. - * - * Within unwritten extents, the page cache determines which parts are holes - * and which are data: unwritten and uptodate buffer heads count as data; - * everything else counts as a hole. - * - * Returns the resulting offset on successs, and -ENOENT otherwise. - */ -loff_t -page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, - int whence) -{ - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); - loff_t lastoff = offset; - struct pagevec pvec; - - if (length <= 0) - return -ENOENT; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, - end - 1); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - * - * If current page offset is beyond where we've ended, - * we've found a hole. - */ - if (whence == SEEK_HOLE && - lastoff < page_offset(page)) - goto check_range; - - lock_page(page); - if (likely(page->mapping == inode->i_mapping) && - page_has_buffers(page)) { - lastoff = page_seek_hole_data(page, lastoff, whence); - if (lastoff >= 0) { - unlock_page(page); - goto check_range; - } - } - unlock_page(page); - lastoff = page_offset(page) + PAGE_SIZE; - } - pagevec_release(&pvec); - } while (index < end); - - /* When no page at lastoff and we are not done, we found a hole. */ - if (whence != SEEK_HOLE) - goto not_found; - -check_range: - if (lastoff < offset + length) - goto out; -not_found: - lastoff = -ENOENT; -out: - pagevec_release(&pvec); - return lastoff; -} - void __init buffer_init(void) { unsigned long nrpages; diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index d9f001078e08..4a717d400807 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -218,7 +218,8 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) "%s", fsdef->dentry->d_sb->s_id); - fscache_object_init(&fsdef->fscache, NULL, &cache->cache); + fscache_object_init(&fsdef->fscache, &fscache_fsdef_index, + &cache->cache); ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag); if (ret < 0) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 0daa1e3fe0df..af2b17b21b94 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -186,12 +186,12 @@ try_again: * need to wait for it to be destroyed */ wait_for_old_object: trace_cachefiles_wait_active(object, dentry, xobject); + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); if (fscache_object_is_live(&xobject->fscache)) { pr_err("\n"); pr_err("Error: Unexpected object collision\n"); cachefiles_printk_object(object, xobject); - BUG(); } atomic_inc(&xobject->usage); write_unlock(&cache->active_lock); @@ -248,7 +248,6 @@ wait_for_old_object: goto try_again; requeue: - clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_timeo); _leave(" = -ETIMEDOUT"); return -ETIMEDOUT; @@ -572,6 +571,11 @@ lookup_again: if (ret < 0) goto create_error; + if (unlikely(d_unhashed(next))) { + dput(next); + inode_unlock(d_inode(dir)); + goto lookup_again; + } ASSERT(d_backing_inode(next)); _debug("mkdir -> %p{%p{ino=%lu}}", @@ -764,6 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, /* search the current directory for the element name */ inode_lock(d_inode(dir)); +retry: start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); cachefiles_hist(cachefiles_lookup_histogram, start); @@ -793,6 +798,10 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, if (ret < 0) goto mkdir_error; + if (unlikely(d_unhashed(subdir))) { + dput(subdir); + goto retry; + } ASSERT(d_backing_inode(subdir)); _debug("mkdir -> %p{%p{ino=%lu}}", diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c index 125b90f6c796..0ce1aa56b67f 100644 --- a/fs/cachefiles/proc.c +++ b/fs/cachefiles/proc.c @@ -85,21 +85,6 @@ static const struct seq_operations cachefiles_histogram_ops = { }; /* - * open "/proc/fs/cachefiles/XXX" which provide statistics summaries - */ -static int cachefiles_histogram_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &cachefiles_histogram_ops); -} - -static const struct file_operations cachefiles_histogram_fops = { - .open = cachefiles_histogram_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -/* * initialise the /proc/fs/cachefiles/ directory */ int __init cachefiles_proc_init(void) @@ -109,8 +94,8 @@ int __init cachefiles_proc_init(void) if (!proc_mkdir("fs/cachefiles", NULL)) goto error_dir; - if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL, - &cachefiles_histogram_fops)) + if (!proc_create_seq("fs/cachefiles/histogram", S_IFREG | 0444, NULL, + &cachefiles_histogram_ops)) goto error_histogram; _leave(" = 0"); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 5082c8a49686..40f7595aad10 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -27,6 +27,7 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, struct cachefiles_one_read *monitor = container_of(wait, struct cachefiles_one_read, monitor); struct cachefiles_object *object; + struct fscache_retrieval *op = monitor->op; struct wait_bit_key *key = _key; struct page *page = wait->private; @@ -51,16 +52,22 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, list_del(&wait->entry); /* move onto the action list and queue for FS-Cache thread pool */ - ASSERT(monitor->op); + ASSERT(op); - object = container_of(monitor->op->op.object, - struct cachefiles_object, fscache); + /* We need to temporarily bump the usage count as we don't own a ref + * here otherwise cachefiles_read_copier() may free the op between the + * monitor being enqueued on the op->to_do list and the op getting + * enqueued on the work queue. + */ + fscache_get_retrieval(op); + object = container_of(op->op.object, struct cachefiles_object, fscache); spin_lock(&object->work_lock); - list_add_tail(&monitor->op_link, &monitor->op->to_do); + list_add_tail(&monitor->op_link, &op->to_do); spin_unlock(&object->work_lock); - fscache_enqueue_retrieval(monitor->op); + fscache_enqueue_retrieval(op); + fscache_put_retrieval(op); return 0; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5f7ad3d0df2e..292b3d72d725 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -370,7 +370,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, /* build page vector */ nr_pages = calc_pages_for(0, len); - pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL); + pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out_put; @@ -574,6 +574,7 @@ static u64 get_writepages_data_length(struct inode *inode, */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { + struct timespec ts; struct inode *inode; struct ceph_inode_info *ci; struct ceph_fs_client *fsc; @@ -624,11 +625,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); + ts = timespec64_to_timespec(inode->i_mtime); err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, - &inode->i_mtime, &page, 1); + &ts, &page, 1); if (err < 0) { struct writeback_control tmp_wbc; if (!wbc) @@ -966,8 +968,9 @@ get_more_pages: BUG_ON(pages); max_pages = calc_pages_for(0, (u64)len); - pages = kmalloc(max_pages * sizeof (*pages), - GFP_NOFS); + pages = kmalloc_array(max_pages, + sizeof(*pages), + GFP_NOFS); if (!pages) { pool = fsc->wb_pagevec_pool; pages = mempool_alloc(pool, GFP_NOFS); @@ -1113,8 +1116,8 @@ new_request: /* allocate new pages array for next request */ data_pages = pages; - pages = kmalloc(locked_pages * sizeof (*pages), - GFP_NOFS); + pages = kmalloc_array(locked_pages, sizeof(*pages), + GFP_NOFS); if (!pages) { pool = fsc->wb_pagevec_pool; pages = mempool_alloc(pool, GFP_NOFS); @@ -1131,7 +1134,7 @@ new_request: pages = NULL; } - req->r_mtime = inode->i_mtime; + req->r_mtime = timespec64_to_timespec(inode->i_mtime); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); req = NULL; @@ -1731,7 +1734,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out; } - req->r_mtime = inode->i_mtime; + req->r_mtime = timespec64_to_timespec(inode->i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1773,7 +1776,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out_put; } - req->r_mtime = inode->i_mtime; + req->r_mtime = timespec64_to_timespec(inode->i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1934,8 +1937,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, 0, false, true); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - wr_req->r_mtime = ci->vfs_inode.i_mtime; - wr_req->r_abort_on_full = true; + wr_req->r_mtime = timespec64_to_timespec(ci->vfs_inode.i_mtime); err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index bb524c880b1e..362900e42424 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -130,7 +130,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( memset(&aux, 0, sizeof(aux)); aux.version = ci->i_version; - aux.mtime = inode->i_mtime; + aux.mtime = timespec64_to_timespec(inode->i_mtime); if (memcmp(data, &aux, sizeof(aux)) != 0) return FSCACHE_CHECKAUX_OBSOLETE; @@ -163,7 +163,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) if (!ci->fscache) { memset(&aux, 0, sizeof(aux)); aux.version = ci->i_version; - aux.mtime = inode->i_mtime; + aux.mtime = timespec64_to_timespec(inode->i_mtime); ci->fscache = fscache_acquire_cookie(fsc->fscache, &ceph_fscache_inode_object_def, &ci->i_vino, sizeof(ci->i_vino), diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 23dbfae16156..990258cbd836 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -69,6 +69,8 @@ static char *gcap_string(char *s, int c) *s++ = 'w'; if (c & CEPH_CAP_GBUFFER) *s++ = 'b'; + if (c & CEPH_CAP_GWREXTEND) + *s++ = 'a'; if (c & CEPH_CAP_GLAZYIO) *s++ = 'l'; return s; @@ -1358,9 +1360,9 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.xattr_buf = NULL; } - arg.mtime = inode->i_mtime; - arg.atime = inode->i_atime; - arg.ctime = inode->i_ctime; + arg.mtime = timespec64_to_timespec(inode->i_mtime); + arg.atime = timespec64_to_timespec(inode->i_atime); + arg.ctime = timespec64_to_timespec(inode->i_ctime); arg.op = op; arg.caps = cap->implemented; @@ -3022,30 +3024,41 @@ static void invalidate_aliases(struct inode *inode) dput(prev); } +struct cap_extra_info { + struct ceph_string *pool_ns; + /* inline data */ + u64 inline_version; + void *inline_data; + u32 inline_len; + /* dirstat */ + bool dirstat_valid; + u64 nfiles; + u64 nsubdirs; + /* currently issued */ + int issued; +}; + /* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * * caller holds s_mutex and i_ceph_lock, we drop both. */ -static void handle_cap_grant(struct ceph_mds_client *mdsc, - struct inode *inode, struct ceph_mds_caps *grant, - struct ceph_string **pns, u64 inline_version, - void *inline_data, u32 inline_len, - struct ceph_buffer *xattr_buf, +static void handle_cap_grant(struct inode *inode, struct ceph_mds_session *session, - struct ceph_cap *cap, int issued) + struct ceph_cap *cap, + struct ceph_mds_caps *grant, + struct ceph_buffer *xattr_buf, + struct cap_extra_info *extra_info) __releases(ci->i_ceph_lock) - __releases(mdsc->snap_rwsem) + __releases(session->s_mdsc->snap_rwsem) { struct ceph_inode_info *ci = ceph_inode(inode); - int mds = session->s_mds; int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); - struct timespec mtime, atime, ctime; int check_caps = 0; bool wake = false; bool writeback = false; @@ -3055,7 +3068,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, bool fill_inline = false; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", - inode, cap, mds, seq, ceph_cap_string(newcaps)); + inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); @@ -3101,7 +3114,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, __check_cap_issue(ci, cap, newcaps); if ((newcaps & CEPH_CAP_AUTH_SHARED) && - (issued & CEPH_CAP_AUTH_EXCL) == 0) { + (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(grant->mode); inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); @@ -3110,15 +3123,16 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, from_kgid(&init_user_ns, inode->i_gid)); } - if ((newcaps & CEPH_CAP_AUTH_SHARED) && - (issued & CEPH_CAP_LINK_EXCL) == 0) { + if ((newcaps & CEPH_CAP_LINK_SHARED) && + (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); if (inode->i_nlink == 0 && (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) deleted_inode = true; } - if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { + if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 && + grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); u64 version = le64_to_cpu(grant->xattr_version); @@ -3134,15 +3148,21 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, } if (newcaps & CEPH_CAP_ANY_RD) { + struct timespec mtime, atime, ctime; /* ctime/mtime/atime? */ ceph_decode_timespec(&mtime, &grant->mtime); ceph_decode_timespec(&atime, &grant->atime); ceph_decode_timespec(&ctime, &grant->ctime); - ceph_fill_file_time(inode, issued, + ceph_fill_file_time(inode, extra_info->issued, le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, &atime); } + if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) { + ci->i_files = extra_info->nfiles; + ci->i_subdirs = extra_info->nsubdirs; + } + if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ s64 old_pool = ci->i_layout.pool_id; @@ -3151,15 +3171,16 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, lockdep_is_held(&ci->i_ceph_lock)); - rcu_assign_pointer(ci->i_layout.pool_ns, *pns); + rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns); - if (ci->i_layout.pool_id != old_pool || *pns != old_ns) + if (ci->i_layout.pool_id != old_pool || + extra_info->pool_ns != old_ns) ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; - *pns = old_ns; + extra_info->pool_ns = old_ns; /* size/truncate_seq? */ - queue_trunc = ceph_fill_file_size(inode, issued, + queue_trunc = ceph_fill_file_size(inode, extra_info->issued, le32_to_cpu(grant->truncate_seq), le64_to_cpu(grant->truncate_size), size); @@ -3238,24 +3259,26 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, } BUG_ON(cap->issued & ~cap->implemented); - if (inline_version > 0 && inline_version >= ci->i_inline_version) { - ci->i_inline_version = inline_version; + if (extra_info->inline_version > 0 && + extra_info->inline_version >= ci->i_inline_version) { + ci->i_inline_version = extra_info->inline_version; if (ci->i_inline_version != CEPH_INLINE_NONE && (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) fill_inline = true; } if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { - if (newcaps & ~issued) + if (newcaps & ~extra_info->issued) wake = true; - kick_flushing_inode_caps(mdsc, session, inode); - up_read(&mdsc->snap_rwsem); + kick_flushing_inode_caps(session->s_mdsc, session, inode); + up_read(&session->s_mdsc->snap_rwsem); } else { spin_unlock(&ci->i_ceph_lock); } if (fill_inline) - ceph_fill_inline_data(inode, NULL, inline_data, inline_len); + ceph_fill_inline_data(inode, NULL, extra_info->inline_data, + extra_info->inline_len); if (queue_trunc) ceph_queue_vmtruncate(inode); @@ -3720,31 +3743,25 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; - struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; struct ceph_snap_realm *realm = NULL; - struct ceph_string *pool_ns = NULL; - int mds = session->s_mds; - int op, issued; + int op; + int msg_version = le16_to_cpu(msg->hdr.version); u32 seq, mseq; struct ceph_vino vino; - u64 tid; - u64 inline_version = 0; - void *inline_data = NULL; - u32 inline_len = 0; void *snaptrace; size_t snaptrace_len; void *p, *end; + struct cap_extra_info extra_info = {}; - dout("handle_caps from mds%d\n", mds); + dout("handle_caps from mds%d\n", session->s_mds); /* decode */ end = msg->front.iov_base + msg->front.iov_len; - tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len < sizeof(*h)) goto bad; h = msg->front.iov_base; @@ -3758,7 +3775,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, snaptrace_len = le32_to_cpu(h->snap_trace_len); p = snaptrace + snaptrace_len; - if (le16_to_cpu(msg->hdr.version) >= 2) { + if (msg_version >= 2) { u32 flock_len; ceph_decode_32_safe(&p, end, flock_len, bad); if (p + flock_len > end) @@ -3766,7 +3783,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, p += flock_len; } - if (le16_to_cpu(msg->hdr.version) >= 3) { + if (msg_version >= 3) { if (op == CEPH_CAP_OP_IMPORT) { if (p + sizeof(*peer) > end) goto bad; @@ -3778,16 +3795,16 @@ void ceph_handle_caps(struct ceph_mds_session *session, } } - if (le16_to_cpu(msg->hdr.version) >= 4) { - ceph_decode_64_safe(&p, end, inline_version, bad); - ceph_decode_32_safe(&p, end, inline_len, bad); - if (p + inline_len > end) + if (msg_version >= 4) { + ceph_decode_64_safe(&p, end, extra_info.inline_version, bad); + ceph_decode_32_safe(&p, end, extra_info.inline_len, bad); + if (p + extra_info.inline_len > end) goto bad; - inline_data = p; - p += inline_len; + extra_info.inline_data = p; + p += extra_info.inline_len; } - if (le16_to_cpu(msg->hdr.version) >= 5) { + if (msg_version >= 5) { struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; u32 epoch_barrier; @@ -3795,7 +3812,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); } - if (le16_to_cpu(msg->hdr.version) >= 8) { + if (msg_version >= 8) { u64 flush_tid; u32 caller_uid, caller_gid; u32 pool_ns_len; @@ -3809,13 +3826,33 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_decode_32_safe(&p, end, pool_ns_len, bad); if (pool_ns_len > 0) { ceph_decode_need(&p, end, pool_ns_len, bad); - pool_ns = ceph_find_or_create_string(p, pool_ns_len); + extra_info.pool_ns = + ceph_find_or_create_string(p, pool_ns_len); p += pool_ns_len; } } + if (msg_version >= 11) { + struct ceph_timespec *btime; + u64 change_attr; + u32 flags; + + /* version >= 9 */ + if (p + sizeof(*btime) > end) + goto bad; + btime = p; + p += sizeof(*btime); + ceph_decode_64_safe(&p, end, change_attr, bad); + /* version >= 10 */ + ceph_decode_32_safe(&p, end, flags, bad); + /* version >= 11 */ + extra_info.dirstat_valid = true; + ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); + ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); + } + /* lookup ino */ - inode = ceph_find_inode(sb, vino); + inode = ceph_find_inode(mdsc->fsc->sb, vino); ci = ceph_inode(inode); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); @@ -3848,7 +3885,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, /* these will work even if we don't have a cap yet */ switch (op) { case CEPH_CAP_OP_FLUSHSNAP_ACK: - handle_cap_flushsnap_ack(inode, tid, h, session); + handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid), + h, session); goto done; case CEPH_CAP_OP_EXPORT: @@ -3867,10 +3905,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, down_read(&mdsc->snap_rwsem); } handle_cap_import(mdsc, inode, h, peer, session, - &cap, &issued); - handle_cap_grant(mdsc, inode, h, &pool_ns, - inline_version, inline_data, inline_len, - msg->middle, session, cap, issued); + &cap, &extra_info.issued); + handle_cap_grant(inode, session, cap, + h, msg->middle, &extra_info); if (realm) ceph_put_snap_realm(mdsc, realm); goto done_unlocked; @@ -3878,10 +3915,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, /* the rest require a cap */ spin_lock(&ci->i_ceph_lock); - cap = __get_cap_for_mds(ceph_inode(inode), mds); + cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); if (!cap) { dout(" no cap on %p ino %llx.%llx from mds%d\n", - inode, ceph_ino(inode), ceph_snap(inode), mds); + inode, ceph_ino(inode), ceph_snap(inode), + session->s_mds); spin_unlock(&ci->i_ceph_lock); goto flush_cap_releases; } @@ -3890,15 +3928,15 @@ void ceph_handle_caps(struct ceph_mds_session *session, switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: - __ceph_caps_issued(ci, &issued); - issued |= __ceph_caps_dirty(ci); - handle_cap_grant(mdsc, inode, h, &pool_ns, - inline_version, inline_data, inline_len, - msg->middle, session, cap, issued); + __ceph_caps_issued(ci, &extra_info.issued); + extra_info.issued |= __ceph_caps_dirty(ci); + handle_cap_grant(inode, session, cap, + h, msg->middle, &extra_info); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: - handle_cap_flush_ack(inode, tid, h, session, cap); + handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid), + h, session, cap); break; case CEPH_CAP_OP_TRUNC: @@ -3925,7 +3963,7 @@ done: mutex_unlock(&session->s_mutex); done_unlocked: iput(inode); - ceph_put_string(pool_ns); + ceph_put_string(extra_info.pool_ns); return; bad: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1a78dd6f8bf2..036ac0f3a393 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1486,6 +1486,8 @@ const struct file_operations ceph_dir_fops = { .release = ceph_release, .unlocked_ioctl = ceph_ioctl, .fsync = ceph_fsync, + .lock = ceph_lock, + .flock = ceph_flock, }; const struct file_operations ceph_snapdir_fops = { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index cf0e45b10121..ad0bed99b1d5 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -895,7 +895,6 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_callback = ceph_aio_complete_req; req->r_inode = inode; req->r_priv = aio_req; - req->r_abort_on_full = true; ret = ceph_osdc_start_request(req->r_osdc, req, false); out: @@ -924,7 +923,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, int num_pages = 0; int flags; int ret; - struct timespec mtime = current_time(inode); + struct timespec mtime = timespec64_to_timespec(current_time(inode)); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; @@ -1132,7 +1131,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, int flags; int ret; bool check_caps = false; - struct timespec mtime = current_time(inode); + struct timespec mtime = timespec64_to_timespec(current_time(inode)); size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) @@ -1664,7 +1663,7 @@ static int ceph_zero_partial_object(struct inode *inode, goto out; } - req->r_mtime = inode->i_mtime; + req->r_mtime = timespec64_to_timespec(inode->i_mtime); ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { ret = ceph_osdc_wait_request(&fsc->client->osdc, req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ae056927080d..a866be999216 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -662,6 +662,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, struct timespec *mtime, struct timespec *atime) { struct ceph_inode_info *ci = ceph_inode(inode); + struct timespec64 ctime64 = timespec_to_timespec64(*ctime); + struct timespec64 mtime64 = timespec_to_timespec64(*mtime); + struct timespec64 atime64 = timespec_to_timespec64(*atime); int warn = 0; if (issued & (CEPH_CAP_FILE_EXCL| @@ -670,39 +673,39 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { if (ci->i_version == 0 || - timespec_compare(ctime, &inode->i_ctime) > 0) { - dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, - ctime->tv_sec, ctime->tv_nsec); - inode->i_ctime = *ctime; + timespec64_compare(&ctime64, &inode->i_ctime) > 0) { + dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n", + (long long)inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + (long long)ctime->tv_sec, ctime->tv_nsec); + inode->i_ctime = ctime64; } if (ci->i_version == 0 || ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { /* the MDS did a utimes() */ - dout("mtime %ld.%09ld -> %ld.%09ld " + dout("mtime %lld.%09ld -> %lld.%09ld " "tw %d -> %d\n", - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - mtime->tv_sec, mtime->tv_nsec, + (long long)inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + (long long)mtime->tv_sec, mtime->tv_nsec, ci->i_time_warp_seq, (int)time_warp_seq); - inode->i_mtime = *mtime; - inode->i_atime = *atime; + inode->i_mtime = mtime64; + inode->i_atime = atime64; ci->i_time_warp_seq = time_warp_seq; } else if (time_warp_seq == ci->i_time_warp_seq) { /* nobody did utimes(); take the max */ - if (timespec_compare(mtime, &inode->i_mtime) > 0) { - dout("mtime %ld.%09ld -> %ld.%09ld inc\n", - inode->i_mtime.tv_sec, + if (timespec64_compare(&mtime64, &inode->i_mtime) > 0) { + dout("mtime %lld.%09ld -> %lld.%09ld inc\n", + (long long)inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - mtime->tv_sec, mtime->tv_nsec); - inode->i_mtime = *mtime; + (long long)mtime->tv_sec, mtime->tv_nsec); + inode->i_mtime = mtime64; } - if (timespec_compare(atime, &inode->i_atime) > 0) { - dout("atime %ld.%09ld -> %ld.%09ld inc\n", - inode->i_atime.tv_sec, + if (timespec64_compare(&atime64, &inode->i_atime) > 0) { + dout("atime %lld.%09ld -> %lld.%09ld inc\n", + (long long)inode->i_atime.tv_sec, inode->i_atime.tv_nsec, - atime->tv_sec, atime->tv_nsec); - inode->i_atime = *atime; + (long long)atime->tv_sec, atime->tv_nsec); + inode->i_atime = atime64; } } else if (issued & CEPH_CAP_FILE_EXCL) { /* we did a utimes(); ignore mds values */ @@ -712,9 +715,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, } else { /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { - inode->i_ctime = *ctime; - inode->i_mtime = *mtime; - inode->i_atime = *atime; + inode->i_ctime = ctime64; + inode->i_mtime = mtime64; + inode->i_atime = atime64; ci->i_time_warp_seq = time_warp_seq; } else { warn = 1; @@ -739,7 +742,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); - int issued = 0, implemented, new_issued; + int issued, new_issued, info_caps; struct timespec mtime, atime, ctime; struct ceph_buffer *xattr_blob = NULL; struct ceph_string *pool_ns = NULL; @@ -754,8 +757,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); + info_caps = le32_to_cpu(info->cap.caps); + /* prealloc new cap struct */ - if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP) + if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) new_cap = ceph_get_cap(mdsc, caps_reservation); /* @@ -792,9 +797,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, le64_to_cpu(info->version) > (ci->i_version & ~1))) new_version = true; - issued = __ceph_caps_issued(ci, &implemented); - issued |= implemented | __ceph_caps_dirty(ci); - new_issued = ~issued & le32_to_cpu(info->cap.caps); + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + new_issued = ~issued & info_caps; /* update inode */ inode->i_rdev = le32_to_cpu(info->rdev); @@ -826,6 +831,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page, &ctime, &mtime, &atime); } + if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) { + ci->i_files = le64_to_cpu(info->files); + ci->i_subdirs = le64_to_cpu(info->subdirs); + } + if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { s64 old_pool = ci->i_layout.pool_id; @@ -854,6 +864,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page, } } + /* layout and rstat are not tracked by capability, update them if + * the inode info is from auth mds */ + if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) { + if (S_ISDIR(inode->i_mode)) { + ci->i_dir_layout = iinfo->dir_layout; + ci->i_rbytes = le64_to_cpu(info->rbytes); + ci->i_rfiles = le64_to_cpu(info->rfiles); + ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); + ceph_decode_timespec(&ci->i_rctime, &info->rctime); + } + } + /* xattrs */ /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && @@ -870,7 +892,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page, } /* finally update i_version */ - ci->i_version = le64_to_cpu(info->version); + if (le64_to_cpu(info->version) > ci->i_version) + ci->i_version = le64_to_cpu(info->version); inode->i_mapping->a_ops = &ceph_aops; @@ -918,15 +941,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page, case S_IFDIR: inode->i_op = &ceph_dir_iops; inode->i_fop = &ceph_dir_fops; - - ci->i_dir_layout = iinfo->dir_layout; - - ci->i_files = le64_to_cpu(info->files); - ci->i_subdirs = le64_to_cpu(info->subdirs); - ci->i_rbytes = le64_to_cpu(info->rbytes); - ci->i_rfiles = le64_to_cpu(info->rfiles); - ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); - ceph_decode_timespec(&ci->i_rctime, &info->rctime); break; default: pr_err("fill_inode %llx.%llx BAD mode 0%o\n", @@ -934,12 +948,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page, } /* were we issued a capability? */ - if (info->cap.caps) { + if (info_caps) { if (ceph_snap(inode) == CEPH_NOSNAP) { - unsigned caps = le32_to_cpu(info->cap.caps); ceph_add_cap(inode, session, le64_to_cpu(info->cap.cap_id), - cap_fmode, caps, + cap_fmode, info_caps, le32_to_cpu(info->cap.wanted), le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), @@ -949,7 +962,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, /* set dir completion flag? */ if (S_ISDIR(inode->i_mode) && ci->i_files == 0 && ci->i_subdirs == 0 && - (caps & CEPH_CAP_FILE_SHARED) && + (info_caps & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0 && !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); @@ -962,8 +975,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page, wake = true; } else { dout(" %p got snap_caps %s\n", inode, - ceph_cap_string(le32_to_cpu(info->cap.caps))); - ci->i_snap_caps |= le32_to_cpu(info->cap.caps); + ceph_cap_string(info_caps)); + ci->i_snap_caps |= info_caps; if (cap_fmode >= 0) __ceph_get_fmode(ci, cap_fmode); } @@ -978,8 +991,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; ci->i_inline_version = iinfo->inline_version; if (ci->i_inline_version != CEPH_INLINE_NONE && - (locked_page || - (le32_to_cpu(info->cap.caps) & cache_caps))) + (locked_page || (info_caps & cache_caps))) fill_inline = true; } @@ -1123,6 +1135,7 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) if (IS_ERR(realdn)) { pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", PTR_ERR(realdn), dn, in, ceph_vinop(in)); + dput(dn); dn = realdn; /* note realdn contains the error */ goto out; } else if (realdn) { @@ -1941,6 +1954,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) int err = 0; int inode_dirty_flags = 0; bool lock_snap_rwsem = false; + struct timespec ts; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) @@ -2015,44 +2029,44 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } if (ia_valid & ATTR_ATIME) { - dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode, - inode->i_atime.tv_sec, inode->i_atime.tv_nsec, - attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); + dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode, + (long long)inode->i_atime.tv_sec, inode->i_atime.tv_nsec, + (long long)attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); if (issued & CEPH_CAP_FILE_EXCL) { ci->i_time_warp_seq++; inode->i_atime = attr->ia_atime; dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_WR) && - timespec_compare(&inode->i_atime, + timespec64_compare(&inode->i_atime, &attr->ia_atime) < 0) { inode->i_atime = attr->ia_atime; dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec_equal(&inode->i_atime, &attr->ia_atime)) { - ceph_encode_timespec(&req->r_args.setattr.atime, - &attr->ia_atime); + !timespec64_equal(&inode->i_atime, &attr->ia_atime)) { + ts = timespec64_to_timespec(attr->ia_atime); + ceph_encode_timespec(&req->r_args.setattr.atime, &ts); mask |= CEPH_SETATTR_ATIME; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } if (ia_valid & ATTR_MTIME) { - dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode, - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); + dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, + (long long)inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + (long long)attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); if (issued & CEPH_CAP_FILE_EXCL) { ci->i_time_warp_seq++; inode->i_mtime = attr->ia_mtime; dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_WR) && - timespec_compare(&inode->i_mtime, + timespec64_compare(&inode->i_mtime, &attr->ia_mtime) < 0) { inode->i_mtime = attr->ia_mtime; dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) { - ceph_encode_timespec(&req->r_args.setattr.mtime, - &attr->ia_mtime); + !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) { + ts = timespec64_to_timespec(attr->ia_mtime); + ceph_encode_timespec(&req->r_args.setattr.mtime, &ts); mask |= CEPH_SETATTR_MTIME; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; @@ -2082,9 +2096,9 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (ia_valid & ATTR_CTIME) { bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; - dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode, - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, - attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, + dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode, + (long long)inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + (long long)attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, only ? "ctime only" : "ignored"); if (only) { /* @@ -2126,7 +2140,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; - req->r_stamp = attr->ia_ctime; + req->r_stamp = timespec64_to_timespec(attr->ia_ctime); err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, @@ -2178,6 +2192,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + int mode; int err; if (ceph_snap(inode) == CEPH_SNAPDIR) { @@ -2190,7 +2205,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = inode; @@ -2261,6 +2277,14 @@ int ceph_getattr(const struct path *path, struct kstat *stat, stat->size = ci->i_files + ci->i_subdirs; stat->blocks = 0; stat->blksize = 65536; + /* + * Some applications rely on the number of st_nlink + * value on directories to be either 0 (if unlinked) + * or 2 + number of subdirectories. + */ + if (stat->nlink == 1) + /* '.' + '..' + subdirs */ + stat->nlink = 1 + 1 + ci->i_subdirs; } } return err; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5ece2e6ad154..dc8bc664a871 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2958,12 +2958,15 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { + struct timespec ts; rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); rec.v1.size = cpu_to_le64(inode->i_size); - ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); - ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); + ts = timespec64_to_timespec(inode->i_mtime); + ceph_encode_timespec(&rec.v1.mtime, &ts); + ts = timespec64_to_timespec(inode->i_atime); + ceph_encode_timespec(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v1.pathbase = cpu_to_le64(pathbase); } @@ -2992,8 +2995,9 @@ encode_again: num_flock_locks = 0; } if (num_fcntl_locks + num_flock_locks > 0) { - flocks = kmalloc((num_fcntl_locks + num_flock_locks) * - sizeof(struct ceph_filelock), GFP_NOFS); + flocks = kmalloc_array(num_fcntl_locks + num_flock_locks, + sizeof(struct ceph_filelock), + GFP_NOFS); if (!flocks) { err = -ENOMEM; goto out_free; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 041c27ea8de1..af81555c14fd 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -594,9 +594,9 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, BUG_ON(capsnap->writing); capsnap->size = inode->i_size; - capsnap->mtime = inode->i_mtime; - capsnap->atime = inode->i_atime; - capsnap->ctime = inode->i_ctime; + capsnap->mtime = timespec64_to_timespec(inode->i_mtime); + capsnap->atime = timespec64_to_timespec(inode->i_atime); + capsnap->ctime = timespec64_to_timespec(inode->i_ctime); capsnap->time_warp_seq = ci->i_time_warp_seq; capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b33082e6878f..95a3b3ac9b6e 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -45,7 +45,7 @@ static void ceph_put_super(struct super_block *s) static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry)); - struct ceph_monmap *monmap = fsc->client->monc.monmap; + struct ceph_mon_client *monc = &fsc->client->monc; struct ceph_statfs st; u64 fsid; int err; @@ -58,7 +58,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) } dout("statfs\n"); - err = ceph_monc_do_statfs(&fsc->client->monc, data_pool, &st); + err = ceph_monc_do_statfs(monc, data_pool, &st); if (err < 0) return err; @@ -94,8 +94,11 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = NAME_MAX; /* Must convert the fsid, for consistent values across arches */ - fsid = le64_to_cpu(*(__le64 *)(&monmap->fsid)) ^ - le64_to_cpu(*((__le64 *)&monmap->fsid + 1)); + mutex_lock(&monc->mutex); + fsid = le64_to_cpu(*(__le64 *)(&monc->monmap->fsid)) ^ + le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1)); + mutex_unlock(&monc->mutex); + buf->f_fsid.val[0] = fsid & 0xffffffff; buf->f_fsid.val[1] = fsid >> 32; @@ -256,19 +259,19 @@ static int parse_fsopt_token(char *c, void *private) break; /* misc */ case Opt_wsize: - if (intval < PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) + if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) return -EINVAL; fsopt->wsize = ALIGN(intval, PAGE_SIZE); break; case Opt_rsize: - if (intval < PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) + if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) return -EINVAL; fsopt->rsize = ALIGN(intval, PAGE_SIZE); break; case Opt_rasize: if (intval < 0) return -EINVAL; - fsopt->rasize = ALIGN(intval + PAGE_SIZE - 1, PAGE_SIZE); + fsopt->rasize = ALIGN(intval, PAGE_SIZE); break; case Opt_caps_wanted_delay_min: if (intval < 1) @@ -286,7 +289,7 @@ static int parse_fsopt_token(char *c, void *private) fsopt->max_readdir = intval; break; case Opt_readdir_max_bytes: - if (intval < PAGE_SIZE && intval != 0) + if (intval < (int)PAGE_SIZE && intval != 0) return -EINVAL; fsopt->max_readdir_bytes = intval; break; @@ -534,6 +537,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noasyncreaddir"); if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) seq_puts(m, ",nodcache"); + if (fsopt->flags & CEPH_MOUNT_OPT_INO32) + seq_puts(m, ",ino32"); if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) { seq_show_option(m, "fsc", fsopt->fscache_uniq); } @@ -551,7 +556,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->mds_namespace) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); - if (fsopt->wsize) + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) seq_printf(m, ",rsize=%d", fsopt->rsize); @@ -616,7 +621,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, err = PTR_ERR(fsc->client); goto fail; } + fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->osdc.abort_on_full = true; if (!fsopt->mds_namespace) { ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, @@ -674,6 +681,13 @@ fail: return ERR_PTR(err); } +static void flush_fs_workqueues(struct ceph_fs_client *fsc) +{ + flush_workqueue(fsc->wb_wq); + flush_workqueue(fsc->pg_inv_wq); + flush_workqueue(fsc->trunc_wq); +} + static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); @@ -793,6 +807,7 @@ static void ceph_umount_begin(struct super_block *sb) if (!fsc) return; fsc->mount_state = CEPH_MOUNT_SHUTDOWN; + ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); ceph_mdsc_force_umount(fsc->mdsc); return; } @@ -1088,6 +1103,8 @@ static void ceph_kill_sb(struct super_block *s) dout("kill_sb %p\n", s); ceph_mdsc_pre_umount(fsc->mdsc); + flush_fs_workqueues(fsc); + generic_shutdown_super(s); fsc->client->extra_mon_dispatch = NULL; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 315f7e63e7cc..5bc8edb4c2a6 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -50,10 +50,14 @@ struct ceph_vxattr { size_t name_size; /* strlen(name) + 1 (for '\0') */ size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t size); - bool readonly, hidden; bool (*exists_cb)(struct ceph_inode_info *ci); + unsigned int flags; }; +#define VXATTR_FLAG_READONLY (1<<0) +#define VXATTR_FLAG_HIDDEN (1<<1) +#define VXATTR_FLAG_RSTAT (1<<2) + /* layouts */ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) @@ -262,32 +266,31 @@ static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci, #define CEPH_XATTR_NAME2(_type, _name, _name2) \ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 -#define XATTR_NAME_CEPH(_type, _name) \ +#define XATTR_NAME_CEPH(_type, _name, _flags) \ { \ .name = CEPH_XATTR_NAME(_type, _name), \ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ - .readonly = true, \ - .hidden = false, \ - .exists_cb = NULL, \ + .exists_cb = NULL, \ + .flags = (VXATTR_FLAG_READONLY | _flags), \ } +#define XATTR_RSTAT_FIELD(_type, _name) \ + XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) #define XATTR_LAYOUT_FIELD(_type, _name, _field) \ { \ .name = CEPH_XATTR_NAME2(_type, _name, _field), \ .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ - .readonly = false, \ - .hidden = true, \ .exists_cb = ceph_vxattrcb_layout_exists, \ + .flags = VXATTR_FLAG_HIDDEN, \ } #define XATTR_QUOTA_FIELD(_type, _name) \ { \ .name = CEPH_XATTR_NAME(_type, _name), \ .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ - .readonly = false, \ - .hidden = true, \ .exists_cb = ceph_vxattrcb_quota_exists, \ + .flags = VXATTR_FLAG_HIDDEN, \ } static struct ceph_vxattr ceph_dir_vxattrs[] = { @@ -295,30 +298,28 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { .name = "ceph.dir.layout", .name_size = sizeof("ceph.dir.layout"), .getxattr_cb = ceph_vxattrcb_layout, - .readonly = false, - .hidden = true, .exists_cb = ceph_vxattrcb_layout_exists, + .flags = VXATTR_FLAG_HIDDEN, }, XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), XATTR_LAYOUT_FIELD(dir, layout, stripe_count), XATTR_LAYOUT_FIELD(dir, layout, object_size), XATTR_LAYOUT_FIELD(dir, layout, pool), XATTR_LAYOUT_FIELD(dir, layout, pool_namespace), - XATTR_NAME_CEPH(dir, entries), - XATTR_NAME_CEPH(dir, files), - XATTR_NAME_CEPH(dir, subdirs), - XATTR_NAME_CEPH(dir, rentries), - XATTR_NAME_CEPH(dir, rfiles), - XATTR_NAME_CEPH(dir, rsubdirs), - XATTR_NAME_CEPH(dir, rbytes), - XATTR_NAME_CEPH(dir, rctime), + XATTR_NAME_CEPH(dir, entries, 0), + XATTR_NAME_CEPH(dir, files, 0), + XATTR_NAME_CEPH(dir, subdirs, 0), + XATTR_RSTAT_FIELD(dir, rentries), + XATTR_RSTAT_FIELD(dir, rfiles), + XATTR_RSTAT_FIELD(dir, rsubdirs), + XATTR_RSTAT_FIELD(dir, rbytes), + XATTR_RSTAT_FIELD(dir, rctime), { .name = "ceph.quota", .name_size = sizeof("ceph.quota"), .getxattr_cb = ceph_vxattrcb_quota, - .readonly = false, - .hidden = true, .exists_cb = ceph_vxattrcb_quota_exists, + .flags = VXATTR_FLAG_HIDDEN, }, XATTR_QUOTA_FIELD(quota, max_bytes), XATTR_QUOTA_FIELD(quota, max_files), @@ -333,9 +334,8 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { .name = "ceph.file.layout", .name_size = sizeof("ceph.file.layout"), .getxattr_cb = ceph_vxattrcb_layout, - .readonly = false, - .hidden = true, .exists_cb = ceph_vxattrcb_layout_exists, + .flags = VXATTR_FLAG_HIDDEN, }, XATTR_LAYOUT_FIELD(file, layout, stripe_unit), XATTR_LAYOUT_FIELD(file, layout, stripe_count), @@ -374,9 +374,10 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) struct ceph_vxattr *vxattr; size_t size = 0; - for (vxattr = vxattrs; vxattr->name; vxattr++) - if (!vxattr->hidden) + for (vxattr = vxattrs; vxattr->name; vxattr++) { + if (!(vxattr->flags & VXATTR_FLAG_HIDDEN)) size += vxattr->name_size; + } return size; } @@ -809,7 +810,10 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr) { - err = ceph_do_getattr(inode, 0, true); + int mask = 0; + if (vxattr->flags & VXATTR_FLAG_RSTAT) + mask |= CEPH_STAT_RSTAT; + err = ceph_do_getattr(inode, mask, true); if (err) return err; err = -ENODATA; @@ -919,7 +923,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) err = namelen; if (vxattrs) { for (i = 0; vxattrs[i].name; i++) { - if (!vxattrs[i].hidden && + if (!(vxattrs[i].flags & VXATTR_FLAG_HIDDEN) && !(vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci))) { len = sprintf(names, "%s", vxattrs[i].name); @@ -1024,7 +1028,7 @@ int __ceph_setxattr(struct inode *inode, const char *name, vxattr = ceph_match_vxattr(inode, name); if (vxattr) { - if (vxattr->readonly) + if (vxattr->flags & VXATTR_FLAG_READONLY) return -EOPNOTSUPP; if (value && !strncmp(vxattr->name, "ceph.quota", 10)) check_realm = true; diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 7e4a1e2f0696..85817991ee68 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -1,11 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 # -# Makefile for Linux CIFS VFS client +# Makefile for Linux CIFS/SMB2/SMB3 VFS client # +ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_CIFS) += cifs.o -cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ - link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ +cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ + inode.o link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index a3b56544c21b..3d19595eb352 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -428,7 +428,7 @@ asn1_oid_decode(struct asn1_ctx *ctx, if (size < 2 || size > UINT_MAX/sizeof(unsigned long)) return 0; - *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); + *oid = kmalloc_array(size, sizeof(unsigned long), GFP_ATOMIC); if (*oid == NULL) return 0; diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index edf5f40898bf..e1553d1e0e50 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -128,8 +128,8 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, memset(&auxdata, 0, sizeof(auxdata)); auxdata.eof = cifsi->server_eof; - auxdata.last_write_time = cifsi->vfs_inode.i_mtime; - auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + auxdata.last_write_time = timespec64_to_timespec(cifsi->vfs_inode.i_mtime); + auxdata.last_change_time = timespec64_to_timespec(cifsi->vfs_inode.i_ctime); if (memcmp(data, &auxdata, datalen) != 0) return FSCACHE_CHECKAUX_OBSOLETE; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 9d69ea433330..bfe999505815 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -42,7 +42,7 @@ cifs_dump_mem(char *label, void *data, int length) data, length, true); } -void cifs_dump_detail(void *buf) +void cifs_dump_detail(void *buf, struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 struct smb_hdr *smb = (struct smb_hdr *)buf; @@ -50,7 +50,8 @@ void cifs_dump_detail(void *buf) cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n", smb->Command, smb->Status.CifsError, smb->Flags, smb->Flags2, smb->Mid, smb->Pid); - cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb)); + cifs_dbg(VFS, "smb buf %p len %u\n", smb, + server->ops->calc_smb_size(smb, server)); #endif /* CONFIG_CIFS_DEBUG2 */ } @@ -83,7 +84,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server) cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n", mid_entry->multiRsp, mid_entry->multiEnd); if (mid_entry->resp_buf) { - cifs_dump_detail(mid_entry->resp_buf); + cifs_dump_detail(mid_entry->resp_buf, server); cifs_dump_mem("existing buf: ", mid_entry->resp_buf, 62); } @@ -113,6 +114,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_printf(m, " type: %d ", dev_type); if (tcon->seal) seq_printf(m, " Encrypted"); + if (tcon->nocase) + seq_printf(m, " nocase"); if (tcon->unix_ext) seq_printf(m, " POSIX Extensions"); if (tcon->ses->server->ops->dump_share_caps) @@ -123,6 +126,25 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_putc(m, '\n'); } +static void +cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) +{ + struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; + struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; + + seq_printf(m, "\t\tSpeed: %zu bps\n", iface->speed); + seq_puts(m, "\t\tCapabilities: "); + if (iface->rdma_capable) + seq_puts(m, "rdma "); + if (iface->rss_capable) + seq_puts(m, "rss "); + seq_putc(m, '\n'); + if (iface->sockaddr.ss_family == AF_INET) + seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr); + else if (iface->sockaddr.ss_family == AF_INET6) + seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr); +} + static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { struct list_head *tmp1, *tmp2, *tmp3; @@ -237,6 +259,10 @@ skip_rdma: server->credits, server->dialect); if (server->sign) seq_printf(m, " signed"); +#ifdef CONFIG_CIFS_SMB311 + if (server->posix_ext_supported) + seq_printf(m, " posix"); +#endif /* 3.1.1 */ i++; list_for_each(tmp2, &server->smb_ses_list) { ses = list_entry(tmp2, struct cifs_ses, @@ -305,6 +331,16 @@ skip_rdma: mid_entry->mid); } spin_unlock(&GlobalMid_Lock); + + spin_lock(&ses->iface_lock); + if (ses->iface_count) + seq_printf(m, "\n\tServer interfaces: %zu\n", + ses->iface_count); + for (j = 0; j < ses->iface_count; j++) { + seq_printf(m, "\t%d)\n", j); + cifs_dump_iface(m, &ses->iface_list[j]); + } + spin_unlock(&ses->iface_lock); } } spin_unlock(&cifs_tcp_ses_lock); @@ -314,18 +350,6 @@ skip_rdma: return 0; } -static int cifs_debug_data_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, cifs_debug_data_proc_show, NULL); -} - -static const struct file_operations cifs_debug_data_proc_fops = { - .open = cifs_debug_data_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #ifdef CONFIG_CIFS_STATS static ssize_t cifs_stats_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) @@ -497,35 +521,36 @@ cifs_proc_init(void) if (proc_fs_cifs == NULL) return; - proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops); + proc_create_single("DebugData", 0, proc_fs_cifs, + cifs_debug_data_proc_show); #ifdef CONFIG_CIFS_STATS - proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops); + proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops); #endif /* STATS */ - proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops); - proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops); - proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs, + proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops); + proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops); + proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs, &cifs_linux_ext_proc_fops); - proc_create("SecurityFlags", 0, proc_fs_cifs, + proc_create("SecurityFlags", 0644, proc_fs_cifs, &cifs_security_flags_proc_fops); - proc_create("LookupCacheEnabled", 0, proc_fs_cifs, + proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, &cifs_lookup_cache_proc_fops); #ifdef CONFIG_CIFS_SMB_DIRECT - proc_create("rdma_readwrite_threshold", 0, proc_fs_cifs, + proc_create("rdma_readwrite_threshold", 0644, proc_fs_cifs, &cifs_rdma_readwrite_threshold_proc_fops); - proc_create("smbd_max_frmr_depth", 0, proc_fs_cifs, + proc_create("smbd_max_frmr_depth", 0644, proc_fs_cifs, &cifs_smbd_max_frmr_depth_proc_fops); - proc_create("smbd_keep_alive_interval", 0, proc_fs_cifs, + proc_create("smbd_keep_alive_interval", 0644, proc_fs_cifs, &cifs_smbd_keep_alive_interval_proc_fops); - proc_create("smbd_max_receive_size", 0, proc_fs_cifs, + proc_create("smbd_max_receive_size", 0644, proc_fs_cifs, &cifs_smbd_max_receive_size_proc_fops); - proc_create("smbd_max_fragmented_recv_size", 0, proc_fs_cifs, + proc_create("smbd_max_fragmented_recv_size", 0644, proc_fs_cifs, &cifs_smbd_max_fragmented_recv_size_proc_fops); - proc_create("smbd_max_send_size", 0, proc_fs_cifs, + proc_create("smbd_max_send_size", 0644, proc_fs_cifs, &cifs_smbd_max_send_size_proc_fops); - proc_create("smbd_send_credit_target", 0, proc_fs_cifs, + proc_create("smbd_send_credit_target", 0644, proc_fs_cifs, &cifs_smbd_send_credit_target_proc_fops); - proc_create("smbd_receive_credit_max", 0, proc_fs_cifs, + proc_create("smbd_receive_credit_max", 0644, proc_fs_cifs, &cifs_smbd_receive_credit_max_proc_fops); #endif } @@ -583,6 +608,8 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, cifsFYI = bv; else if ((c[0] > '1') && (c[0] <= '9')) cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */ + else + return -EINVAL; return count; } diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 0e74690d11bc..f4f3f0853c6e 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -23,7 +23,7 @@ #define _H_CIFS_DEBUG void cifs_dump_mem(char *label, void *data, int length); -void cifs_dump_detail(void *); +void cifs_dump_detail(void *buf, struct TCP_Server_Info *ptcp_info); void cifs_dump_mids(struct TCP_Server_Info *); extern bool traceSMB; /* flag which enables the function below */ void dump_smb(void *, int); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 350fa55a1bf7..9731d0d891e7 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -50,6 +50,7 @@ * root mountable */ #define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */ +#define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 13a8a77322c9..1d377b7f2860 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -747,8 +747,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *)) return; - ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), - GFP_KERNEL); + ppace = kmalloc_array(num_aces, sizeof(struct cifs_ace *), + GFP_KERNEL); if (!ppace) return; diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 4f3884835267..dd95a6fa24bf 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -98,4 +98,18 @@ struct cifs_ace { struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ } __attribute__((packed)); +/* + * Minimum security identifier can be one for system defined Users + * and Groups such as NULL SID and World or Built-in accounts such + * as Administrator and Guest and consists of + * Revision + Num (Sub)Auths + Authority + Domain (one Subauthority) + */ +#define MIN_SID_LEN (1 + 1 + 6 + 4) /* in bytes */ + +/* + * Minimum security descriptor can be one without any SACL and DACL and can + * consist of revision, type, and two sids of minimum size for owner and group + */ +#define MIN_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + (2 * MIN_SID_LEN)) + #endif /* _CIFSACL_H */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index a6ef088e057b..ee2a8ec70056 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -44,19 +44,27 @@ int __cifs_calc_signature(struct smb_rqst *rqst, int rc; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; + int is_smb2 = server->vals->header_preamble_size == 0; - if (n_vec < 2 || iov[0].iov_len != 4) - return -EIO; + /* iov[0] is actual data and not the rfc1002 length for SMB2+ */ + if (is_smb2) { + if (iov[0].iov_len <= 4) + return -EIO; + i = 0; + } else { + if (n_vec < 2 || iov[0].iov_len != 4) + return -EIO; + i = 1; /* skip rfc1002 length */ + } - for (i = 1; i < n_vec; i++) { + for (; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { cifs_dbg(VFS, "null iovec entry\n"); return -EIO; } - if (i == 1 && iov[1].iov_len <= 4) - break; /* nothing to sign or corrupt header */ + rc = crypto_shash_update(shash, iov[i].iov_base, iov[i].iov_len); if (rc) { @@ -68,11 +76,12 @@ int __cifs_calc_signature(struct smb_rqst *rqst, /* now hash over the rq_pages array */ for (i = 0; i < rqst->rq_npages; i++) { - void *kaddr = kmap(rqst->rq_pages[i]); - size_t len = rqst->rq_pagesz; + void *kaddr; + unsigned int len, offset; + + rqst_page_get_length(rqst, i, &len, &offset); - if (i == rqst->rq_npages - 1) - len = rqst->rq_tailsz; + kaddr = (char *) kmap(rqst->rq_pages[i]) + offset; crypto_shash_update(shash, kaddr, len); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5a5a0158cc8f..d5aa7ae917bf 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -58,13 +58,15 @@ bool traceSMB; bool enable_oplocks = true; bool linuxExtEnabled = true; bool lookupCacheEnabled = true; +bool disable_legacy_dialects; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; module_param(CIFSMaxBufSize, uint, 0444); -MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " +MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header) " + "for CIFS requests. " "Default: 16384 Range: 8192 to 130048"); unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; module_param(cifs_min_rcv, uint, 0444); @@ -76,11 +78,21 @@ MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; module_param(cifs_max_pending, uint, 0444); -MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " +MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for " + "CIFS/SMB1 dialect (N/A for SMB3) " "Default: 32767 Range: 2 to 32767."); module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); +module_param(disable_legacy_dialects, bool, 0644); +MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be " + "helpful to restrict the ability to " + "override the default dialects (SMB2.1, " + "SMB3 and SMB3.02) on mount with old " + "dialects (CIFS/SMB1 and SMB2) since " + "vers=1.0 (CIFS/SMB1) and vers=2.0 are weaker" + " and less secure. Default: n/N/0"); + extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; @@ -469,10 +481,20 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",persistenthandles"); else if (tcon->use_resilient) seq_puts(s, ",resilienthandles"); + +#ifdef CONFIG_CIFS_SMB311 + if (tcon->posix_extensions) + seq_puts(s, ",posix"); + else if (tcon->unix_ext) + seq_puts(s, ",unix"); + else + seq_puts(s, ",nounix"); +#else if (tcon->unix_ext) seq_puts(s, ",unix"); else seq_puts(s, ",nounix"); +#endif /* SMB311 */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) seq_puts(s, ",posixpaths"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) @@ -495,6 +517,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",sfu"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) seq_puts(s, ",nobrl"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_HANDLE_CACHE) + seq_puts(s, ",nohandlecache"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) seq_puts(s, ",cifsacl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) @@ -674,8 +698,8 @@ static int cifs_set_super(struct super_block *sb, void *data) } static struct dentry * -cifs_do_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +cifs_smb3_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, bool is_smb3) { int rc; struct super_block *sb; @@ -686,7 +710,7 @@ cifs_do_mount(struct file_system_type *fs_type, cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags); - volume_info = cifs_get_volume_info((char *)data, dev_name); + volume_info = cifs_get_volume_info((char *)data, dev_name, is_smb3); if (IS_ERR(volume_info)) return ERR_CAST(volume_info); @@ -766,6 +790,20 @@ out_nls: goto out; } +static struct dentry * +smb3_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return cifs_smb3_do_mount(fs_type, flags, dev_name, data, true); +} + +static struct dentry * +cifs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return cifs_smb3_do_mount(fs_type, flags, dev_name, data, false); +} + static ssize_t cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) { @@ -897,6 +935,17 @@ struct file_system_type cifs_fs_type = { /* .fs_flags */ }; MODULE_ALIAS_FS("cifs"); + +static struct file_system_type smb3_fs_type = { + .owner = THIS_MODULE, + .name = "smb3", + .mount = smb3_do_mount, + .kill_sb = cifs_kill_sb, + /* .fs_flags */ +}; +MODULE_ALIAS_FS("smb3"); +MODULE_ALIAS("smb3"); + const struct inode_operations cifs_dir_inode_ops = { .create = cifs_create, .atomic_open = cifs_atomic_open, @@ -1435,6 +1484,12 @@ init_cifs(void) if (rc) goto out_init_cifs_idmap; + rc = register_filesystem(&smb3_fs_type); + if (rc) { + unregister_filesystem(&cifs_fs_type); + goto out_init_cifs_idmap; + } + return 0; out_init_cifs_idmap: @@ -1465,8 +1520,9 @@ out_clean_proc: static void __exit exit_cifs(void) { - cifs_dbg(NOISY, "exit_cifs\n"); + cifs_dbg(NOISY, "exit_smb3\n"); unregister_filesystem(&cifs_fs_type); + unregister_filesystem(&smb3_fs_type); cifs_dfs_release_automount_timer(); #ifdef CONFIG_CIFS_ACL exit_cifs_idmap(); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 013ba2aed8d9..5f0231803431 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -149,5 +149,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.11" +#define CIFS_VERSION "2.12" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cb950a5fa078..c923c7854027 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -33,6 +33,9 @@ #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ +#define CIFS_PORT 445 +#define RFC1001_PORT 139 + /* * The sizes of various internal tables and strings */ @@ -176,6 +179,7 @@ struct smb_rqst { struct kvec *rq_iov; /* array of kvecs */ unsigned int rq_nvec; /* number of kvecs in array */ struct page **rq_pages; /* pointer to array of page ptrs */ + unsigned int rq_offset; /* the offset to the 1st page */ unsigned int rq_npages; /* number pages in array */ unsigned int rq_pagesz; /* page size to use */ unsigned int rq_tailsz; /* length of last page */ @@ -244,7 +248,7 @@ struct smb_version_operations { int (*map_error)(char *, bool); /* find mid corresponding to the response message */ struct mid_q_entry * (*find_mid)(struct TCP_Server_Info *, char *); - void (*dump_detail)(void *); + void (*dump_detail)(void *buf, struct TCP_Server_Info *ptcp_info); void (*clear_stats)(struct cifs_tcon *); void (*print_stats)(struct seq_file *m, struct cifs_tcon *); void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *); @@ -311,6 +315,10 @@ struct smb_version_operations { /* send echo request */ int (*echo)(struct TCP_Server_Info *); /* create directory */ + int (*posix_mkdir)(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, + const char *full_path, + struct cifs_sb_info *cifs_sb); int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *, struct cifs_sb_info *); /* set info on created directory */ @@ -372,7 +380,7 @@ struct smb_version_operations { int (*close_dir)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); /* calculate a size of SMB message */ - unsigned int (*calc_smb_size)(void *); + unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi); /* check for STATUS_PENDING and process it in a positive case */ bool (*is_status_pending)(char *, struct TCP_Server_Info *, int); /* check for STATUS_NETWORK_SESSION_EXPIRED */ @@ -415,9 +423,9 @@ struct smb_version_operations { void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int, bool *); /* create lease context buffer for CREATE request */ - char * (*create_lease_buf)(u8 *, u8); + char * (*create_lease_buf)(u8 *lease_key, u8 oplock); /* parse lease context buffer and return oplock/epoch info */ - __u8 (*parse_lease_buf)(void *, unsigned int *); + __u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey); ssize_t (*copychunk_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, @@ -457,7 +465,7 @@ struct smb_version_operations { struct mid_q_entry **); enum securityEnum (*select_sectype)(struct TCP_Server_Info *, enum securityEnum); - + int (*next_header)(char *); }; struct smb_version_values { @@ -521,10 +529,12 @@ struct smb_vol { bool sfu_remap:1; /* remap seven reserved chars ala SFU */ bool posix_paths:1; /* unset to not ask for posix pathnames. */ bool no_linux_ext:1; + bool linux_ext:1; bool sfu_emul:1; bool nullauth:1; /* attempt to authenticate with null user */ bool nocase:1; /* request case insensitive filenames */ bool nobrl:1; /* disable sending byte range locks to srv */ + bool nohandlecache:1; /* disable caching dir handles if srvr probs */ bool mand_lock:1; /* send mandatory not posix byte range lock reqs */ bool seal:1; /* request transport encryption on share */ bool nodfs:1; /* Do not request DFS, even if available */ @@ -630,7 +640,7 @@ struct TCP_Server_Info { bool oplocks:1; /* enable oplocks */ unsigned int maxReq; /* Clients should submit no more */ /* than maxReq distinct unanswered SMBs to the server when using */ - /* multiplexed reads or writes */ + /* multiplexed reads or writes (for SMB1/CIFS only, not SMB2/SMB3) */ unsigned int maxBuf; /* maxBuf specifies the maximum */ /* message size the server can send or receive for non-raw SMBs */ /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */ @@ -681,6 +691,7 @@ struct TCP_Server_Info { __le16 cipher_type; /* save initital negprot hash */ __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; + bool posix_ext_supported; #endif /* 3.1.1 */ struct delayed_work reconnect; /* reconnect workqueue job */ struct mutex reconnect_mutex; /* prevent simultaneous reconnects */ @@ -834,6 +845,13 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) #endif +struct cifs_server_iface { + size_t speed; + unsigned int rdma_capable : 1; + unsigned int rss_capable : 1; + struct sockaddr_storage sockaddr; +}; + /* * Session structure. One of these for each uid session with a particular host */ @@ -871,6 +889,20 @@ struct cifs_ses { #ifdef CONFIG_CIFS_SMB311 __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; #endif /* 3.1.1 */ + + /* + * Network interfaces available on the server this session is + * connected to. + * + * Other channels can be opened by connecting and binding this + * session to interfaces from this list. + * + * iface_lock should be taken when accessing any of these fields + */ + spinlock_t iface_lock; + struct cifs_server_iface *iface_list; + size_t iface_count; + unsigned long iface_last_update; /* jiffies */ }; static inline bool @@ -879,6 +911,14 @@ cap_unix(struct cifs_ses *ses) return ses->server->vals->cap_unix & ses->capabilities; } +struct cached_fid { + bool is_valid:1; /* Do we have a useable root fid */ + struct cifs_fid *fid; + struct mutex fid_mutex; + struct cifs_tcon *tcon; + struct work_struct lease_break; +}; + /* * there is one of these for each connection to a resource on a particular * session @@ -953,9 +993,13 @@ struct cifs_tcon { bool print:1; /* set if connection to printer share */ bool retry:1; bool nocase:1; + bool nohandlecache:1; /* if strange server resource prob can turn off */ bool seal:1; /* transport encryption for this mounted share */ bool unix_ext:1; /* if false disable Linux extensions to CIFS protocol for this mount even if server would support */ +#ifdef CONFIG_CIFS_SMB311 + bool posix_extensions; /* if true SMB3.11 posix extensions enabled */ +#endif /* CIFS_311 */ bool local_lease:1; /* check leases (only) on local system not remote */ bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ bool broken_sparse_sup; /* if server or share does not support sparse */ @@ -979,6 +1023,7 @@ struct cifs_tcon { struct fscache_cookie *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ + struct cached_fid crfid; /* Cached root fid */ /* BB add field for back pointer to sb struct(s)? */ }; @@ -1008,6 +1053,12 @@ tlink_tcon(struct tcon_link *tlink) return tlink->tl_tcon; } +static inline struct tcon_link * +cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) +{ + return cifs_sb->master_tlink; +} + extern void cifs_put_tlink(struct tcon_link *tlink); static inline struct tcon_link * @@ -1071,6 +1122,7 @@ struct cifs_open_parms { int create_options; const char *path; struct cifs_fid *fid; + umode_t mode; bool reconnect:1; }; @@ -1169,10 +1221,11 @@ struct cifs_readdata { struct smbd_mr *mr; #endif unsigned int pagesz; + unsigned int page_offset; unsigned int tailsz; unsigned int credits; unsigned int nr_pages; - struct page *pages[]; + struct page **pages; }; struct cifs_writedata; @@ -1194,10 +1247,11 @@ struct cifs_writedata { struct smbd_mr *mr; #endif unsigned int pagesz; + unsigned int page_offset; unsigned int tailsz; unsigned int credits; unsigned int nr_pages; - struct page *pages[]; + struct page **pages; }; /* @@ -1362,6 +1416,7 @@ typedef int (mid_handle_t)(struct TCP_Server_Info *server, /* one of these for every pending CIFS request to the server */ struct mid_q_entry { struct list_head qhead; /* mids waiting on reply from this server */ + struct kref refcount; struct TCP_Server_Info *server; /* server corresponding to this mid */ __u64 mid; /* multiplex id */ __u32 pid; /* process id */ @@ -1692,16 +1747,17 @@ GLOBAL_EXTERN atomic_t smBufAllocCount; GLOBAL_EXTERN atomic_t midCount; /* Misc globals */ -GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */ -GLOBAL_EXTERN bool lookupCacheEnabled; -GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent +extern bool enable_oplocks; /* enable or disable oplocks */ +extern bool lookupCacheEnabled; +extern unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ -GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ -GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ -GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */ -GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ -GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ -GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ +extern unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ +extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ +extern unsigned int CIFSMaxBufSize; /* max size not including hdr */ +extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ +extern unsigned int cifs_min_small; /* min size of small buf pool */ +extern unsigned int cifs_max_pending; /* MAX requests at once to server*/ +extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ #ifdef CONFIG_CIFS_ACL GLOBAL_EXTERN struct rb_root uidtree; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 365a414a75e9..1890f534c88b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -21,6 +21,7 @@ #ifndef _CIFSPROTO_H #define _CIFSPROTO_H #include <linux/nls.h> +#include "trace.h" struct statfs; struct smb_vol; @@ -47,6 +48,7 @@ extern void _free_xid(unsigned int); cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n", \ __func__, __xid, \ from_kuid(&init_user_ns, current_fsuid())); \ + trace_smb3_enter(__xid, __func__); \ __xid; \ }) @@ -54,7 +56,11 @@ extern void _free_xid(unsigned int); do { \ _free_xid(curr_xid); \ cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n", \ - __func__, curr_xid, (int)rc); \ + __func__, curr_xid, (int)rc); \ + if (rc) \ + trace_smb3_exit_err(curr_xid, __func__, (int)rc); \ + else \ + trace_smb3_exit_done(curr_xid, __func__); \ } while (0) extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); @@ -76,6 +82,7 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server); extern void DeleteMidQEntry(struct mid_q_entry *midEntry); extern void cifs_delete_mid(struct mid_q_entry *mid); +extern void cifs_mid_q_entry_release(struct mid_q_entry *midEntry); extern void cifs_wake_up_task(struct mid_q_entry *mid); extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); @@ -106,10 +113,6 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec *, int /* nvec to send */, int * /* type of buf returned */, const int flags, struct kvec * /* resp vec */); -extern int smb2_send_recv(const unsigned int xid, struct cifs_ses *pses, - struct kvec *pkvec, int nvec_to_send, - int *pbuftype, const int flags, - struct kvec *presp); extern int SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *ptcon, struct smb_hdr *in_buf , @@ -124,7 +127,7 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); -extern unsigned int smbCalcSize(void *buf); +extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); @@ -197,13 +200,15 @@ extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, - struct page *page, unsigned int to_read); + struct page *page, + unsigned int page_offset, + unsigned int to_read); extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); extern struct smb_vol *cifs_get_volume_info(char *mount_data, - const char *devname); + const char *devname, bool is_smb3); extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); extern void cifs_umount(struct cifs_sb_info *); extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); @@ -525,6 +530,8 @@ int cifs_async_writev(struct cifs_writedata *wdata, void cifs_writev_complete(struct work_struct *work); struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete); +struct cifs_writedata *cifs_writedata_direct_alloc(struct page **pages, + work_func_t complete); void cifs_writedata_release(struct kref *refcount); int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, @@ -542,9 +549,13 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); +void smb2_cached_lease_break(struct work_struct *work); int cifs_alloc_hash(const char *name, struct crypto_shash **shash, struct sdesc **sdesc); void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc); +extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, + unsigned int *len, unsigned int *offset); + #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1529a088383d..93408eab92e7 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -106,6 +106,12 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) open_file->oplock_break_cancelled = true; } spin_unlock(&tcon->open_file_lock); + + mutex_lock(&tcon->crfid.fid_mutex); + tcon->crfid.is_valid = false; + memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); + mutex_unlock(&tcon->crfid.fid_mutex); + /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted * to this tcon. @@ -151,8 +157,14 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * greater than cifs socket timeout which is 7 seconds */ while (server->tcpStatus == CifsNeedReconnect) { - wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), 10 * HZ); + rc = wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), + 10 * HZ); + if (rc < 0) { + cifs_dbg(FYI, "%s: aborting reconnect due to a received" + " signal by the process\n", __func__); + return -ERESTARTSYS; + } /* are we still trying to reconnect? */ if (server->tcpStatus != CifsNeedReconnect) @@ -1946,6 +1958,7 @@ cifs_writedata_release(struct kref *refcount) if (wdata->cfile) cifsFileInfo_put(wdata->cfile); + kvfree(wdata->pages); kfree(wdata); } @@ -2069,12 +2082,22 @@ cifs_writev_complete(struct work_struct *work) struct cifs_writedata * cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) { + struct page **pages = + kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (pages) + return cifs_writedata_direct_alloc(pages, complete); + + return NULL; +} + +struct cifs_writedata * +cifs_writedata_direct_alloc(struct page **pages, work_func_t complete) +{ struct cifs_writedata *wdata; - /* writedata + number of page pointers */ - wdata = kzalloc(sizeof(*wdata) + - sizeof(struct page *) * nr_pages, GFP_NOFS); + wdata = kzalloc(sizeof(*wdata), GFP_NOFS); if (wdata != NULL) { + wdata->pages = pages; kref_init(&wdata->refcount); INIT_LIST_HEAD(&wdata->list); init_completion(&wdata->done); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7a10a5d0731f..5df2c0698cda 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -57,10 +57,8 @@ #include "smb2proto.h" #include "smbdirect.h" -#define CIFS_PORT 445 -#define RFC1001_PORT 139 - extern mempool_t *cifs_req_poolp; +extern bool disable_legacy_dialects; /* FIXME: should these be tunable? */ #define TLINK_ERROR_EXPIRE (1 * HZ) @@ -76,9 +74,10 @@ enum { Opt_mapposix, Opt_nomapposix, Opt_mapchars, Opt_nomapchars, Opt_sfu, Opt_nosfu, Opt_nodfs, Opt_posixpaths, - Opt_noposixpaths, Opt_nounix, + Opt_noposixpaths, Opt_nounix, Opt_unix, Opt_nocase, Opt_brl, Opt_nobrl, + Opt_handlecache, Opt_nohandlecache, Opt_forcemandatorylock, Opt_setuidfromacl, Opt_setuids, Opt_nosetuids, Opt_dynperm, Opt_nodynperm, Opt_nohard, Opt_nosoft, @@ -144,10 +143,16 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_noposixpaths, "noposixpaths" }, { Opt_nounix, "nounix" }, { Opt_nounix, "nolinux" }, + { Opt_nounix, "noposix" }, + { Opt_unix, "unix" }, + { Opt_unix, "linux" }, + { Opt_unix, "posix" }, { Opt_nocase, "nocase" }, { Opt_nocase, "ignorecase" }, { Opt_brl, "brl" }, { Opt_nobrl, "nobrl" }, + { Opt_handlecache, "handlecache" }, + { Opt_nohandlecache, "nohandlecache" }, { Opt_nobrl, "nolock" }, { Opt_forcemandatorylock, "forcemandatorylock" }, { Opt_forcemandatorylock, "forcemand" }, @@ -312,7 +317,7 @@ static int generic_ip_connect(struct TCP_Server_Info *server); static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, - const char *devname); + const char *devname, bool is_smb3); /* * cifs tcp session reconnection @@ -591,10 +596,11 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, - unsigned int to_read) + unsigned int page_offset, unsigned int to_read) { struct msghdr smb_msg; - struct bio_vec bv = {.bv_page = page, .bv_len = to_read}; + struct bio_vec bv = { + .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -848,6 +854,7 @@ cifs_demultiplex_thread(void *p) int length; struct TCP_Server_Info *server = p; unsigned int pdu_length; + unsigned int next_offset; char *buf = NULL; struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; @@ -874,24 +881,29 @@ cifs_demultiplex_thread(void *p) length = cifs_read_from_socket(server, buf, pdu_length); if (length < 0) continue; - server->total_read = length; + + if (server->vals->header_preamble_size == 0) + server->total_read = 0; + else + server->total_read = length; /* * The right amount was read from socket - 4 bytes, * so we can now interpret the length field. */ pdu_length = get_rfc1002_length(buf); - server->pdu_size = pdu_length; cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length); if (!is_smb_response(server, buf[0])) continue; +next_pdu: + server->pdu_size = pdu_length; /* make sure we have enough to get to the MID */ - if (pdu_length < HEADER_SIZE(server) - 1 - + if (server->pdu_size < HEADER_SIZE(server) - 1 - server->vals->header_preamble_size) { cifs_dbg(VFS, "SMB response too short (%u bytes)\n", - pdu_length); + server->pdu_size); cifs_reconnect(server); wake_up(&server->response_q); continue; @@ -906,6 +918,13 @@ cifs_demultiplex_thread(void *p) continue; server->total_read += length; + if (server->ops->next_header) { + next_offset = server->ops->next_header(buf); + if (next_offset) + server->pdu_size = next_offset; + } + + mid_entry = NULL; if (server->ops->is_transform_hdr && server->ops->receive_transform && server->ops->is_transform_hdr(buf)) { @@ -920,8 +939,11 @@ cifs_demultiplex_thread(void *p) length = mid_entry->receive(server, mid_entry); } - if (length < 0) + if (length < 0) { + if (mid_entry) + cifs_mid_q_entry_release(mid_entry); continue; + } if (server->large_buf) buf = server->bigbuf; @@ -938,6 +960,8 @@ cifs_demultiplex_thread(void *p) if (!mid_entry->multiRsp || mid_entry->multiEnd) mid_entry->callback(mid_entry); + + cifs_mid_q_entry_release(mid_entry); } else if (server->ops->is_oplock_break && server->ops->is_oplock_break(buf, server)) { cifs_dbg(FYI, "Received oplock break\n"); @@ -948,10 +972,18 @@ cifs_demultiplex_thread(void *p) HEADER_SIZE(server)); #ifdef CONFIG_CIFS_DEBUG2 if (server->ops->dump_detail) - server->ops->dump_detail(buf); + server->ops->dump_detail(buf, server); cifs_dump_mids(server); #endif /* CIFS_DEBUG2 */ - + } + if (pdu_length > server->pdu_size) { + if (!allocate_buffers(server)) + continue; + pdu_length -= server->pdu_size; + server->total_read = 0; + server->large_buf = false; + buf = server->smallbuf; + goto next_pdu; } } /* end while !EXITING */ @@ -1137,16 +1169,32 @@ cifs_parse_cache_flavor(char *value, struct smb_vol *vol) } static int -cifs_parse_smb_version(char *value, struct smb_vol *vol) +cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3) { substring_t args[MAX_OPT_ARGS]; switch (match_token(value, cifs_smb_version_tokens, args)) { case Smb_1: + if (disable_legacy_dialects) { + cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + return 1; + } + if (is_smb3) { + cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); + return 1; + } vol->ops = &smb1_operations; vol->vals = &smb1_values; break; case Smb_20: + if (disable_legacy_dialects) { + cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + return 1; + } + if (is_smb3) { + cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n"); + return 1; + } vol->ops = &smb20_operations; vol->vals = &smb20_values; break; @@ -1235,7 +1283,7 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol) static int cifs_parse_mount_options(const char *mountdata, const char *devname, - struct smb_vol *vol) + struct smb_vol *vol, bool is_smb3) { char *data, *end; char *mountdata_copy = NULL, *options; @@ -1426,8 +1474,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->posix_paths = 0; break; case Opt_nounix: + if (vol->linux_ext) + cifs_dbg(VFS, + "conflicting unix mount options\n"); vol->no_linux_ext = 1; break; + case Opt_unix: + if (vol->no_linux_ext) + cifs_dbg(VFS, + "conflicting unix mount options\n"); + vol->linux_ext = 1; + break; case Opt_nocase: vol->nocase = 1; break; @@ -1445,6 +1502,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, (S_IALLUGO & ~(S_ISUID | S_IXGRP))) vol->file_mode = S_IALLUGO; break; + case Opt_nohandlecache: + vol->nohandlecache = 1; + break; + case Opt_handlecache: + vol->nohandlecache = 0; + break; case Opt_forcemandatorylock: vol->mand_lock = 1; break; @@ -1933,7 +1996,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (cifs_parse_smb_version(string, vol) != 0) + if (cifs_parse_smb_version(string, vol, is_smb3) != 0) goto cifs_parse_mount_err; got_version = true; break; @@ -2967,6 +3030,16 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) } } +#ifdef CONFIG_CIFS_SMB311 + if ((volume_info->linux_ext) && (ses->server->posix_ext_supported)) { + if (ses->server->vals->protocol_id == SMB311_PROT_ID) { + tcon->posix_extensions = true; + printk_once(KERN_WARNING + "SMB3.11 POSIX Extensions are experimental\n"); + } + } +#endif /* 311 */ + /* * BB Do we need to wrap session_mutex around this TCon call and Unix * SetFS as we do on SessSetup and reconnect? @@ -3022,6 +3095,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) */ tcon->retry = volume_info->retry; tcon->nocase = volume_info->nocase; + tcon->nohandlecache = volume_info->nohandlecache; tcon->local_lease = volume_info->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); @@ -3056,12 +3130,6 @@ cifs_put_tlink(struct tcon_link *tlink) return; } -static inline struct tcon_link * -cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) -{ - return cifs_sb->master_tlink; -} - static int compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) { @@ -3580,6 +3648,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; if (pvolume_info->nobrl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; + if (pvolume_info->nohandlecache) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_HANDLE_CACHE; if (pvolume_info->nostrictsync) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC; if (pvolume_info->mand_lock) @@ -3741,7 +3811,7 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, } else { cleanup_volume_info_contents(volume_info); rc = cifs_setup_volume_info(volume_info, mdata, - fake_devname); + fake_devname, false); } kfree(fake_devname); kfree(cifs_sb->mountdata); @@ -3754,11 +3824,11 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, - const char *devname) + const char *devname, bool is_smb3) { int rc = 0; - if (cifs_parse_mount_options(mount_data, devname, volume_info)) + if (cifs_parse_mount_options(mount_data, devname, volume_info, is_smb3)) return -EINVAL; if (volume_info->nullauth) { @@ -3792,7 +3862,7 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, } struct smb_vol * -cifs_get_volume_info(char *mount_data, const char *devname) +cifs_get_volume_info(char *mount_data, const char *devname, bool is_smb3) { int rc; struct smb_vol *volume_info; @@ -3801,7 +3871,7 @@ cifs_get_volume_info(char *mount_data, const char *devname) if (!volume_info) return ERR_PTR(-ENOMEM); - rc = cifs_setup_volume_info(volume_info, mount_data, devname); + rc = cifs_setup_volume_info(volume_info, mount_data, devname, is_smb3); if (rc) { cifs_cleanup_volume_info(volume_info); volume_info = ERR_PTR(rc); @@ -3922,6 +3992,12 @@ try_mount_again: goto remote_path_check; } +#ifdef CONFIG_CIFS_SMB311 + /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */ + if (tcon->posix_extensions) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; +#endif /* SMB3.11 */ + /* tell server which Unix caps we support */ if (cap_unix(tcon->ses)) { /* reset of caps checks mount to see if unix extensions @@ -4353,6 +4429,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) vol_info->UNC = master_tcon->treeName; vol_info->retry = master_tcon->retry; vol_info->nocase = master_tcon->nocase; + vol_info->nohandlecache = master_tcon->nohandlecache; vol_info->local_lease = master_tcon->local_lease; vol_info->no_linux_ext = !master_tcon->unix_ext; vol_info->sectype = master_tcon->ses->sectype; @@ -4382,8 +4459,14 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) goto out; } +#ifdef CONFIG_CIFS_SMB311 + /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */ + if (tcon->posix_extensions) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; +#endif /* SMB3.11 */ if (cap_unix(ses)) reset_cifs_unix_caps(0, tcon, NULL, vol_info); + out: kfree(vol_info->username); kzfree(vol_info->password); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 925844343038..ddae52bd1993 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -369,7 +369,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, oparms.path = full_path; oparms.fid = fid; oparms.reconnect = false; - + oparms.mode = mode; rc = server->ops->open(xid, &oparms, oplock, buf); if (rc) { cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); @@ -780,21 +780,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { free_xid(xid); - return (struct dentry *)tlink; + return ERR_CAST(tlink); } pTcon = tlink_tcon(tlink); rc = check_name(direntry, pTcon); - if (rc) - goto lookup_out; + if (unlikely(rc)) { + cifs_put_tlink(tlink); + free_xid(xid); + return ERR_PTR(rc); + } /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) in which we already have the sb rename sem */ full_path = build_path_from_dentry(direntry); if (full_path == NULL) { - rc = -ENOMEM; - goto lookup_out; + cifs_put_tlink(tlink); + free_xid(xid); + return ERR_PTR(-ENOMEM); } if (d_really_is_positive(direntry)) { @@ -813,29 +817,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, parent_dir_inode->i_sb, xid, NULL); } - if ((rc == 0) && (newInode != NULL)) { - d_add(direntry, newInode); + if (rc == 0) { /* since paths are not looked up by component - the parent directories are presumed to be good here */ renew_parental_timestamps(direntry); - } else if (rc == -ENOENT) { - rc = 0; cifs_set_time(direntry, jiffies); - d_add(direntry, NULL); - /* if it was once a directory (but how can we tell?) we could do - shrink_dcache_parent(direntry); */ - } else if (rc != -EACCES) { - cifs_dbg(FYI, "Unexpected lookup error %d\n", rc); - /* We special case check for Access Denied - since that - is a common return code */ + newInode = NULL; + } else { + if (rc != -EACCES) { + cifs_dbg(FYI, "Unexpected lookup error %d\n", rc); + /* We special case check for Access Denied - since that + is a common return code */ + } + newInode = ERR_PTR(rc); } - -lookup_out: kfree(full_path); cifs_put_tlink(tlink); free_xid(xid); - return ERR_PTR(rc); + return d_splice_alias(newInode, direntry); } static int diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 23fd430fe74a..8d41ca7bfcf1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2880,13 +2880,13 @@ out: } static struct cifs_readdata * -cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) +cifs_readdata_direct_alloc(struct page **pages, work_func_t complete) { struct cifs_readdata *rdata; - rdata = kzalloc(sizeof(*rdata) + (sizeof(struct page *) * nr_pages), - GFP_KERNEL); + rdata = kzalloc(sizeof(*rdata), GFP_KERNEL); if (rdata != NULL) { + rdata->pages = pages; kref_init(&rdata->refcount); INIT_LIST_HEAD(&rdata->list); init_completion(&rdata->done); @@ -2896,6 +2896,22 @@ cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) return rdata; } +static struct cifs_readdata * +cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) +{ + struct page **pages = + kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + struct cifs_readdata *ret = NULL; + + if (pages) { + ret = cifs_readdata_direct_alloc(pages, complete); + if (!ret) + kfree(pages); + } + + return ret; +} + void cifs_readdata_release(struct kref *refcount) { @@ -2910,6 +2926,7 @@ cifs_readdata_release(struct kref *refcount) if (rdata->cfile) cifsFileInfo_put(rdata->cfile); + kvfree(rdata->pages); kfree(rdata); } @@ -3009,12 +3026,20 @@ uncached_fill_pages(struct TCP_Server_Info *server, int result = 0; unsigned int i; unsigned int nr_pages = rdata->nr_pages; + unsigned int page_offset = rdata->page_offset; rdata->got_bytes = 0; rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; size_t n; + unsigned int segment_size = rdata->pagesz; + + if (i == 0) + segment_size -= page_offset; + else + page_offset = 0; + if (len <= 0) { /* no need to hold page hostage */ @@ -3023,24 +3048,25 @@ uncached_fill_pages(struct TCP_Server_Info *server, put_page(page); continue; } + n = len; - if (len >= PAGE_SIZE) { + if (len >= segment_size) /* enough data to fill the page */ - n = PAGE_SIZE; - len -= n; - } else { - zero_user(page, len, PAGE_SIZE - len); + n = segment_size; + else rdata->tailsz = len; - len = 0; - } + len -= n; + if (iter) - result = copy_page_from_iter(page, 0, n, iter); + result = copy_page_from_iter( + page, page_offset, n, iter); #ifdef CONFIG_CIFS_SMB_DIRECT else if (rdata->mr) result = n; #endif else - result = cifs_read_page_from_socket(server, page, n); + result = cifs_read_page_from_socket( + server, page, page_offset, n); if (result < 0) break; @@ -3113,6 +3139,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->bytes = cur_len; rdata->pid = pid; rdata->pagesz = PAGE_SIZE; + rdata->tailsz = PAGE_SIZE; rdata->read_into_pages = cifs_uncached_read_into_pages; rdata->copy_into_pages = cifs_uncached_copy_into_pages; rdata->credits = credits; @@ -3557,6 +3584,7 @@ readpages_fill_pages(struct TCP_Server_Info *server, u64 eof; pgoff_t eof_index; unsigned int nr_pages = rdata->nr_pages; + unsigned int page_offset = rdata->page_offset; /* determine the eof that the server (probably) has */ eof = CIFS_I(rdata->mapping->host)->server_eof; @@ -3567,13 +3595,21 @@ readpages_fill_pages(struct TCP_Server_Info *server, rdata->tailsz = PAGE_SIZE; for (i = 0; i < nr_pages; i++) { struct page *page = rdata->pages[i]; - size_t n = PAGE_SIZE; + unsigned int to_read = rdata->pagesz; + size_t n; + + if (i == 0) + to_read -= page_offset; + else + page_offset = 0; + + n = to_read; - if (len >= PAGE_SIZE) { - len -= PAGE_SIZE; + if (len >= to_read) { + len -= to_read; } else if (len > 0) { /* enough for partial page, fill and zero the rest */ - zero_user(page, len, PAGE_SIZE - len); + zero_user(page, len + page_offset, to_read - len); n = rdata->tailsz = len; len = 0; } else if (page->index > eof_index) { @@ -3605,13 +3641,15 @@ readpages_fill_pages(struct TCP_Server_Info *server, } if (iter) - result = copy_page_from_iter(page, 0, n, iter); + result = copy_page_from_iter( + page, page_offset, n, iter); #ifdef CONFIG_CIFS_SMB_DIRECT else if (rdata->mr) result = n; #endif else - result = cifs_read_page_from_socket(server, page, n); + result = cifs_read_page_from_socket( + server, page, page_offset, n); if (result < 0) break; @@ -3790,6 +3828,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata->bytes = bytes; rdata->pid = pid; rdata->pagesz = PAGE_SIZE; + rdata->tailsz = PAGE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; rdata->copy_into_pages = cifs_readpages_copy_into_pages; rdata->credits = credits; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 25d3f66b2d50..85145a763021 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -129,8 +129,8 @@ static void cifs_fscache_acquire_inode_cookie(struct cifsInodeInfo *cifsi, memset(&auxdata, 0, sizeof(auxdata)); auxdata.eof = cifsi->server_eof; - auxdata.last_write_time = cifsi->vfs_inode.i_mtime; - auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + auxdata.last_write_time = timespec64_to_timespec(cifsi->vfs_inode.i_mtime); + auxdata.last_change_time = timespec64_to_timespec(cifsi->vfs_inode.i_ctime); cifsi->fscache = fscache_acquire_cookie(tcon->fscache, @@ -166,8 +166,8 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) if (cifsi->fscache) { memset(&auxdata, 0, sizeof(auxdata)); auxdata.eof = cifsi->server_eof; - auxdata.last_write_time = cifsi->vfs_inode.i_mtime; - auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + auxdata.last_write_time = timespec64_to_timespec(cifsi->vfs_inode.i_mtime); + auxdata.last_change_time = timespec64_to_timespec(cifsi->vfs_inode.i_ctime); cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); fscache_relinquish_cookie(cifsi->fscache, &auxdata, false); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 3c371f7f5963..a2cfb33e85c1 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -95,6 +95,7 @@ static void cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct timespec ts; cifs_dbg(FYI, "%s: revalidating inode %llu\n", __func__, cifs_i->uniqueid); @@ -113,7 +114,8 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) } /* revalidate if mtime or size have changed */ - if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) && + ts = timespec64_to_timespec(inode->i_mtime); + if (timespec_equal(&ts, &fattr->cf_mtime) && cifs_i->server_eof == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); @@ -162,9 +164,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); - inode->i_atime = fattr->cf_atime; - inode->i_mtime = fattr->cf_mtime; - inode->i_ctime = fattr->cf_ctime; + inode->i_atime = timespec_to_timespec64(fattr->cf_atime); + inode->i_mtime = timespec_to_timespec64(fattr->cf_mtime); + inode->i_ctime = timespec_to_timespec64(fattr->cf_ctime); inode->i_rdev = fattr->cf_rdev; cifs_nlink_fattr_to_inode(inode, fattr); inode->i_uid = fattr->cf_uid; @@ -746,7 +748,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_dbg(FYI, "Getting info on %s\n", full_path); if ((data == NULL) && (*inode != NULL)) { - if (CIFS_CACHE_READ(CIFS_I(*inode))) { + if (CIFS_CACHE_READ(CIFS_I(*inode)) && + CIFS_I(*inode)->time != 0) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto cgii_exit; } @@ -1122,14 +1125,14 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, if (attrs->ia_valid & ATTR_ATIME) { set_time = true; info_buf.LastAccessTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime)); + cpu_to_le64(cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_atime))); } else info_buf.LastAccessTime = 0; if (attrs->ia_valid & ATTR_MTIME) { set_time = true; info_buf.LastWriteTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime)); + cpu_to_le64(cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_mtime))); } else info_buf.LastWriteTime = 0; @@ -1142,7 +1145,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, if (set_time && (attrs->ia_valid & ATTR_CTIME)) { cifs_dbg(FYI, "CIFS - CTIME changed\n"); info_buf.ChangeTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); + cpu_to_le64(cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_ctime))); } else info_buf.ChangeTime = 0; @@ -1572,6 +1575,17 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) goto mkdir_out; } + server = tcon->ses->server; + +#ifdef CONFIG_CIFS_SMB311 + if ((server->ops->posix_mkdir) && (tcon->posix_extensions)) { + rc = server->ops->posix_mkdir(xid, inode, mode, tcon, full_path, + cifs_sb); + d_drop(direntry); /* for time being always refresh inode info */ + goto mkdir_out; + } +#endif /* SMB311 */ + if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb, @@ -1580,8 +1594,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) goto mkdir_out; } - server = tcon->ses->server; - if (!server->ops->mkdir) { rc = -ENOSYS; goto mkdir_out; @@ -1791,7 +1803,7 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, * with unix extensions enabled. */ info_buf_source = - kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), + kmalloc_array(2, sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); if (info_buf_source == NULL) { rc = -ENOMEM; @@ -1857,15 +1869,15 @@ cifs_inode_needs_reval(struct inode *inode) struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + if (cifs_i->time == 0) + return true; + if (CIFS_CACHE_READ(cifs_i)) return false; if (!lookupCacheEnabled) return true; - if (cifs_i->time == 0) - return true; - if (!cifs_sb->actimeo) return true; @@ -2059,8 +2071,8 @@ int cifs_getattr(const struct path *path, struct kstat *stat, /* old CIFS Unix Extensions doesn't return create time */ if (CIFS_I(inode)->createtime) { stat->result_mask |= STATX_BTIME; - stat->btime = - cifs_NTtimeToUnix(cpu_to_le64(CIFS_I(inode)->createtime)); + stat->btime = timespec_to_timespec64( + cifs_NTtimeToUnix(cpu_to_le64(CIFS_I(inode)->createtime))); } stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED); @@ -2104,10 +2116,14 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) static void cifs_setsize(struct inode *inode, loff_t offset) { + struct cifsInodeInfo *cifs_i = CIFS_I(inode); + spin_lock(&inode->i_lock); i_size_write(inode, offset); spin_unlock(&inode->i_lock); + /* Cached inode must be refreshed on truncate */ + cifs_i->time = 0; truncate_pagecache(inode, offset); } @@ -2262,17 +2278,17 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) args->gid = INVALID_GID; /* no change */ if (attrs->ia_valid & ATTR_ATIME) - args->atime = cifs_UnixTimeToNT(attrs->ia_atime); + args->atime = cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_atime)); else args->atime = NO_CHANGE_64; if (attrs->ia_valid & ATTR_MTIME) - args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime); + args->mtime = cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_mtime)); else args->mtime = NO_CHANGE_64; if (attrs->ia_valid & ATTR_CTIME) - args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime); + args->ctime = cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_ctime)); else args->ctime = NO_CHANGE_64; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 889a840172eb..de41f96aba49 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -421,7 +421,8 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, return -ENOMEM; } - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, pfile_info, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, pfile_info, NULL, + NULL); if (rc) goto qmf_out_open_fail; @@ -478,7 +479,8 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, + NULL); if (rc) { kfree(utf16_path); return rc; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 460084a8eac5..53e8362cbc4a 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -82,6 +82,7 @@ sesInfoAlloc(void) INIT_LIST_HEAD(&ret_buf->smb_ses_list); INIT_LIST_HEAD(&ret_buf->tcon_list); mutex_init(&ret_buf->session_mutex); + spin_lock_init(&ret_buf->iface_lock); } return ret_buf; } @@ -102,6 +103,7 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kzfree(buf_to_free->auth_key.response); + kfree(buf_to_free->iface_list); kzfree(buf_to_free); } @@ -117,6 +119,9 @@ tconInfoAlloc(void) INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); spin_lock_init(&ret_buf->open_file_lock); + mutex_init(&ret_buf->crfid.fid_mutex); + ret_buf->crfid.fid = kzalloc(sizeof(struct cifs_fid), + GFP_KERNEL); #ifdef CONFIG_CIFS_STATS spin_lock_init(&ret_buf->stat_lock); #endif @@ -134,6 +139,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free) atomic_dec(&tconInfoAllocCount); kfree(buf_to_free->nativeFileSystem); kzfree(buf_to_free->password); + kfree(buf_to_free->crfid.fid); kfree(buf_to_free); } @@ -145,7 +151,7 @@ cifs_buf_get(void) * SMB2 header is bigger than CIFS one - no problems to clean some * more bytes for CIFS. */ - size_t buf_size = sizeof(struct smb2_hdr); + size_t buf_size = sizeof(struct smb2_sync_hdr); /* * We could use negotiated size instead of max_msgsize - @@ -339,7 +345,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server) /* otherwise, there is enough to get to the BCC */ if (check_smb_hdr(smb)) return -EIO; - clc_len = smbCalcSize(smb); + clc_len = smbCalcSize(smb, server); if (4 + rfclen != total_read) { cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n", @@ -786,7 +792,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) GFP_KERNEL); if (!bv) { - bv = vmalloc(max_pages * sizeof(struct bio_vec)); + bv = vmalloc(array_size(max_pages, sizeof(struct bio_vec))); if (!bv) return -ENOMEM; } @@ -796,7 +802,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) GFP_KERNEL); if (!pages) { - pages = vmalloc(max_pages * sizeof(struct page *)); + pages = vmalloc(array_size(max_pages, sizeof(struct page *))); if (!pages) { kvfree(bv); return -ENOMEM; @@ -902,3 +908,20 @@ cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) crypto_free_shash(*shash); *shash = NULL; } + +/** + * rqst_page_get_length - obtain the length and offset for a page in smb_rqst + * Input: rqst - a smb_rqst, page - a page index for rqst + * Output: *len - the length for this page, *offset - the offset for this page + */ +void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, + unsigned int *len, unsigned int *offset) +{ + *len = rqst->rq_pagesz; + *offset = (page == 0) ? rqst->rq_offset : 0; + + if (rqst->rq_npages == 1 || page == rqst->rq_npages-1) + *len = rqst->rq_tailsz; + else if (page == 0) + *len = rqst->rq_pagesz - rqst->rq_offset; +} diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index cc88f4f0325e..d7ad0dfe4e68 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -903,7 +903,7 @@ map_smb_to_linux_error(char *buf, bool logErr) * portion, the number of word parameters and the data portion of the message */ unsigned int -smbCalcSize(void *buf) +smbCalcSize(void *buf, struct TCP_Server_Info *server) { struct smb_hdr *ptr = (struct smb_hdr *)buf; return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index a27fc8791551..eeab81c9452f 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -650,7 +650,8 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, char *cur_ent; char *end_of_smb = cfile->srch_inf.ntwrk_buf_start + server->ops->calc_smb_size( - cfile->srch_inf.ntwrk_buf_start); + cfile->srch_inf.ntwrk_buf_start, + server); cur_ent = cfile->srch_inf.srch_entries_start; first_entry_in_buffer = cfile->srch_inf.index_of_last_entry @@ -831,7 +832,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); max_len = tcon->ses->server->ops->calc_smb_size( - cifsFile->srch_inf.ntwrk_buf_start); + cifsFile->srch_inf.ntwrk_buf_start, + tcon->ses->server); end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index aff8ce8ba34d..646dcd149de1 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -107,6 +107,7 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer) if (compare_mid(mid->mid, buf) && mid->mid_state == MID_REQUEST_SUBMITTED && le16_to_cpu(mid->command) == buf->Command) { + kref_get(&mid->refcount); spin_unlock(&GlobalMid_Lock); return mid; } diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 12af5dba742b..4ed10dd086e6 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -41,7 +41,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, int rc; __le16 *smb2_path; struct smb2_file_all_info *smb2_data = NULL; - __u8 smb2_oplock[17]; + __u8 smb2_oplock; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; @@ -59,12 +59,10 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, } oparms->desired_access |= FILE_READ_ATTRIBUTES; - *smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; + smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; - if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) - memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE); - - rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data, NULL); + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, + NULL); if (rc) goto out; @@ -100,7 +98,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, move_smb2_info_to_cifs(buf, smb2_data); } - *oplock = *smb2_oplock; + *oplock = smb2_oplock; out: kfree(smb2_data); kfree(smb2_path); diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 401a5d856636..0ffa18094335 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -61,9 +61,4 @@ /* Maximum buffer size value we can send with 1 credit */ #define SMB2_MAX_BUFFER_SIZE 65536 -static inline struct smb2_sync_hdr *get_sync_hdr(void *buf) -{ - return &(((struct smb2_hdr *)buf)->sync_hdr); -} - #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 1238cd3552f9..d01ad706d7fc 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -44,26 +44,39 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, __u32 create_options, void *data, int command) { int rc, tmprc = 0; - __le16 *utf16_path; + __le16 *utf16_path = NULL; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; + bool use_cached_root_handle = false; - utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); - if (!utf16_path) - return -ENOMEM; + if ((strcmp(full_path, "") == 0) && (create_options == 0) && + (desired_access == FILE_READ_ATTRIBUTES) && + (create_disposition == FILE_OPEN) && + (tcon->nohandlecache == false)) { + rc = open_shroot(xid, tcon, &fid); + if (rc == 0) + use_cached_root_handle = true; + } + + if (use_cached_root_handle == false) { + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; - oparms.tcon = tcon; - oparms.desired_access = desired_access; - oparms.disposition = create_disposition; - oparms.create_options = create_options; - oparms.fid = &fid; - oparms.reconnect = false; + oparms.tcon = tcon; + oparms.desired_access = desired_access; + oparms.disposition = create_disposition; + oparms.create_options = create_options; + oparms.fid = &fid; + oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); - if (rc) { - kfree(utf16_path); - return rc; + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, + NULL); + if (rc) { + kfree(utf16_path); + return rc; + } } switch (command) { @@ -107,7 +120,8 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, break; } - rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + if (use_cached_root_handle == false) + rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); if (tmprc) rc = tmprc; kfree(utf16_path); diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 3bfc9c990724..20a2d304c603 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -27,6 +27,7 @@ #include "smb2proto.h" #include "smb2status.h" #include "smb2glob.h" +#include "trace.h" struct status_to_posix_error { __le32 smb2_status; @@ -2450,13 +2451,16 @@ smb2_print_status(__le32 status) int map_smb2_to_linux_error(char *buf, bool log_err) { - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; unsigned int i; int rc = -EIO; __le32 smb2err = shdr->Status; - if (smb2err == 0) + if (smb2err == 0) { + trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId, + le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId)); return 0; + } /* mask facility */ if (log_err && (smb2err != STATUS_MORE_PROCESSING_REQUIRED) && @@ -2478,5 +2482,8 @@ map_smb2_to_linux_error(char *buf, bool log_err) cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n", __le32_to_cpu(smb2err), rc); + trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId, + le16_to_cpu(shdr->Command), + le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc); return rc; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 68ea8491c160..3ff7cec2da81 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -94,8 +94,8 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { }; #ifdef CONFIG_CIFS_SMB311 -static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen, - size_t hdr_preamble_size) +static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len, + __u32 non_ctxlen) { __u16 neg_count; __u32 nc_offset, size_of_pad_before_neg_ctxts; @@ -109,12 +109,11 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen, /* Make sure that negotiate contexts start after gss security blob */ nc_offset = le32_to_cpu(pneg_rsp->NegotiateContextOffset); - if (nc_offset < non_ctxlen - hdr_preamble_size /* RFC1001 len */) { + if (nc_offset < non_ctxlen) { printk_once(KERN_WARNING "invalid negotiate context offset\n"); return 0; } - size_of_pad_before_neg_ctxts = nc_offset - - (non_ctxlen - hdr_preamble_size); + size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen; /* Verify that at least minimal negotiate contexts fit within frame */ if (len < nc_offset + (neg_count * sizeof(struct smb2_neg_context))) { @@ -131,25 +130,20 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen, #endif /* CIFS_SMB311 */ int -smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) +smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) { - struct smb2_pdu *pdu = (struct smb2_pdu *)buf; - struct smb2_hdr *hdr = &pdu->hdr; - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr; __u64 mid; - __u32 len = get_rfc1002_length(buf); __u32 clc_len; /* calculated length */ int command; - - /* BB disable following printk later */ - cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n", - __func__, length, len); + int pdu_size = sizeof(struct smb2_sync_pdu); + int hdr_size = sizeof(struct smb2_sync_hdr); /* * Add function to do table lookup of StructureSize by command * ie Validate the wct via smb2_struct_sizes table above */ - if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { struct smb2_transform_hdr *thdr = (struct smb2_transform_hdr *)buf; @@ -173,8 +167,8 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) } mid = le64_to_cpu(shdr->MessageId); - if (length < sizeof(struct smb2_pdu)) { - if ((length >= sizeof(struct smb2_hdr)) + if (len < pdu_size) { + if ((len >= hdr_size) && (shdr->Status != 0)) { pdu->StructureSize2 = 0; /* @@ -187,8 +181,7 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) } return 1; } - if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - - srvr->vals->header_preamble_size) { + if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE) { cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n", mid); return 1; @@ -227,44 +220,38 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) } } - if (srvr->vals->header_preamble_size + len != length) { - cifs_dbg(VFS, "Total length %u RFC1002 length %zu mismatch mid %llu\n", - length, srvr->vals->header_preamble_size + len, mid); - return 1; - } - - clc_len = smb2_calc_size(hdr); + clc_len = smb2_calc_size(buf, srvr); #ifdef CONFIG_CIFS_SMB311 if (shdr->Command == SMB2_NEGOTIATE) - clc_len += get_neg_ctxt_len(hdr, len, clc_len, - srvr->vals->header_preamble_size); + clc_len += get_neg_ctxt_len(shdr, len, clc_len); #endif /* SMB311 */ - if (srvr->vals->header_preamble_size + len != clc_len) { - cifs_dbg(FYI, "Calculated size %u length %zu mismatch mid %llu\n", - clc_len, srvr->vals->header_preamble_size + len, mid); + if (len != clc_len) { + cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", + clc_len, len, mid); /* create failed on symlink */ if (command == SMB2_CREATE_HE && shdr->Status == STATUS_STOPPED_ON_SYMLINK) return 0; /* Windows 7 server returns 24 bytes more */ - if (clc_len + 24 - srvr->vals->header_preamble_size == len && command == SMB2_OPLOCK_BREAK_HE) + if (clc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE) return 0; /* server can return one byte more due to implied bcc[0] */ - if (clc_len == srvr->vals->header_preamble_size + len + 1) + if (clc_len == len + 1) return 0; /* * MacOS server pads after SMB2.1 write response with 3 bytes * of junk. Other servers match RFC1001 len to actual * SMB2/SMB3 frame length (header + smb2 response specific data) + * Some windows servers do too when compounding is used. * Log the server error (once), but allow it and continue * since the frame is parseable. */ - if (clc_len < srvr->vals->header_preamble_size /* RFC1001 header size */ + len) { + if (clc_len < len) { printk_once(KERN_WARNING - "SMB2 server sent bad RFC1001 len %d not %zu\n", - len, clc_len - srvr->vals->header_preamble_size); + "SMB2 server sent bad RFC1001 len %d not %d\n", + len, clc_len); return 0; } @@ -305,15 +292,14 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { * area and the offset to it (from the beginning of the smb are also returned. */ char * -smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) +smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr) { - struct smb2_sync_hdr *shdr = get_sync_hdr(hdr); *off = 0; *len = 0; /* error responses do not have data area */ if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED && - (((struct smb2_err_rsp *)hdr)->StructureSize) == + (((struct smb2_err_rsp *)shdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2) return NULL; @@ -325,42 +311,44 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) switch (shdr->Command) { case SMB2_NEGOTIATE: *off = le16_to_cpu( - ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferOffset); + ((struct smb2_negotiate_rsp *)shdr)->SecurityBufferOffset); *len = le16_to_cpu( - ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferLength); + ((struct smb2_negotiate_rsp *)shdr)->SecurityBufferLength); break; case SMB2_SESSION_SETUP: *off = le16_to_cpu( - ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferOffset); + ((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferOffset); *len = le16_to_cpu( - ((struct smb2_sess_setup_rsp *)hdr)->SecurityBufferLength); + ((struct smb2_sess_setup_rsp *)shdr)->SecurityBufferLength); break; case SMB2_CREATE: *off = le32_to_cpu( - ((struct smb2_create_rsp *)hdr)->CreateContextsOffset); + ((struct smb2_create_rsp *)shdr)->CreateContextsOffset); *len = le32_to_cpu( - ((struct smb2_create_rsp *)hdr)->CreateContextsLength); + ((struct smb2_create_rsp *)shdr)->CreateContextsLength); break; case SMB2_QUERY_INFO: *off = le16_to_cpu( - ((struct smb2_query_info_rsp *)hdr)->OutputBufferOffset); + ((struct smb2_query_info_rsp *)shdr)->OutputBufferOffset); *len = le32_to_cpu( - ((struct smb2_query_info_rsp *)hdr)->OutputBufferLength); + ((struct smb2_query_info_rsp *)shdr)->OutputBufferLength); break; case SMB2_READ: - *off = ((struct smb2_read_rsp *)hdr)->DataOffset; - *len = le32_to_cpu(((struct smb2_read_rsp *)hdr)->DataLength); + /* TODO: is this a bug ? */ + *off = ((struct smb2_read_rsp *)shdr)->DataOffset; + *len = le32_to_cpu(((struct smb2_read_rsp *)shdr)->DataLength); break; case SMB2_QUERY_DIRECTORY: *off = le16_to_cpu( - ((struct smb2_query_directory_rsp *)hdr)->OutputBufferOffset); + ((struct smb2_query_directory_rsp *)shdr)->OutputBufferOffset); *len = le32_to_cpu( - ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength); + ((struct smb2_query_directory_rsp *)shdr)->OutputBufferLength); break; case SMB2_IOCTL: *off = le32_to_cpu( - ((struct smb2_ioctl_rsp *)hdr)->OutputOffset); - *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount); + ((struct smb2_ioctl_rsp *)shdr)->OutputOffset); + *len = le32_to_cpu( + ((struct smb2_ioctl_rsp *)shdr)->OutputCount); break; case SMB2_CHANGE_NOTIFY: default: @@ -403,15 +391,14 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) * portion, the number of word parameters and the data portion of the message. */ unsigned int -smb2_calc_size(void *buf) +smb2_calc_size(void *buf, struct TCP_Server_Info *srvr) { - struct smb2_pdu *pdu = (struct smb2_pdu *)buf; - struct smb2_hdr *hdr = &pdu->hdr; - struct smb2_sync_hdr *shdr = get_sync_hdr(hdr); + struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf; + struct smb2_sync_hdr *shdr = &pdu->sync_hdr; int offset; /* the offset from the beginning of SMB to data area */ int data_length; /* the length of the variable length data area */ /* Structure Size has already been checked to make sure it is 64 */ - int len = 4 + le16_to_cpu(shdr->StructureSize); + int len = le16_to_cpu(shdr->StructureSize); /* * StructureSize2, ie length of fixed parameter area has already @@ -422,7 +409,7 @@ smb2_calc_size(void *buf) if (has_smb2_data_area[le16_to_cpu(shdr->Command)] == false) goto calc_size_exit; - smb2_get_data_area_len(&offset, &data_length, hdr); + smb2_get_data_area_len(&offset, &data_length, shdr); cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset); if (data_length > 0) { @@ -430,15 +417,14 @@ smb2_calc_size(void *buf) * Check to make sure that data area begins after fixed area, * Note that last byte of the fixed area is part of data area * for some commands, typically those with odd StructureSize, - * so we must add one to the calculation (and 4 to account for - * the size of the RFC1001 hdr. + * so we must add one to the calculation. */ - if (offset + 4 + 1 < len) { + if (offset + 1 < len) { cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n", - offset + 4 + 1, len); + offset + 1, len); data_length = 0; } else { - len = 4 + offset + data_length; + len = offset + data_length; } } calc_size_exit: @@ -465,8 +451,17 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) /* Windows doesn't allow paths beginning with \ */ if (from[0] == '\\') start_of_path = from + 1; +#ifdef CONFIG_CIFS_SMB311 + /* SMB311 POSIX extensions paths do not include leading slash */ + else if (cifs_sb_master_tlink(cifs_sb) && + cifs_sb_master_tcon(cifs_sb)->posix_extensions && + (from[0] == '/')) { + start_of_path = from + 1; + } +#endif /* 311 */ else start_of_path = from; + to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len, cifs_sb->local_nls, map_type); return to; @@ -498,10 +493,11 @@ cifs_ses_oplock_break(struct work_struct *work) { struct smb2_lease_break_work *lw = container_of(work, struct smb2_lease_break_work, lease_break); - int rc; + int rc = 0; rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key, lw->lease_state); + cifs_dbg(FYI, "Lease release rc %d\n", rc); cifs_put_tlink(lw->tlink); kfree(lw); @@ -567,6 +563,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, open->oplock = lease_state; } + return found; } @@ -609,6 +606,18 @@ smb2_is_valid_lease_break(char *buffer) return true; } spin_unlock(&tcon->open_file_lock); + + if (tcon->crfid.is_valid && + !memcmp(rsp->LeaseKey, + tcon->crfid.fid->lease_key, + SMB2_LEASE_KEY_SIZE)) { + INIT_WORK(&tcon->crfid.lease_break, + smb2_cached_lease_break); + queue_work(cifsiod_wq, + &tcon->crfid.lease_break); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } } } } @@ -621,7 +630,7 @@ smb2_is_valid_lease_break(char *buffer) bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) { - struct smb2_oplock_break_rsp *rsp = (struct smb2_oplock_break_rsp *)buffer; + struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; struct list_head *tmp, *tmp1, *tmp2; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -630,7 +639,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "Checking for oplock break\n"); - if (rsp->hdr.sync_hdr.Command != SMB2_OPLOCK_BREAK) + if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) return false; if (rsp->StructureSize != @@ -721,7 +730,7 @@ smb2_cancelled_close_fid(struct work_struct *work) int smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *sync_hdr = get_sync_hdr(buffer); + struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer; struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer; struct cifs_tcon *tcon; struct close_cancelled_open *cancelled; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 9c6d95ffca97..ea92a38b2f08 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -123,7 +123,7 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype) static unsigned int smb2_get_credits(struct mid_q_entry *mid) { - struct smb2_sync_hdr *shdr = get_sync_hdr(mid->resp_buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf; return le16_to_cpu(shdr->CreditRequest); } @@ -190,7 +190,7 @@ static struct mid_q_entry * smb2_find_mid(struct TCP_Server_Info *server, char *buf) { struct mid_q_entry *mid; - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; __u64 wire_mid = le64_to_cpu(shdr->MessageId); if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { @@ -203,6 +203,7 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) if ((mid->mid == wire_mid) && (mid->mid_state == MID_REQUEST_SUBMITTED) && (mid->command == shdr->Command)) { + kref_get(&mid->refcount); spin_unlock(&GlobalMid_Lock); return mid; } @@ -212,15 +213,16 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) } static void -smb2_dump_detail(void *buf) +smb2_dump_detail(void *buf, struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId, shdr->ProcessId); - cifs_dbg(VFS, "smb buf %p len %u\n", buf, smb2_calc_size(buf)); + cifs_dbg(VFS, "smb buf %p len %u\n", buf, + server->ops->calc_smb_size(buf, server)); #endif } @@ -293,34 +295,226 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) return rsize; } -#ifdef CONFIG_CIFS_STATS2 + +static int +parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, + size_t buf_len, + struct cifs_server_iface **iface_list, + size_t *iface_count) +{ + struct network_interface_info_ioctl_rsp *p; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + struct iface_info_ipv4 *p4; + struct iface_info_ipv6 *p6; + struct cifs_server_iface *info; + ssize_t bytes_left; + size_t next = 0; + int nb_iface = 0; + int rc = 0; + + *iface_list = NULL; + *iface_count = 0; + + /* + * Fist pass: count and sanity check + */ + + bytes_left = buf_len; + p = buf; + while (bytes_left >= sizeof(*p)) { + nb_iface++; + next = le32_to_cpu(p->Next); + if (!next) { + bytes_left -= sizeof(*p); + break; + } + p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); + bytes_left -= next; + } + + if (!nb_iface) { + cifs_dbg(VFS, "%s: malformed interface info\n", __func__); + rc = -EINVAL; + goto out; + } + + if (bytes_left || p->Next) + cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); + + + /* + * Second pass: extract info to internal structure + */ + + *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL); + if (!*iface_list) { + rc = -ENOMEM; + goto out; + } + + info = *iface_list; + bytes_left = buf_len; + p = buf; + while (bytes_left >= sizeof(*p)) { + info->speed = le64_to_cpu(p->LinkSpeed); + info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE); + info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE); + + cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count); + cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); + cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, + le32_to_cpu(p->Capability)); + + switch (p->Family) { + /* + * The kernel and wire socket structures have the same + * layout and use network byte order but make the + * conversion explicit in case either one changes. + */ + case INTERNETWORK: + addr4 = (struct sockaddr_in *)&info->sockaddr; + p4 = (struct iface_info_ipv4 *)p->Buffer; + addr4->sin_family = AF_INET; + memcpy(&addr4->sin_addr, &p4->IPv4Address, 4); + + /* [MS-SMB2] 2.2.32.5.1.1 Clients MUST ignore these */ + addr4->sin_port = cpu_to_be16(CIFS_PORT); + + cifs_dbg(FYI, "%s: ipv4 %pI4\n", __func__, + &addr4->sin_addr); + break; + case INTERNETWORKV6: + addr6 = (struct sockaddr_in6 *)&info->sockaddr; + p6 = (struct iface_info_ipv6 *)p->Buffer; + addr6->sin6_family = AF_INET6; + memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16); + + /* [MS-SMB2] 2.2.32.5.1.2 Clients MUST ignore these */ + addr6->sin6_flowinfo = 0; + addr6->sin6_scope_id = 0; + addr6->sin6_port = cpu_to_be16(CIFS_PORT); + + cifs_dbg(FYI, "%s: ipv6 %pI6\n", __func__, + &addr6->sin6_addr); + break; + default: + cifs_dbg(VFS, + "%s: skipping unsupported socket family\n", + __func__); + goto next_iface; + } + + (*iface_count)++; + info++; +next_iface: + next = le32_to_cpu(p->Next); + if (!next) + break; + p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); + bytes_left -= next; + } + + if (!*iface_count) { + rc = -EINVAL; + goto out; + } + +out: + if (rc) { + kfree(*iface_list); + *iface_count = 0; + *iface_list = NULL; + } + return rc; +} + + static int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) { int rc; unsigned int ret_data_len = 0; - struct network_interface_info_ioctl_rsp *out_buf; + struct network_interface_info_ioctl_rsp *out_buf = NULL; + struct cifs_server_iface *iface_list; + size_t iface_count; + struct cifs_ses *ses = tcon->ses; rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, NULL /* no data input */, 0 /* no data input */, (char **)&out_buf, &ret_data_len); - if (rc != 0) + if (rc != 0) { cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc); - else if (ret_data_len < sizeof(struct network_interface_info_ioctl_rsp)) { - cifs_dbg(VFS, "server returned bad net interface info buf\n"); - rc = -EINVAL; - } else { - /* Dump info on first interface */ - cifs_dbg(FYI, "Adapter Capability 0x%x\t", - le32_to_cpu(out_buf->Capability)); - cifs_dbg(FYI, "Link Speed %lld\n", - le64_to_cpu(out_buf->LinkSpeed)); + goto out; } + + rc = parse_server_interfaces(out_buf, ret_data_len, + &iface_list, &iface_count); + if (rc) + goto out; + + spin_lock(&ses->iface_lock); + kfree(ses->iface_list); + ses->iface_list = iface_list; + ses->iface_count = iface_count; + ses->iface_last_update = jiffies; + spin_unlock(&ses->iface_lock); + +out: kfree(out_buf); return rc; } -#endif /* STATS2 */ + +void +smb2_cached_lease_break(struct work_struct *work) +{ + struct cached_fid *cfid = container_of(work, + struct cached_fid, lease_break); + mutex_lock(&cfid->fid_mutex); + if (cfid->is_valid) { + cifs_dbg(FYI, "clear cached root file handle\n"); + SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid, + cfid->fid->volatile_fid); + cfid->is_valid = false; + } + mutex_unlock(&cfid->fid_mutex); +} + +/* + * Open the directory at the root of a share + */ +int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) +{ + struct cifs_open_parms oparams; + int rc; + __le16 srch_path = 0; /* Null - since an open of top of share */ + u8 oplock = SMB2_OPLOCK_LEVEL_II; + + mutex_lock(&tcon->crfid.fid_mutex); + if (tcon->crfid.is_valid) { + cifs_dbg(FYI, "found a cached root file handle\n"); + memcpy(pfid, tcon->crfid.fid, sizeof(struct cifs_fid)); + mutex_unlock(&tcon->crfid.fid_mutex); + return 0; + } + + oparams.tcon = tcon; + oparams.create_options = 0; + oparams.desired_access = FILE_READ_ATTRIBUTES; + oparams.disposition = FILE_OPEN; + oparams.fid = pfid; + oparams.reconnect = false; + + rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL); + if (rc == 0) { + memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); + tcon->crfid.tcon = tcon; + tcon->crfid.is_valid = true; + } + mutex_unlock(&tcon->crfid.fid_mutex); + return rc; +} static void smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) @@ -330,6 +524,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; + bool no_cached_open = tcon->nohandlecache; oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES; @@ -338,13 +533,16 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + if (no_cached_open) + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, + NULL); + else + rc = open_shroot(xid, tcon, &fid); + if (rc) return; -#ifdef CONFIG_CIFS_STATS2 SMB3_request_interfaces(xid, tcon); -#endif /* STATS2 */ SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_ATTRIBUTE_INFORMATION); @@ -352,7 +550,8 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) FS_DEVICE_INFORMATION); SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */ - SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + if (no_cached_open) + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); return; } @@ -372,7 +571,7 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL); if (rc) return; @@ -394,6 +593,9 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; + if ((*full_path == 0) && tcon->crfid.is_valid) + return 0; + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); if (!utf16_path) return -ENOMEM; @@ -405,7 +607,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); if (rc) { kfree(utf16_path); return rc; @@ -554,7 +756,7 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); kfree(utf16_path); if (rc) { cifs_dbg(FYI, "open failed rc=%d\n", rc); @@ -633,7 +835,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); kfree(utf16_path); if (rc) { cifs_dbg(FYI, "open failed rc=%d\n", rc); @@ -654,6 +856,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_set_ea(xid, tcon, fid.persistent_fid, fid.volatile_fid, ea, len); + kfree(ea); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); return rc; @@ -704,9 +908,11 @@ smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon) seq_puts(m, " TRIM-support,"); seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags); + seq_printf(m, "\n\ttid: 0x%x", tcon->tid); if (tcon->perf_sector_size) seq_printf(m, "\tOptimal sector size: 0x%x", tcon->perf_sector_size); + seq_printf(m, "\tMaximal Access: 0x%x", tcon->maximal_access); } static void @@ -1215,7 +1421,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); kfree(utf16_path); if (rc) { cifs_dbg(FYI, "open dir failed rc=%d\n", rc); @@ -1257,7 +1463,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon, static bool smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) { - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; if (shdr->Status != STATUS_PENDING) return false; @@ -1275,12 +1481,13 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) static bool smb2_is_session_expired(char *buf) { - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; - if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED) + if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED && + shdr->Status != STATUS_USER_SESSION_DELETED) return false; - cifs_dbg(FYI, "Session expired\n"); + cifs_dbg(FYI, "Session expired or deleted\n"); return true; } @@ -1314,7 +1521,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL); if (rc) return rc; buf->f_type = SMB2_MAGIC_NUMBER; @@ -1468,14 +1675,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; struct kvec err_iov = {NULL, 0}; - struct smb2_err_rsp *err_buf; + struct smb2_err_rsp *err_buf = NULL; + int resp_buftype; struct smb2_symlink_err_rsp *symlink; unsigned int sub_len; unsigned int sub_offset; unsigned int print_len; unsigned int print_offset; - struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = ses->server; cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); @@ -1490,18 +1696,18 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov); - + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov, + &resp_buftype); if (!rc || !err_iov.iov_base) { - kfree(utf16_path); - return -ENOENT; + rc = -ENOENT; + goto querty_exit; } err_buf = err_iov.iov_base; if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || - err_iov.iov_len + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) { - kfree(utf16_path); - return -ENOENT; + err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) { + rc = -ENOENT; + goto querty_exit; } /* open must fail on symlink - reset rc */ @@ -1512,27 +1718,29 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, print_len = le16_to_cpu(symlink->PrintNameLength); print_offset = le16_to_cpu(symlink->PrintNameOffset); - if (err_iov.iov_len + server->vals->header_preamble_size < - SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { - kfree(utf16_path); - return -ENOENT; + if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { + rc = -ENOENT; + goto querty_exit; } - if (err_iov.iov_len + server->vals->header_preamble_size < - SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { - kfree(utf16_path); - return -ENOENT; + if (err_iov.iov_len < + SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { + rc = -ENOENT; + goto querty_exit; } *target_path = cifs_strndup_from_utf16( (char *)symlink->PathBuffer + sub_offset, sub_len, true, cifs_sb->local_nls); if (!(*target_path)) { - kfree(utf16_path); - return -ENOMEM; + rc = -ENOMEM; + goto querty_exit; } convert_delimiter(*target_path, '/'); cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + + querty_exit: + free_rsp_buf(resp_buftype, err_buf); kfree(utf16_path); return rc; } @@ -1593,8 +1801,11 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, oparms.create_options = 0; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); - if (!utf16_path) - return ERR_PTR(-ENOMEM); + if (!utf16_path) { + rc = -ENOMEM; + free_xid(xid); + return ERR_PTR(rc); + } oparms.tcon = tcon; oparms.desired_access = READ_CONTROL; @@ -1602,7 +1813,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); kfree(utf16_path); if (!rc) { rc = SMB2_query_acl(xid, tlink_tcon(tlink), fid.persistent_fid, @@ -1652,8 +1863,11 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, access_flags = WRITE_DAC; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); - if (!utf16_path) - return -ENOMEM; + if (!utf16_path) { + rc = -ENOMEM; + free_xid(xid); + return rc; + } oparms.tcon = tcon; oparms.desired_access = access_flags; @@ -1662,7 +1876,7 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); kfree(utf16_path); if (!rc) { rc = SMB2_set_acl(xid, tlink_tcon(tlink), fid.persistent_fid, @@ -1713,15 +1927,21 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) - if (keep_size == false) - return -EOPNOTSUPP; + if (keep_size == false) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } /* * Must check if file sparse since fallocate -z (zero range) assumes * non-sparse allocation */ - if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) - return -EOPNOTSUPP; + if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } /* * need to make sure we are not asked to extend the file since the SMB3 @@ -1730,8 +1950,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, * which for a non sparse file would zero the newly extended range */ if (keep_size == false) - if (i_size_read(inode) < offset + len) - return -EOPNOTSUPP; + if (i_size_read(inode) < offset + len) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } cifs_dbg(FYI, "offset %lld len %lld", offset, len); @@ -1764,8 +1987,11 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, /* Need to make file sparse, if not already, before freeing range. */ /* Consider adding equivalent for compressed since it could also work */ - if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) - return -EOPNOTSUPP; + if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } cifs_dbg(FYI, "offset %lld len %lld", offset, len); @@ -1796,8 +2022,10 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) - if (keep_size == false) - return -EOPNOTSUPP; + if (keep_size == false) { + free_xid(xid); + return rc; + } /* * Files are non-sparse by default so falloc may be a no-op @@ -1806,14 +2034,16 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, */ if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) { if (keep_size == true) - return 0; + rc = 0; /* check if extending file */ else if (i_size_read(inode) >= off + len) /* not extending file and already not sparse */ - return 0; + rc = 0; /* BB: in future add else clause to extend file */ else - return -EOPNOTSUPP; + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; } if ((keep_size == true) || (i_size_read(inode) >= off + len)) { @@ -1825,8 +2055,11 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, * ie potentially making a few extra pages at the beginning * or end of the file non-sparse via set_sparse is harmless. */ - if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) - return -EOPNOTSUPP; + if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } rc = smb2_set_sparse(xid, tcon, cfile, inode, false); } @@ -1989,8 +2222,7 @@ smb2_create_lease_buf(u8 *lease_key, u8 oplock) if (!buf) return NULL; - buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); - buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + memcpy(&buf->lcontext.LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); buf->lcontext.LeaseState = map_oplock_to_lease(oplock); buf->ccontext.DataOffset = cpu_to_le16(offsetof @@ -2016,8 +2248,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) if (!buf) return NULL; - buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); - buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + memcpy(&buf->lcontext.LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); buf->lcontext.LeaseState = map_oplock_to_lease(oplock); buf->ccontext.DataOffset = cpu_to_le16(offsetof @@ -2035,7 +2266,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) } static __u8 -smb2_parse_lease_buf(void *buf, unsigned int *epoch) +smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) { struct create_lease *lc = (struct create_lease *)buf; @@ -2046,13 +2277,15 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch) } static __u8 -smb3_parse_lease_buf(void *buf, unsigned int *epoch) +smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) { struct create_lease_v2 *lc = (struct create_lease_v2 *)buf; *epoch = le16_to_cpu(lc->lcontext.Epoch); if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) return SMB2_OPLOCK_LEVEL_NOCHANGE; + if (lease_key) + memcpy(lease_key, &lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); return le32_to_cpu(lc->lcontext.LeaseState); } @@ -2070,12 +2303,11 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile) } static void -fill_transform_hdr(struct TCP_Server_Info *server, - struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq) +fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, + struct smb_rqst *old_rq) { struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base; - unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base); + (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base; memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr)); tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; @@ -2083,8 +2315,6 @@ fill_transform_hdr(struct TCP_Server_Info *server, tr_hdr->Flags = cpu_to_le16(0x01); get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE); memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); - inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - server->vals->header_preamble_size); - inc_rfc1001_len(tr_hdr, orig_len); } /* We can not use the normal sg_set_buf() as we will sometimes pass a @@ -2096,11 +2326,15 @@ static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); } +/* Assumes: + * rqst->rq_iov[0] is transform header + * rqst->rq_iov[1+] data to be encrypted/decrypted + */ static struct scatterlist * init_sg(struct smb_rqst *rqst, u8 *sign) { unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages + 1; - unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24; + unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; struct scatterlist *sg; unsigned int i; unsigned int j; @@ -2110,14 +2344,15 @@ init_sg(struct smb_rqst *rqst, u8 *sign) return NULL; sg_init_table(sg, sg_len); - smb2_sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len); + smb2_sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 20, assoc_data_len); for (i = 1; i < rqst->rq_nvec; i++) smb2_sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len); for (j = 0; i < sg_len - 1; i++, j++) { - unsigned int len = (j < rqst->rq_npages - 1) ? rqst->rq_pagesz - : rqst->rq_tailsz; - sg_set_page(&sg[i], rqst->rq_pages[j], len, 0); + unsigned int len, offset; + + rqst_page_get_length(rqst, j, &len, &offset); + sg_set_page(&sg[i], rqst->rq_pages[j], len, offset); } smb2_sg_set_buf(&sg[sg_len - 1], sign, SMB2_SIGNATURE_SIZE); return sg; @@ -2144,9 +2379,9 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) return 1; } /* - * Encrypt or decrypt @rqst message. @rqst has the following format: - * iov[0] - transform header (associate data), - * iov[1-N] and pages - data to encrypt. + * Encrypt or decrypt @rqst message. @rqst[0] has the following format: + * iov[0] - transform header (associate data), + * iov[1-N] - SMB2 header and pages - data to encrypt. * On success return encrypted data in iov[1-N] and pages, leave iov[0] * untouched. */ @@ -2155,7 +2390,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) { struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base; - unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20 - server->vals->header_preamble_size; + unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; int rc = 0; struct scatterlist *sg; u8 sign[SMB2_SIGNATURE_SIZE] = {}; @@ -2250,6 +2485,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, struct page **pages; struct smb2_transform_hdr *tr_hdr; unsigned int npages = old_rq->rq_npages; + unsigned int orig_len; int i; int rc = -ENOMEM; @@ -2258,6 +2494,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, return rc; new_rq->rq_pages = pages; + new_rq->rq_offset = old_rq->rq_offset; new_rq->rq_npages = old_rq->rq_npages; new_rq->rq_pagesz = old_rq->rq_pagesz; new_rq->rq_tailsz = old_rq->rq_tailsz; @@ -2268,31 +2505,39 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, goto err_free_pages; } - iov = kmalloc_array(old_rq->rq_nvec, sizeof(struct kvec), GFP_KERNEL); + iov = kmalloc_array(old_rq->rq_nvec + 1, sizeof(struct kvec), + GFP_KERNEL); if (!iov) goto err_free_pages; - /* copy all iovs from the old except the 1st one (rfc1002 length) */ - memcpy(&iov[1], &old_rq->rq_iov[1], - sizeof(struct kvec) * (old_rq->rq_nvec - 1)); + /* copy all iovs from the old */ + memcpy(&iov[1], &old_rq->rq_iov[0], + sizeof(struct kvec) * old_rq->rq_nvec); + new_rq->rq_iov = iov; - new_rq->rq_nvec = old_rq->rq_nvec; + new_rq->rq_nvec = old_rq->rq_nvec + 1; tr_hdr = kmalloc(sizeof(struct smb2_transform_hdr), GFP_KERNEL); if (!tr_hdr) goto err_free_iov; - /* fill the 1st iov with a transform header */ - fill_transform_hdr(server, tr_hdr, old_rq); + orig_len = smb_rqst_len(server, old_rq); + + /* fill the 2nd iov with a transform header */ + fill_transform_hdr(tr_hdr, orig_len, old_rq); new_rq->rq_iov[0].iov_base = tr_hdr; new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr); /* copy pages form the old */ for (i = 0; i < npages; i++) { - char *dst = kmap(new_rq->rq_pages[i]); - char *src = kmap(old_rq->rq_pages[i]); - unsigned int len = (i < npages - 1) ? new_rq->rq_pagesz : - new_rq->rq_tailsz; + char *dst, *src; + unsigned int offset, len; + + rqst_page_get_length(new_rq, i, &len, &offset); + + dst = (char *) kmap(new_rq->rq_pages[i]) + offset; + src = (char *) kmap(old_rq->rq_pages[i]) + offset; + memcpy(dst, src, len); kunmap(new_rq->rq_pages[i]); kunmap(old_rq->rq_pages[i]); @@ -2344,7 +2589,6 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, { struct kvec iov[2]; struct smb_rqst rqst = {NULL}; - struct smb2_hdr *hdr; int rc; iov[0].iov_base = buf; @@ -2365,10 +2609,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, if (rc) return rc; - memmove(buf + server->vals->header_preamble_size, iov[1].iov_base, buf_data_size); - hdr = (struct smb2_hdr *)buf; - hdr->smb2_buf_length = cpu_to_be32(buf_data_size + page_data_size); - server->total_read = buf_data_size + page_data_size + server->vals->header_preamble_size; + memmove(buf, iov[1].iov_base, buf_data_size); + + server->total_read = buf_data_size + page_data_size; return rc; } @@ -2393,7 +2636,7 @@ read_data_into_pages(struct TCP_Server_Info *server, struct page **pages, zero_user(page, len, PAGE_SIZE - len); len = 0; } - length = cifs_read_page_from_socket(server, page, n); + length = cifs_read_page_from_socket(server, page, 0, n); if (length < 0) return length; server->total_read += length; @@ -2441,7 +2684,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, unsigned int cur_page_idx; unsigned int pad_len; struct cifs_readdata *rdata = mid->callback_data; - struct smb2_sync_hdr *shdr = get_sync_hdr(buf); + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; struct bio_vec *bvec = NULL; struct iov_iter iter; struct kvec iov; @@ -2472,7 +2715,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - data_offset = server->ops->read_data_offset(buf) + server->vals->header_preamble_size; + data_offset = server->ops->read_data_offset(buf); #ifdef CONFIG_CIFS_SMB_DIRECT use_rdma_mr = rdata->mr; #endif @@ -2568,12 +2811,11 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) unsigned int npages; struct page **pages; unsigned int len; - unsigned int buflen = server->pdu_size + server->vals->header_preamble_size; + unsigned int buflen = server->pdu_size; int rc; int i = 0; - len = min_t(unsigned int, buflen, server->vals->read_rsp_size - - server->vals->header_preamble_size + + len = min_t(unsigned int, buflen, server->vals->read_rsp_size + sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1; rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len); @@ -2581,8 +2823,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) return rc; server->total_read += rc; - len = le32_to_cpu(tr_hdr->OriginalMessageSize) + - server->vals->header_preamble_size - + len = le32_to_cpu(tr_hdr->OriginalMessageSize) - server->vals->read_rsp_size; npages = DIV_ROUND_UP(len, PAGE_SIZE); @@ -2609,8 +2850,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) if (rc) goto free_pages; - rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size - - server->vals->header_preamble_size, + rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size, pages, npages, len); if (rc) goto free_pages; @@ -2647,7 +2887,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid_entry; /* switch to large buffer if too big for a small one */ - if (pdu_length + server->vals->header_preamble_size > MAX_CIFS_SMALL_BUFFER_SIZE) { + if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE) { server->large_buf = true; memcpy(server->bigbuf, buf, server->total_read); buf = server->bigbuf; @@ -2655,13 +2895,12 @@ receive_encrypted_standard(struct TCP_Server_Info *server, /* now read the rest */ length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, - pdu_length - HEADER_SIZE(server) + 1 + - server->vals->header_preamble_size); + pdu_length - HEADER_SIZE(server) + 1); if (length < 0) return length; server->total_read += length; - buf_size = pdu_length + server->vals->header_preamble_size - sizeof(struct smb2_transform_hdr); + buf_size = pdu_length - sizeof(struct smb2_transform_hdr); length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0); if (length) return length; @@ -2690,7 +2929,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid) struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); - if (pdu_length + server->vals->header_preamble_size < sizeof(struct smb2_transform_hdr) + + if (pdu_length < sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_sync_hdr)) { cifs_dbg(VFS, "Transform message is too small (%u)\n", pdu_length); @@ -2699,14 +2938,14 @@ smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid) return -ECONNABORTED; } - if (pdu_length + server->vals->header_preamble_size < orig_len + sizeof(struct smb2_transform_hdr)) { + if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) { cifs_dbg(VFS, "Transform message is broken\n"); cifs_reconnect(server); wake_up(&server->response_q); return -ECONNABORTED; } - if (pdu_length + server->vals->header_preamble_size > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) + if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) return receive_encrypted_read(server, mid); return receive_encrypted_standard(server, mid); @@ -2717,11 +2956,23 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) { char *buf = server->large_buf ? server->bigbuf : server->smallbuf; - return handle_read_data(server, mid, buf, server->pdu_size + - server->vals->header_preamble_size, + return handle_read_data(server, mid, buf, server->pdu_size, NULL, 0, 0); } +static int +smb2_next_header(char *buf) +{ + struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf; + struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf; + + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) + return sizeof(struct smb2_transform_hdr) + + le32_to_cpu(t_hdr->OriginalMessageSize); + + return le32_to_cpu(hdr->NextCommand); +} + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -2813,6 +3064,7 @@ struct smb_version_operations smb20_operations = { .get_acl_by_fid = get_smb2_acl_by_fid, .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ + .next_header = smb2_next_header, }; struct smb_version_operations smb21_operations = { @@ -2907,6 +3159,7 @@ struct smb_version_operations smb21_operations = { .get_acl_by_fid = get_smb2_acl_by_fid, .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ + .next_header = smb2_next_header, }; struct smb_version_operations smb30_operations = { @@ -3011,6 +3264,7 @@ struct smb_version_operations smb30_operations = { .get_acl_by_fid = get_smb2_acl_by_fid, .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ + .next_header = smb2_next_header, }; #ifdef CONFIG_CIFS_SMB311 @@ -3058,6 +3312,7 @@ struct smb_version_operations smb311_operations = { .set_compression = smb2_set_compression, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, + .posix_mkdir = smb311_posix_mkdir, .rmdir = smb2_rmdir, .unlink = smb2_unlink, .rename = smb2_rename_path, @@ -3111,6 +3366,7 @@ struct smb_version_operations smb311_operations = { .query_all_EAs = smb2_query_eas, .set_EA = smb2_set_ea, #endif /* CIFS_XATTR */ + .next_header = smb2_next_header, }; #endif /* CIFS_SMB311 */ @@ -3122,8 +3378,8 @@ struct smb_version_values smb20_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3143,8 +3399,8 @@ struct smb_version_values smb21_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3164,8 +3420,8 @@ struct smb_version_values smb3any_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3185,8 +3441,8 @@ struct smb_version_values smbdefault_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3206,8 +3462,8 @@ struct smb_version_values smb30_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3227,8 +3483,8 @@ struct smb_version_values smb302_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, @@ -3249,8 +3505,8 @@ struct smb_version_values smb311_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_hdr), - .header_preamble_size = 4, + .header_size = sizeof(struct smb2_sync_hdr), + .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, .lock_cmd = SMB2_LOCK, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0f48741a0130..3c92678cb45b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -49,6 +49,7 @@ #include "cifspdu.h" #include "cifs_spnego.h" #include "smbdirect.h" +#include "trace.h" /* * The following table defines the expected "StructureSize" of SMB2 requests @@ -79,7 +80,7 @@ static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { /* SMB2_OPLOCK_BREAK */ 24 /* BB this is 36 for LEASE_BREAK variant */ }; -static int encryption_required(const struct cifs_tcon *tcon) +static int smb3_encryption_required(const struct cifs_tcon *tcon) { if (!tcon) return 0; @@ -145,7 +146,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, shdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */ if (tcon->ses && tcon->ses->server && tcon->ses->server->sign && - !encryption_required(tcon)) + !smb3_encryption_required(tcon)) shdr->Flags |= SMB2_FLAGS_SIGNED; out: return; @@ -154,7 +155,7 @@ out: static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) { - int rc = 0; + int rc; struct nls_table *nls_codepage; struct cifs_ses *ses; struct TCP_Server_Info *server; @@ -165,10 +166,10 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) * for those three - in the calling routine. */ if (tcon == NULL) - return rc; + return 0; if (smb2_command == SMB2_TREE_CONNECT) - return rc; + return 0; if (tcon->tidStatus == CifsExiting) { /* @@ -211,8 +212,14 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) return -EAGAIN; } - wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), 10 * HZ); + rc = wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), + 10 * HZ); + if (rc < 0) { + cifs_dbg(FYI, "%s: aborting reconnect due to a received" + " signal by the process\n", __func__); + return -ERESTARTSYS; + } /* are we still trying to reconnect? */ if (server->tcpStatus != CifsNeedReconnect) @@ -230,7 +237,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) } if (!tcon->ses->need_reconnect && !tcon->need_reconnect) - return rc; + return 0; nls_codepage = load_nls_default(); @@ -339,7 +346,10 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, return rc; /* BB eventually switch this to SMB2 specific small buf size */ - *request_buf = cifs_small_buf_get(); + if (smb2_command == SMB2_SET_INFO) + *request_buf = cifs_buf_get(); + else + *request_buf = cifs_small_buf_get(); if (*request_buf == NULL) { /* BB should we add a retry in here if not a writepage? */ return -ENOMEM; @@ -367,6 +377,7 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, #define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) #define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) +#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) static void build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt) @@ -390,21 +401,35 @@ build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) } static void +build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE; + pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN); +} + +static void assemble_neg_contexts(struct smb2_negotiate_req *req, unsigned int *total_len) { char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT; + unsigned int ctxt_len; + *total_len += 2; /* Add 2 due to round to 8 byte boundary for 1st ctxt */ build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); - /* Add 2 to size to round to 8 byte boundary */ + ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8; + *total_len += ctxt_len; + pneg_ctxt += ctxt_len; - pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context); build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); - req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); - req->NegotiateContextCount = cpu_to_le16(2); + ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8; + *total_len += ctxt_len; + pneg_ctxt += ctxt_len; - *total_len += 4 + sizeof(struct smb2_preauth_neg_context) - + sizeof(struct smb2_encryption_neg_context); + build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); + *total_len += sizeof(struct smb2_posix_neg_context); + + req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); + req->NegotiateContextCount = cpu_to_le16(3); } static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt) @@ -449,12 +474,12 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, } static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, - struct TCP_Server_Info *server) + struct TCP_Server_Info *server, + unsigned int len_of_smb) { struct smb2_neg_context *pctx; unsigned int offset = le32_to_cpu(rsp->NegotiateContextOffset); unsigned int ctxt_cnt = le16_to_cpu(rsp->NegotiateContextCount); - unsigned int len_of_smb = be32_to_cpu(rsp->hdr.smb2_buf_length); unsigned int len_of_ctxts, i; int rc = 0; @@ -475,8 +500,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, if (len_of_ctxts < sizeof(struct smb2_neg_context)) break; - pctx = (struct smb2_neg_context *)(offset + - server->vals->header_preamble_size + (char *)rsp); + pctx = (struct smb2_neg_context *)(offset + (char *)rsp); clen = le16_to_cpu(pctx->DataLength); if (clen > len_of_ctxts) break; @@ -487,6 +511,8 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES) rc = decode_encrypt_ctx(server, (struct smb2_encryption_neg_context *)pctx); + else if (pctx->ContextType == SMB2_POSIX_EXTENSIONS_AVAILABLE) + server->posix_ext_supported = true; else cifs_dbg(VFS, "unknown negcontext of type %d ignored\n", le16_to_cpu(pctx->ContextType)); @@ -501,6 +527,64 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, return rc; } +static struct create_posix * +create_posix_buf(umode_t mode) +{ + struct create_posix *buf; + + buf = kzalloc(sizeof(struct create_posix), + GFP_KERNEL); + if (!buf) + return NULL; + + buf->ccontext.DataOffset = + cpu_to_le16(offsetof(struct create_posix, Mode)); + buf->ccontext.DataLength = cpu_to_le32(4); + buf->ccontext.NameOffset = + cpu_to_le16(offsetof(struct create_posix, Name)); + buf->ccontext.NameLength = cpu_to_le16(16); + + /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */ + buf->Name[0] = 0x93; + buf->Name[1] = 0xAD; + buf->Name[2] = 0x25; + buf->Name[3] = 0x50; + buf->Name[4] = 0x9C; + buf->Name[5] = 0xB4; + buf->Name[6] = 0x11; + buf->Name[7] = 0xE7; + buf->Name[8] = 0xB4; + buf->Name[9] = 0x23; + buf->Name[10] = 0x83; + buf->Name[11] = 0xDE; + buf->Name[12] = 0x96; + buf->Name[13] = 0x8B; + buf->Name[14] = 0xCD; + buf->Name[15] = 0x7C; + buf->Mode = cpu_to_le32(mode); + cifs_dbg(FYI, "mode on posix create 0%o", mode); + return buf; +} + +static int +add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) +{ + struct smb2_create_req *req = iov[0].iov_base; + unsigned int num = *num_iovec; + + iov[num].iov_base = create_posix_buf(mode); + if (iov[num].iov_base == NULL) + return -ENOMEM; + iov[num].iov_len = sizeof(struct create_posix); + if (!req->CreateContextsOffset) + req->CreateContextsOffset = cpu_to_le32( + sizeof(struct smb2_create_req) + + iov[num - 1].iov_len); + le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_posix)); + *num_iovec = num + 1; + return 0; +} + #else static void assemble_neg_contexts(struct smb2_negotiate_req *req, unsigned int *total_len) @@ -527,6 +611,7 @@ static void assemble_neg_contexts(struct smb2_negotiate_req *req, int SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) { + struct smb_rqst rqst; struct smb2_negotiate_req *req; struct smb2_negotiate_rsp *rsp; struct kvec iov[1]; @@ -598,7 +683,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_negotiate_rsp *)rsp_iov.iov_base; /* @@ -691,7 +780,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES; security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, - &rsp->hdr); + (struct smb2_sync_hdr *)rsp); /* * See MS-SMB2 section 2.2.4: if no blob, client picks default which * for us will be @@ -718,7 +807,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) #ifdef CONFIG_CIFS_SMB311 if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { if (rsp->NegotiateContextCount) - rc = smb311_decode_neg_context(rsp, server); + rc = smb311_decode_neg_context(rsp, server, + rsp_iov.iov_len); else cifs_dbg(VFS, "Missing expected negotiate contexts\n"); } @@ -914,8 +1004,9 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) req->PreviousSessionId = sess_data->previous_session; req->Flags = 0; /* MBZ */ - /* to enable echos and oplocks */ - req->sync_hdr.CreditRequest = cpu_to_le16(3); + + /* enough to enable echos and oplocks and one max size write */ + req->sync_hdr.CreditRequest = cpu_to_le16(130); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) @@ -951,6 +1042,7 @@ static int SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) { int rc; + struct smb_rqst rqst; struct smb2_sess_setup_req *req = sess_data->iov[0].iov_base; struct kvec rsp_iov = { NULL, 0 }; @@ -959,10 +1051,13 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */); req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); - /* BB add code to build os and lm fields */ + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = sess_data->iov; + rqst.rq_nvec = 2; - rc = smb2_send_recv(sess_data->xid, sess_data->ses, - sess_data->iov, 2, + /* BB add code to build os and lm fields */ + rc = cifs_send_recv(sess_data->xid, sess_data->ses, + &rqst, &sess_data->buf0_type, CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); cifs_small_buf_release(sess_data->iov[0].iov_base); @@ -1054,7 +1149,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) goto out_put_spnego_key; rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - ses->Suid = rsp->hdr.sync_hdr.SessionId; + ses->Suid = rsp->sync_hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); @@ -1130,13 +1225,13 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) /* If true, rc here is expected and not an error */ if (sess_data->buf0_type != CIFS_NO_BUFFER && - rsp->hdr.sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) + rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) rc = 0; if (rc) goto out; - if (offsetof(struct smb2_sess_setup_rsp, Buffer) - ses->server->vals->header_preamble_size != + if (offsetof(struct smb2_sess_setup_rsp, Buffer) != le16_to_cpu(rsp->SecurityBufferOffset)) { cifs_dbg(VFS, "Invalid security buffer offset %d\n", le16_to_cpu(rsp->SecurityBufferOffset)); @@ -1151,7 +1246,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); - ses->Suid = rsp->hdr.sync_hdr.SessionId; + ses->Suid = rsp->sync_hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); out: @@ -1209,7 +1304,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - ses->Suid = rsp->hdr.sync_hdr.SessionId; + ses->Suid = rsp->sync_hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); rc = SMB2_sess_establish_session(sess_data); @@ -1276,6 +1371,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, sess_data->ses = ses; sess_data->buf0_type = CIFS_NO_BUFFER; sess_data->nls_cp = (struct nls_table *) nls_cp; + sess_data->previous_session = ses->Suid; #ifdef CONFIG_CIFS_SMB311 /* @@ -1299,6 +1395,7 @@ out: int SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) { + struct smb_rqst rqst; struct smb2_logoff_req *req; /* response is also trivial struct */ int rc = 0; struct TCP_Server_Info *server; @@ -1336,7 +1433,11 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); /* * No tcon so can't do @@ -1366,6 +1467,7 @@ int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, struct cifs_tcon *tcon, const struct nls_table *cp) { + struct smb_rqst rqst; struct smb2_tree_connect_req *req; struct smb2_tree_connect_rsp *rsp = NULL; struct kvec iov[2]; @@ -1403,7 +1505,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, return rc; } - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; iov[0].iov_base = (char *)req; @@ -1419,10 +1521,14 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */ if ((ses->server->dialect == SMB311_PROT_ID) && - !encryption_required(tcon)) + !smb3_encryption_required(tcon)) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 2; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; @@ -1457,7 +1563,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tidStatus = CifsGood; tcon->need_reconnect = false; - tcon->tid = rsp->hdr.sync_hdr.TreeId; + tcon->tid = rsp->sync_hdr.TreeId; strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && @@ -1477,7 +1583,7 @@ tcon_exit: return rc; tcon_error_exit: - if (rsp && rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { + if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); } goto tcon_exit; @@ -1486,6 +1592,7 @@ tcon_error_exit: int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) { + struct smb_rqst rqst; struct smb2_tree_disconnect_req *req; /* response is trivial */ int rc = 0; struct cifs_ses *ses = tcon->ses; @@ -1508,7 +1615,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; flags |= CIFS_NO_RESP; @@ -1516,7 +1623,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); @@ -1575,7 +1686,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid) static __u8 parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, - unsigned int *epoch) + unsigned int *epoch, char *lease_key) { char *data_offset; struct create_context *cc; @@ -1583,14 +1694,15 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, unsigned int remaining; char *name; - data_offset = (char *)rsp + server->vals->header_preamble_size + le32_to_cpu(rsp->CreateContextsOffset); + data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset); remaining = le32_to_cpu(rsp->CreateContextsLength); cc = (struct create_context *)data_offset; while (remaining >= sizeof(struct create_context)) { name = le16_to_cpu(cc->NameOffset) + (char *)cc; if (le16_to_cpu(cc->NameLength) == 4 && strncmp(name, "RqLs", 4) == 0) - return server->ops->parse_lease_buf(cc, epoch); + return server->ops->parse_lease_buf(cc, epoch, + lease_key); next = le32_to_cpu(cc->Next); if (!next) @@ -1604,12 +1716,12 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, static int add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, - unsigned int *num_iovec, __u8 *oplock) + unsigned int *num_iovec, u8 *lease_key, __u8 *oplock) { struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; - iov[num].iov_base = server->ops->create_lease_buf(oplock+1, *oplock); + iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = server->vals->create_lease_size; @@ -1808,17 +1920,171 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, return 0; } +#ifdef CONFIG_CIFS_SMB311 +int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, + const char *full_path, + struct cifs_sb_info *cifs_sb) +{ + struct smb_rqst rqst; + struct smb2_create_req *req; + struct smb2_create_rsp *rsp; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + struct kvec iov[3]; /* make sure at least one for each open context */ + struct kvec rsp_iov = {NULL, 0}; + int resp_buftype; + int uni_path_len; + __le16 *copy_path = NULL; + int copy_size; + int rc = 0; + unsigned int n_iov = 2; + __u32 file_attributes = 0; + char *pc_buf = NULL; + int flags = 0; + unsigned int total_len; + __le16 *path = cifs_convert_path_to_utf16(full_path, cifs_sb); + + if (!path) + return -ENOMEM; + + cifs_dbg(FYI, "mkdir\n"); + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = smb2_plain_req_init(SMB2_CREATE, tcon, (void **) &req, &total_len); + + if (rc) + return rc; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + + req->ImpersonationLevel = IL_IMPERSONATION; + req->DesiredAccess = cpu_to_le32(FILE_WRITE_ATTRIBUTES); + /* File attributes ignored on open (used in create though) */ + req->FileAttributes = cpu_to_le32(file_attributes); + req->ShareAccess = FILE_SHARE_ALL_LE; + req->CreateDisposition = cpu_to_le32(FILE_CREATE); + req->CreateOptions = cpu_to_le32(CREATE_NOT_FILE); + + iov[0].iov_base = (char *)req; + /* -1 since last byte is buf[0] which is sent below (path) */ + iov[0].iov_len = total_len - 1; + + req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)); + + /* [MS-SMB2] 2.2.13 NameOffset: + * If SMB2_FLAGS_DFS_OPERATIONS is set in the Flags field of + * the SMB2 header, the file name includes a prefix that will + * be processed during DFS name normalization as specified in + * section 3.3.5.9. Otherwise, the file name is relative to + * the share that is identified by the TreeId in the SMB2 + * header. + */ + if (tcon->share_flags & SHI1005_FLAGS_DFS) { + int name_len; + + req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + rc = alloc_path_with_tree_prefix(©_path, ©_size, + &name_len, + tcon->treeName, path); + if (rc) { + cifs_small_buf_release(req); + return rc; + } + req->NameLength = cpu_to_le16(name_len * 2); + uni_path_len = copy_size; + path = copy_path; + } else { + uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; + /* MUST set path len (NameLength) to 0 opening root of share */ + req->NameLength = cpu_to_le16(uni_path_len - 2); + if (uni_path_len % 8 != 0) { + copy_size = roundup(uni_path_len, 8); + copy_path = kzalloc(copy_size, GFP_KERNEL); + if (!copy_path) { + cifs_small_buf_release(req); + return -ENOMEM; + } + memcpy((char *)copy_path, (const char *)path, + uni_path_len); + uni_path_len = copy_size; + path = copy_path; + } + } + + iov[1].iov_len = uni_path_len; + iov[1].iov_base = path; + req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_NONE; + + if (tcon->posix_extensions) { + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = + cpu_to_le32(iov[n_iov-1].iov_len); + } + + rc = add_posix_context(iov, &n_iov, mode); + if (rc) { + cifs_small_buf_release(req); + kfree(copy_path); + return rc; + } + pc_buf = iov[n_iov-1].iov_base; + } + + + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_iov; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, + &rsp_iov); + + cifs_small_buf_release(req); + rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; + + if (rc != 0) { + cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); + trace_smb3_posix_mkdir_err(xid, tcon->tid, ses->Suid, + CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES, rc); + goto smb311_mkdir_exit; + } else + trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid, + ses->Suid, CREATE_NOT_FILE, + FILE_WRITE_ATTRIBUTES); + + SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId); + + /* Eventually save off posix specific response info and timestaps */ + +smb311_mkdir_exit: + kfree(copy_path); + kfree(pc_buf); + free_rsp_buf(resp_buftype, rsp); + return rc; + +} +#endif /* SMB311 */ + int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, - struct kvec *err_iov) + struct kvec *err_iov, int *buftype) { + struct smb_rqst rqst; struct smb2_create_req *req; struct smb2_create_rsp *rsp; struct TCP_Server_Info *server; struct cifs_tcon *tcon = oparms->tcon; struct cifs_ses *ses = tcon->ses; - struct kvec iov[4]; + struct kvec iov[5]; /* make sure at least one for each open context */ struct kvec rsp_iov = {NULL, 0}; int resp_buftype; int uni_path_len; @@ -1827,7 +2093,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, int rc = 0; unsigned int n_iov = 2; __u32 file_attributes = 0; - char *dhc_buf = NULL, *lc_buf = NULL; + char *dhc_buf = NULL, *lc_buf = NULL, *pc_buf = NULL; int flags = 0; unsigned int total_len; @@ -1843,7 +2109,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; if (oparms->create_options & CREATE_OPTION_READONLY) @@ -1915,7 +2181,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, *oplock == SMB2_OPLOCK_LEVEL_NONE) req->RequestedOplockLevel = *oplock; else { - rc = add_lease_context(server, iov, &n_iov, oplock); + rc = add_lease_context(server, iov, &n_iov, + oparms->fid->lease_key, oplock); if (rc) { cifs_small_buf_release(req); kfree(copy_path); @@ -1944,7 +2211,32 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, dhc_buf = iov[n_iov-1].iov_base; } - rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, +#ifdef CONFIG_CIFS_SMB311 + if (tcon->posix_extensions) { + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = + cpu_to_le32(iov[n_iov-1].iov_len); + } + + rc = add_posix_context(iov, &n_iov, oparms->mode); + if (rc) { + cifs_small_buf_release(req); + kfree(copy_path); + kfree(lc_buf); + kfree(dhc_buf); + return rc; + } + pc_buf = iov[n_iov-1].iov_base; + } +#endif /* SMB311 */ + + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_iov; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; @@ -1953,11 +2245,17 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); if (err_iov && rsp) { *err_iov = rsp_iov; + *buftype = resp_buftype; resp_buftype = CIFS_NO_BUFFER; rsp = NULL; } + trace_smb3_open_err(xid, tcon->tid, ses->Suid, + oparms->create_options, oparms->desired_access, rc); goto creat_exit; - } + } else + trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid, + ses->Suid, oparms->create_options, + oparms->desired_access); oparms->fid->persistent_fid = rsp->PersistentFileId; oparms->fid->volatile_fid = rsp->VolatileFileId; @@ -1972,13 +2270,15 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) - *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch); + *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch, + oparms->fid->lease_key); else *oplock = rsp->OplockLevel; creat_exit: kfree(copy_path); kfree(lc_buf); kfree(dhc_buf); + kfree(pc_buf); free_rsp_buf(resp_buftype, rsp); return rc; } @@ -1992,9 +2292,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */) { + struct smb_rqst rqst; struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp; - struct smb2_sync_hdr *shdr; struct cifs_ses *ses; struct kvec iov[2]; struct kvec rsp_iov; @@ -2025,7 +2325,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->CtlCode = cpu_to_le32(opcode); @@ -2083,11 +2383,19 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_iov; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; + if (rc != 0) + trace_smb3_fsctl_err(xid, persistent_fid, tcon->tid, + ses->Suid, 0, opcode, rc); + if ((rc != 0) && (rc != -EINVAL)) { cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); goto ioctl_exit; @@ -2115,7 +2423,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) { + if (rsp_iov.iov_len < le32_to_cpu(rsp->OutputOffset) + *plen) { cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen, le32_to_cpu(rsp->OutputOffset)); *plen = 0; @@ -2129,8 +2437,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - shdr = get_sync_hdr(rsp); - memcpy(*out_data, (char *)shdr + le32_to_cpu(rsp->OutputOffset), *plen); + memcpy(*out_data, (char *)rsp + le32_to_cpu(rsp->OutputOffset), *plen); ioctl_exit: free_rsp_buf(resp_buftype, rsp); return rc; @@ -2162,9 +2469,10 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, } int -SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid) +SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int flags) { + struct smb_rqst rqst; struct smb2_close_req *req; struct smb2_close_rsp *rsp; struct cifs_ses *ses = tcon->ses; @@ -2172,7 +2480,6 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, struct kvec rsp_iov; int resp_buftype; int rc = 0; - int flags = 0; unsigned int total_len; cifs_dbg(FYI, "Close\n"); @@ -2184,7 +2491,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->PersistentFileId = persistent_fid; @@ -2193,12 +2500,18 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE); + trace_smb3_close_err(xid, persistent_fid, tcon->tid, ses->Suid, + rc); goto close_exit; } @@ -2209,14 +2522,20 @@ close_exit: return rc; } +int +SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid) +{ + return SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0); +} + static int -validate_iov(struct TCP_Server_Info *server, - unsigned int offset, unsigned int buffer_length, +validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size) { unsigned int smb_len = iov->iov_len; - char *end_of_smb = smb_len + server->vals->header_preamble_size + (char *)iov->iov_base; - char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)iov->iov_base; + char *end_of_smb = smb_len + (char *)iov->iov_base; + char *begin_of_buf = offset + (char *)iov->iov_base; char *end_of_buf = begin_of_buf + buffer_length; @@ -2246,18 +2565,17 @@ validate_iov(struct TCP_Server_Info *server, * Caller must free buffer. */ static int -validate_and_copy_iov(struct TCP_Server_Info *server, - unsigned int offset, unsigned int buffer_length, +validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int minbufsize, char *data) { - char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)(iov->iov_base); + char *begin_of_buf = offset + (char *)iov->iov_base; int rc; if (!data) return -EINVAL; - rc = validate_iov(server, offset, buffer_length, iov, minbufsize); + rc = validate_iov(offset, buffer_length, iov, minbufsize); if (rc) return rc; @@ -2272,6 +2590,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, u32 additional_info, size_t output_len, size_t min_len, void **data, u32 *dlen) { + struct smb_rqst rqst; struct smb2_query_info_req *req; struct smb2_query_info_rsp *rsp = NULL; struct kvec iov[2]; @@ -2292,7 +2611,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->InfoType = info_type; @@ -2312,12 +2631,18 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, /* 1 for Buffer */ iov[0].iov_len = total_len - 1; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); + trace_smb3_query_info_err(xid, persistent_fid, tcon->tid, + ses->Suid, info_class, (__u32)info_type, rc); goto qinf_exit; } @@ -2335,8 +2660,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, } } - rc = validate_and_copy_iov(ses->server, - le16_to_cpu(rsp->OutputBufferOffset), + rc = validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, min_len, *data); @@ -2377,8 +2701,7 @@ SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, return query_info(xid, tcon, persistent_fid, volatile_fid, 0, SMB2_O_INFO_SECURITY, additional_info, - SMB2_MAX_BUFFER_SIZE, - sizeof(struct smb2_file_all_info), data, plen); + SMB2_MAX_BUFFER_SIZE, MIN_SEC_DESC_LEN, data, plen); } int @@ -2407,7 +2730,7 @@ smb2_echo_callback(struct mid_q_entry *mid) unsigned int credits_received = 1; if (mid->mid_state == MID_RESPONSE_RECEIVED) - credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest); + credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); DeleteMidQEntry(mid); add_credits(server, credits_received, CIFS_ECHO_OP); @@ -2479,11 +2802,10 @@ SMB2_echo(struct TCP_Server_Info *server) { struct smb2_echo_req *req; int rc = 0; - struct kvec iov[2]; + struct kvec iov[1]; struct smb_rqst rqst = { .rq_iov = iov, - .rq_nvec = 2 }; + .rq_nvec = 1 }; unsigned int total_len; - __be32 rfc1002_marker; cifs_dbg(FYI, "In echo request\n"); @@ -2499,11 +2821,8 @@ SMB2_echo(struct TCP_Server_Info *server) req->sync_hdr.CreditRequest = cpu_to_le16(1); - iov[0].iov_len = 4; - rfc1002_marker = cpu_to_be32(total_len); - iov[0].iov_base = &rfc1002_marker; - iov[1].iov_len = total_len; - iov[1].iov_base = (char *)req; + iov[0].iov_len = total_len; + iov[0].iov_base = (char *)req; rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL, server, CIFS_ECHO_OP); @@ -2518,6 +2837,7 @@ int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { + struct smb_rqst rqst; struct smb2_flush_req *req; struct cifs_ses *ses = tcon->ses; struct kvec iov[1]; @@ -2536,7 +2856,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->PersistentFileId = persistent_fid; @@ -2545,11 +2865,18 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); - if (rc != 0) + if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE); + trace_smb3_flush_err(xid, persistent_fid, tcon->tid, ses->Suid, + rc); + } free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc; @@ -2603,8 +2930,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, rdata->mr = smbd_register_mr( server->smbd_conn, rdata->pages, - rdata->nr_pages, rdata->tailsz, - true, need_invalidate); + rdata->nr_pages, rdata->page_offset, + rdata->tailsz, true, need_invalidate); if (!rdata->mr) return -ENOBUFS; @@ -2658,11 +2985,12 @@ smb2_readv_callback(struct mid_q_entry *mid) struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rdata->iov[1].iov_base; + (struct smb2_sync_hdr *)rdata->iov[0].iov_base; unsigned int credits_received = 1; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, + .rq_offset = rdata->page_offset, .rq_npages = rdata->nr_pages, .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; @@ -2729,10 +3057,9 @@ smb2_async_readv(struct cifs_readdata *rdata) struct smb2_sync_hdr *shdr; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = rdata->iov, - .rq_nvec = 2 }; + .rq_nvec = 1 }; struct TCP_Server_Info *server; unsigned int total_len; - __be32 req_len; cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@ -2760,15 +3087,11 @@ smb2_async_readv(struct cifs_readdata *rdata) return rc; } - if (encryption_required(io_parms.tcon)) + if (smb3_encryption_required(io_parms.tcon)) flags |= CIFS_TRANSFORM_REQ; - req_len = cpu_to_be32(total_len); - - rdata->iov[0].iov_base = &req_len; - rdata->iov[0].iov_len = sizeof(__be32); - rdata->iov[1].iov_base = buf; - rdata->iov[1].iov_len = total_len; + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = total_len; shdr = (struct smb2_sync_hdr *)buf; @@ -2791,7 +3114,13 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); - } + trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid, + io_parms.tcon->tid, io_parms.tcon->ses->Suid, + io_parms.offset, io_parms.length); + } else + trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid, + io_parms.tcon->tid, io_parms.tcon->ses->Suid, + io_parms.offset, io_parms.length); cifs_small_buf_release(buf); return rc; @@ -2801,10 +3130,10 @@ int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *buf_type) { + struct smb_rqst rqst; int resp_buftype, rc = -EACCES; struct smb2_read_plain_req *req = NULL; struct smb2_read_rsp *rsp = NULL; - struct smb2_sync_hdr *shdr; struct kvec iov[1]; struct kvec rsp_iov; unsigned int total_len; @@ -2816,13 +3145,17 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc) return rc; - if (encryption_required(io_parms->tcon)) + if (smb3_encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; @@ -2832,9 +3165,15 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); } + trace_smb3_read_err(rc, xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, io_parms->length); free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc == -ENODATA ? 0 : rc; - } + } else + trace_smb3_read_done(xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, io_parms->length); *nbytes = le32_to_cpu(rsp->DataLength); if ((*nbytes > CIFS_MAX_MSGSIZE) || @@ -2845,10 +3184,8 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, *nbytes = 0; } - shdr = get_sync_hdr(rsp); - if (*buf) { - memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes); + memcpy(*buf, (char *)rsp + rsp->DataOffset, *nbytes); free_rsp_buf(resp_buftype, rsp_iov.iov_base); } else if (resp_buftype != CIFS_NO_BUFFER) { *buf = rsp_iov.iov_base; @@ -2875,7 +3212,7 @@ smb2_writev_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest); + credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); wdata->result = smb2_check_receive(mid, tcon->ses->server, 0); if (wdata->result != 0) break; @@ -2934,10 +3271,9 @@ smb2_async_writev(struct cifs_writedata *wdata, struct smb2_sync_hdr *shdr; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - struct kvec iov[2]; + struct kvec iov[1]; struct smb_rqst rqst = { }; unsigned int total_len; - __be32 rfc1002_marker; rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len); if (rc) { @@ -2952,7 +3288,7 @@ smb2_async_writev(struct cifs_writedata *wdata, goto async_writev_out; } - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; shdr = (struct smb2_sync_hdr *)req; @@ -2980,16 +3316,22 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata->mr = smbd_register_mr( server->smbd_conn, wdata->pages, - wdata->nr_pages, wdata->tailsz, - false, need_invalidate); + wdata->nr_pages, wdata->page_offset, + wdata->tailsz, false, need_invalidate); if (!wdata->mr) { rc = -ENOBUFS; goto async_writev_out; } req->Length = 0; req->DataOffset = 0; - req->RemainingBytes = - cpu_to_le32((wdata->nr_pages-1)*PAGE_SIZE + wdata->tailsz); + if (wdata->nr_pages > 1) + req->RemainingBytes = + cpu_to_le32( + (wdata->nr_pages - 1) * wdata->pagesz - + wdata->page_offset + wdata->tailsz + ); + else + req->RemainingBytes = cpu_to_le32(wdata->tailsz); req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; if (need_invalidate) req->Channel = SMB2_CHANNEL_RDMA_V1; @@ -3003,22 +3345,19 @@ smb2_async_writev(struct cifs_writedata *wdata, v1->length = cpu_to_le32(wdata->mr->mr->length); } #endif - /* 4 for rfc1002 length field and 1 for Buffer */ - iov[0].iov_len = 4; - rfc1002_marker = cpu_to_be32(total_len - 1 + wdata->bytes); - iov[0].iov_base = &rfc1002_marker; - iov[1].iov_len = total_len - 1; - iov[1].iov_base = (char *)req; + iov[0].iov_len = total_len - 1; + iov[0].iov_base = (char *)req; rqst.rq_iov = iov; - rqst.rq_nvec = 2; + rqst.rq_nvec = 1; rqst.rq_pages = wdata->pages; + rqst.rq_offset = wdata->page_offset; rqst.rq_npages = wdata->nr_pages; rqst.rq_pagesz = wdata->pagesz; rqst.rq_tailsz = wdata->tailsz; #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) { - iov[1].iov_len += sizeof(struct smbd_buffer_descriptor_v1); + iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); rqst.rq_npages = 0; } #endif @@ -3050,9 +3389,15 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata, flags); if (rc) { + trace_smb3_write_err(0 /* no xid */, req->PersistentFileId, + tcon->tid, tcon->ses->Suid, wdata->offset, + wdata->bytes, rc); kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); - } + } else + trace_smb3_write_done(0 /* no xid */, req->PersistentFileId, + tcon->tid, tcon->ses->Suid, wdata->offset, + wdata->bytes); async_writev_out: cifs_small_buf_release(req); @@ -3069,6 +3414,7 @@ int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, struct kvec *iov, int n_vec) { + struct smb_rqst rqst; int rc = 0; struct smb2_write_req *req = NULL; struct smb2_write_rsp *rsp = NULL; @@ -3090,7 +3436,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (io_parms->tcon->ses->server == NULL) return -ECONNABORTED; - if (encryption_required(io_parms->tcon)) + if (smb3_encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid); @@ -3110,16 +3456,29 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, /* 1 for Buffer */ iov[0].iov_len = total_len - 1; - rc = smb2_send_recv(xid, io_parms->tcon->ses, iov, n_vec + 1, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_vec + 1; + + rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { + trace_smb3_write_err(xid, req->PersistentFileId, + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, io_parms->length, rc); cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE); cifs_dbg(VFS, "Send error in write = %d\n", rc); - } else + } else { *nbytes = le32_to_cpu(rsp->DataLength); + trace_smb3_write_done(xid, req->PersistentFileId, + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, *nbytes); + } free_rsp_buf(resp_buftype, rsp); return rc; @@ -3173,6 +3532,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int index, struct cifs_search_info *srch_inf) { + struct smb_rqst rqst; struct smb2_query_directory_req *req; struct smb2_query_directory_rsp *rsp = NULL; struct kvec iov[2]; @@ -3200,7 +3560,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; switch (srch_inf->info_level) { @@ -3245,13 +3605,17 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, iov[1].iov_base = (char *)(req->Buffer); iov[1].iov_len = len; - rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 2; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; if (rc) { if (rc == -ENODATA && - rsp->hdr.sync_hdr.Status == STATUS_NO_MORE_FILES) { + rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { srch_inf->endOfSearch = true; rc = 0; } @@ -3259,8 +3623,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, goto qdir_exit; } - rc = validate_iov(server, - le16_to_cpu(rsp->OutputBufferOffset), + rc = validate_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, info_buf_size); if (rc) @@ -3275,10 +3638,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, cifs_buf_release(srch_inf->ntwrk_buf_start); } srch_inf->ntwrk_buf_start = (char *)rsp; - srch_inf->srch_entries_start = srch_inf->last_entry = 4 /* rfclen */ + - (char *)&rsp->hdr + le16_to_cpu(rsp->OutputBufferOffset); - /* 4 for rfc1002 length field */ - end_of_smb = get_rfc1002_length(rsp) + 4 + (char *)&rsp->hdr; + srch_inf->srch_entries_start = srch_inf->last_entry = + (char *)rsp + le16_to_cpu(rsp->OutputBufferOffset); + end_of_smb = rsp_iov.iov_len + (char *)rsp; srch_inf->entries_in_buffer = num_entries(srch_inf->srch_entries_start, end_of_smb, &srch_inf->last_entry, info_buf_size); @@ -3306,6 +3668,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, u8 info_type, u32 additional_info, unsigned int num, void **data, unsigned int *size) { + struct smb_rqst rqst; struct smb2_set_info_req *req; struct smb2_set_info_rsp *rsp = NULL; struct kvec *iov; @@ -3323,7 +3686,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, if (!num) return -EINVAL; - iov = kmalloc(sizeof(struct kvec) * num, GFP_KERNEL); + iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL); if (!iov) return -ENOMEM; @@ -3333,7 +3696,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->sync_hdr.ProcessId = cpu_to_le32(pid); @@ -3361,13 +3724,20 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, iov[i].iov_len = size[i]; } - rc = smb2_send_recv(xid, ses, iov, num, &resp_buftype, flags, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = num; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); + cifs_buf_release(req); rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base; - if (rc != 0) + if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE); + trace_smb3_set_info_err(xid, persistent_fid, tcon->tid, + ses->Suid, info_class, (__u32)info_type, rc); + } free_rsp_buf(resp_buftype, rsp); kfree(iov); @@ -3384,7 +3754,7 @@ SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, int rc; int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX)); - data = kmalloc(sizeof(void *) * 2, GFP_KERNEL); + data = kmalloc_array(2, sizeof(void *), GFP_KERNEL); if (!data) return -ENOMEM; @@ -3432,7 +3802,7 @@ SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, int rc; int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX)); - data = kmalloc(sizeof(void *) * 2, GFP_KERNEL); + data = kmalloc_array(2, sizeof(void *), GFP_KERNEL); if (!data) return -ENOMEM; @@ -3513,8 +3883,9 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, __u8 oplock_level) { + struct smb_rqst rqst; int rc; - struct smb2_oplock_break_req *req = NULL; + struct smb2_oplock_break *req = NULL; struct cifs_ses *ses = tcon->ses; int flags = CIFS_OBREAK_OP; unsigned int total_len; @@ -3528,7 +3899,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->VolatileFid = volatile_fid; @@ -3541,7 +3912,11 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -3593,7 +3968,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, req->InputBufferOffset = cpu_to_le16(sizeof(struct smb2_query_info_req) - 1); req->OutputBufferLength = cpu_to_le32( - outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - server->vals->header_preamble_size); + outbuf_len + sizeof(struct smb2_query_info_rsp) - 1); iov->iov_base = (char *)req; iov->iov_len = total_len; @@ -3604,13 +3979,13 @@ int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata) { + struct smb_rqst rqst; struct smb2_query_info_rsp *rsp = NULL; struct kvec iov; struct kvec rsp_iov; int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = ses->server; struct smb2_fs_full_size_info *info = NULL; int flags = 0; @@ -3620,10 +3995,14 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = &iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -3631,10 +4010,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, } rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; - info = (struct smb2_fs_full_size_info *)(server->vals->header_preamble_size + - le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr); - rc = validate_iov(server, - le16_to_cpu(rsp->OutputBufferOffset), + info = (struct smb2_fs_full_size_info *)( + le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp); + rc = validate_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, sizeof(struct smb2_fs_full_size_info)); if (!rc) @@ -3649,13 +4027,13 @@ int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int level) { + struct smb_rqst rqst; struct smb2_query_info_rsp *rsp = NULL; struct kvec iov; struct kvec rsp_iov; int rc = 0; int resp_buftype, max_len, min_len; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = ses->server; unsigned int rsp_len, offset; int flags = 0; @@ -3678,10 +4056,14 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = &iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -3691,20 +4073,20 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, rsp_len = le32_to_cpu(rsp->OutputBufferLength); offset = le16_to_cpu(rsp->OutputBufferOffset); - rc = validate_iov(server, offset, rsp_len, &rsp_iov, min_len); + rc = validate_iov(offset, rsp_len, &rsp_iov, min_len); if (rc) goto qfsattr_exit; if (level == FS_ATTRIBUTE_INFORMATION) - memcpy(&tcon->fsAttrInfo, server->vals->header_preamble_size + offset - + (char *)&rsp->hdr, min_t(unsigned int, + memcpy(&tcon->fsAttrInfo, offset + + (char *)rsp, min_t(unsigned int, rsp_len, max_len)); else if (level == FS_DEVICE_INFORMATION) - memcpy(&tcon->fsDevInfo, server->vals->header_preamble_size + offset - + (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO)); + memcpy(&tcon->fsDevInfo, offset + + (char *)rsp, sizeof(FILE_SYSTEM_DEVICE_INFO)); else if (level == FS_SECTOR_SIZE_INFORMATION) { struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *) - (server->vals->header_preamble_size + offset + (char *)&rsp->hdr); + (offset + (char *)rsp); tcon->ss_flags = le32_to_cpu(ss_info->Flags); tcon->perf_sector_size = le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf); @@ -3720,6 +4102,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid, const __u32 num_lock, struct smb2_lock_element *buf) { + struct smb_rqst rqst; int rc = 0; struct smb2_lock_req *req = NULL; struct kvec iov[2]; @@ -3735,7 +4118,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->sync_hdr.ProcessId = cpu_to_le32(pid); @@ -3752,12 +4135,19 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, iov[1].iov_len = count; cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); - rc = smb2_send_recv(xid, tcon->ses, iov, 2, &resp_buf_type, flags, + + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 2; + + rc = cifs_send_recv(xid, tcon->ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc); cifs_stats_fail_inc(tcon, SMB2_LOCK_HE); + trace_smb3_lock_err(xid, persist_fid, tcon->tid, + tcon->ses->Suid, rc); } return rc; @@ -3784,6 +4174,7 @@ int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, __u8 *lease_key, const __le32 lease_state) { + struct smb_rqst rqst; int rc; struct smb2_lease_ack *req = NULL; struct cifs_ses *ses = tcon->ses; @@ -3799,7 +4190,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (encryption_required(tcon)) + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; req->sync_hdr.CreditRequest = cpu_to_le16(1); @@ -3814,7 +4205,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index d28f358022c5..a671adcc44a6 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -122,25 +122,10 @@ struct smb2_sync_pdu { __le16 StructureSize2; /* size of wct area (varies, request specific) */ } __packed; -struct smb2_hdr { - __be32 smb2_buf_length; /* big endian on wire */ - /* length is only two or three bytes - with */ - /* one or two byte type preceding it that MBZ */ - struct smb2_sync_hdr sync_hdr; -} __packed; - -struct smb2_pdu { - struct smb2_hdr hdr; - __le16 StructureSize2; /* size of wct area (varies, request specific) */ -} __packed; - #define SMB3_AES128CMM_NONCE 11 #define SMB3_AES128GCM_NONCE 12 struct smb2_transform_hdr { - __be32 smb2_buf_length; /* big endian on wire */ - /* length is only two or three bytes - with - one or two byte type preceding it that MBZ */ __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ __u8 Signature[16]; __u8 Nonce[16]; @@ -171,7 +156,7 @@ struct smb2_transform_hdr { #define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) struct smb2_err_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; __le16 Reserved; /* MBZ */ __le32 ByteCount; /* even if zero, at least one byte follows */ @@ -300,8 +285,16 @@ struct smb2_encryption_neg_context { __le16 Ciphers[1]; /* Ciphers[0] since only one used now */ } __packed; +#define POSIX_CTXT_DATA_LEN 8 +struct smb2_posix_neg_context { + __le16 ContextType; /* 0x100 */ + __le16 DataLength; + __le32 Reserved; + __le64 Reserved1; /* In case needed for future (eg version or caps) */ +} __packed; + struct smb2_negotiate_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 65 */ __le16 SecurityMode; __le16 DialectRevision; @@ -341,7 +334,7 @@ struct smb2_sess_setup_req { #define SMB2_SESSION_FLAG_IS_NULL 0x0002 #define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 struct smb2_sess_setup_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 9 */ __le16 SessionFlags; __le16 SecurityBufferOffset; @@ -356,7 +349,7 @@ struct smb2_logoff_req { } __packed; struct smb2_logoff_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -452,7 +445,7 @@ struct smb2_tree_connect_req_extension { } __packed; struct smb2_tree_connect_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 16 */ __u8 ShareType; /* see below */ __u8 Reserved; @@ -503,7 +496,7 @@ struct smb2_tree_disconnect_req { } __packed; struct smb2_tree_disconnect_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -615,7 +608,9 @@ struct smb2_tree_disconnect_rsp { #define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" #define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74 -#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9 +#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83 +#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C + struct smb2_create_req { struct smb2_sync_hdr sync_hdr; @@ -638,7 +633,7 @@ struct smb2_create_req { } __packed; struct smb2_create_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 89 */ __u8 OplockLevel; __u8 Reserved; @@ -683,16 +678,14 @@ struct create_context { #define SMB2_LEASE_KEY_SIZE 16 struct lease_context { - __le64 LeaseKeyLow; - __le64 LeaseKeyHigh; + u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; __le32 LeaseState; __le32 LeaseFlags; __le64 LeaseDuration; } __packed; struct lease_context_v2 { - __le64 LeaseKeyLow; - __le64 LeaseKeyHigh; + u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; __le32 LeaseState; __le32 LeaseFlags; __le64 LeaseDuration; @@ -727,6 +720,13 @@ struct create_durable { } Data; } __packed; +struct create_posix { + struct create_context ccontext; + __u8 Name[16]; + __le32 Mode; + __u32 Reserved; +} __packed; + /* See MS-SMB2 2.2.13.2.11 */ /* Flags */ #define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002 @@ -849,8 +849,11 @@ struct validate_negotiate_info_rsp { __le16 Dialect; /* Dialect in use for the connection */ } __packed; -#define RSS_CAPABLE 0x00000001 -#define RDMA_CAPABLE 0x00000002 +#define RSS_CAPABLE cpu_to_le32(0x00000001) +#define RDMA_CAPABLE cpu_to_le32(0x00000002) + +#define INTERNETWORK cpu_to_le16(0x0002) +#define INTERNETWORKV6 cpu_to_le16(0x0017) struct network_interface_info_ioctl_rsp { __le32 Next; /* next interface. zero if this is last one */ @@ -858,7 +861,21 @@ struct network_interface_info_ioctl_rsp { __le32 Capability; /* RSS or RDMA Capable */ __le32 Reserved; __le64 LinkSpeed; - char SockAddr_Storage[128]; + __le16 Family; + __u8 Buffer[126]; +} __packed; + +struct iface_info_ipv4 { + __be16 Port; + __be32 IPv4Address; + __be64 Reserved; +} __packed; + +struct iface_info_ipv6 { + __be16 Port; + __be32 FlowInfo; + __u8 IPv6Address[16]; + __be32 ScopeId; } __packed; #define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */ @@ -894,7 +911,7 @@ struct smb2_ioctl_req { } __packed; struct smb2_ioctl_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 57 */ __u16 Reserved; __le32 CtlCode; @@ -921,7 +938,7 @@ struct smb2_close_req { } __packed; struct smb2_close_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* 60 */ __le16 Flags; __le32 Reserved; @@ -944,7 +961,7 @@ struct smb2_flush_req { } __packed; struct smb2_flush_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; __le16 Reserved; } __packed; @@ -976,7 +993,7 @@ struct smb2_read_plain_req { } __packed; struct smb2_read_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 17 */ __u8 DataOffset; __u8 Reserved; @@ -1007,7 +1024,7 @@ struct smb2_write_req { } __packed; struct smb2_write_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 17 */ __u8 DataOffset; __u8 Reserved; @@ -1041,7 +1058,7 @@ struct smb2_lock_req { } __packed; struct smb2_lock_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -1053,7 +1070,7 @@ struct smb2_echo_req { } __packed; struct smb2_echo_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __u16 Reserved; } __packed; @@ -1079,7 +1096,7 @@ struct smb2_query_directory_req { } __packed; struct smb2_query_directory_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1128,7 +1145,7 @@ struct smb2_query_info_req { } __packed; struct smb2_query_info_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1150,12 +1167,11 @@ struct smb2_set_info_req { } __packed; struct smb2_set_info_rsp { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 2 */ } __packed; -/* oplock break without an rfc1002 header */ -struct smb2_oplock_break_req { +struct smb2_oplock_break { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 24 */ __u8 OplockLevel; @@ -1165,21 +1181,10 @@ struct smb2_oplock_break_req { __u64 VolatileFid; } __packed; -/* oplock break with an rfc1002 header */ -struct smb2_oplock_break_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 24 */ - __u8 OplockLevel; - __u8 Reserved; - __le32 Reserved2; - __u64 PersistentFid; - __u64 VolatileFid; -} __packed; - #define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) struct smb2_lease_break { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 44 */ __le16 Reserved; __le32 Flags; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 8ba24a95db71..6e6a4f2ec890 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -36,8 +36,9 @@ struct smb_rqst; extern int map_smb2_to_linux_error(char *buf, bool log_err); extern int smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *server); -extern unsigned int smb2_calc_size(void *buf); -extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr); +extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server); +extern char *smb2_get_data_area_len(int *off, int *len, + struct smb2_sync_hdr *shdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb); @@ -65,6 +66,8 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); +extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_fid *pfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, @@ -76,6 +79,10 @@ extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, bool set_alloc); extern int smb2_set_file_info(struct inode *inode, const char *full_path, FILE_BASIC_INFO *buf, const unsigned int xid); +extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, + const char *full_path, + struct cifs_sb_info *cifs_sb); extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, @@ -106,6 +113,8 @@ extern int smb2_unlock_range(struct cifsFileInfo *cfile, extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); extern void smb2_reconnect_server(struct work_struct *work); extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server); +extern unsigned long smb_rqst_len(struct TCP_Server_Info *server, + struct smb_rqst *rqst); /* * SMB2 Worker functions - most of protocol specific implementation details @@ -122,13 +131,15 @@ extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, - struct kvec *err_iov); + struct kvec *err_iov, int *resp_buftype); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); +extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int flags); extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 8806f3f76c1d..719d55e63d88 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -171,8 +171,10 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[1].iov_base; + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; struct cifs_ses *ses; + struct shash_desc *shash = &server->secmech.sdeschmacsha256->shash; + struct smb_rqst drqst; ses = smb2_find_smb_ses(server, shdr->SessionId); if (!ses) { @@ -190,21 +192,39 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) } rc = crypto_shash_setkey(server->secmech.hmacsha256, - ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } - rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + rc = crypto_shash_init(shash); if (rc) { cifs_dbg(VFS, "%s: Could not init sha256", __func__); return rc; } - rc = __cifs_calc_signature(rqst, server, sigptr, - &server->secmech.sdeschmacsha256->shash); + /* + * For SMB2+, __cifs_calc_signature() expects to sign only the actual + * data, that is, iov[0] should not contain a rfc1002 length. + * + * Sign the rfc1002 length prior to passing the data (iov[1-N]) down to + * __cifs_calc_signature(). + */ + drqst = *rqst; + if (drqst.rq_nvec >= 2 && iov[0].iov_len == 4) { + rc = crypto_shash_update(shash, iov[0].iov_base, + iov[0].iov_len); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with payload\n", + __func__); + return rc; + } + drqst.rq_iov++; + drqst.rq_nvec--; + } + rc = __cifs_calc_signature(&drqst, server, sigptr, shash); if (!rc) memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); @@ -408,12 +428,14 @@ generate_smb311signingkey(struct cifs_ses *ses) int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { - int rc = 0; + int rc; unsigned char smb3_signature[SMB2_CMACAES_SIZE]; unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[1].iov_base; + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; struct cifs_ses *ses; + struct shash_desc *shash = &server->secmech.sdesccmacaes->shash; + struct smb_rqst drqst; ses = smb2_find_smb_ses(server, shdr->SessionId); if (!ses) { @@ -425,8 +447,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); rc = crypto_shash_setkey(server->secmech.cmacaes, - ses->smb3signingkey, SMB2_CMACAES_SIZE); - + ses->smb3signingkey, SMB2_CMACAES_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); return rc; @@ -437,15 +458,33 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) * so unlike smb2 case we do not have to check here if secmech are * initialized */ - rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash); + rc = crypto_shash_init(shash); if (rc) { cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__); return rc; } - rc = __cifs_calc_signature(rqst, server, sigptr, - &server->secmech.sdesccmacaes->shash); + /* + * For SMB2+, __cifs_calc_signature() expects to sign only the actual + * data, that is, iov[0] should not contain a rfc1002 length. + * + * Sign the rfc1002 length prior to passing the data (iov[1-N]) down to + * __cifs_calc_signature(). + */ + drqst = *rqst; + if (drqst.rq_nvec >= 2 && iov[0].iov_len == 4) { + rc = crypto_shash_update(shash, iov[0].iov_base, + iov[0].iov_len); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with payload\n", + __func__); + return rc; + } + drqst.rq_iov++; + drqst.rq_nvec--; + } + rc = __cifs_calc_signature(&drqst, server, sigptr, shash); if (!rc) memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); @@ -458,7 +497,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int rc = 0; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; if (!(shdr->Flags & SMB2_FLAGS_SIGNED) || server->tcpStatus == CifsNeedNegotiate) @@ -480,7 +519,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) unsigned int rc; char server_response_sig[16]; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; if ((shdr->Command == SMB2_NEGOTIATE) || (shdr->Command == SMB2_SESSION_SETUP) || @@ -548,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); memset(temp, 0, sizeof(struct mid_q_entry)); + kref_init(&temp->refcount); temp->mid = le64_to_cpu(shdr->MessageId); temp->pid = current->pid; temp->command = shdr->Command; /* Always LE */ @@ -605,14 +645,12 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { unsigned int len = mid->resp_buf_size; - struct kvec iov[2]; + struct kvec iov[1]; struct smb_rqst rqst = { .rq_iov = iov, - .rq_nvec = 2 }; + .rq_nvec = 1 }; iov[0].iov_base = (char *)mid->resp_buf; - iov[0].iov_len = 4; - iov[1].iov_base = (char *)mid->resp_buf + 4; - iov[1].iov_len = len; + iov[0].iov_len = len; dump_smb(mid->resp_buf, min_t(u32, 80, len)); /* convert the length into a more usable form */ @@ -633,7 +671,7 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) { int rc; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(ses->server, shdr); @@ -654,7 +692,7 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(server, shdr); diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index c62f7c95683c..c55ea4e6201b 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -17,6 +17,8 @@ #include <linux/highmem.h> #include "smbdirect.h" #include "cifs_debug.h" +#include "cifsproto.h" +#include "smb2proto.h" static struct smbd_response *get_empty_queue_buffer( struct smbd_connection *info); @@ -2003,10 +2005,12 @@ read_rfc1002_done: * return value: actual data read */ static int smbd_recv_page(struct smbd_connection *info, - struct page *page, unsigned int to_read) + struct page *page, unsigned int page_offset, + unsigned int to_read) { int ret; char *to_address; + void *page_address; /* make sure we have the page ready for read */ ret = wait_event_interruptible( @@ -2014,16 +2018,17 @@ static int smbd_recv_page(struct smbd_connection *info, info->reassembly_data_length >= to_read || info->transport_status != SMBD_CONNECTED); if (ret) - return 0; + return ret; /* now we can read from reassembly queue and not sleep */ - to_address = kmap_atomic(page); + page_address = kmap_atomic(page); + to_address = (char *) page_address + page_offset; log_read(INFO, "reading from page=%p address=%p to_read=%d\n", page, to_address, to_read); ret = smbd_recv_buf(info, to_address, to_read); - kunmap_atomic(to_address); + kunmap_atomic(page_address); return ret; } @@ -2037,7 +2042,7 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) { char *buf; struct page *page; - unsigned int to_read; + unsigned int to_read, page_offset; int rc; info->smbd_recv_pending++; @@ -2051,15 +2056,16 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) case READ | ITER_BVEC: page = msg->msg_iter.bvec->bv_page; + page_offset = msg->msg_iter.bvec->bv_offset; to_read = msg->msg_iter.bvec->bv_len; - rc = smbd_recv_page(info, page, to_read); + rc = smbd_recv_page(info, page, page_offset, to_read); break; default: /* It's a bug in upper layer to get there */ cifs_dbg(VFS, "CIFS: invalid msg type %d\n", msg->msg_iter.type); - rc = -EIO; + rc = -EINVAL; } info->smbd_recv_pending--; @@ -2077,12 +2083,13 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) * rqst: the data to write * return value: 0 if successfully write, otherwise error code */ -int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) +int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst) { + struct smbd_connection *info = server->smbd_conn; struct kvec vec; int nvecs; int size; - int buflen = 0, remaining_data_length; + unsigned int buflen, remaining_data_length; int start, i, j; int max_iov_size = info->max_send_size - sizeof(struct smbd_data_transfer); @@ -2106,18 +2113,13 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len); return -EINVAL; } - iov = &rqst->rq_iov[1]; - - /* total up iov array first */ - for (i = 0; i < rqst->rq_nvec-1; i++) { - buflen += iov[i].iov_len; - } - /* add in the page array if there is one */ - if (rqst->rq_npages) { - buflen += rqst->rq_pagesz * (rqst->rq_npages - 1); - buflen += rqst->rq_tailsz; - } + /* + * Add in the page array if there is one. The caller needs to set + * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and + * ends at page boundary + */ + buflen = smb_rqst_len(server, rqst); if (buflen + sizeof(struct smbd_data_transfer) > info->max_fragmented_send_size) { @@ -2127,6 +2129,8 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) goto done; } + iov = &rqst->rq_iov[1]; + cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen); for (i = 0; i < rqst->rq_nvec-1; i++) dump_smb(iov[i].iov_base, iov[i].iov_len); @@ -2213,8 +2217,9 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) /* now sending pages if there are any */ for (i = 0; i < rqst->rq_npages; i++) { - buflen = (i == rqst->rq_npages-1) ? - rqst->rq_tailsz : rqst->rq_pagesz; + unsigned int offset; + + rqst_page_get_length(rqst, i, &buflen, &offset); nvecs = (buflen + max_iov_size - 1) / max_iov_size; log_write(INFO, "sending pages buflen=%d nvecs=%d\n", buflen, nvecs); @@ -2225,9 +2230,11 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) remaining_data_length -= size; log_write(INFO, "sending pages i=%d offset=%d size=%d" " remaining_data_length=%d\n", - i, j*max_iov_size, size, remaining_data_length); + i, j*max_iov_size+offset, size, + remaining_data_length); rc = smbd_post_send_page( - info, rqst->rq_pages[i], j*max_iov_size, + info, rqst->rq_pages[i], + j*max_iov_size + offset, size, remaining_data_length); if (rc) goto done; @@ -2284,37 +2291,37 @@ static void smbd_mr_recovery_work(struct work_struct *work) if (smbdirect_mr->state == MR_INVALIDATED || smbdirect_mr->state == MR_ERROR) { - if (smbdirect_mr->state == MR_INVALIDATED) { + /* recover this MR entry */ + rc = ib_dereg_mr(smbdirect_mr->mr); + if (rc) { + log_rdma_mr(ERR, + "ib_dereg_mr failed rc=%x\n", + rc); + smbd_disconnect_rdma_connection(info); + continue; + } + + smbdirect_mr->mr = ib_alloc_mr( + info->pd, info->mr_type, + info->max_frmr_depth); + if (IS_ERR(smbdirect_mr->mr)) { + log_rdma_mr(ERR, + "ib_alloc_mr failed mr_type=%x " + "max_frmr_depth=%x\n", + info->mr_type, + info->max_frmr_depth); + smbd_disconnect_rdma_connection(info); + continue; + } + + if (smbdirect_mr->state == MR_INVALIDATED) ib_dma_unmap_sg( info->id->device, smbdirect_mr->sgl, smbdirect_mr->sgl_count, smbdirect_mr->dir); - smbdirect_mr->state = MR_READY; - } else if (smbdirect_mr->state == MR_ERROR) { - - /* recover this MR entry */ - rc = ib_dereg_mr(smbdirect_mr->mr); - if (rc) { - log_rdma_mr(ERR, - "ib_dereg_mr failed rc=%x\n", - rc); - smbd_disconnect_rdma_connection(info); - } - smbdirect_mr->mr = ib_alloc_mr( - info->pd, info->mr_type, - info->max_frmr_depth); - if (IS_ERR(smbdirect_mr->mr)) { - log_rdma_mr(ERR, - "ib_alloc_mr failed mr_type=%x " - "max_frmr_depth=%x\n", - info->mr_type, - info->max_frmr_depth); - smbd_disconnect_rdma_connection(info); - } + smbdirect_mr->state = MR_READY; - smbdirect_mr->state = MR_READY; - } /* smbdirect_mr->state is updated by this function * and is read and updated by I/O issuing CPUs trying * to get a MR, the call to atomic_inc_return @@ -2460,7 +2467,7 @@ again: */ struct smbd_mr *smbd_register_mr( struct smbd_connection *info, struct page *pages[], int num_pages, - int tailsz, bool writing, bool need_invalidate) + int offset, int tailsz, bool writing, bool need_invalidate) { struct smbd_mr *smbdirect_mr; int rc, i; @@ -2483,17 +2490,31 @@ struct smbd_mr *smbd_register_mr( smbdirect_mr->sgl_count = num_pages; sg_init_table(smbdirect_mr->sgl, num_pages); - for (i = 0; i < num_pages - 1; i++) - sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0); + log_rdma_mr(INFO, "num_pages=0x%x offset=0x%x tailsz=0x%x\n", + num_pages, offset, tailsz); + if (num_pages == 1) { + sg_set_page(&smbdirect_mr->sgl[0], pages[0], tailsz, offset); + goto skip_multiple_pages; + } + + /* We have at least two pages to register */ + sg_set_page( + &smbdirect_mr->sgl[0], pages[0], PAGE_SIZE - offset, offset); + i = 1; + while (i < num_pages - 1) { + sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0); + i++; + } sg_set_page(&smbdirect_mr->sgl[i], pages[i], tailsz ? tailsz : PAGE_SIZE, 0); +skip_multiple_pages: dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; smbdirect_mr->dir = dir; rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir); if (!rc) { - log_rdma_mr(INFO, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", + log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", num_pages, dir, rc); goto dma_map_error; } @@ -2501,8 +2522,8 @@ struct smbd_mr *smbd_register_mr( rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages, NULL, PAGE_SIZE); if (rc != num_pages) { - log_rdma_mr(INFO, - "ib_map_mr_sg failed rc = %x num_pages = %x\n", + log_rdma_mr(ERR, + "ib_map_mr_sg failed rc = %d num_pages = %x\n", rc, num_pages); goto map_mr_error; } diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h index f9038daea194..a11096254f29 100644 --- a/fs/cifs/smbdirect.h +++ b/fs/cifs/smbdirect.h @@ -292,7 +292,7 @@ void smbd_destroy(struct smbd_connection *info); /* Interface for carrying upper layer I/O through send/recv */ int smbd_recv(struct smbd_connection *info, struct msghdr *msg); -int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst); +int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst); enum mr_state { MR_READY, @@ -321,7 +321,7 @@ struct smbd_mr { /* Interfaces to register and deregister MR for RDMA read/write */ struct smbd_mr *smbd_register_mr( struct smbd_connection *info, struct page *pages[], int num_pages, - int tailsz, bool writing, bool need_invalidate); + int offset, int tailsz, bool writing, bool need_invalidate); int smbd_deregister_mr(struct smbd_mr *mr); #else @@ -332,7 +332,7 @@ static inline void *smbd_get_connection( static inline int smbd_reconnect(struct TCP_Server_Info *server) {return -1; } static inline void smbd_destroy(struct smbd_connection *info) {} static inline int smbd_recv(struct smbd_connection *info, struct msghdr *msg) {return -1; } -static inline int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) {return -1; } +static inline int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst) {return -1; } #endif #endif diff --git a/fs/cifs/trace.c b/fs/cifs/trace.c new file mode 100644 index 000000000000..bd4a546feec1 --- /dev/null +++ b/fs/cifs/trace.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018, Microsoft Corporation. + * + * Author(s): Steve French <stfrench@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h new file mode 100644 index 000000000000..67e413f6ee4d --- /dev/null +++ b/fs/cifs/trace.h @@ -0,0 +1,430 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018, Microsoft Corporation. + * + * Author(s): Steve French <stfrench@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cifs + +#if !defined(_CIFS_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _CIFS_TRACE_H + +#include <linux/tracepoint.h> + +/* For logging errors in read or write */ +DECLARE_EVENT_CLASS(smb3_rw_err_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u64 offset, + __u32 len, + int rc), + TP_ARGS(xid, fid, tid, sesid, offset, len, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, offset) + __field(__u32, len) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->offset = offset; + __entry->len = len; + __entry->rc = rc; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->offset, __entry->len, __entry->rc) +) + +#define DEFINE_SMB3_RW_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 offset, \ + __u32 len, \ + int rc), \ + TP_ARGS(xid, fid, tid, sesid, offset, len, rc)) + +DEFINE_SMB3_RW_ERR_EVENT(write_err); +DEFINE_SMB3_RW_ERR_EVENT(read_err); + + +/* For logging successful read or write */ +DECLARE_EVENT_CLASS(smb3_rw_done_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u64 offset, + __u32 len), + TP_ARGS(xid, fid, tid, sesid, offset, len), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, offset) + __field(__u32, len) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->offset = offset; + __entry->len = len; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->offset, __entry->len) +) + +#define DEFINE_SMB3_RW_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 offset, \ + __u32 len), \ + TP_ARGS(xid, fid, tid, sesid, offset, len)) + +DEFINE_SMB3_RW_DONE_EVENT(write_done); +DEFINE_SMB3_RW_DONE_EVENT(read_done); + +/* + * For handle based calls other than read and write, and get/set info + */ +DECLARE_EVENT_CLASS(smb3_fd_err_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + int rc), + TP_ARGS(xid, fid, tid, sesid, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->rc = rc; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->rc) +) + +#define DEFINE_SMB3_FD_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_fd_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + int rc), \ + TP_ARGS(xid, fid, tid, sesid, rc)) + +DEFINE_SMB3_FD_ERR_EVENT(flush_err); +DEFINE_SMB3_FD_ERR_EVENT(lock_err); +DEFINE_SMB3_FD_ERR_EVENT(close_err); + +/* + * For handle based query/set info calls + */ +DECLARE_EVENT_CLASS(smb3_inf_err_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u8 infclass, + __u32 type, + int rc), + TP_ARGS(xid, fid, tid, sesid, infclass, type, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u8, infclass) + __field(__u32, type) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->infclass = infclass; + __entry->type = type; + __entry->rc = rc; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->infclass, __entry->type, __entry->rc) +) + +#define DEFINE_SMB3_INF_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_inf_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u8 infclass, \ + __u32 type, \ + int rc), \ + TP_ARGS(xid, fid, tid, sesid, infclass, type, rc)) + +DEFINE_SMB3_INF_ERR_EVENT(query_info_err); +DEFINE_SMB3_INF_ERR_EVENT(set_info_err); +DEFINE_SMB3_INF_ERR_EVENT(fsctl_err); + +/* + * For logging SMB3 Status code and Command for responses which return errors + */ +DECLARE_EVENT_CLASS(smb3_cmd_err_class, + TP_PROTO(__u32 tid, + __u64 sesid, + __u16 cmd, + __u64 mid, + __u32 status, + int rc), + TP_ARGS(tid, sesid, cmd, mid, status, rc), + TP_STRUCT__entry( + __field(__u32, tid) + __field(__u64, sesid) + __field(__u16, cmd) + __field(__u64, mid) + __field(__u32, status) + __field(int, rc) + ), + TP_fast_assign( + __entry->tid = tid; + __entry->sesid = sesid; + __entry->cmd = cmd; + __entry->mid = mid; + __entry->status = status; + __entry->rc = rc; + ), + TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d", + __entry->sesid, __entry->tid, __entry->cmd, __entry->mid, + __entry->status, __entry->rc) +) + +#define DEFINE_SMB3_CMD_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_cmd_err_class, smb3_##name, \ + TP_PROTO(__u32 tid, \ + __u64 sesid, \ + __u16 cmd, \ + __u64 mid, \ + __u32 status, \ + int rc), \ + TP_ARGS(tid, sesid, cmd, mid, status, rc)) + +DEFINE_SMB3_CMD_ERR_EVENT(cmd_err); + +DECLARE_EVENT_CLASS(smb3_cmd_done_class, + TP_PROTO(__u32 tid, + __u64 sesid, + __u16 cmd, + __u64 mid), + TP_ARGS(tid, sesid, cmd, mid), + TP_STRUCT__entry( + __field(__u32, tid) + __field(__u64, sesid) + __field(__u16, cmd) + __field(__u64, mid) + ), + TP_fast_assign( + __entry->tid = tid; + __entry->sesid = sesid; + __entry->cmd = cmd; + __entry->mid = mid; + ), + TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu", + __entry->sesid, __entry->tid, + __entry->cmd, __entry->mid) +) + +#define DEFINE_SMB3_CMD_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_cmd_done_class, smb3_##name, \ + TP_PROTO(__u32 tid, \ + __u64 sesid, \ + __u16 cmd, \ + __u64 mid), \ + TP_ARGS(tid, sesid, cmd, mid)) + +DEFINE_SMB3_CMD_DONE_EVENT(cmd_done); + +DECLARE_EVENT_CLASS(smb3_exit_err_class, + TP_PROTO(unsigned int xid, + const char *func_name, + int rc), + TP_ARGS(xid, func_name, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(const char *, func_name) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->func_name = func_name; + __entry->rc = rc; + ), + TP_printk("\t%s: xid=%u rc=%d", + __entry->func_name, __entry->xid, __entry->rc) +) + +#define DEFINE_SMB3_EXIT_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_exit_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + const char *func_name, \ + int rc), \ + TP_ARGS(xid, func_name, rc)) + +DEFINE_SMB3_EXIT_ERR_EVENT(exit_err); + +DECLARE_EVENT_CLASS(smb3_enter_exit_class, + TP_PROTO(unsigned int xid, + const char *func_name), + TP_ARGS(xid, func_name), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(const char *, func_name) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->func_name = func_name; + ), + TP_printk("\t%s: xid=%u", + __entry->func_name, __entry->xid) +) + +#define DEFINE_SMB3_ENTER_EXIT_EVENT(name) \ +DEFINE_EVENT(smb3_enter_exit_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + const char *func_name), \ + TP_ARGS(xid, func_name)) + +DEFINE_SMB3_ENTER_EXIT_EVENT(enter); +DEFINE_SMB3_ENTER_EXIT_EVENT(exit_done); + +/* + * For smb2/smb3 open call + */ +DECLARE_EVENT_CLASS(smb3_open_err_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + int create_options, + int desired_access, + int rc), + TP_ARGS(xid, tid, sesid, create_options, desired_access, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, create_options) + __field(int, desired_access) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->create_options = create_options; + __entry->desired_access = desired_access; + __entry->rc = rc; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, + __entry->create_options, __entry->desired_access, __entry->rc) +) + +#define DEFINE_SMB3_OPEN_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_open_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + int create_options, \ + int desired_access, \ + int rc), \ + TP_ARGS(xid, tid, sesid, create_options, desired_access, rc)) + +DEFINE_SMB3_OPEN_ERR_EVENT(open_err); +DEFINE_SMB3_OPEN_ERR_EVENT(posix_mkdir_err); + +DECLARE_EVENT_CLASS(smb3_open_done_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + int create_options, + int desired_access), + TP_ARGS(xid, fid, tid, sesid, create_options, desired_access), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, create_options) + __field(int, desired_access) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->create_options = create_options; + __entry->desired_access = desired_access; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx cr_opts=0x%x des_access=0x%x", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->create_options, __entry->desired_access) +) + +#define DEFINE_SMB3_OPEN_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_open_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + int create_options, \ + int desired_access), \ + TP_ARGS(xid, fid, tid, sesid, create_options, desired_access)) + +DEFINE_SMB3_OPEN_DONE_EVENT(open_done); +DEFINE_SMB3_OPEN_DONE_EVENT(posix_mkdir_done); + +#endif /* _CIFS_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 927226a2122f..a341ec839c83 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -61,6 +61,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); memset(temp, 0, sizeof(struct mid_q_entry)); + kref_init(&temp->refcount); temp->mid = get_mid(smb_buffer); temp->pid = current->pid; temp->command = cpu_to_le16(smb_buffer->Command); @@ -82,6 +83,21 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) return temp; } +static void _cifs_mid_q_entry_release(struct kref *refcount) +{ + struct mid_q_entry *mid = container_of(refcount, struct mid_q_entry, + refcount); + + mempool_free(mid, cifs_mid_poolp); +} + +void cifs_mid_q_entry_release(struct mid_q_entry *midEntry) +{ + spin_lock(&GlobalMid_Lock); + kref_put(&midEntry->refcount, _cifs_mid_q_entry_release); + spin_unlock(&GlobalMid_Lock); +} + void DeleteMidQEntry(struct mid_q_entry *midEntry) { @@ -110,7 +126,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) } } #endif - mempool_free(midEntry, cifs_mid_poolp); + cifs_mid_q_entry_release(midEntry); } void @@ -201,93 +217,133 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, return 0; } -static unsigned long -rqst_len(struct smb_rqst *rqst) +unsigned long +smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst) { unsigned int i; - struct kvec *iov = rqst->rq_iov; + struct kvec *iov; + int nvec; unsigned long buflen = 0; + if (server->vals->header_preamble_size == 0 && + rqst->rq_nvec >= 2 && rqst->rq_iov[0].iov_len == 4) { + iov = &rqst->rq_iov[1]; + nvec = rqst->rq_nvec - 1; + } else { + iov = rqst->rq_iov; + nvec = rqst->rq_nvec; + } + /* total up iov array first */ - for (i = 0; i < rqst->rq_nvec; i++) + for (i = 0; i < nvec; i++) buflen += iov[i].iov_len; - /* add in the page array if there is one */ + /* + * Add in the page array if there is one. The caller needs to make + * sure rq_offset and rq_tailsz are set correctly. If a buffer of + * multiple pages ends at page boundary, rq_tailsz needs to be set to + * PAGE_SIZE. + */ if (rqst->rq_npages) { - buflen += rqst->rq_pagesz * (rqst->rq_npages - 1); - buflen += rqst->rq_tailsz; + if (rqst->rq_npages == 1) + buflen += rqst->rq_tailsz; + else { + /* + * If there is more than one page, calculate the + * buffer length based on rq_offset and rq_tailsz + */ + buflen += rqst->rq_pagesz * (rqst->rq_npages - 1) - + rqst->rq_offset; + buflen += rqst->rq_tailsz; + } } return buflen; } static int -__smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) +__smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, + struct smb_rqst *rqst) { - int rc; - struct kvec *iov = rqst->rq_iov; - int n_vec = rqst->rq_nvec; - unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base); - unsigned long send_length; - unsigned int i; + int rc = 0; + struct kvec *iov; + int n_vec; + unsigned int send_length = 0; + unsigned int i, j; size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; struct msghdr smb_msg; int val = 1; + __be32 rfc1002_marker; + if (cifs_rdma_enabled(server) && server->smbd_conn) { - rc = smbd_send(server->smbd_conn, rqst); + rc = smbd_send(server, rqst); goto smbd_done; } if (ssocket == NULL) return -ENOTSOCK; - /* sanity check send length */ - send_length = rqst_len(rqst); - if (send_length != smb_buf_length + 4) { - WARN(1, "Send length mismatch(send_length=%lu smb_buf_length=%u)\n", - send_length, smb_buf_length); - return -EIO; - } - - if (n_vec < 2) - return -EIO; - - cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length); - dump_smb(iov[0].iov_base, iov[0].iov_len); - dump_smb(iov[1].iov_base, iov[1].iov_len); - /* cork the socket */ kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, (char *)&val, sizeof(val)); - size = 0; - for (i = 0; i < n_vec; i++) - size += iov[i].iov_len; + for (j = 0; j < num_rqst; j++) + send_length += smb_rqst_len(server, &rqst[j]); + rfc1002_marker = cpu_to_be32(send_length); - iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, iov, n_vec, size); - - rc = smb_send_kvec(server, &smb_msg, &sent); - if (rc < 0) - goto uncork; - - total_len += sent; - - /* now walk the page array and send each page in it */ - for (i = 0; i < rqst->rq_npages; i++) { - size_t len = i == rqst->rq_npages - 1 - ? rqst->rq_tailsz - : rqst->rq_pagesz; - struct bio_vec bvec = { - .bv_page = rqst->rq_pages[i], - .bv_len = len + /* Generate a rfc1002 marker for SMB2+ */ + if (server->vals->header_preamble_size == 0) { + struct kvec hiov = { + .iov_base = &rfc1002_marker, + .iov_len = 4 }; - iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC, - &bvec, 1, len); + iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, &hiov, + 1, 4); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) - break; + goto uncork; total_len += sent; + send_length += 4; + } + + cifs_dbg(FYI, "Sending smb: smb_len=%u\n", send_length); + + for (j = 0; j < num_rqst; j++) { + iov = rqst[j].rq_iov; + n_vec = rqst[j].rq_nvec; + + size = 0; + for (i = 0; i < n_vec; i++) { + dump_smb(iov[i].iov_base, iov[i].iov_len); + size += iov[i].iov_len; + } + + iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, + iov, n_vec, size); + + rc = smb_send_kvec(server, &smb_msg, &sent); + if (rc < 0) + goto uncork; + + total_len += sent; + + /* now walk the page array and send each page in it */ + for (i = 0; i < rqst[j].rq_npages; i++) { + struct bio_vec bvec; + + bvec.bv_page = rqst[j].rq_pages[i]; + rqst_page_get_length(&rqst[j], i, &bvec.bv_len, + &bvec.bv_offset); + + iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC, + &bvec, 1, bvec.bv_len); + rc = smb_send_kvec(server, &smb_msg, &sent); + if (rc < 0) + break; + + total_len += sent; + } } uncork: @@ -296,9 +352,9 @@ uncork: kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, (char *)&val, sizeof(val)); - if ((total_len > 0) && (total_len != smb_buf_length + 4)) { + if ((total_len > 0) && (total_len != send_length)) { cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n", - smb_buf_length + 4, total_len); + send_length, total_len); /* * If we have only sent part of an SMB then the next SMB could * be taken as the remainder of this one. We need to kill the @@ -323,7 +379,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst, int flags) int rc; if (!(flags & CIFS_TRANSFORM_REQ)) - return __smb_send_rqst(server, rqst); + return __smb_send_rqst(server, 1, rqst); if (!server->ops->init_transform_rq || !server->ops->free_transform_rq) { @@ -335,7 +391,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst, int flags) if (rc) return rc; - rc = __smb_send_rqst(server, &cur_rqst); + rc = __smb_send_rqst(server, 1, &cur_rqst); server->ops->free_transform_rq(&cur_rqst); return rc; } @@ -353,7 +409,7 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, iov[1].iov_base = (char *)smb_buffer + 4; iov[1].iov_len = smb_buf_length; - return __smb_send_rqst(server, &rqst); + return __smb_send_rqst(server, 1, &rqst); } static int @@ -718,7 +774,6 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, * to the same server. We may make this configurable later or * use ses->maxReq. */ - rc = wait_for_free_request(ses->server, timeout, optype); if (rc) return rc; @@ -754,8 +809,8 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, #ifdef CONFIG_CIFS_SMB311 if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) - smb311_update_preauth_hash(ses, rqst->rq_iov+1, - rqst->rq_nvec-1); + smb311_update_preauth_hash(ses, rqst->rq_iov, + rqst->rq_nvec); #endif if (timeout == CIFS_ASYNC_OP) @@ -800,8 +855,8 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, #ifdef CONFIG_CIFS_SMB311 if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { struct kvec iov = { - .iov_base = buf + 4, - .iov_len = get_rfc1002_length(buf) + .iov_base = resp_iov->iov_base, + .iov_len = resp_iov->iov_len }; smb311_update_preauth_hash(ses, &iov, 1); } @@ -832,8 +887,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, int rc; if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { - new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), - GFP_KERNEL); + new_iov = kmalloc_array(n_vec + 1, sizeof(struct kvec), + GFP_KERNEL); if (!new_iov) { /* otherwise cifs_send_recv below sets resp_buf_type */ *resp_buf_type = CIFS_NO_BUFFER; @@ -860,49 +915,6 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, return rc; } -/* Like SendReceive2 but iov[0] does not contain an rfc1002 header */ -int -smb2_send_recv(const unsigned int xid, struct cifs_ses *ses, - struct kvec *iov, int n_vec, int *resp_buf_type /* ret */, - const int flags, struct kvec *resp_iov) -{ - struct smb_rqst rqst; - struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov; - int rc; - int i; - __u32 count; - __be32 rfc1002_marker; - - if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { - new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), - GFP_KERNEL); - if (!new_iov) - return -ENOMEM; - } else - new_iov = s_iov; - - /* 1st iov is an RFC1002 Session Message length */ - memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec)); - - count = 0; - for (i = 1; i < n_vec + 1; i++) - count += new_iov[i].iov_len; - - rfc1002_marker = cpu_to_be32(count); - - new_iov[0].iov_base = &rfc1002_marker; - new_iov[0].iov_len = 4; - - memset(&rqst, 0, sizeof(struct smb_rqst)); - rqst.rq_iov = new_iov; - rqst.rq_nvec = n_vec + 1; - - rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov); - if (n_vec + 1 > CIFS_MAX_IOV_SIZE) - kfree(new_iov); - return rc; -} - int SendReceive(const unsigned int xid, struct cifs_ses *ses, struct smb_hdr *in_buf, struct smb_hdr *out_buf, diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index ca599df0dcb1..f3d543dd9a98 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -105,11 +105,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_size != -1) inode->i_blocks = (attr->va_size + 511) >> 9; if (attr->va_atime.tv_sec != -1) - inode->i_atime = attr->va_atime; + inode->i_atime = timespec_to_timespec64(attr->va_atime); if (attr->va_mtime.tv_sec != -1) - inode->i_mtime = attr->va_mtime; + inode->i_mtime = timespec_to_timespec64(attr->va_mtime); if (attr->va_ctime.tv_sec != -1) - inode->i_ctime = attr->va_ctime; + inode->i_ctime = timespec_to_timespec64(attr->va_ctime); } @@ -175,13 +175,13 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_size = iattr->ia_size; } if ( valid & ATTR_ATIME ) { - vattr->va_atime = iattr->ia_atime; + vattr->va_atime = timespec64_to_timespec(iattr->ia_atime); } if ( valid & ATTR_MTIME ) { - vattr->va_mtime = iattr->ia_mtime; + vattr->va_mtime = timespec64_to_timespec(iattr->ia_mtime); } if ( valid & ATTR_CTIME ) { - vattr->va_ctime = iattr->ia_ctime; + vattr->va_ctime = timespec64_to_timespec(iattr->ia_ctime); } } diff --git a/fs/compat.c b/fs/compat.c index 190b38b39d9e..4a0aaaf53217 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -16,79 +16,12 @@ */ #include <linux/compat.h> -#include <linux/ncp_mount.h> #include <linux/nfs4_mount.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/uaccess.h> #include "internal.h" -struct compat_ncp_mount_data { - compat_int_t version; - compat_uint_t ncp_fd; - __compat_uid_t mounted_uid; - compat_pid_t wdog_pid; - unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; - compat_uint_t time_out; - compat_uint_t retry_count; - compat_uint_t flags; - __compat_uid_t uid; - __compat_gid_t gid; - compat_mode_t file_mode; - compat_mode_t dir_mode; -}; - -struct compat_ncp_mount_data_v4 { - compat_int_t version; - compat_ulong_t flags; - compat_ulong_t mounted_uid; - compat_long_t wdog_pid; - compat_uint_t ncp_fd; - compat_uint_t time_out; - compat_uint_t retry_count; - compat_ulong_t uid; - compat_ulong_t gid; - compat_ulong_t file_mode; - compat_ulong_t dir_mode; -}; - -static void *do_ncp_super_data_conv(void *raw_data) -{ - int version = *(unsigned int *)raw_data; - - if (version == 3) { - struct compat_ncp_mount_data *c_n = raw_data; - struct ncp_mount_data *n = raw_data; - - n->dir_mode = c_n->dir_mode; - n->file_mode = c_n->file_mode; - n->gid = c_n->gid; - n->uid = c_n->uid; - memmove (n->mounted_vol, c_n->mounted_vol, (sizeof (c_n->mounted_vol) + 3 * sizeof (unsigned int))); - n->wdog_pid = c_n->wdog_pid; - n->mounted_uid = c_n->mounted_uid; - } else if (version == 4) { - struct compat_ncp_mount_data_v4 *c_n = raw_data; - struct ncp_mount_data_v4 *n = raw_data; - - n->dir_mode = c_n->dir_mode; - n->file_mode = c_n->file_mode; - n->gid = c_n->gid; - n->uid = c_n->uid; - n->retry_count = c_n->retry_count; - n->time_out = c_n->time_out; - n->ncp_fd = c_n->ncp_fd; - n->wdog_pid = c_n->wdog_pid; - n->mounted_uid = c_n->mounted_uid; - n->flags = c_n->flags; - } else if (version != 5) { - return NULL; - } - - return raw_data; -} - - struct compat_nfs_string { compat_uint_t len; compat_uptr_t data; @@ -154,7 +87,6 @@ static int do_nfs4_super_data_conv(void *raw_data) return 0; } -#define NCPFS_NAME "ncpfs" #define NFS4_NAME "nfs4" COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, @@ -183,9 +115,7 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, goto out2; if (kernel_type && options) { - if (!strcmp(kernel_type, NCPFS_NAME)) { - do_ncp_super_data_conv(options); - } else if (!strcmp(kernel_type, NFS4_NAME)) { + if (!strcmp(kernel_type, NFS4_NAME)) { retval = -EINVAL; if (do_nfs4_super_data_conv(options)) goto out3; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index ef80085ed564..9907475b4226 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -38,8 +38,6 @@ #include <linux/ppp-ioctl.h> #include <linux/if_pppox.h> #include <linux/mtio.h> -#include <linux/auto_fs.h> -#include <linux/auto_fs4.h> #include <linux/tty.h> #include <linux/vt_kern.h> #include <linux/fb.h> diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index ad718e5e37bb..28ef9e528853 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -90,14 +90,14 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) if (ia_valid & ATTR_GID) sd_iattr->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) - sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); + sd_iattr->ia_atime = timespec64_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MTIME) - sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); + sd_iattr->ia_mtime = timespec64_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_CTIME) - sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); + sd_iattr->ia_ctime = timespec64_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MODE) { umode_t mode = iattr->ia_mode; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 017b0ab19bc4..f408994fc632 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -90,7 +90,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, const struct cramfs_inode *cramfs_inode, unsigned int offset) { struct inode *inode; - static struct timespec zerotime; + static struct timespec64 zerotime; inode = iget_locked(sb, cramino(cramfs_inode, offset)); if (!inode) @@ -492,7 +492,7 @@ static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); - if (IS_ENABLED(CCONFIG_CRAMFS_MTD) && sb->s_mtd) { + if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) { if (sbi && sbi->mtd_point_size) mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size); kill_mtd_super(sb); @@ -808,10 +808,7 @@ static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, un } out: mutex_unlock(&read_mutex); - if (IS_ERR(inode)) - return ERR_CAST(inode); - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static int cramfs_readpage(struct file *file, struct page *page) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 0d5e6a569d58..0959044c5cee 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -26,15 +26,8 @@ #include <linux/namei.h> #include "fscrypt_private.h" -/* - * Call fscrypt_decrypt_page on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) +static void __fscrypt_decrypt_bio(struct bio *bio, bool done) { - struct fscrypt_ctx *ctx = - container_of(work, struct fscrypt_ctx, r.work); - struct bio *bio = ctx->r.bio; struct bio_vec *bv; int i; @@ -46,22 +39,38 @@ static void completion_pages(struct work_struct *work) if (ret) { WARN_ON_ONCE(1); SetPageError(page); - } else { + } else if (done) { SetPageUptodate(page); } - unlock_page(page); + if (done) + unlock_page(page); } +} + +void fscrypt_decrypt_bio(struct bio *bio) +{ + __fscrypt_decrypt_bio(bio, false); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio); + +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + + __fscrypt_decrypt_bio(bio, true); fscrypt_release_ctx(ctx); bio_put(bio); } -void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio) { INIT_WORK(&ctx->r.work, completion_pages); ctx->r.bio = bio; - queue_work(fscrypt_read_workqueue, &ctx->r.work); + fscrypt_enqueue_decrypt_work(&ctx->r.work); } -EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); +EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio); void fscrypt_pullback_bio_page(struct page **page, bool restore) { diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index ce654526c0fb..0f46cf550907 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -45,12 +45,18 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static LIST_HEAD(fscrypt_free_ctxs); static DEFINE_SPINLOCK(fscrypt_ctx_lock); -struct workqueue_struct *fscrypt_read_workqueue; +static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); static struct kmem_cache *fscrypt_ctx_cachep; struct kmem_cache *fscrypt_info_cachep; +void fscrypt_enqueue_decrypt_work(struct work_struct *work) +{ + queue_work(fscrypt_read_workqueue, work); +} +EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work); + /** * fscrypt_release_ctx() - Releases an encryption context * @ctx: The encryption context to release. @@ -156,12 +162,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, } req = skcipher_request_alloc(tfm, gfp_flags); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", - __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, @@ -178,9 +180,10 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_skcipher_encrypt() returned %d\n", - __func__, res); + fscrypt_err(inode->i_sb, + "%scryption failed for inode %lu, block %llu: %d", + (rw == FS_DECRYPT ? "de" : "en"), + inode->i_ino, lblk_num, res); return res; } return 0; @@ -326,7 +329,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); @@ -353,7 +355,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) const struct dentry_operations fscrypt_d_ops = { .d_revalidate = fscrypt_d_revalidate, }; -EXPORT_SYMBOL(fscrypt_d_ops); void fscrypt_restore_control_page(struct page *page) { @@ -422,13 +423,43 @@ fail: return res; } +void fscrypt_msg(struct super_block *sb, const char *level, + const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct va_format vaf; + va_list args; + + if (!__ratelimit(&rs)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf); + else + printk("%sfscrypt: %pV\n", level, &vaf); + va_end(args); +} + /** * fscrypt_init() - Set up for fs encryption. */ static int __init fscrypt_init(void) { + /* + * Use an unbound workqueue to allow bios to be decrypted in parallel + * even when they happen to complete on the same CPU. This sacrifices + * locality, but it's worthwhile since decryption is CPU-intensive. + * + * Also use a high-priority workqueue to prioritize decryption work, + * which blocks reads from completing, over regular application tasks. + */ fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue", - WQ_HIGHPRI, 0); + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); if (!fscrypt_read_workqueue) goto fail; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index e33f3d3c5ade..d7a0f682ca12 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -59,11 +59,8 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: skcipher_request_alloc() failed\n", __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); @@ -74,8 +71,9 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - printk_ratelimited(KERN_ERR - "%s: Error (error code %d)\n", __func__, res); + fscrypt_err(inode->i_sb, + "Filename encryption failed for inode %lu: %d", + inode->i_ino, res); return res; } @@ -96,23 +94,14 @@ static int fname_decrypt(struct inode *inode, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; - struct fscrypt_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - unsigned lim; - - lim = inode->i_sb->s_cop->max_namelen(inode); - if (iname->len <= 0 || iname->len > lim) - return -EIO; /* Allocate request */ req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); @@ -127,8 +116,9 @@ static int fname_decrypt(struct inode *inode, res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - printk_ratelimited(KERN_ERR - "%s: Error (error code %d)\n", __func__, res); + fscrypt_err(inode->i_sb, + "Filename decryption failed for inode %lu: %d", + inode->i_ino, res); return res; } @@ -341,12 +331,12 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } ret = fscrypt_get_encryption_info(dir); - if (ret && ret != -EOPNOTSUPP) + if (ret) return ret; if (dir->i_crypt_info) { if (!fscrypt_fname_encrypted_size(dir, iname->len, - dir->i_sb->s_cop->max_namelen(dir), + dir->i_sb->s_cop->max_namelen, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index ad6722bae8b7..39c20ef26db4 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -18,15 +18,7 @@ /* Encryption parameters */ #define FS_IV_SIZE 16 -#define FS_AES_128_ECB_KEY_SIZE 16 -#define FS_AES_128_CBC_KEY_SIZE 16 -#define FS_AES_128_CTS_KEY_SIZE 16 -#define FS_AES_256_GCM_KEY_SIZE 32 -#define FS_AES_256_CBC_KEY_SIZE 32 -#define FS_AES_256_CTS_KEY_SIZE 32 -#define FS_AES_256_XTS_KEY_SIZE 64 - -#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FS_KEY_DERIVATION_NONCE_SIZE 16 /** * Encryption context for inode @@ -91,13 +83,16 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode, filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) return true; + if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS) + return true; + return false; } /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern int fscrypt_initialize(unsigned int cop_flags); -extern struct workqueue_struct *fscrypt_read_workqueue; extern int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, @@ -106,6 +101,15 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, gfp_t gfp_flags); extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +extern const struct dentry_operations fscrypt_d_ops; + +extern void __printf(3, 4) __cold +fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...); + +#define fscrypt_warn(sb, fmt, ...) \ + fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__) +#define fscrypt_err(sb, fmt, ...) \ + fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__) /* fname.c */ extern int fname_encrypt(struct inode *inode, const struct qstr *iname, diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index bec06490fb13..926e5df20ec3 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -39,8 +39,9 @@ int fscrypt_file_open(struct inode *inode, struct file *filp) dir = dget_parent(file_dentry(filp)); if (IS_ENCRYPTED(d_inode(dir)) && !fscrypt_has_permitted_context(d_inode(dir), inode)) { - pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu", - d_inode(dir)->i_ino, inode->i_ino); + fscrypt_warn(inode->i_sb, + "inconsistent encryption contexts: %lu/%lu", + d_inode(dir)->i_ino, inode->i_ino); err = -EPERM; } dput(dir); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 05f5ee1f0705..e997ca51192f 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -19,17 +19,16 @@ static struct crypto_shash *essiv_hash_tfm; -/** - * derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivation. - * @source_key: Source key to which to apply derivation. - * @derived_raw_key: Derived raw key. +/* + * Key derivation function. This generates the derived key by encrypting the + * master key with AES-128-ECB using the inode's nonce as the AES key. * - * Return: Zero on success; non-zero otherwise. + * The master key must be at least as long as the derived key. If the master + * key is longer, then only the first 'derived_keysize' bytes are used. */ -static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], - const struct fscrypt_key *source_key, - u8 derived_raw_key[FS_MAX_KEY_SIZE]) +static int derive_key_aes(const u8 *master_key, + const struct fscrypt_context *ctx, + u8 *derived_key, unsigned int derived_keysize) { int res = 0; struct skcipher_request *req = NULL; @@ -51,14 +50,13 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); - res = crypto_skcipher_setkey(tfm, deriving_key, - FS_AES_128_ECB_KEY_SIZE); + res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce)); if (res < 0) goto out; - sg_init_one(&src_sg, source_key->raw, source_key->size); - sg_init_one(&dst_sg, derived_raw_key, source_key->size); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + sg_init_one(&src_sg, master_key, derived_keysize); + sg_init_one(&dst_sg, derived_key, derived_keysize); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, NULL); res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); out: @@ -67,101 +65,147 @@ out: return res; } -static int validate_user_key(struct fscrypt_info *crypt_info, - struct fscrypt_context *ctx, u8 *raw_key, - const char *prefix, int min_keysize) +/* + * Search the current task's subscribed keyrings for a "logon" key with + * description prefix:descriptor, and if found acquire a read lock on it and + * return a pointer to its validated payload in *payload_ret. + */ +static struct key * +find_and_lock_process_key(const char *prefix, + const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE], + unsigned int min_keysize, + const struct fscrypt_key **payload_ret) { char *description; - struct key *keyring_key; - struct fscrypt_key *master_key; + struct key *key; const struct user_key_payload *ukp; - int res; + const struct fscrypt_key *payload; description = kasprintf(GFP_NOFS, "%s%*phN", prefix, - FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); + FS_KEY_DESCRIPTOR_SIZE, descriptor); if (!description) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - keyring_key = request_key(&key_type_logon, description, NULL); + key = request_key(&key_type_logon, description, NULL); kfree(description); - if (IS_ERR(keyring_key)) - return PTR_ERR(keyring_key); - down_read(&keyring_key->sem); - - if (keyring_key->type != &key_type_logon) { - printk_once(KERN_WARNING - "%s: key type must be logon\n", __func__); - res = -ENOKEY; - goto out; - } - ukp = user_key_payload_locked(keyring_key); - if (!ukp) { - /* key was revoked before we acquired its semaphore */ - res = -EKEYREVOKED; - goto out; + if (IS_ERR(key)) + return key; + + down_read(&key->sem); + ukp = user_key_payload_locked(key); + + if (!ukp) /* was the key revoked before we acquired its semaphore? */ + goto invalid; + + payload = (const struct fscrypt_key *)ukp->data; + + if (ukp->datalen != sizeof(struct fscrypt_key) || + payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) { + fscrypt_warn(NULL, + "key with description '%s' has invalid payload", + key->description); + goto invalid; } - if (ukp->datalen != sizeof(struct fscrypt_key)) { - res = -EINVAL; - goto out; + + if (payload->size < min_keysize) { + fscrypt_warn(NULL, + "key with description '%s' is too short (got %u bytes, need %u+ bytes)", + key->description, payload->size, min_keysize); + goto invalid; } - master_key = (struct fscrypt_key *)ukp->data; - BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - - if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE - || master_key->size % AES_BLOCK_SIZE != 0) { - printk_once(KERN_WARNING - "%s: key size incorrect: %d\n", - __func__, master_key->size); - res = -ENOKEY; - goto out; + + *payload_ret = payload; + return key; + +invalid: + up_read(&key->sem); + key_put(key); + return ERR_PTR(-ENOKEY); +} + +/* Find the master key, then derive the inode's actual encryption key */ +static int find_and_derive_key(const struct inode *inode, + const struct fscrypt_context *ctx, + u8 *derived_key, unsigned int derived_keysize) +{ + struct key *key; + const struct fscrypt_key *payload; + int err; + + key = find_and_lock_process_key(FS_KEY_DESC_PREFIX, + ctx->master_key_descriptor, + derived_keysize, &payload); + if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) { + key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix, + ctx->master_key_descriptor, + derived_keysize, &payload); } - res = derive_key_aes(ctx->nonce, master_key, raw_key); -out: - up_read(&keyring_key->sem); - key_put(keyring_key); - return res; + if (IS_ERR(key)) + return PTR_ERR(key); + err = derive_key_aes(payload->raw, ctx, derived_key, derived_keysize); + up_read(&key->sem); + key_put(key); + return err; } -static const struct { +static struct fscrypt_mode { + const char *friendly_name; const char *cipher_str; int keysize; + bool logged_impl_name; } available_modes[] = { - [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", - FS_AES_256_XTS_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", - FS_AES_256_CTS_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", - FS_AES_128_CBC_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", - FS_AES_128_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_XTS] = { + .friendly_name = "AES-256-XTS", + .cipher_str = "xts(aes)", + .keysize = 64, + }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { + .friendly_name = "AES-256-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 32, + }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { + .friendly_name = "AES-128-CBC", + .cipher_str = "cbc(aes)", + .keysize = 16, + }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { + .friendly_name = "AES-128-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 16, + }, + [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { + .friendly_name = "Speck128/256-XTS", + .cipher_str = "xts(speck128)", + .keysize = 64, + }, + [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { + .friendly_name = "Speck128/256-CTS-CBC", + .cipher_str = "cts(cbc(speck128))", + .keysize = 32, + }, }; -static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, - const char **cipher_str_ret, int *keysize_ret) +static struct fscrypt_mode * +select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode) { - u32 mode; - if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { - pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", - inode->i_ino, - ci->ci_data_mode, ci->ci_filename_mode); - return -EINVAL; + fscrypt_warn(inode->i_sb, + "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)", + inode->i_ino, ci->ci_data_mode, + ci->ci_filename_mode); + return ERR_PTR(-EINVAL); } - if (S_ISREG(inode->i_mode)) { - mode = ci->ci_data_mode; - } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - mode = ci->ci_filename_mode; - } else { - WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", - inode->i_ino, (inode->i_mode & S_IFMT)); - return -EINVAL; - } + if (S_ISREG(inode->i_mode)) + return &available_modes[ci->ci_data_mode]; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return &available_modes[ci->ci_filename_mode]; - *cipher_str_ret = available_modes[mode].cipher_str; - *keysize_ret = available_modes[mode].keysize; - return 0; + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return ERR_PTR(-EINVAL); } static void put_crypt_info(struct fscrypt_info *ci) @@ -184,8 +228,9 @@ static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) tfm = crypto_alloc_shash("sha256", 0, 0); if (IS_ERR(tfm)) { - pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", - PTR_ERR(tfm)); + fscrypt_warn(NULL, + "error allocating SHA-256 transform: %ld", + PTR_ERR(tfm)); return PTR_ERR(tfm); } prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); @@ -245,8 +290,7 @@ int fscrypt_get_encryption_info(struct inode *inode) struct fscrypt_info *crypt_info; struct fscrypt_context ctx; struct crypto_skcipher *ctfm; - const char *cipher_str; - int keysize; + struct fscrypt_mode *mode; u8 *raw_key = NULL; int res; @@ -290,57 +334,59 @@ int fscrypt_get_encryption_info(struct inode *inode) memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); - if (res) + mode = select_encryption_mode(crypt_info, inode); + if (IS_ERR(mode)) { + res = PTR_ERR(mode); goto out; + } /* * This cannot be a stack buffer because it is passed to the scatterlist * crypto API as part of key derivation. */ res = -ENOMEM; - raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS); + raw_key = kmalloc(mode->keysize, GFP_NOFS); if (!raw_key) goto out; - res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, - keysize); - if (res && inode->i_sb->s_cop->key_prefix) { - int res2 = validate_user_key(crypt_info, &ctx, raw_key, - inode->i_sb->s_cop->key_prefix, - keysize); - if (res2) { - if (res2 == -ENOKEY) - res = -ENOKEY; - goto out; - } - } else if (res) { + res = find_and_derive_key(inode, &ctx, raw_key, mode->keysize); + if (res) goto out; - } - ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", - __func__, res, inode->i_ino); + + ctfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); + if (IS_ERR(ctfm)) { + res = PTR_ERR(ctfm); + fscrypt_warn(inode->i_sb, + "error allocating '%s' transform for inode %lu: %d", + mode->cipher_str, inode->i_ino, res); goto out; } + if (unlikely(!mode->logged_impl_name)) { + /* + * fscrypt performance can vary greatly depending on which + * crypto algorithm implementation is used. Help people debug + * performance problems by logging the ->cra_driver_name the + * first time a mode is used. Note that multiple threads can + * race here, but it doesn't really matter. + */ + mode->logged_impl_name = true; + pr_info("fscrypt: %s using implementation \"%s\"\n", + mode->friendly_name, + crypto_skcipher_alg(ctfm)->base.cra_driver_name); + } crypt_info->ci_ctfm = ctfm; - crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - /* - * if the provided key is longer than keysize, we use the first - * keysize bytes of the derived key only - */ - res = crypto_skcipher_setkey(ctfm, raw_key, keysize); + res = crypto_skcipher_setkey(ctfm, raw_key, mode->keysize); if (res) goto out; if (S_ISREG(inode->i_mode) && crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { - res = init_essiv_generator(crypt_info, raw_key, keysize); + res = init_essiv_generator(crypt_info, raw_key, mode->keysize); if (res) { - pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", - __func__, res, inode->i_ino); + fscrypt_warn(inode->i_sb, + "error initializing ESSIV generator for inode %lu: %d", + inode->i_ino, res); goto out; } } @@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, } } +static struct page *dax_busy_page(void *entry) +{ + unsigned long pfn; + + for_each_mapped_pfn(entry, pfn) { + struct page *page = pfn_to_page(pfn); + + if (page_ref_count(page) > 1) + return page; + } + return NULL; +} + /* * Find radix tree entry at given index. If it points to an exceptional entry, * return it with the radix tree entry locked. If the radix tree doesn't @@ -492,6 +505,90 @@ restart: return entry; } +/** + * dax_layout_busy_page - find first pinned page in @mapping + * @mapping: address space to scan for a page with ref count > 1 + * + * DAX requires ZONE_DEVICE mapped pages. These pages are never + * 'onlined' to the page allocator so they are considered idle when + * page->count == 1. A filesystem uses this interface to determine if + * any page in the mapping is busy, i.e. for DMA, or other + * get_user_pages() usages. + * + * It is expected that the filesystem is holding locks to block the + * establishment of new mappings in this address_space. I.e. it expects + * to be able to run unmap_mapping_range() and subsequently not race + * mapping_mapped() becoming true. + */ +struct page *dax_layout_busy_page(struct address_space *mapping) +{ + pgoff_t indices[PAGEVEC_SIZE]; + struct page *page = NULL; + struct pagevec pvec; + pgoff_t index, end; + unsigned i; + + /* + * In the 'limited' case get_user_pages() for dax is disabled. + */ + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return NULL; + + if (!dax_mapping(mapping) || !mapping_mapped(mapping)) + return NULL; + + pagevec_init(&pvec); + index = 0; + end = -1; + + /* + * If we race get_user_pages_fast() here either we'll see the + * elevated page count in the pagevec_lookup and wait, or + * get_user_pages_fast() will see that the page it took a reference + * against is no longer mapped in the page tables and bail to the + * get_user_pages() slow path. The slow path is protected by + * pte_lock() and pmd_lock(). New references are not taken without + * holding those locks, and unmap_mapping_range() will not zero the + * pte or pmd without holding the respective lock, so we are + * guaranteed to either see new references or prevent new + * references from being established. + */ + unmap_mapping_range(mapping, 0, 0, 1); + + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *pvec_ent = pvec.pages[i]; + void *entry; + + index = indices[i]; + if (index >= end) + break; + + if (!radix_tree_exceptional_entry(pvec_ent)) + continue; + + xa_lock_irq(&mapping->i_pages); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (entry) + page = dax_busy_page(entry); + put_unlocked_mapping_entry(mapping, index, entry); + xa_unlock_irq(&mapping->i_pages); + if (page) + break; + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + index++; + + if (page) + break; + } + return page; +} +EXPORT_SYMBOL_GPL(dax_layout_busy_page); + static int __dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index, bool trunc) { @@ -677,7 +774,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, * downgrading page table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ if (pmdp) { #ifdef CONFIG_FS_DAX_PMD @@ -905,14 +1002,13 @@ out: * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ -static int dax_load_hole(struct address_space *mapping, void *entry, +static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, struct vm_fault *vmf) { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; - int ret = VM_FAULT_NOPAGE; + vm_fault_t ret = VM_FAULT_NOPAGE; struct page *zero_page; - void *entry2; pfn_t pfn; zero_page = ZERO_PAGE(0); @@ -922,14 +1018,9 @@ static int dax_load_hole(struct address_space *mapping, void *entry, } pfn = page_to_pfn_t(zero_page); - entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_ZERO_PAGE, false); - if (IS_ERR(entry2)) { - ret = VM_FAULT_SIGBUS; - goto out; - } - - vm_insert_mixed(vmf->vma, vaddr, pfn); + dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, + false); + ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); out: trace_dax_load_hole(inode, vmf, ret); return ret; @@ -991,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; + size_t xfer; int id; if (iov_iter_rw(iter) == READ) { @@ -1054,18 +1146,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * vfs_write(), depending on which operation we are doing. */ if (iov_iter_rw(iter) == WRITE) - map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, + xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else - map_len = copy_to_iter(kaddr, map_len, iter); - if (map_len <= 0) { - ret = map_len ? map_len : -EFAULT; - break; - } + xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, + map_len, iter); + + pos += xfer; + length -= xfer; + done += xfer; - pos += map_len; - length -= map_len; - done += map_len; + if (xfer == 0) + ret = -EFAULT; + if (xfer < map_len) + break; } dax_read_unlock(id); @@ -1112,7 +1206,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(dax_iomap_rw); -static int dax_fault_return(int error) +static vm_fault_t dax_fault_return(int error) { if (error == 0) return VM_FAULT_NOPAGE; @@ -1132,7 +1226,7 @@ static bool dax_fault_is_synchronous(unsigned long flags, && (iomap->flags & IOMAP_F_DIRTY); } -static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1145,18 +1239,18 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; - int vmf_ret = 0; + vm_fault_t ret = 0; void *entry; pfn_t pfn; - trace_dax_pte_fault(inode, vmf, vmf_ret); + trace_dax_pte_fault(inode, vmf, ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ if (pos >= i_size_read(inode)) { - vmf_ret = VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; goto out; } @@ -1165,7 +1259,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = grab_mapping_entry(mapping, vmf->pgoff, 0); if (IS_ERR(entry)) { - vmf_ret = dax_fault_return(PTR_ERR(entry)); + ret = dax_fault_return(PTR_ERR(entry)); goto out; } @@ -1176,7 +1270,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * retried. */ if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { - vmf_ret = VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; goto unlock_entry; } @@ -1189,7 +1283,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (iomap_errp) *iomap_errp = error; if (error) { - vmf_ret = dax_fault_return(error); + ret = dax_fault_return(error); goto unlock_entry; } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { @@ -1219,9 +1313,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto error_finish_iomap; __SetPageUptodate(vmf->cow_page); - vmf_ret = finish_fault(vmf); - if (!vmf_ret) - vmf_ret = VM_FAULT_DONE_COW; + ret = finish_fault(vmf); + if (!ret) + ret = VM_FAULT_DONE_COW; goto finish_iomap; } @@ -1240,10 +1334,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, 0, write && !sync); - if (IS_ERR(entry)) { - error = PTR_ERR(entry); - goto error_finish_iomap; - } /* * If we are doing synchronous page fault and inode needs fsync, @@ -1257,23 +1347,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto error_finish_iomap; } *pfnp = pfn; - vmf_ret = VM_FAULT_NEEDDSYNC | major; + ret = VM_FAULT_NEEDDSYNC | major; goto finish_iomap; } trace_dax_insert_mapping(inode, vmf, entry); if (write) - error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); + ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn); else - error = vm_insert_mixed(vma, vaddr, pfn); + ret = vmf_insert_mixed(vma, vaddr, pfn); - /* -EBUSY is fine, somebody else faulted on the same PTE */ - if (error == -EBUSY) - error = 0; - break; + goto finish_iomap; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!write) { - vmf_ret = dax_load_hole(mapping, entry, vmf); + ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1284,12 +1371,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } error_finish_iomap: - vmf_ret = dax_fault_return(error) | major; + ret = dax_fault_return(error); finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; - if (vmf_ret & VM_FAULT_ERROR) + if (ret & VM_FAULT_ERROR) copied = 0; /* * The fault is done by now and there's no way back (other @@ -1302,12 +1389,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff); out: - trace_dax_pte_fault_done(inode, vmf, vmf_ret); - return vmf_ret; + trace_dax_pte_fault_done(inode, vmf, ret); + return ret | major; } #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, +static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -1327,8 +1414,6 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, pfn = page_to_pfn_t(zero_page); ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); - if (IS_ERR(ret)) - goto fallback; ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1348,7 +1433,7 @@ fallback: return VM_FAULT_FALLBACK; } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1358,7 +1443,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, bool sync; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; - int result = VM_FAULT_FALLBACK; + vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; void *entry; @@ -1450,8 +1535,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_PMD, write && !sync); - if (IS_ERR(entry)) - goto finish_iomap; /* * If we are doing synchronous page fault and inode needs fsync, @@ -1509,7 +1592,7 @@ out: return result; } #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -1529,7 +1612,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * has done all the necessary locking for page fault to proceed * successfully. */ -int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { switch (pe_size) { @@ -1553,14 +1636,14 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault); * DAX file. It takes care of marking corresponding radix tree entry as dirty * as well. */ -static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, +static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t pfn) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *entry, **slot; pgoff_t index = vmf->pgoff; - int vmf_ret, error; + vm_fault_t ret; xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); @@ -1579,21 +1662,20 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, xa_unlock_irq(&mapping->i_pages); switch (pe_size) { case PE_SIZE_PTE: - error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); - vmf_ret = dax_fault_return(error); + ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); break; #ifdef CONFIG_FS_DAX_PMD case PE_SIZE_PMD: - vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, true); break; #endif default: - vmf_ret = VM_FAULT_FALLBACK; + ret = VM_FAULT_FALLBACK; } put_locked_mapping_entry(mapping, index); - trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); - return vmf_ret; + trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); + return ret; } /** @@ -1606,8 +1688,8 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, * stored persistently on the media and handles inserting of appropriate page * table entry. */ -int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - pfn_t pfn) +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; diff --git a/fs/dcache.c b/fs/dcache.c index 86d2de63461e..0e8e5de3c48a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -580,6 +580,7 @@ static void __dentry_kill(struct dentry *dentry) spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); + cond_resched(); } static struct dentry *__lock_parent(struct dentry *dentry) @@ -827,30 +828,24 @@ static inline bool fast_dput(struct dentry *dentry) */ void dput(struct dentry *dentry) { - if (unlikely(!dentry)) - return; + while (dentry) { + might_sleep(); -repeat: - might_sleep(); + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); + return; + } - rcu_read_lock(); - if (likely(fast_dput(dentry))) { + /* Slow case: now with the dentry lock held */ rcu_read_unlock(); - return; - } - - /* Slow case: now with the dentry lock held */ - rcu_read_unlock(); - if (likely(retain_dentry(dentry))) { - spin_unlock(&dentry->d_lock); - return; - } + if (likely(retain_dentry(dentry))) { + spin_unlock(&dentry->d_lock); + return; + } - dentry = dentry_kill(dentry); - if (dentry) { - cond_resched(); - goto repeat; + dentry = dentry_kill(dentry); } } EXPORT_SYMBOL(dput); @@ -907,6 +902,35 @@ repeat: } EXPORT_SYMBOL(dget_parent); +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + __dget(alias); + return alias; +} + +/** + * d_find_any_alias - find any alias for a given inode + * @inode: inode to find an alias for + * + * If any aliases exist for the given inode, take and return a + * reference for one of them. If no aliases exist, return %NULL. + */ +struct dentry *d_find_any_alias(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + de = __d_find_any_alias(inode); + spin_unlock(&inode->i_lock); + return de; +} +EXPORT_SYMBOL(d_find_any_alias); + /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question @@ -923,34 +947,19 @@ EXPORT_SYMBOL(dget_parent); */ static struct dentry *__d_find_alias(struct inode *inode) { - struct dentry *alias, *discon_alias; + struct dentry *alias; + + if (S_ISDIR(inode->i_mode)) + return __d_find_any_alias(inode); -again: - discon_alias = NULL; hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); - if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { - if (IS_ROOT(alias) && - (alias->d_flags & DCACHE_DISCONNECTED)) { - discon_alias = alias; - } else { - __dget_dlock(alias); - spin_unlock(&alias->d_lock); - return alias; - } - } - spin_unlock(&alias->d_lock); - } - if (discon_alias) { - alias = discon_alias; - spin_lock(&alias->d_lock); - if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { + if (!d_unhashed(alias)) { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); - goto again; } return NULL; } @@ -1052,8 +1061,6 @@ static void shrink_dentry_list(struct list_head *list) while (!list_empty(list)) { struct dentry *dentry, *parent; - cond_resched(); - dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); @@ -1230,13 +1237,11 @@ enum d_walk_ret { * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry - * @finish: callback when successfully finished the walk * - * The @enter() and @finish() callbacks are called with d_lock held. + * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, - enum d_walk_ret (*enter)(void *, struct dentry *), - void (*finish)(void *)) + enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent; struct list_head *next; @@ -1325,8 +1330,6 @@ ascend: if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); - if (finish) - finish(data); out_unlock: spin_unlock(&this_parent->d_lock); @@ -1375,7 +1378,7 @@ int path_has_submounts(const struct path *parent) struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; read_seqlock_excl(&mount_lock); - d_walk(parent->dentry, &data, path_check_mount, NULL); + d_walk(parent->dentry, &data, path_check_mount); read_sequnlock_excl(&mount_lock); return data.mounted; @@ -1483,11 +1486,16 @@ void shrink_dcache_parent(struct dentry *parent) data.start = parent; data.found = 0; - d_walk(parent, &data, select_collect, NULL); + d_walk(parent, &data, select_collect); + + if (!list_empty(&data.dispose)) { + shrink_dentry_list(&data.dispose); + continue; + } + + cond_resched(); if (!data.found) break; - - shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1518,7 +1526,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) static void do_one_tree(struct dentry *dentry) { shrink_dcache_parent(dentry); - d_walk(dentry, dentry, umount_check, NULL); + d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } @@ -1542,78 +1550,48 @@ void shrink_dcache_for_umount(struct super_block *sb) } } -struct detach_data { - struct select_data select; - struct dentry *mountpoint; -}; -static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry) +static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { - struct detach_data *data = _data; - + struct dentry **victim = _data; if (d_mountpoint(dentry)) { __dget_dlock(dentry); - data->mountpoint = dentry; + *victim = dentry; return D_WALK_QUIT; } - - return select_collect(&data->select, dentry); -} - -static void check_and_drop(void *_data) -{ - struct detach_data *data = _data; - - if (!data->mountpoint && list_empty(&data->select.dispose)) - __d_drop(data->select.start); + return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) - * - * no dcache lock. - * - * The final d_drop is done as an atomic operation relative to - * rename_lock ensuring there are no races with d_set_mounted. This - * ensures there are no unhashed dentries on the path to a mountpoint. */ void d_invalidate(struct dentry *dentry) { - /* - * If it's already been dropped, return OK. - */ + bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } + __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ - if (!dentry->d_inode) { - d_drop(dentry); + if (!dentry->d_inode) return; - } + shrink_dcache_parent(dentry); for (;;) { - struct detach_data data; - - data.mountpoint = NULL; - INIT_LIST_HEAD(&data.select.dispose); - data.select.start = dentry; - data.select.found = 0; - - d_walk(dentry, &data, detach_and_collect, check_and_drop); - - if (!list_empty(&data.select.dispose)) - shrink_dentry_list(&data.select.dispose); - else if (!data.mountpoint) + struct dentry *victim = NULL; + d_walk(dentry, &victim, find_submount); + if (!victim) { + if (had_submounts) + shrink_dcache_parent(dentry); return; - - if (data.mountpoint) { - detach_mounts(data.mountpoint); - dput(data.mountpoint); } + had_submounts = true; + detach_mounts(victim); + dput(victim); } } EXPORT_SYMBOL(d_invalidate); @@ -1899,6 +1877,28 @@ void d_instantiate(struct dentry *entry, struct inode * inode) } EXPORT_SYMBOL(d_instantiate); +/* + * This should be equivalent to d_instantiate() + unlock_new_inode(), + * with lockdep-related part of unlock_new_inode() done before + * anything else. Use that instead of open-coding d_instantiate()/ + * unlock_new_inode() combinations. + */ +void d_instantiate_new(struct dentry *entry, struct inode *inode) +{ + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + BUG_ON(!inode); + lockdep_annotate_inode_mutex_key(inode); + security_d_instantiate(entry, inode); + spin_lock(&inode->i_lock); + __d_instantiate(entry, inode); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; + smp_mb(); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_instantiate_new); + /** * d_instantiate_no_diralias - instantiate a non-aliased dentry * @entry: dentry to complete @@ -1941,35 +1941,6 @@ struct dentry *d_make_root(struct inode *root_inode) } EXPORT_SYMBOL(d_make_root); -static struct dentry * __d_find_any_alias(struct inode *inode) -{ - struct dentry *alias; - - if (hlist_empty(&inode->i_dentry)) - return NULL; - alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); - __dget(alias); - return alias; -} - -/** - * d_find_any_alias - find any alias for a given inode - * @inode: inode to find an alias for - * - * If any aliases exist for the given inode, take and return a - * reference for one of them. If no aliases exist, return %NULL. - */ -struct dentry *d_find_any_alias(struct inode *inode) -{ - struct dentry *de; - - spin_lock(&inode->i_lock); - de = __d_find_any_alias(inode); - spin_unlock(&inode->i_lock); - return de; -} -EXPORT_SYMBOL(d_find_any_alias); - static struct dentry *__d_instantiate_anon(struct dentry *dentry, struct inode *inode, bool disconnected) @@ -3112,7 +3083,7 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) void d_genocide(struct dentry *parent) { - d_walk(parent, parent, d_genocide_kill, NULL); + d_walk(parent, parent, d_genocide_kill); } EXPORT_SYMBOL(d_genocide); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 1f99678ff5d3..4fce1da7db23 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -796,19 +796,13 @@ EXPORT_SYMBOL_GPL(debugfs_read_file_bool); ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - char buf[32]; - size_t buf_size; bool bv; int r; bool *val = file->private_data; struct dentry *dentry = F_DENTRY(file); - buf_size = min(count, (sizeof(buf)-1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - if (strtobool(buf, &bv) == 0) { + r = kstrtobool_from_user(user_buf, count, &bv); + if (!r) { r = debugfs_file_get(dentry); if (unlikely(r)) return r; diff --git a/fs/direct-io.c b/fs/direct-io.c index 874607bb6e02..093fb54cd316 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -432,8 +432,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct bio *bio; /* - * bio_alloc() is guaranteed to return a bio when called with - * __GFP_RECLAIM and we request a valid number of vectors. + * bio_alloc() is guaranteed to return a bio when allowed to sleep and + * we request a valid number of vectors. */ bio = bio_alloc(GFP_KERNEL, nr_vecs); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 78a7c855b06b..5ba94be006ee 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -517,7 +517,7 @@ static int new_lockspace(const char *name, const char *cluster, size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; - ls->ls_rsbtbl = vmalloc(sizeof(struct dlm_rsbtable) * size); + ls->ls_rsbtbl = vmalloc(array_size(size, sizeof(struct dlm_rsbtable))); if (!ls->ls_rsbtbl) goto out_lsfree; for (i = 0; i < size; i++) { diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5243989a60cc..a5e4a221435c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1037,6 +1037,7 @@ static void sctp_connect_to_sock(struct connection *con) int result; int addr_len; struct socket *sock; + struct timeval tv = { .tv_sec = 5, .tv_usec = 0 }; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -1080,11 +1081,22 @@ static void sctp_connect_to_sock(struct connection *con) log_print("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, sizeof(one)); + /* + * Make sock->ops->connect() function return in specified time, + * since O_NONBLOCK argument in connect() function does not work here, + * then, we should restore the default value of this attribute. + */ + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, + sizeof(tv)); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, - O_NONBLOCK); + 0); + memset(&tv, 0, sizeof(tv)); + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, + sizeof(tv)); + if (result == -EINPROGRESS) result = 0; if (result == 0) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 97d17eaeba07..49121e5a8de2 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -283,8 +283,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, iget_failed(ecryptfs_inode); goto out; } - unlock_new_inode(ecryptfs_inode); - d_instantiate(ecryptfs_dentry, ecryptfs_inode); + d_instantiate_new(ecryptfs_dentry, ecryptfs_inode); out: return rc; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 602ca4285b2e..67db22fe99c5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, pt->_key = epi->event.events; if (!is_file_epoll(epi->ffd.file)) - return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & - epi->event.events; + return vfs_poll(epi->ffd.file, pt) & epi->event.events; ep = epi->ffd.file->private_data; poll_wait(epi->ffd.file, &ep->poll_wait, pt); @@ -2025,7 +2024,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* The target file descriptor must support poll */ error = -EPERM; - if (!tf.file->f_op->poll) + if (!file_can_poll(tf.file)) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ diff --git a/fs/exec.c b/fs/exec.c index 183059c427b9..72e961a62adb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; @@ -298,7 +298,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) err = -EINTR; goto err_free; } - vma->vm_mm = mm; /* * Place the stack at the largest stack address the architecture @@ -311,7 +310,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); if (err) @@ -326,7 +324,7 @@ err: up_write(&mm->mmap_sem); err_free: bprm->vma = NULL; - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return err; } @@ -1706,14 +1704,13 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execveat_common(int fd, struct filename *filename, - struct user_arg_ptr argv, - struct user_arg_ptr envp, - int flags) +static int __do_execve_file(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags, struct file *file) { char *pathbuf = NULL; struct linux_binprm *bprm; - struct file *file; struct files_struct *displaced; int retval; @@ -1752,7 +1749,8 @@ static int do_execveat_common(int fd, struct filename *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = do_open_execat(fd, filename, flags); + if (!file) + file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1760,7 +1758,9 @@ static int do_execveat_common(int fd, struct filename *filename, sched_exec(); bprm->file = file; - if (fd == AT_FDCWD || filename->name[0] == '/') { + if (!filename) { + bprm->filename = "none"; + } else if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') @@ -1822,11 +1822,13 @@ static int do_execveat_common(int fd, struct filename *filename, current->fs->in_exec = 0; current->in_execve = 0; membarrier_execve(current); + rseq_execve(current); acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); kfree(pathbuf); - putname(filename); + if (filename) + putname(filename); if (displaced) put_files_struct(displaced); return retval; @@ -1849,10 +1851,27 @@ out_files: if (displaced) reset_files_struct(displaced); out_ret: - putname(filename); + if (filename) + putname(filename); return retval; } +static int do_execveat_common(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags) +{ + return __do_execve_file(fd, filename, argv, envp, flags, NULL); +} + +int do_execve_file(struct file *file, void *__argv, void *__envp) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + + return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file); +} + int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 0ac62811b341..5f81fcd383a4 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -110,8 +110,8 @@ static int pcol_try_alloc(struct page_collect *pcol) pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); for (; pages; pages >>= 1) { - pcol->pages = kmalloc(pages * sizeof(struct page *), - GFP_KERNEL); + pcol->pages = kmalloc_array(pages, sizeof(struct page *), + GFP_KERNEL); if (likely(pcol->pages)) { pcol->alloc_pages = pages; return 0; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 3c6a9c156b7a..1b8b44637e70 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -146,68 +146,82 @@ int _ore_get_io_state(struct ore_layout *layout, struct ore_io_state **pios) { struct ore_io_state *ios; - struct page **pages; - struct osd_sg_entry *sgilist; + size_t size_ios, size_extra, size_total; + void *ios_extra; + + /* + * The desired layout looks like this, with the extra_allocation + * items pointed at from fields within ios or per_dev: + struct __alloc_all_io_state { struct ore_io_state ios; struct ore_per_dev_state per_dev[numdevs]; union { struct osd_sg_entry sglist[sgs_per_dev * numdevs]; struct page *pages[num_par_pages]; - }; - } *_aios; - - if (likely(sizeof(*_aios) <= PAGE_SIZE)) { - _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); - if (unlikely(!_aios)) { - ORE_DBGMSG("Failed kzalloc bytes=%zd\n", - sizeof(*_aios)); + } extra_allocation; + } whole_allocation; + + */ + + /* This should never happen, so abort early if it ever does. */ + if (sgs_per_dev && num_par_pages) { + ORE_DBGMSG("Tried to use both pages and sglist\n"); + *pios = NULL; + return -EINVAL; + } + + if (numdevs > (INT_MAX - sizeof(*ios)) / + sizeof(struct ore_per_dev_state)) + return -ENOMEM; + size_ios = sizeof(*ios) + sizeof(struct ore_per_dev_state) * numdevs; + + if (sgs_per_dev * numdevs > INT_MAX / sizeof(struct osd_sg_entry)) + return -ENOMEM; + if (num_par_pages > INT_MAX / sizeof(struct page *)) + return -ENOMEM; + size_extra = max(sizeof(struct osd_sg_entry) * (sgs_per_dev * numdevs), + sizeof(struct page *) * num_par_pages); + + size_total = size_ios + size_extra; + + if (likely(size_total <= PAGE_SIZE)) { + ios = kzalloc(size_total, GFP_KERNEL); + if (unlikely(!ios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", size_total); *pios = NULL; return -ENOMEM; } - pages = num_par_pages ? _aios->pages : NULL; - sgilist = sgs_per_dev ? _aios->sglist : NULL; - ios = &_aios->ios; + ios_extra = (char *)ios + size_ios; } else { - struct __alloc_small_io_state { - struct ore_io_state ios; - struct ore_per_dev_state per_dev[numdevs]; - } *_aio_small; - union __extra_part { - struct osd_sg_entry sglist[sgs_per_dev * numdevs]; - struct page *pages[num_par_pages]; - } *extra_part; - - _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); - if (unlikely(!_aio_small)) { + ios = kzalloc(size_ios, GFP_KERNEL); + if (unlikely(!ios)) { ORE_DBGMSG("Failed alloc first part bytes=%zd\n", - sizeof(*_aio_small)); + size_ios); *pios = NULL; return -ENOMEM; } - extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); - if (unlikely(!extra_part)) { + ios_extra = kzalloc(size_extra, GFP_KERNEL); + if (unlikely(!ios_extra)) { ORE_DBGMSG("Failed alloc second part bytes=%zd\n", - sizeof(*extra_part)); - kfree(_aio_small); + size_extra); + kfree(ios); *pios = NULL; return -ENOMEM; } - pages = num_par_pages ? extra_part->pages : NULL; - sgilist = sgs_per_dev ? extra_part->sglist : NULL; /* In this case the per_dev[0].sgilist holds the pointer to * be freed */ - ios = &_aio_small->ios; ios->extra_part_alloc = true; } - if (pages) { - ios->parity_pages = pages; + if (num_par_pages) { + ios->parity_pages = ios_extra; ios->max_par_pages = num_par_pages; } - if (sgilist) { + if (sgs_per_dev) { + struct osd_sg_entry *sgilist = ios_extra; unsigned d; for (d = 0; d < numdevs; ++d) { @@ -790,7 +804,7 @@ int ore_create(struct ore_io_state *ios) for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; - or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, i)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -815,7 +829,7 @@ int ore_remove(struct ore_io_state *ios) for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; - or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, i)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -847,7 +861,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, dev)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -966,7 +980,7 @@ int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) return 0; /* Just an empty slot */ first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; - or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, first_dev)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; @@ -1060,7 +1074,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, cur_comp)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 27cbdb697649..199590f36203 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -71,6 +71,11 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, { struct __stripe_pages_2d *sp2d; unsigned data_devs = group_width - parity; + + /* + * Desired allocation layout is, though when larger than PAGE_SIZE, + * each struct __alloc_1p_arrays is separately allocated: + struct _alloc_all_bytes { struct __alloc_stripe_pages_2d { struct __stripe_pages_2d sp2d; @@ -82,55 +87,85 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, char page_is_read[data_devs]; } __a1pa[pages_in_unit]; } *_aab; + struct __alloc_1p_arrays *__a1pa; struct __alloc_1p_arrays *__a1pa_end; - const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + + */ + + char *__a1pa; + char *__a1pa_end; + + const size_t sizeof_stripe_pages_2d = + sizeof(struct __stripe_pages_2d) + + sizeof(struct __1_page_stripe) * pages_in_unit; + const size_t sizeof__a1pa = + ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs, + sizeof(void *)); + const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit; + const size_t alloc_total = sizeof_stripe_pages_2d + + sizeof__a1pa_arrays; + unsigned num_a1pa, alloc_size, i; /* FIXME: check these numbers in ore_verify_layout */ - BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE); BUG_ON(sizeof__a1pa > PAGE_SIZE); - if (sizeof(*_aab) > PAGE_SIZE) { - num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; - alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + /* + * If alloc_total would be larger than PAGE_SIZE, only allocate + * as many a1pa items as would fill the rest of the page, instead + * of the full pages_in_unit count. + */ + if (alloc_total > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa; + alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa; } else { num_a1pa = pages_in_unit; - alloc_size = sizeof(*_aab); + alloc_size = alloc_total; } - _aab = kzalloc(alloc_size, GFP_KERNEL); - if (unlikely(!_aab)) { + *psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!sp2d)) { ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); return -ENOMEM; } + /* From here Just call _sp2d_free */ - sp2d = &_aab->__asp2d.sp2d; - *psp2d = sp2d; /* From here Just call _sp2d_free */ - - __a1pa = _aab->__a1pa; - __a1pa_end = __a1pa + num_a1pa; + /* Find start of a1pa area. */ + __a1pa = (char *)sp2d + sizeof_stripe_pages_2d; + /* Find end of the _allocated_ a1pa area. */ + __a1pa_end = __a1pa + alloc_size; + /* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */ for (i = 0; i < pages_in_unit; ++i) { + struct __1_page_stripe *stripe = &sp2d->_1p_stripes[i]; + if (unlikely(__a1pa >= __a1pa_end)) { num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, pages_in_unit - i); + alloc_size = sizeof__a1pa * num_a1pa; - __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL); + __a1pa = kzalloc(alloc_size, GFP_KERNEL); if (unlikely(!__a1pa)) { ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", num_a1pa); return -ENOMEM; } - __a1pa_end = __a1pa + num_a1pa; + __a1pa_end = __a1pa + alloc_size; /* First *pages is marked for kfree of the buffer */ - sp2d->_1p_stripes[i].alloc = true; + stripe->alloc = true; } - sp2d->_1p_stripes[i].pages = __a1pa->pages; - sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; - sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; - ++__a1pa; + /* + * Attach all _lp_stripes pointers to the allocation for + * it which was either part of the original PAGE_SIZE + * allocation or the subsequent allocation in this loop. + */ + stripe->pages = (void *)__a1pa; + stripe->scribble = stripe->pages + group_width; + stripe->page_is_read = (char *)stripe->scribble + group_width; + __a1pa += sizeof__a1pa; } sp2d->parity = parity; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 179cd5c2f52a..41cf2fbee50d 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -229,7 +229,7 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, u64 offset, void *p, unsigned length) { - struct osd_request *or = osd_start_request(od, GFP_KERNEL); + struct osd_request *or = osd_start_request(od); /* struct osd_sense_info osi = {.key = 0};*/ int ret; @@ -549,27 +549,26 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_dev **peds) { - struct __alloc_ore_devs_and_exofs_devs { - /* Twice bigger table: See exofs_init_comps() and comment at - * exofs_read_lookup_dev_table() - */ - struct ore_dev *oreds[numdevs * 2 - 1]; - struct exofs_dev eds[numdevs]; - } *aoded; + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + const size_t numores = numdevs * 2 - 1; struct exofs_dev *eds; unsigned i; - aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); - if (unlikely(!aoded)) { + sbi->oc.ods = kzalloc(numores * sizeof(struct ore_dev *) + + numdevs * sizeof(struct exofs_dev), GFP_KERNEL); + if (unlikely(!sbi->oc.ods)) { EXOFS_ERR("ERROR: failed allocating Device array[%d]\n", numdevs); return -ENOMEM; } - sbi->oc.ods = aoded->oreds; - *peds = eds = aoded->eds; + /* Start of allocated struct exofs_dev entries */ + *peds = eds = (void *)sbi->oc.ods[numores]; + /* Initialize pointers into struct exofs_dev */ for (i = 0; i < numdevs; ++i) - aoded->oreds[i] = &eds[i].ored; + sbi->oc.ods[i] = &eds[i].ored; return 0; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index cc40802ddfa8..00e759f05161 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -748,7 +748,6 @@ extern void ext2_free_blocks (struct inode *, unsigned long, unsigned long); extern unsigned long ext2_count_free_blocks (struct super_block *); extern unsigned long ext2_count_dirs (struct super_block *); -extern void ext2_check_blocks_bitmap (struct super_block *); extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, unsigned int block_group, struct buffer_head ** bh); @@ -771,7 +770,6 @@ extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *); extern void ext2_free_inode (struct inode *); extern unsigned long ext2_count_free_inodes (struct super_block *); -extern void ext2_check_inodes_bitmap (struct super_block *); extern unsigned long ext2_count_free (struct buffer_head *, unsigned); /* inode.c */ diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 1e01fabef130..71635909df3b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1264,21 +1264,11 @@ do_indirects: static void ext2_truncate_blocks(struct inode *inode, loff_t offset) { - /* - * XXX: it seems like a bug here that we don't allow - * IS_APPEND inode to have blocks-past-i_size trimmed off. - * review and fix this. - * - * Also would be nice to be able to handle IO errors and such, - * but that's probably too much to ask. - */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return; if (ext2_inode_is_fast_symlink(inode)) return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; dax_sem_down_write(EXT2_I(inode)); __ext2_truncate_blocks(inode, offset); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 55f7caadb093..152453a91877 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -41,8 +41,7 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ext2_add_link(dentry, inode); if (!err) { - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -255,8 +254,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) if (err) goto out_fail; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out: return err; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index de1694512f1f..8ff53f8da3bc 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -557,6 +557,9 @@ static int parse_options(char *options, struct super_block *sb, set_opt (opts->s_mount_opt, NO_UID32); break; case Opt_nocheck: + ext2_msg(sb, KERN_WARNING, + "Option nocheck/check=none is deprecated and" + " will be removed in June 2020."); clear_opt (opts->s_mount_opt, CHECK); break; case Opt_debug: @@ -961,8 +964,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { - err = bdev_dax_supported(sb, blocksize); - if (err) { + if (!bdev_dax_supported(sb->s_bdev, blocksize)) { ext2_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); sbi->s_mount_opt &= ~EXT2_MOUNT_DAX; @@ -1083,7 +1085,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) / EXT2_BLOCKS_PER_GROUP(sb)) + 1; db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL); + sbi->s_group_desc = kmalloc_array (db_count, + sizeof(struct buffer_head *), + GFP_KERNEL); if (sbi->s_group_desc == NULL) { ext2_msg(sb, KERN_ERR, "error: not enough memory"); goto failed_mount; @@ -1334,9 +1338,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) new_opts.s_resgid = sbi->s_resgid; spin_unlock(&sbi->s_lock); - /* - * Allow the "check" option to be passed as a remount option. - */ if (!parse_options(data, sb, &new_opts)) return -EINVAL; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 508b905d744d..e68cefe08261 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -184,26 +184,15 @@ static int ext4_init_block_bitmap(struct super_block *sb, unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; - int flex_bg = 0; - struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT | + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); @@ -217,22 +206,19 @@ static int ext4_init_block_bitmap(struct super_block *sb, start = ext4_group_first_block_no(sb, block_group); - if (ext4_has_feature_flex_bg(sb)) - flex_bg = 1; - /* Set bits for block and inode bitmaps, and inode table */ tmp = ext4_block_bitmap(sb, gdp); - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_bitmap(sb, gdp); - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_table(sb, gdp); for (; tmp < ext4_inode_table(sb, gdp) + sbi->s_itb_per_group; tmp++) { - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } @@ -375,7 +361,6 @@ static int ext4_validate_block_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); - struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return 0; @@ -387,10 +372,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSBADCRC; } blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); @@ -398,10 +381,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } set_buffer_verified(bh); @@ -436,6 +417,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid block bitmap block %llu in " "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); @@ -455,7 +438,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) goto verify; } ext4_lock_group(sb, block_group); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + if (block_group == 0) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + ext4_error(sb, "Block bitmap for bg 0 marked " + "uninitialized"); + err = -EFSCORRUPTED; + goto out; + } err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); @@ -514,6 +506,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; } clear_buffer_new(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a42e71203e53..7c7123f265c2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -817,12 +817,14 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } -#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ -do { \ - (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(inode)->xtime); \ +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ + struct timespec ts = timespec64_to_timespec((inode)->xtime); \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&ts); \ + } \ } while (0) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ @@ -834,16 +836,20 @@ do { \ ext4_encode_extra_time(&(einode)->xtime); \ } while (0) -#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ -do { \ - (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ - ext4_decode_extra_time(&(inode)->xtime, \ - raw_inode->xtime ## _extra); \ - else \ - (inode)->xtime.tv_nsec = 0; \ +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + struct timespec ts = timespec64_to_timespec((inode)->xtime); \ + ext4_decode_extra_time(&ts, \ + raw_inode->xtime ## _extra); \ + (inode)->xtime = timespec_to_timespec64(ts); \ + } \ + else \ + (inode)->xtime.tv_nsec = 0; \ } while (0) + #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ @@ -1108,6 +1114,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ @@ -1501,11 +1508,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || - ino == EXT4_USR_QUOTA_INO || - ino == EXT4_GRP_QUOTA_INO || - ino == EXT4_BOOT_LOADER_INO || - ino == EXT4_JOURNAL_INO || - ino == EXT4_RESIZE_INO || (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } @@ -2390,7 +2392,7 @@ extern int ext4_init_inode_table(struct super_block *sb, extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ -extern const struct file_operations ext4_seq_mb_groups_fops; +extern const struct seq_operations ext4_mb_seq_groups_ops; extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; extern int ext4_mb_init(struct super_block *); @@ -2530,6 +2532,9 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, @@ -2857,6 +2862,10 @@ struct ext4_group_info { #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) @@ -3005,9 +3014,6 @@ extern int ext4_inline_data_fiemap(struct inode *inode, struct iomap; extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); -extern int ext4_try_to_evict_inline_data(handle_t *handle, - struct inode *inode, - int needed); extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 98fb0c119c68..adf6668b596f 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -91,6 +91,7 @@ struct ext4_extent_header { }; #define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) +#define EXT4_MAX_EXTENT_DEPTH 5 #define EXT4_EXTENT_TAIL_OFFSET(hdr) \ (sizeof(struct ext4_extent_header) + \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c969275ce3ee..8ce6fd5b10dd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -577,7 +577,7 @@ int ext4_ext_precache(struct inode *inode) down_read(&ei->i_data_sem); depth = ext_depth(inode); - path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS); if (path == NULL) { up_read(&ei->i_data_sem); @@ -869,6 +869,12 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_inode_hdr(inode); depth = ext_depth(inode); + if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { + EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d", + depth); + ret = -EFSCORRUPTED; + goto err; + } if (path) { ext4_ext_drop_refs(path); @@ -879,7 +885,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, } if (!path) { /* account possible depth increase */ - path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), + path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), GFP_NOFS); if (unlikely(!path)) return ERR_PTR(-ENOMEM); @@ -1063,7 +1069,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, * We need this to handle errors and free blocks * upon them. */ - ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); + ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), GFP_NOFS); if (!ablocks) return -ENOMEM; @@ -2921,7 +2927,7 @@ again: path[k].p_block = le16_to_cpu(path[k].p_hdr->eh_entries)+1; } else { - path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS); if (path == NULL) { ext4_journal_stop(handle); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 763ef185dd17..c4e6fb15101b 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -162,8 +162,7 @@ int __init ext4_init_es(void) void ext4_exit_es(void) { - if (ext4_es_cachep) - kmem_cache_destroy(ext4_es_cachep); + kmem_cache_destroy(ext4_es_cachep); } void ext4_es_init_tree(struct ext4_es_tree *tree) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index fb6f023622fe..7f8023340eb8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -277,10 +277,11 @@ out: } #ifdef CONFIG_FS_DAX -static int ext4_dax_huge_fault(struct vm_fault *vmf, +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { - int result, error = 0; + int error = 0; + vm_fault_t result; int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -335,7 +336,7 @@ retry: return result; } -static int ext4_dax_fault(struct vm_fault *vmf) +static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); } @@ -380,50 +381,64 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int ext4_file_open(struct inode * inode, struct file * filp) +static int ext4_sample_last_mounted(struct super_block *sb, + struct vfsmount *mnt) { - struct super_block *sb = inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct vfsmount *mnt = filp->f_path.mnt; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct path path; char buf[64], *cp; + handle_t *handle; + int err; + + if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED)) + return 0; + + if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) + return 0; + + sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + /* + * Sample where the filesystem has been mounted and + * store it in the superblock for sysadmin convenience + * when trying to sort through large numbers of block + * devices or filesystem images. + */ + memset(buf, 0, sizeof(buf)); + path.mnt = mnt; + path.dentry = mnt->mnt_root; + cp = d_path(&path, buf, sizeof(buf)); + err = 0; + if (IS_ERR(cp)) + goto out; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + err = PTR_ERR(handle); + if (IS_ERR(handle)) + goto out; + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_journal; + strlcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); + ext4_handle_dirty_super(handle, sb); +out_journal: + ext4_journal_stop(handle); +out: + sb_end_intwrite(sb); + return err; +} + +static int ext4_file_open(struct inode * inode, struct file * filp) +{ int ret; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && - !sb_rdonly(sb))) { - sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; - /* - * Sample where the filesystem has been mounted and - * store it in the superblock for sysadmin convenience - * when trying to sort through large numbers of block - * devices or filesystem images. - */ - memset(buf, 0, sizeof(buf)); - path.mnt = mnt; - path.dentry = mnt->mnt_root; - cp = d_path(&path, buf, sizeof(buf)); - if (!IS_ERR(cp)) { - handle_t *handle; - int err; - - handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) { - ext4_journal_stop(handle); - return err; - } - strlcpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); - ext4_handle_dirty_super(handle, sb); - ext4_journal_stop(handle); - } - } + ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); + if (ret) + return ret; ret = fscrypt_file_open(inode, filp); if (ret) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index e871c4bf18e9..4b99e2db95b8 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -402,8 +402,8 @@ static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) } /* Find all the fixed metadata in the filesystem. */ -int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, - struct list_head *meta_list) +static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, + struct list_head *meta_list) { struct ext4_group_desc *gdp; ext4_group_t agno; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df92e3ec9913..fb83750c1a14 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -83,7 +83,6 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); - struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return 0; @@ -97,14 +96,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, desc); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } set_buffer_verified(bh); @@ -136,6 +129,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid inode bitmap blk %llu in " "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); @@ -143,7 +138,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); - return ERR_PTR(-EIO); + return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) goto verify; @@ -155,7 +150,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) } ext4_lock_group(sb, block_group); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { + if (block_group == 0) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + ext4_error(sb, "Inode bitmap for bg 0 marked " + "uninitialized"); + err = -EFSCORRUPTED; + goto out; + } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); @@ -190,6 +194,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EIO); } @@ -337,13 +343,8 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); - if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); } error_return: @@ -914,6 +915,8 @@ repeat_in_this_group: if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto next_group; } @@ -1000,7 +1003,8 @@ got: /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); @@ -1078,8 +1082,8 @@ got: inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = - current_time(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + ei->i_crtime = timespec64_to_timespec(inode->i_mtime); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; @@ -1105,6 +1109,8 @@ got: err = -EIO; ext4_error(sb, "failed to insert inode %lu: doubly allocated?", inode->i_ino); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } inode->i_generation = prandom_u32(); @@ -1206,11 +1212,8 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - ext4_error(sb, "inode bitmap error %ld for orphan %lu", - ino, PTR_ERR(bitmap_bh)); + if (IS_ERR(bitmap_bh)) return (struct inode *) bitmap_bh; - } /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index c32802c956d5..bf7fa1507e81 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -561,10 +561,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); int i; - /* Count number blocks in a subtree under 'partial' */ - count = 1; - for (i = 0; partial + i != chain + depth - 1; i++) - count *= epb; + /* + * Count number blocks in a subtree under 'partial'. At each + * level we count number of complete empty subtrees beyond + * current offset and then descend into the subtree only + * partially beyond current offset. + */ + count = 0; + for (i = partial - chain + 1; i < depth; i++) + count = count * epb + (epb - offsets[i] - 1); + count++; /* Fill in size of a hole we found */ map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, count); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 70cf4c7b268a..e55a8bc870bd 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -144,6 +144,12 @@ int ext4_find_inline_data_nolock(struct inode *inode) goto out; if (!is.s.not_found) { + if (is.s.here->e_value_inum) { + EXT4_ERROR_INODE(inode, "inline data xattr refers " + "to an external xattr inode"); + error = -EFSCORRUPTED; + goto out; + } EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + @@ -431,6 +437,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); + memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE); if (ext4_has_feature_extents(inode->i_sb)) { if (S_ISDIR(inode->i_mode) || @@ -880,11 +887,11 @@ retry_journal: flags |= AOP_FLAG_NOFS; if (ret == -ENOSPC) { + ext4_journal_stop(handle); ret = ext4_da_convert_inline_data_to_extent(mapping, inode, flags, fsdata); - ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; @@ -1835,8 +1842,8 @@ int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap) iomap->offset = 0; iomap->length = min_t(loff_t, ext4_get_inline_size(inode), i_size_read(inode)); - iomap->type = 0; - iomap->flags = IOMAP_F_DATA_INLINE; + iomap->type = IOMAP_INLINE; + iomap->flags = 0; out: up_read(&EXT4_I(inode)->xattr_sem); @@ -1884,42 +1891,6 @@ out: return (error < 0 ? error : 0); } -/* - * Called during xattr set, and if we can sparse space 'needed', - * just create the extent tree evict the data to the outer block. - * - * We use jbd2 instead of page cache to move data to the 1st block - * so that the whole transaction can be committed as a whole and - * the data isn't lost because of the delayed page cache write. - */ -int ext4_try_to_evict_inline_data(handle_t *handle, - struct inode *inode, - int needed) -{ - int error; - struct ext4_xattr_entry *entry; - struct ext4_inode *raw_inode; - struct ext4_iloc iloc; - - error = ext4_get_inode_loc(inode, &iloc); - if (error) - return error; - - raw_inode = ext4_raw_inode(&iloc); - entry = (struct ext4_xattr_entry *)((void *)raw_inode + - EXT4_I(inode)->i_inline_off); - if (EXT4_XATTR_LEN(entry->e_name_len) + - EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { - error = -ENOSPC; - goto out; - } - - error = ext4_convert_inline_data_nolock(handle, inode, &iloc); -out: - brelse(iloc.bh); - return error; -} - int ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1e50c5efae67..7d6c10017bdf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -402,9 +402,9 @@ static int __check_block_validity(struct inode *inode, const char *func, if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, - "lblock %lu mapped to illegal pblock " + "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, - map->m_len); + map->m_pblk, map->m_len); return -EFSCORRUPTED; } return 0; @@ -4298,28 +4298,28 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) EXT4_BLOCK_SIZE_BITS(sb); stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - /* If there are no blocks to remove, return now */ - if (first_block >= stop_block) - goto out_stop; + /* If there are blocks to remove, do it */ + if (stop_block > first_block) { - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); - ret = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - if (ret) { - up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; - } + ret = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); - else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, first_block, + stop_block - 1); + else + ret = ext4_ind_remove_space(handle, inode, first_block, + stop_block); - up_write(&EXT4_I(inode)->i_data_sem); + up_write(&EXT4_I(inode)->i_data_sem); + } if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -4506,7 +4506,8 @@ static int __ext4_get_inode_loc(struct inode *inode, int inodes_per_block, inode_offset; iloc->bh = NULL; - if (!ext4_valid_inum(sb, inode->i_ino)) + if (inode->i_ino < EXT4_ROOT_INO || + inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); @@ -4701,19 +4702,21 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, } } -static inline void ext4_iget_extra_inode(struct inode *inode, +static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= EXT4_INODE_SIZE(inode->i_sb) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); - ext4_find_inline_data_nolock(inode); + return ext4_find_inline_data_nolock(inode); } else EXT4_I(inode)->i_inline_off = 0; + return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) @@ -4724,6 +4727,26 @@ int ext4_get_projid(struct inode *inode, kprojid_t *projid) return 0; } +/* + * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of + * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag + * set. + */ +static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + inode_set_iversion_raw(inode, val); + else + inode_set_iversion_queried(inode, val); +} +static inline u64 ext4_inode_peek_iversion(const struct inode *inode) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return inode_peek_iversion_raw(inode); + else + return inode_peek_iversion(inode); +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -4893,7 +4916,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { - ext4_iget_extra_inode(inode, raw_inode, ei); + ret = ext4_iget_extra_inode(inode, raw_inode, ei); + if (ret) + goto bad_inode; } } @@ -4910,7 +4935,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } - inode_set_iversion_queried(inode, ivers); + ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; @@ -4945,6 +4970,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { + /* VFS does not allow setting these so must be corruption */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + EXT4_ERROR_INODE(inode, + "immutable or append flags not allowed on symlinks"); + ret = -EFSCORRUPTED; + goto bad_inode; + } if (ext4_encrypted_inode(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; ext4_set_aops(inode); @@ -5196,7 +5228,7 @@ static int ext4_do_update_inode(handle_t *handle, } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { - u64 ivers = inode_peek_iversion(inode); + u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 769a62708b1c..f7ab34088162 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -470,6 +470,8 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, "freeing block already freed " "(bit %u)", first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -747,10 +749,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, * corrupt and update bb_free using bitmap value */ grp->bb_free = free; - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); @@ -1454,12 +1454,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - e4b->bd_info->bb_free); - /* Mark the block group as corrupt. */ - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, - &e4b->bd_info->bb_state); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); mb_regenerate_buddy(e4b); goto done; } @@ -1956,6 +1952,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, "%d free clusters as per " "group info. But bitmap says 0", free); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } @@ -1966,6 +1964,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit @@ -2254,7 +2254,7 @@ out: static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { - struct super_block *sb = seq->private; + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) @@ -2265,7 +2265,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = seq->private; + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group; ++*pos; @@ -2277,7 +2277,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { - struct super_block *sb = seq->private; + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err, buddy_loaded = 0; @@ -2330,34 +2330,13 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } -static const struct seq_operations ext4_mb_seq_groups_ops = { +const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, .show = ext4_mb_seq_groups_show, }; -static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) -{ - struct super_block *sb = PDE_DATA(inode); - int rc; - - rc = seq_open(file, &ext4_mb_seq_groups_ops); - if (rc == 0) { - struct seq_file *m = file->private_data; - m->private = sb; - } - return rc; - -} - -const struct file_operations ext4_seq_mb_groups_fops = { - .open = ext4_mb_seq_groups_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; @@ -2444,7 +2423,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, * initialize bb_free to be able to skip * empty groups without initialization */ - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + if (ext4_has_group_desc_csum(sb) && + (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { meta_group_info[i]->bb_free = ext4_free_clusters_after_init(sb, group, desc); } else { @@ -2537,8 +2517,7 @@ static void ext4_groupinfo_destroy_slabs(void) int i; for (i = 0; i < NR_GRPINFO_CACHES; i++) { - if (ext4_groupinfo_caches[i]) - kmem_cache_destroy(ext4_groupinfo_caches[i]); + kmem_cache_destroy(ext4_groupinfo_caches[i]); ext4_groupinfo_caches[i] = NULL; } } @@ -3011,7 +2990,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, #endif ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b1f21e3a0763..2a4c25c4681d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2411,8 +2411,7 @@ static int ext4_add_nondir(handle_t *handle, int err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } drop_nlink(inode); @@ -2651,8 +2650,7 @@ out_clear_inode: err = ext4_mark_inode_dirty(handle, dir); if (err) goto out_clear_inode; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); @@ -3675,7 +3673,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, }; u8 new_file_type; int retval; - struct timespec ctime; + struct timespec64 ctime; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 9ffa6fad18db..19b87a8de6ff 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -77,7 +77,7 @@ static void mpage_end_io(struct bio *bio) if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { - fscrypt_decrypt_bio_pages(bio->bi_private, bio); + fscrypt_enqueue_decrypt_bio(bio->bi_private, bio); return; } } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b6bec270a8e4..e5fb38451a73 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -204,12 +204,14 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) goto out2; flex_gd->count = flexbg_size; - flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * - flexbg_size, GFP_NOFS); + flex_gd->groups = kmalloc_array(flexbg_size, + sizeof(struct ext4_new_group_data), + GFP_NOFS); if (flex_gd->groups == NULL) goto out2; - flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS); + flex_gd->bg_flags = kmalloc_array(flexbg_size, sizeof(__u16), + GFP_NOFS); if (flex_gd->bg_flags == NULL) goto out1; @@ -969,7 +971,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, int res, i; int err; - primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); + primary = kmalloc_array(reserved_gdb, sizeof(*primary), GFP_NOFS); if (!primary) return -ENOMEM; @@ -1933,7 +1935,7 @@ retry: return 0; n_group = ext4_get_group_number(sb, n_blocks_count - 1); - if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + if (n_group >= (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { ext4_warning(sb, "resize would cause inodes_count overflow"); return -EINVAL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eb104e8476f0..ba2396a7bd04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -405,6 +405,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) static void ext4_handle_error(struct super_block *sb) { + if (test_opt(sb, WARN_ON_ERROR)) + WARN_ON_ONCE(1); + if (sb_rdonly(sb)) return; @@ -740,6 +743,9 @@ __acquires(bitlock) va_end(args); } + if (test_opt(sb, WARN_ON_ERROR)) + WARN_ON_ONCE(1); + if (test_opt(sb, ERRORS_CONT)) { ext4_commit_super(sb, 0); return; @@ -763,6 +769,36 @@ __acquires(bitlock) return; } +void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t group, + unsigned int flags) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + + if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) && + !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) { + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &grp->bb_state); + } + + if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) && + !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (gdp) { + int count; + + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, + &grp->bb_state); + } +} + void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -1237,19 +1273,13 @@ static bool ext4_dummy_context(struct inode *inode) return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); } -static unsigned ext4_max_namelen(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : - EXT4_NAME_LEN; -} - static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, .dummy_context = ext4_dummy_context, .empty_dir = ext4_empty_dir, - .max_namelen = ext4_max_namelen, + .max_namelen = EXT4_NAME_LEN, }; #endif @@ -1347,7 +1377,8 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, - Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, + Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, @@ -1414,6 +1445,8 @@ static const match_table_t tokens = { {Opt_dax, "dax"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, + {Opt_warn_on_error, "warn_on_error"}, + {Opt_nowarn_on_error, "nowarn_on_error"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, @@ -1578,6 +1611,8 @@ static const struct mount_opts { MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, + {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, @@ -2116,12 +2151,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int res = 0; + int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); - res = SB_RDONLY; + err = -EROFS; } if (read_only) goto done; @@ -2154,7 +2189,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); - ext4_commit_super(sb, 1); + err = ext4_commit_super(sb, 1); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " @@ -2166,7 +2201,7 @@ done: sbi->s_mount_opt, sbi->s_mount_opt2); cleancache_init_fs(sb); - return res; + return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) @@ -2307,6 +2342,7 @@ static int ext4_check_descriptors(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); ext4_fsblk_t last_block; + ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0) + 1; ext4_fsblk_t block_bitmap; ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; @@ -2339,6 +2375,14 @@ static int ext4_check_descriptors(struct super_block *sb, if (!sb_rdonly(sb)) return 0; } + if (block_bitmap >= sb_block + 1 && + block_bitmap <= last_bg_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u overlaps " + "block group descriptors", i); + if (!sb_rdonly(sb)) + return 0; + } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " @@ -2353,6 +2397,14 @@ static int ext4_check_descriptors(struct super_block *sb, if (!sb_rdonly(sb)) return 0; } + if (inode_bitmap >= sb_block + 1 && + inode_bitmap <= last_bg_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u overlaps " + "block group descriptors", i); + if (!sb_rdonly(sb)) + return 0; + } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " @@ -2367,6 +2419,14 @@ static int ext4_check_descriptors(struct super_block *sb, if (!sb_rdonly(sb)) return 0; } + if (inode_table >= sb_block + 1 && + inode_table <= last_bg_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u overlaps " + "block group descriptors", i); + if (!sb_rdonly(sb)) + return 0; + } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -3073,13 +3133,22 @@ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *gdp = NULL; + if (!ext4_has_group_desc_csum(sb)) + return ngroups; + for (group = 0; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) continue; - if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) + continue; + if (group != 0) break; + ext4_error(sb, "Inode table for bg 0 marked as " + "needing zeroing"); + if (sb_rdonly(sb)) + return ngroups; } return group; @@ -3718,6 +3787,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) le32_to_cpu(es->s_log_block_size)); goto failed_mount; } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto failed_mount; + } if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { ext4_msg(sb, KERN_ERR, @@ -3732,8 +3808,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) " that may contain inline data"); sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; } - err = bdev_dax_supported(sb, blocksize); - if (err) { + if (!bdev_dax_supported(sb->s_bdev, blocksize)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; @@ -3783,6 +3858,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } else { sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { + ext4_msg(sb, KERN_ERR, "invalid first ino: %u", + sbi->s_first_ino); + goto failed_mount; + } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > blocksize)) { @@ -3859,13 +3939,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "block size (%d)", clustersize, blocksize); goto failed_mount; } - if (le32_to_cpu(es->s_log_cluster_size) > - (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log cluster size: %u", - le32_to_cpu(es->s_log_cluster_size)); - goto failed_mount; - } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); sbi->s_clusters_per_group = @@ -3886,10 +3959,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } else { if (clustersize != blocksize) { - ext4_warning(sb, "fragment/cluster size (%d) != " - "block size (%d)", clustersize, - blocksize); - clustersize = blocksize; + ext4_msg(sb, KERN_ERR, + "fragment/cluster size (%d) != " + "block size (%d)", clustersize, blocksize); + goto failed_mount; } if (sbi->s_blocks_per_group > blocksize * 8) { ext4_msg(sb, KERN_ERR, @@ -3943,6 +4016,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_blocks_count(es)); goto failed_mount; } + if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && + (sbi->s_cluster_ratio == 1)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block is 0 with a 1k block and cluster size"); + goto failed_mount; + } + blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); @@ -3970,14 +4050,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } } - sbi->s_group_desc = kvmalloc(db_count * - sizeof(struct buffer_head *), - GFP_KERNEL); + sbi->s_group_desc = kvmalloc_array(db_count, + sizeof(struct buffer_head *), + GFP_KERNEL); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); ret = -ENOMEM; goto failed_mount; } + if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != + le32_to_cpu(es->s_inodes_count)) { + ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", + le32_to_cpu(es->s_inodes_count), + ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); + ret = -EINVAL; + goto failed_mount; + } bgl_lock_init(sbi->s_blockgroup_lock); @@ -4224,8 +4312,12 @@ no_journal: goto failed_mount4; } - if (ext4_setup_super(sb, es, sb_rdonly(sb))) + ret = ext4_setup_super(sb, es, sb_rdonly(sb)); + if (ret == -EROFS) { sb->s_flags |= SB_RDONLY; + ret = 0; + } else if (ret) + goto failed_mount4a; /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && @@ -4709,6 +4801,14 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!sbh || block_device_ejected(sb)) return error; + + /* + * The superblock bh should be mapped, but it might not be if the + * device was hot-removed. Not much we can do but fail the I/O. + */ + if (!buffer_mapped(sbh)) + return error; + /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock @@ -4760,11 +4860,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); - if (error) - return error; - - error = buffer_write_io_error(sbh); - if (error) { + if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); @@ -5165,8 +5261,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); - if (!ext4_setup_super(sb, es, 0)) - sb->s_flags &= ~SB_RDONLY; + + err = ext4_setup_super(sb, es, 0); + if (err) + goto restore_opts; + + sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) { @@ -5190,8 +5290,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } ext4_setup_system_zone(sb); - if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) - ext4_commit_super(sb, 1); + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { + err = ext4_commit_super(sb, 1); + if (err) + goto restore_opts; + } #ifdef CONFIG_QUOTA /* Release old quota file names */ @@ -5252,7 +5355,8 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bsoftlimit : dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { - curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + curblock = (dquot->dq_dqb.dqb_curspace + + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; buf->f_blocks = limit; buf->f_bfree = buf->f_bavail = (buf->f_blocks > curblock) ? diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 9ebd26c957c2..f34da0bb8f17 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -346,39 +346,9 @@ static struct kobject *ext4_root; static struct kobject *ext4_feat; -#define PROC_FILE_SHOW_DEFN(name) \ -static int name##_open(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \ -} \ -\ -static const struct file_operations ext4_seq_##name##_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -} - -#define PROC_FILE_LIST(name) \ - { __stringify(name), &ext4_seq_##name##_fops } - -PROC_FILE_SHOW_DEFN(es_shrinker_info); -PROC_FILE_SHOW_DEFN(options); - -static const struct ext4_proc_files { - const char *name; - const struct file_operations *fops; -} proc_files[] = { - PROC_FILE_LIST(options), - PROC_FILE_LIST(es_shrinker_info), - PROC_FILE_LIST(mb_groups), - { NULL, NULL }, -}; - int ext4_register_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - const struct ext4_proc_files *p; int err; init_completion(&sbi->s_kobj_unregister); @@ -392,11 +362,14 @@ int ext4_register_sysfs(struct super_block *sb) if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); - if (sbi->s_proc) { - for (p = proc_files; p->name; p++) - proc_create_data(p->name, S_IRUGO, sbi->s_proc, - p->fops, sb); + proc_create_single_data("options", S_IRUGO, sbi->s_proc, + ext4_seq_options_show, sb); + proc_create_single_data("es_shrinker_info", S_IRUGO, + sbi->s_proc, ext4_seq_es_shrinker_info_show, + sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); } return 0; } @@ -404,13 +377,9 @@ int ext4_register_sysfs(struct super_block *sb) void ext4_unregister_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - const struct ext4_proc_files *p; - if (sbi->s_proc) { - for (p = proc_files; p->name; p++) - remove_proc_entry(p->name, sbi->s_proc); - remove_proc_entry(sb->s_id, ext4_proc_root); - } + if (sbi->s_proc) + remove_proc_subtree(sb->s_id, ext4_proc_root); kobject_del(&sbi->s_kobj); } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 499cb4b1fbd2..723df14f4084 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -230,12 +230,12 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, { int error = -EFSCORRUPTED; - if (buffer_verified(bh)) - return 0; - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) goto errout; + if (buffer_verified(bh)) + return 0; + error = -EFSBADCRC; if (!ext4_xattr_block_csum_verify(inode, bh)) goto errout; @@ -1560,7 +1560,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, handle_t *handle, struct inode *inode, bool is_block) { - struct ext4_xattr_entry *last; + struct ext4_xattr_entry *last, *next; struct ext4_xattr_entry *here = s->here; size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; @@ -1595,7 +1595,13 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* Compute min_offs and last. */ last = s->first; - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + for (; !IS_LAST_ENTRY(last); last = next) { + next = EXT4_XATTR_NEXT(last); + if ((void *)next >= s->end) { + EXT4_ERROR_INODE(inode, "corrupted xattr entries"); + ret = -EFSCORRUPTED; + goto out; + } if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) @@ -1688,7 +1694,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* No failures allowed past this point. */ - if (!s->not_found && here->e_value_offs) { + if (!s->not_found && here->e_value_size && here->e_value_offs) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); @@ -2206,23 +2212,8 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); - if (error) { - if (error == -ENOSPC && - ext4_has_inline_data(inode)) { - error = ext4_try_to_evict_inline_data(handle, inode, - EXT4_XATTR_LEN(strlen(i->name) + - EXT4_XATTR_SIZE(i->value_len))); - if (error) - return error; - error = ext4_xattr_ibody_find(inode, i, is); - if (error) - return error; - error = ext4_xattr_set_entry(i, s, handle, inode, - false /* is_block */); - } - if (error) - return error; - } + if (error) + return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); @@ -2651,6 +2642,11 @@ static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + /* never move system.data out of the inode */ + if ((last->e_name_len == 4) && + (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) && + !memcmp(last->e_name, "data", 4)) + continue; total_size = EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_inum) total_size += EXT4_XATTR_SIZE( diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 629001b28632..197a9d8a15ef 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -43,7 +43,7 @@ ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, xattr->name, xattr->value, - xattr->value_len, 0); + xattr->value_len, XATTR_CREATE); if (err < 0) break; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index bf779461df13..9f1c96caebda 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -24,7 +24,7 @@ #include <trace/events/f2fs.h> static struct kmem_cache *ino_entry_slab; -struct kmem_cache *inode_entry_slab; +struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { @@ -36,7 +36,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) /* * We guarantee no failure on the returned page. */ -struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; @@ -100,24 +100,27 @@ repeat: * readonly and make sure do not write checkpoint with non-uptodate * meta page. */ - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + memset(page_address(page), 0, PAGE_SIZE); f2fs_stop_checkpoint(sbi, false); + } out: return page; } -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, true); } /* for POR only */ -struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, false); } -bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) +bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) { switch (type) { case META_NAT: @@ -151,7 +154,7 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) /* * Readahead CP/NAT/SIT/SSA pages */ -int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) { struct page *page; @@ -173,7 +176,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { - if (!is_valid_blkaddr(sbi, blkno, type)) + if (!f2fs_is_valid_meta_blkaddr(sbi, blkno, type)) goto out; switch (type) { @@ -217,7 +220,7 @@ out: return blkno - start; } -void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) { struct page *page; bool readahead = false; @@ -228,7 +231,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); + f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, @@ -249,7 +252,7 @@ static int __f2fs_write_meta_page(struct page *page, if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) goto redirty_out; - write_meta_page(sbi, page, io_type); + f2fs_do_write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) @@ -294,7 +297,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); + written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -305,7 +308,7 @@ skip_write: return 0; } -long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); @@ -382,7 +385,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); if (!PageDirty(page)) { - f2fs_set_page_dirty_nobuffers(page); + __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); SetPagePrivate(page); f2fs_trace_pid(page); @@ -455,20 +458,20 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, 0, type); } -void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); } /* mode should be APPEND_INO or UPDATE_INO */ -bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) { struct inode_management *im = &sbi->im[mode]; struct ino_entry *e; @@ -479,7 +482,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_ino_entry(struct f2fs_sb_info *sbi, bool all) +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; @@ -498,13 +501,13 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) } } -void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { __add_ino_entry(sbi, ino, devidx, type); } -bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; @@ -519,7 +522,7 @@ bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, return is_dirty; } -int acquire_orphan_inode(struct f2fs_sb_info *sbi) +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; int err = 0; @@ -542,7 +545,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) return err; } -void release_orphan_inode(struct f2fs_sb_info *sbi) +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; @@ -552,14 +555,14 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) spin_unlock(&im->ino_lock); } -void add_orphan_inode(struct inode *inode) +void f2fs_add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); - update_inode_page(inode); + f2fs_update_inode_page(inode); } -void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { /* remove orphan entry from orphan list */ __remove_ino_entry(sbi, ino, ORPHAN_INO); @@ -569,7 +572,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct node_info ni; - int err = acquire_orphan_inode(sbi); + int err = f2fs_acquire_orphan_inode(sbi); if (err) goto err_out; @@ -587,16 +590,17 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) } err = dquot_initialize(inode); - if (err) + if (err) { + iput(inode); goto err_out; + } - dquot_initialize(inode); clear_nlink(inode); /* truncate all the data during iput */ iput(inode); - get_node_info(sbi, ino, &ni); + f2fs_get_node_info(sbi, ino, &ni); /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { @@ -614,7 +618,7 @@ err_out: return err; } -int recover_orphan_inodes(struct f2fs_sb_info *sbi) +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; unsigned int s_flags = sbi->sb->s_flags; @@ -642,10 +646,10 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); - ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); + f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { - struct page *page = get_meta_page(sbi, start_blk + i); + struct page *page = f2fs_get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; orphan_blk = (struct f2fs_orphan_block *)page_address(page); @@ -695,7 +699,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { if (!page) { - page = grab_meta_page(sbi, start_blk++); + page = f2fs_grab_meta_page(sbi, start_blk++); orphan_blk = (struct f2fs_orphan_block *)page_address(page); memset(orphan_blk, 0, sizeof(*orphan_blk)); @@ -737,7 +741,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, size_t crc_offset = 0; __u32 crc = 0; - *cp_page = get_meta_page(sbi, cp_addr); + *cp_page = f2fs_get_meta_page(sbi, cp_addr); *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); @@ -790,7 +794,7 @@ invalid_cp1: return NULL; } -int get_valid_checkpoint(struct f2fs_sb_info *sbi) +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *cp_block; struct f2fs_super_block *fsb = sbi->raw_super; @@ -802,7 +806,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) block_t cp_blk_no; int i; - sbi->ckpt = f2fs_kzalloc(sbi, cp_blks * blk_size, GFP_KERNEL); + sbi->ckpt = f2fs_kzalloc(sbi, array_size(blk_size, cp_blks), + GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* @@ -834,7 +839,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) memcpy(sbi->ckpt, cp_block, blk_size); /* Sanity checking of checkpoint */ - if (sanity_check_ckpt(sbi)) + if (f2fs_sanity_check_ckpt(sbi)) goto free_fail_no_cp; if (cur_page == cp1) @@ -853,7 +858,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) void *sit_bitmap_ptr; unsigned char *ckpt = (unsigned char *)sbi->ckpt; - cur_page = get_meta_page(sbi, cp_blk_no + i); + cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); sit_bitmap_ptr = page_address(cur_page); memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); f2fs_put_page(cur_page, 1); @@ -898,7 +903,7 @@ static void __remove_dirty_inode(struct inode *inode, enum inode_type type) stat_dec_dirty_inode(F2FS_I_SB(inode), type); } -void update_dirty_page(struct inode *inode, struct page *page) +void f2fs_update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; @@ -917,7 +922,7 @@ void update_dirty_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); } -void remove_dirty_inode(struct inode *inode) +void f2fs_remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; @@ -934,7 +939,7 @@ void remove_dirty_inode(struct inode *inode) spin_unlock(&sbi->inode_lock[type]); } -int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; struct inode *inode; @@ -1017,7 +1022,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) /* it's on eviction */ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - update_inode_page(inode); + f2fs_update_inode_page(inode); iput(inode); } } @@ -1057,7 +1062,7 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - err = sync_dirty_inodes(sbi, DIR_INODE); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; cond_resched(); @@ -1085,7 +1090,9 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); + atomic_inc(&sbi->wb_sync_req[NODE]); + err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); + atomic_dec(&sbi->wb_sync_req[NODE]); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); @@ -1179,10 +1186,10 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, /* * pagevec_lookup_tag and lock_page again will take - * some extra time. Therefore, update_meta_pages and - * sync_meta_pages are combined in this function. + * some extra time. Therefore, f2fs_update_meta_pages and + * f2fs_sync_meta_pages are combined in this function. */ - struct page *page = grab_meta_page(sbi, blk_addr); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); int err; memcpy(page_address(page), src, PAGE_SIZE); @@ -1220,7 +1227,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1229,7 +1236,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) * modify checkpoint * version number is already updated */ - ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); + ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = @@ -1249,7 +1256,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } /* 2 cp + n data seg summary + orphan inode blocks */ - data_sum_blocks = npages_for_summary_flush(sbi, false); + data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false); spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); @@ -1294,22 +1301,23 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) - update_meta_page(sbi, nm_i->nat_bits + + f2fs_update_meta_page(sbi, nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), blk + i); /* Flush all the NAT BITS pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, + FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } } /* write out checkpoint buffer at block 0 */ - update_meta_page(sbi, ckpt, start_blk++); + f2fs_update_meta_page(sbi, ckpt, start_blk++); for (i = 1; i < 1 + cp_payload_blks; i++) - update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, + f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, start_blk++); if (orphan_num) { @@ -1317,7 +1325,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk += orphan_blocks; } - write_data_summaries(sbi, start_blk); + f2fs_write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; /* Record write statistics in the hot node summary */ @@ -1328,7 +1336,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); if (__remain_node_summaries(cpc->reason)) { - write_node_summaries(sbi, start_blk); + f2fs_write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; } @@ -1337,7 +1345,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we have one bio having CP pack except cp pack 2 page */ - sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); @@ -1354,7 +1362,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) commit_checkpoint(sbi, ckpt, start_blk); wait_on_all_pages_writeback(sbi); - release_ino_entry(sbi, false); + f2fs_release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) return -EIO; @@ -1379,7 +1387,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* * We guarantee that this checkpoint procedure will not fail. */ -int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; @@ -1412,7 +1420,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* this is the case of multiple fstrims without any changes */ if (cpc->reason & CP_DISCARD) { - if (!exist_trim_candidates(sbi, cpc)) { + if (!f2fs_exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; } @@ -1420,8 +1428,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (NM_I(sbi)->dirty_nat_cnt == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { - flush_sit_entries(sbi, cpc); - clear_prefree_segments(sbi, cpc); + f2fs_flush_sit_entries(sbi, cpc); + f2fs_clear_prefree_segments(sbi, cpc); unblock_operations(sbi); goto out; } @@ -1436,15 +1444,15 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi, cpc); - flush_sit_entries(sbi, cpc); + f2fs_flush_nat_entries(sbi, cpc); + f2fs_flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); if (err) - release_discard_addrs(sbi); + f2fs_release_discard_addrs(sbi); else - clear_prefree_segments(sbi, cpc); + f2fs_clear_prefree_segments(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1461,7 +1469,7 @@ out: return err; } -void init_ino_entry_info(struct f2fs_sb_info *sbi) +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) { int i; @@ -1479,23 +1487,23 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi) F2FS_ORPHANS_PER_BLOCK; } -int __init create_checkpoint_caches(void) +int __init f2fs_create_checkpoint_caches(void) { ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", sizeof(struct ino_entry)); if (!ino_entry_slab) return -ENOMEM; - inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", + f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", sizeof(struct inode_entry)); - if (!inode_entry_slab) { + if (!f2fs_inode_entry_slab) { kmem_cache_destroy(ino_entry_slab); return -ENOMEM; } return 0; } -void destroy_checkpoint_caches(void) +void f2fs_destroy_checkpoint_caches(void) { kmem_cache_destroy(ino_entry_slab); - kmem_cache_destroy(inode_entry_slab); + kmem_cache_destroy(f2fs_inode_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 02237d4d91f5..8f931d699287 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,8 +19,6 @@ #include <linux/bio.h> #include <linux/prefetch.h> #include <linux/uio.h> -#include <linux/mm.h> -#include <linux/memcontrol.h> #include <linux/cleancache.h> #include <linux/sched/signal.h> @@ -30,6 +28,11 @@ #include "trace.h" #include <trace/events/f2fs.h> +#define NUM_PREALLOC_POST_READ_CTXS 128 + +static struct kmem_cache *bio_post_read_ctx_cache; +static mempool_t *bio_post_read_ctx_pool; + static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; @@ -45,16 +48,84 @@ static bool __is_cp_guaranteed(struct page *page) if (inode->i_ino == F2FS_META_INO(sbi) || inode->i_ino == F2FS_NODE_INO(sbi) || S_ISDIR(inode->i_mode) || + (S_ISREG(inode->i_mode) && + is_inode_flag_set(inode, FI_ATOMIC_FILE)) || is_cold_data(page)) return true; return false; } -static void f2fs_read_end_io(struct bio *bio) +/* postprocessing steps for read bios */ +enum bio_post_read_step { + STEP_INITIAL = 0, + STEP_DECRYPT, +}; + +struct bio_post_read_ctx { + struct bio *bio; + struct work_struct work; + unsigned int cur_step; + unsigned int enabled_steps; +}; + +static void __read_end_io(struct bio *bio) { - struct bio_vec *bvec; + struct page *page; + struct bio_vec *bv; int i; + bio_for_each_segment_all(bv, bio, i) { + page = bv->bv_page; + + /* PG_error was set if any post_read step failed */ + if (bio->bi_status || PageError(page)) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx); + +static void decrypt_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fscrypt_decrypt_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx) +{ + switch (++ctx->cur_step) { + case STEP_DECRYPT: + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { + INIT_WORK(&ctx->work, decrypt_work); + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ + default: + __read_end_io(ctx->bio); + } +} + +static bool f2fs_bio_post_read_required(struct bio *bio) +{ + return bio->bi_private && !bio->bi_status; +} + +static void f2fs_read_end_io(struct bio *bio) +{ #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); @@ -62,28 +133,15 @@ static void f2fs_read_end_io(struct bio *bio) } #endif - if (f2fs_bio_encrypted(bio)) { - if (bio->bi_status) { - fscrypt_release_ctx(bio->bi_private); - } else { - fscrypt_decrypt_bio_pages(bio->bi_private, bio); - return; - } - } - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; + if (f2fs_bio_post_read_required(bio)) { + struct bio_post_read_ctx *ctx = bio->bi_private; - if (!bio->bi_status) { - if (!PageUptodate(page)) - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); + ctx->cur_step = STEP_INITIAL; + bio_post_read_processing(ctx); + return; } - bio_put(bio); + + __read_end_io(bio); } static void f2fs_write_end_io(struct bio *bio) @@ -189,7 +247,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; - bio->bi_write_hint = io_type_to_rw_hint(sbi, type, temp); + bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, type, temp); } if (wbc) wbc_init_bio(wbc, bio); @@ -404,13 +462,12 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -int f2fs_submit_page_write(struct f2fs_io_info *fio) +void f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; - int err = 0; f2fs_bug_on(sbi, is_read_io(fio->op)); @@ -420,7 +477,7 @@ next: spin_lock(&io->io_lock); if (list_empty(&io->io_list)) { spin_unlock(&io->io_lock); - goto out_fail; + goto out; } fio = list_first_entry(&io->io_list, struct f2fs_io_info, list); @@ -428,7 +485,7 @@ next: spin_unlock(&io->io_lock); } - if (fio->old_blkaddr != NEW_ADDR) + if (is_valid_blkaddr(fio->old_blkaddr)) verify_block_addr(fio, fio->old_blkaddr); verify_block_addr(fio, fio->new_blkaddr); @@ -447,9 +504,9 @@ alloc_new: if (io->bio == NULL) { if ((fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { - err = -EAGAIN; dec_page_count(sbi, WB_DATA_TYPE(bio_page)); - goto out_fail; + fio->retry = true; + goto skip; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, BIO_MAX_PAGES, false, @@ -469,41 +526,44 @@ alloc_new: f2fs_trace_ios(fio, 0); trace_f2fs_submit_page_write(fio->page, fio); - +skip: if (fio->in_list) goto next; -out_fail: +out: up_write(&io->io_rwsem); - return err; } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; struct bio *bio; - - if (f2fs_encrypted_file(inode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_block_writeback(sbi, blkaddr); - } + struct bio_post_read_ctx *ctx; + unsigned int post_read_steps = 0; bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); + if (!bio) return ERR_PTR(-ENOMEM); - } f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (f2fs_encrypted_file(inode)) + post_read_steps |= 1 << STEP_DECRYPT; + if (post_read_steps) { + ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + if (!ctx) { + bio_put(bio); + return ERR_PTR(-ENOMEM); + } + ctx->bio = bio; + ctx->enabled_steps = post_read_steps; + bio->bi_private = ctx; + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_block_writeback(sbi, blkaddr); + } + return bio; } @@ -544,7 +604,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn) * ->node_page * update block addresses in the node page */ -void set_data_blkaddr(struct dnode_of_data *dn) +void f2fs_set_data_blkaddr(struct dnode_of_data *dn) { f2fs_wait_on_page_writeback(dn->node_page, NODE, true); __set_data_blkaddr(dn); @@ -555,12 +615,12 @@ void set_data_blkaddr(struct dnode_of_data *dn) void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { dn->data_blkaddr = blkaddr; - set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn); f2fs_update_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ -int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); int err; @@ -594,12 +654,12 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) } /* Should keep dn->ofs_in_node unchanged */ -int reserve_new_block(struct dnode_of_data *dn) +int f2fs_reserve_new_block(struct dnode_of_data *dn) { unsigned int ofs_in_node = dn->ofs_in_node; int ret; - ret = reserve_new_blocks(dn, 1); + ret = f2fs_reserve_new_blocks(dn, 1); dn->ofs_in_node = ofs_in_node; return ret; } @@ -609,12 +669,12 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) bool need_put = dn->inode_page ? false : true; int err; - err = get_dnode_of_data(dn, index, ALLOC_NODE); + err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE); if (err) return err; if (dn->data_blkaddr == NULL_ADDR) - err = reserve_new_block(dn); + err = f2fs_reserve_new_block(dn); if (err || need_put) f2fs_put_dnode(dn); return err; @@ -633,7 +693,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) return f2fs_reserve_block(dn, index); } -struct page *get_read_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; @@ -652,7 +712,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, } set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) goto put_err; f2fs_put_dnode(&dn); @@ -671,7 +731,8 @@ got_it: * A new dentry page is allocated but not able to be written, since its * new inode page couldn't be allocated due to -ENOSPC. * In such the case, its blkaddr can be remained as NEW_ADDR. - * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. + * see, f2fs_add_link -> f2fs_get_new_data_page -> + * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); @@ -691,7 +752,7 @@ put_err: return ERR_PTR(err); } -struct page *find_data_page(struct inode *inode, pgoff_t index) +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -701,7 +762,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, 0, false); + page = f2fs_get_read_data_page(inode, index, 0, false); if (IS_ERR(page)) return page; @@ -721,13 +782,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *get_lock_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write) { struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, 0, for_write); + page = f2fs_get_read_data_page(inode, index, 0, for_write); if (IS_ERR(page)) return page; @@ -753,7 +814,7 @@ repeat: * Note that, ipage is set only by make_empty_dir, and if any error occur, * ipage should be released by this function. */ -struct page *get_new_data_page(struct inode *inode, +struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) { struct address_space *mapping = inode->i_mapping; @@ -792,7 +853,7 @@ struct page *get_new_data_page(struct inode *inode, /* if ipage exists, blkaddr should be NEW_ADDR */ f2fs_bug_on(F2FS_I_SB(inode), ipage); - page = get_lock_data_page(inode, index, true); + page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return page; } @@ -824,15 +885,15 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return err; alloc: - get_node_info(sbi, dn->nid, &ni); + f2fs_get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, + f2fs_allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL, false); - set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn); /* update i_size */ - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) f2fs_i_size_write(dn->inode, @@ -870,7 +931,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_seg_type = NO_CHECK_TYPE; if (direct_io) { - map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); + map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); flag = f2fs_force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; @@ -960,7 +1021,7 @@ next_dnode: /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, mode); + err = f2fs_get_dnode_of_data(&dn, pgofs, mode); if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; @@ -968,10 +1029,10 @@ next_dnode: err = 0; if (map->m_next_pgofs) *map->m_next_pgofs = - get_next_page_offset(&dn, pgofs); + f2fs_get_next_page_offset(&dn, pgofs); if (map->m_next_extent) *map->m_next_extent = - get_next_page_offset(&dn, pgofs); + f2fs_get_next_page_offset(&dn, pgofs); } goto unlock_out; } @@ -984,7 +1045,7 @@ next_dnode: next_block: blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { + if (!is_valid_blkaddr(blkaddr)) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1057,7 +1118,7 @@ skip: (pgofs == end || dn.ofs_in_node == end_offset)) { dn.ofs_in_node = ofs_in_node; - err = reserve_new_blocks(&dn, prealloc); + err = f2fs_reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; @@ -1176,7 +1237,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, { return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DEFAULT, NULL, - rw_hint_to_seg_type( + f2fs_rw_hint_to_seg_type( inode->i_write_hint)); } @@ -1221,7 +1282,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - get_node_info(sbi, inode->i_ino, &ni); + f2fs_get_node_info(sbi, inode->i_ino, &ni); phys = (__u64)blk_to_logical(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + @@ -1248,7 +1309,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - get_node_info(sbi, xnid, &ni); + f2fs_get_node_info(sbi, xnid, &ni); phys = (__u64)blk_to_logical(inode, ni.blk_addr); len = inode->i_sb->s_blocksize; @@ -1525,7 +1586,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio) if (!f2fs_encrypted_file(inode)) return 0; - /* wait for GCed encrypted page writeback */ + /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); retry_encrypt: @@ -1552,12 +1613,12 @@ static inline bool check_inplace_update_policy(struct inode *inode, if (policy & (0x1 << F2FS_IPU_FORCE)) return true; - if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) + if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) return true; if (policy & (0x1 << F2FS_IPU_UTIL) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; - if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && f2fs_need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; @@ -1578,7 +1639,7 @@ static inline bool check_inplace_update_policy(struct inode *inode, return false; } -bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) { if (f2fs_is_pinned_file(inode)) return true; @@ -1590,7 +1651,7 @@ bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) return check_inplace_update_policy(inode, fio); } -bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1613,22 +1674,13 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - if (should_update_outplace(inode, fio)) + if (f2fs_should_update_outplace(inode, fio)) return false; - return should_update_inplace(inode, fio); + return f2fs_should_update_inplace(inode, fio); } -static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) -{ - if (fio->old_blkaddr == NEW_ADDR) - return false; - if (fio->old_blkaddr == NULL_ADDR) - return false; - return true; -} - -int do_write_data_page(struct f2fs_io_info *fio) +int f2fs_do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; @@ -1642,7 +1694,7 @@ int do_write_data_page(struct f2fs_io_info *fio) f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; - if (valid_ipu_blkaddr(fio)) { + if (is_valid_blkaddr(fio->old_blkaddr)) { ipu_force = true; fio->need_lock = LOCK_DONE; goto got_it; @@ -1653,7 +1705,7 @@ int do_write_data_page(struct f2fs_io_info *fio) if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) return -EAGAIN; - err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) goto out; @@ -1669,16 +1721,18 @@ got_it: * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + if (ipu_force || (is_valid_blkaddr(fio->old_blkaddr) && + need_inplace_update(fio))) { err = encrypt_one_page(fio); if (err) goto out_writepage; set_page_writeback(page); + ClearPageError(page); f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); - err = rewrite_data_page(fio); + err = f2fs_inplace_write_data(fio); trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; @@ -1697,9 +1751,10 @@ got_it: goto out_writepage; set_page_writeback(page); + ClearPageError(page); /* LFS mode write path */ - write_data_page(&dn, fio); + f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) @@ -1745,6 +1800,12 @@ static int __write_data_page(struct page *page, bool *submitted, /* we should bypass data pages to proceed the kworkder jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(page->mapping, -EIO); + /* + * don't drop any dirty dentry pages for keeping lastest + * directory structure. + */ + if (S_ISDIR(inode->i_mode)) + goto redirty_out; goto out; } @@ -1769,13 +1830,13 @@ write: /* we should not write 0'th page having journal header */ if (f2fs_is_volatile_file(inode) && (!page->index || (!wbc->for_reclaim && - available_free_memory(sbi, BASE_CHECK)))) + f2fs_available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { fio.need_lock = LOCK_DONE; - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); goto done; } @@ -1794,10 +1855,10 @@ write: } if (err == -EAGAIN) { - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); if (err == -EAGAIN) { fio.need_lock = LOCK_REQ; - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); } } @@ -1822,7 +1883,7 @@ out: if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); clear_inode_flag(inode, FI_HOT_DATA); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); submitted = NULL; } @@ -1842,7 +1903,13 @@ out: redirty_out: redirty_page_for_writepage(wbc, page); - if (!err) + /* + * pageout() in MM traslates EAGAIN, so calls handle_write_error() + * -> mapping_set_error() -> set_bit(AS_EIO, ...). + * file_write_and_wait_range() will see EIO error, which is critical + * to return value of fsync() followed by atomic_write failure to user. + */ + if (!err || wbc->for_reclaim) return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; @@ -1866,6 +1933,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; struct pagevec pvec; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; @@ -1919,6 +1987,13 @@ retry: struct page *page = pvec.pages[i]; bool submitted = false; + /* give a priority to WB_SYNC threads */ + if (atomic_read(&sbi->wb_sync_req[DATA]) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + done_index = page->index; retry_write: lock_page(page); @@ -1973,9 +2048,7 @@ continue_unlock: last_idx = page->index; } - /* give a priority to WB_SYNC threads */ - if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || - --wbc->nr_to_write <= 0) && + if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; @@ -2001,7 +2074,7 @@ continue_unlock: return ret; } -int __f2fs_write_data_pages(struct address_space *mapping, +static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) { @@ -2024,7 +2097,7 @@ int __f2fs_write_data_pages(struct address_space *mapping, if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && - available_free_memory(sbi, DIRTY_DENTS)) + f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; /* skip writing during file defragment */ @@ -2035,8 +2108,8 @@ int __f2fs_write_data_pages(struct address_space *mapping, /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) - atomic_inc(&sbi->wb_sync_req); - else if (atomic_read(&sbi->wb_sync_req)) + atomic_inc(&sbi->wb_sync_req[DATA]); + else if (atomic_read(&sbi->wb_sync_req[DATA])) goto skip_write; blk_start_plug(&plug); @@ -2044,13 +2117,13 @@ int __f2fs_write_data_pages(struct address_space *mapping, blk_finish_plug(&plug); if (wbc->sync_mode == WB_SYNC_ALL) - atomic_dec(&sbi->wb_sync_req); + atomic_dec(&sbi->wb_sync_req[DATA]); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. */ - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); return ret; skip_write: @@ -2077,7 +2150,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) if (to > i_size) { down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - truncate_blocks(inode, i_size, true); + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -2109,7 +2182,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, } restart: /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; @@ -2119,7 +2192,7 @@ restart: if (f2fs_has_inline_data(inode)) { if (pos + len <= MAX_INLINE_DATA(inode)) { - read_inline_data(page, ipage); + f2fs_do_read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) set_inline_node(ipage); @@ -2137,7 +2210,7 @@ restart: dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, @@ -2174,7 +2247,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); if (f2fs_is_atomic_file(inode) && - !available_free_memory(sbi, INMEM_PAGES)) { + !f2fs_available_free_memory(sbi, INMEM_PAGES)) { err = -ENOMEM; drop_atomic = true; goto fail; @@ -2222,8 +2295,8 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); - /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_file(inode)) + /* wait for GCed page writeback via META_MAPPING */ + if (f2fs_post_read_required(inode)) f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) @@ -2258,7 +2331,7 @@ fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); if (drop_atomic) - drop_inmem_pages_all(sbi); + f2fs_drop_inmem_pages_all(sbi, false); return err; } @@ -2333,17 +2406,17 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (rw == WRITE && whint_mode == WHINT_MODE_OFF) iocb->ki_hint = WRITE_LIFE_NOT_SET; - if (!down_read_trylock(&F2FS_I(inode)->dio_rwsem[rw])) { + if (!down_read_trylock(&F2FS_I(inode)->i_gc_rwsem[rw])) { if (iocb->ki_flags & IOCB_NOWAIT) { iocb->ki_hint = hint; err = -EAGAIN; goto out; } - down_read(&F2FS_I(inode)->dio_rwsem[rw]); + down_read(&F2FS_I(inode)->i_gc_rwsem[rw]); } err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); - up_read(&F2FS_I(inode)->dio_rwsem[rw]); + up_read(&F2FS_I(inode)->i_gc_rwsem[rw]); if (rw == WRITE) { if (whint_mode == WHINT_MODE_OFF) @@ -2380,13 +2453,13 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, dec_page_count(sbi, F2FS_DIRTY_NODES); } else { inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); } } /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return drop_inmem_page(inode, page); + return f2fs_drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); @@ -2407,35 +2480,6 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 1; } -/* - * This was copied from __set_page_dirty_buffers which gives higher performance - * in very high speed storages. (e.g., pmem) - */ -void f2fs_set_page_dirty_nobuffers(struct page *page) -{ - struct address_space *mapping = page->mapping; - unsigned long flags; - - if (unlikely(!mapping)) - return; - - spin_lock(&mapping->private_lock); - lock_page_memcg(page); - SetPageDirty(page); - spin_unlock(&mapping->private_lock); - - xa_lock_irqsave(&mapping->i_pages, flags); - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); - xa_unlock_irqrestore(&mapping->i_pages, flags); - unlock_page_memcg(page); - - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return; -} - static int f2fs_set_data_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -2448,7 +2492,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { - register_inmem_page(inode, page); + f2fs_register_inmem_page(inode, page); return 1; } /* @@ -2459,8 +2503,8 @@ static int f2fs_set_data_page_dirty(struct page *page) } if (!PageDirty(page)) { - f2fs_set_page_dirty_nobuffers(page); - update_dirty_page(inode, page); + __set_page_dirty_nobuffers(page); + f2fs_update_dirty_page(inode, page); return 1; } return 0; @@ -2555,3 +2599,38 @@ const struct address_space_operations f2fs_dblock_aops = { .migratepage = f2fs_migrate_page, #endif }; + +void f2fs_clear_radix_tree_dirty_tag(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + unsigned long flags; + + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); + xa_unlock_irqrestore(&mapping->i_pages, flags); +} + +int __init f2fs_init_post_read_processing(void) +{ + bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0); + if (!bio_post_read_ctx_cache) + goto fail; + bio_post_read_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, + bio_post_read_ctx_cache); + if (!bio_post_read_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_post_read_ctx_cache); +fail: + return -ENOMEM; +} + +void __exit f2fs_destroy_post_read_processing(void) +{ + mempool_destroy(bio_post_read_ctx_pool); + kmem_cache_destroy(bio_post_read_ctx_cache); +} diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a66107b5cfff..2d65e77ae5cf 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -104,6 +104,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->bg_gc = sbi->bg_gc; + si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; + si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -342,6 +344,10 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); + seq_printf(s, "Skipped : atomic write %llu (%llu)\n", + si->skipped_atomic_files[BG_GC] + + si->skipped_atomic_files[FG_GC], + si->skipped_atomic_files[BG_GC]); seq_puts(s, "\nExtent Cache:\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", si->hit_largest, si->hit_cached, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8c9c2f31b253..7f955c4e86a4 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -60,12 +60,12 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, }; -void set_de_type(struct f2fs_dir_entry *de, umode_t mode) +static void set_de_type(struct f2fs_dir_entry *de, umode_t mode) { de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } -unsigned char get_de_type(struct f2fs_dir_entry *de) +unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) { if (de->file_type < F2FS_FT_MAX) return f2fs_filetype_table[de->file_type]; @@ -97,14 +97,14 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); - de = find_target_dentry(fname, namehash, max_slots, &d); + de = f2fs_find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; return de; } -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, +struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d) { @@ -171,7 +171,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, for (; bidx < end_block; bidx++) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = find_data_page(dir, bidx); + dentry_page = f2fs_find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { room = true; @@ -210,7 +210,7 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, if (f2fs_has_inline_dentry(dir)) { *res_page = NULL; - de = find_in_inline_dir(dir, fname, res_page); + de = f2fs_find_in_inline_dir(dir, fname, res_page); goto out; } @@ -319,7 +319,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -void do_make_empty_dir(struct inode *inode, struct inode *parent, +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { struct qstr dot = QSTR_INIT(".", 1); @@ -340,23 +340,23 @@ static int make_empty_dir(struct inode *inode, struct f2fs_dentry_ptr d; if (f2fs_has_inline_dentry(inode)) - return make_empty_inline_dir(inode, parent, page); + return f2fs_make_empty_inline_dir(inode, parent, page); - dentry_page = get_new_data_page(inode, page, 0, true); + dentry_page = f2fs_get_new_data_page(inode, page, 0, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); - do_make_empty_dir(inode, parent, &d); + f2fs_do_make_empty_dir(inode, parent, &d); set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); return 0; } -struct page *init_inode_metadata(struct inode *inode, struct inode *dir, +struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct page *dpage) { @@ -365,7 +365,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { - page = new_inode_page(inode); + page = f2fs_new_inode_page(inode); if (IS_ERR(page)) return page; @@ -395,7 +395,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, goto put_error; } } else { - page = get_node_page(F2FS_I_SB(dir), inode->i_ino); + page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; } @@ -418,19 +418,19 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, * we should remove this inode from orphan list. */ if (inode->i_nlink == 0) - remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); + f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); f2fs_i_links_write(inode, true); } return page; put_error: clear_nlink(inode); - update_inode(inode, page); + f2fs_update_inode(inode, page); f2fs_put_page(page, 1); return ERR_PTR(err); } -void update_parent_metadata(struct inode *dir, struct inode *inode, +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { @@ -448,7 +448,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_INC_LINK); } -int room_for_filename(const void *bitmap, int slots, int max_slots) +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots) { int bit_start = 0; int zero_start, zero_end; @@ -537,12 +537,12 @@ start: (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { - dentry_page = get_new_data_page(dir, NULL, block, true); + dentry_page = f2fs_get_new_data_page(dir, NULL, block, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = page_address(dentry_page); - bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, + bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; @@ -558,7 +558,7 @@ add_dentry: if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, new_name, + page = f2fs_init_inode_metadata(inode, dir, new_name, orig_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -576,7 +576,7 @@ add_dentry: f2fs_put_page(page, 1); } - update_parent_metadata(dir, inode, current_depth); + f2fs_update_parent_metadata(dir, inode, current_depth); fail: if (inode) up_write(&F2FS_I(inode)->i_sem); @@ -586,7 +586,7 @@ fail: return err; } -int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, +int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, struct inode *inode, nid_t ino, umode_t mode) { struct qstr new_name; @@ -610,7 +610,7 @@ int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; @@ -639,7 +639,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } else if (IS_ERR(page)) { err = PTR_ERR(page); } else { - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + err = f2fs_add_dentry(dir, &fname, inode, ino, mode); } fscrypt_free_filename(&fname); return err; @@ -651,7 +651,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) int err = 0; down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, NULL, NULL, NULL); + page = f2fs_init_inode_metadata(inode, dir, NULL, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -683,9 +683,9 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) up_write(&F2FS_I(inode)->i_sem); if (inode->i_nlink == 0) - add_orphan_inode(inode); + f2fs_add_orphan_inode(inode); else - release_orphan_inode(sbi); + f2fs_release_orphan_inode(sbi); } /* @@ -698,14 +698,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); - struct address_space *mapping = page_mapping(page); - unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) - add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); @@ -731,17 +729,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_drop_nlink(dir, inode); if (bit_pos == NR_DENTRY_IN_BLOCK && - !truncate_hole(dir, page->index, page->index + 1)) { - xa_lock_irqsave(&mapping->i_pages, flags); - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); - xa_unlock_irqrestore(&mapping->i_pages, flags); - + !f2fs_truncate_hole(dir, page->index, page->index + 1)) { + f2fs_clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); - remove_dirty_inode(dir); + f2fs_remove_dirty_inode(dir); } f2fs_put_page(page, 1); } @@ -758,7 +752,7 @@ bool f2fs_empty_dir(struct inode *dir) return f2fs_empty_inline_dir(dir); for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = get_lock_data_page(dir, bidx, false); + dentry_page = f2fs_get_lock_data_page(dir, bidx, false); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) continue; @@ -806,7 +800,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, continue; } - d_type = get_de_type(de); + d_type = f2fs_get_de_type(de); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -830,7 +824,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, return 1; if (sbi->readdir_ra == 1) - ra_node_page(sbi, le32_to_cpu(de->ino)); + f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; @@ -880,7 +874,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = get_lock_data_page(inode, n, false); + dentry_page = f2fs_get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index d5a861bf2b42..231b77ef5a53 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -49,7 +49,7 @@ static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, return NULL; } -struct rb_entry *__lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs) { struct rb_entry *re; @@ -61,7 +61,7 @@ struct rb_entry *__lookup_rb_tree(struct rb_root *root, return re; } -struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, +struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root *root, struct rb_node **parent, unsigned int ofs) { @@ -92,7 +92,7 @@ struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, * in order to simpfy the insertion after. * tree must stay unchanged between lookup and insertion. */ -struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, @@ -159,7 +159,7 @@ lookup_neighbors: return re; } -bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, +bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root *root) { #ifdef CONFIG_F2FS_CHECK_FS @@ -390,7 +390,7 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = (struct extent_node *)__lookup_rb_tree(&et->root, + en = (struct extent_node *)f2fs_lookup_rb_tree(&et->root, (struct rb_entry *)et->cached_en, pgofs); if (!en) goto out; @@ -470,7 +470,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); + p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -520,7 +520,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, (struct rb_entry *)et->cached_en, fofs, (struct rb_entry **)&prev_en, (struct rb_entry **)&next_en, @@ -773,7 +773,7 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn) else blkaddr = dn->data_blkaddr; - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); } @@ -788,7 +788,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); } -void init_extent_cache_info(struct f2fs_sb_info *sbi) +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); mutex_init(&sbi->extent_tree_lock); @@ -800,7 +800,7 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->total_ext_node, 0); } -int __init create_extent_cache(void) +int __init f2fs_create_extent_cache(void) { extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", sizeof(struct extent_tree)); @@ -815,7 +815,7 @@ int __init create_extent_cache(void) return 0; } -void destroy_extent_cache(void) +void f2fs_destroy_extent_cache(void) { kmem_cache_destroy(extent_node_slab); kmem_cache_destroy(extent_tree_slab); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1df7f10476d6..4d8b1de83143 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -176,15 +176,13 @@ enum { #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 -#define DEF_BATCHED_TRIM_SECTIONS 2048 -#define BATCHED_TRIM_SEGMENTS(sbi) \ - (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) -#define BATCHED_TRIM_BLOCKS(sbi) \ - (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ +#define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ +#define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -285,6 +283,7 @@ enum { struct discard_policy { int type; /* type of discard */ unsigned int min_interval; /* used for candidates exist */ + unsigned int mid_interval; /* used for device busy */ unsigned int max_interval; /* used for candidates not exist */ unsigned int max_requests; /* # of discards issued per round */ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ @@ -620,15 +619,20 @@ enum { #define DEF_DIR_LEVEL 0 +enum { + GC_FAILURE_PIN, + GC_FAILURE_ATOMIC, + MAX_GC_FAILURE +}; + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ unsigned char i_dir_level; /* use for dentry level for large dir */ - union { - unsigned int i_current_depth; /* only for directory depth */ - unsigned short i_gc_failures; /* only for regular file */ - }; + unsigned int i_current_depth; /* only for directory depth */ + /* for gc failure statistic */ + unsigned int i_gc_failures[MAX_GC_FAILURE]; unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ @@ -656,7 +660,9 @@ struct f2fs_inode_info { struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ - struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ + + /* avoid racing between foreground op and gc */ + struct rw_semaphore i_gc_rwsem[2]; struct rw_semaphore i_mmap_sem; struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ @@ -694,7 +700,8 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front) { - return back->lstart + back->len == front->lstart; + return (back->lstart + back->len == front->lstart) && + (back->len + front->len < DEF_MAX_DISCARD_LEN); } static inline bool __is_discard_back_mergeable(struct discard_info *cur, @@ -1005,6 +1012,7 @@ struct f2fs_io_info { int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ bool is_meta; /* indicate borrow meta inode mapping or not */ + bool retry; /* need to reallocate block address */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ }; @@ -1067,6 +1075,13 @@ enum { }; enum { + GC_NORMAL, + GC_IDLE_CB, + GC_IDLE_GREEDY, + GC_URGENT, +}; + +enum { WHINT_MODE_OFF, /* not pass down write hints */ WHINT_MODE_USER, /* try to pass down hints given by users */ WHINT_MODE_FS, /* pass down hints with F2FS policy */ @@ -1080,6 +1095,7 @@ enum { enum fsync_mode { FSYNC_MODE_POSIX, /* fsync follows posix semantics */ FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ + FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; #ifdef CONFIG_F2FS_FS_ENCRYPTION @@ -1113,6 +1129,8 @@ struct f2fs_sb_info { struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; /* bio ordering for NODE/DATA */ + /* keep migration IO order for LFS mode */ + struct rw_semaphore io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ @@ -1183,7 +1201,7 @@ struct f2fs_sb_info { struct percpu_counter alloc_valid_block_count; /* writeback control */ - atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ /* valid inode count */ struct percpu_counter total_valid_inode_count; @@ -1194,9 +1212,9 @@ struct f2fs_sb_info { struct mutex gc_mutex; /* mutex for GC */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ - - /* threshold for converting bg victims for fg */ - u64 fggc_threshold; + unsigned int gc_mode; /* current GC state */ + /* for skip statistic */ + unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; @@ -1587,18 +1605,6 @@ static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) } /* - * Check whether the given nid is within node id range. - */ -static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) -{ - if (unlikely(nid < F2FS_ROOT_INO(sbi))) - return -EINVAL; - if (unlikely(nid >= NM_I(sbi)->max_nid)) - return -EINVAL; - return 0; -} - -/* * Check whether the inode has blocks or not */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) @@ -1614,7 +1620,7 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) } static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool cap) { if (!inode) return true; @@ -1627,7 +1633,7 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && in_group_p(F2FS_OPTION(sbi).s_resgid)) return true; - if (capable(CAP_SYS_RESOURCE)) + if (cap && capable(CAP_SYS_RESOURCE)) return true; return false; } @@ -1662,7 +1668,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!__allow_reserved_blocks(sbi, inode)) + if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { @@ -1869,7 +1875,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, valid_block_count = sbi->total_valid_block_count + sbi->current_reserved_blocks + 1; - if (!__allow_reserved_blocks(sbi, inode)) + if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { @@ -2156,9 +2162,60 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) -#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) +/* + * Inode flags + */ +#define F2FS_SECRM_FL 0x00000001 /* Secure deletion */ +#define F2FS_UNRM_FL 0x00000002 /* Undelete */ +#define F2FS_COMPR_FL 0x00000004 /* Compress file */ +#define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ +#define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ +#define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ +#define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define F2FS_DIRTY_FL 0x00000100 +#define F2FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define F2FS_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define F2FS_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define F2FS_IMAGIC_FL 0x00002000 /* AFS directory */ +#define F2FS_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define F2FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define F2FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define F2FS_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ + +/* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */ +#define F2FS_FL_XFLAG_VISIBLE (F2FS_SYNC_FL | \ + F2FS_IMMUTABLE_FL | \ + F2FS_APPEND_FL | \ + F2FS_NODUMP_FL | \ + F2FS_NOATIME_FL | \ + F2FS_PROJINHERIT_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define F2FS_FL_INHERITED (F2FS_SECRM_FL | F2FS_UNRM_FL | F2FS_COMPR_FL |\ + F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL |\ + F2FS_NOCOMPR_FL | F2FS_JOURNAL_DATA_FL |\ + F2FS_NOTAIL_FL | F2FS_DIRSYNC_FL |\ + F2FS_PROJINHERIT_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) { @@ -2201,6 +2258,7 @@ enum { FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ + FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2299,7 +2357,7 @@ static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) static inline void f2fs_i_gc_failures_write(struct inode *inode, unsigned int count) { - F2FS_I(inode)->i_gc_failures = count; + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = count; f2fs_mark_inode_dirty_sync(inode, true); } @@ -2460,6 +2518,7 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + struct timespec ts; bool ret; if (dsync) { @@ -2475,11 +2534,14 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) i_size_read(inode) & ~PAGE_MASK) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + ts = timespec64_to_timespec(inode->i_atime); + if (!timespec_equal(F2FS_I(inode)->i_disk_time, &ts)) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + ts = timespec64_to_timespec(inode->i_ctime); + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &ts)) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + ts = timespec64_to_timespec(inode->i_mtime); + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &ts)) return false; if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3, &F2FS_I(inode)->i_crtime)) @@ -2568,7 +2630,7 @@ static inline int get_inline_xattr_addrs(struct inode *inode) return F2FS_I(inode)->i_inline_xattr_size; } -#define get_inode_mode(i) \ +#define f2fs_get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -2607,18 +2669,25 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, spin_unlock(&sbi->iostat_lock); } +static inline bool is_valid_blkaddr(block_t blkaddr) +{ + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + return false; + return true; +} + /* * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -void truncate_data_blocks(struct dnode_of_data *dn); -int truncate_blocks(struct inode *inode, u64 from, bool lock); +void f2fs_truncate_data_blocks(struct dnode_of_data *dn); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int f2fs_setattr(struct dentry *dentry, struct iattr *attr); -int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); -void truncate_data_blocks_range(struct dnode_of_data *dn, int count); +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -2632,38 +2701,37 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); -int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); -void update_inode(struct inode *inode, struct page *node_page); -void update_inode_page(struct inode *inode); +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +void f2fs_update_inode(struct inode *inode, struct page *node_page); +void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_evict_inode(struct inode *inode); -void handle_failed_inode(struct inode *inode); +void f2fs_handle_failed_inode(struct inode *inode); /* * namei.c */ -int update_extension_list(struct f2fs_sb_info *sbi, const char *name, +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -void set_de_type(struct f2fs_dir_entry *de, umode_t mode); -unsigned char get_de_type(struct f2fs_dir_entry *de); -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, +unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d); int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr); -void do_make_empty_dir(struct inode *inode, struct inode *parent, +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d); -struct page *init_inode_metadata(struct inode *inode, struct inode *dir, +struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct page *dpage); -void update_parent_metadata(struct inode *dir, struct inode *inode, +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth); -int room_for_filename(const void *bitmap, int slots, int max_slots); +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots); void f2fs_drop_nlink(struct inode *dir, struct inode *inode); struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, struct fscrypt_name *fname, struct page **res_page); @@ -2680,9 +2748,9 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode); -int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, +int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, struct inode *inode, nid_t ino, umode_t mode); -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode); void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode); @@ -2691,7 +2759,7 @@ bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { - return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name, + return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, inode, inode->i_ino, inode->i_mode); } @@ -2706,7 +2774,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); -int sanity_check_ckpt(struct f2fs_sb_info *sbi); +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c @@ -2720,179 +2788,183 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, struct dnode_of_data; struct node_info; -bool available_free_memory(struct f2fs_sb_info *sbi, int type); -int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); -bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); -bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); -void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); -pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); -int truncate_inode_blocks(struct inode *inode, pgoff_t from); -int truncate_xattr_node(struct inode *inode); -int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); -int remove_inode_page(struct inode *inode); -struct page *new_inode_page(struct inode *inode); -struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); -void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); -struct page *get_node_page_ra(struct page *parent, int start); -void move_node_page(struct page *node_page, int gc_type); -int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni); +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); +int f2fs_truncate_xattr_node(struct inode *inode); +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_remove_inode_page(struct inode *inode); +struct page *f2fs_new_inode_page(struct inode *inode); +struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *f2fs_get_node_page_ra(struct page *parent, int start); +void f2fs_move_node_page(struct page *node_page, int gc_type); +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, bool do_balance, enum iostat_type io_type); -void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); -bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); -void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); -void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); -int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); -void recover_inline_xattr(struct inode *inode, struct page *page); -int recover_xattr_data(struct inode *inode, struct page *page); -int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); -void restore_node_summary(struct f2fs_sb_info *sbi, +void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +void f2fs_recover_inline_xattr(struct inode *inode, struct page *page); +int f2fs_recover_xattr_data(struct inode *inode, struct page *page); +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); -int build_node_manager(struct f2fs_sb_info *sbi); -void destroy_node_manager(struct f2fs_sb_info *sbi); -int __init create_node_manager_caches(void); -void destroy_node_manager_caches(void); +void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_build_node_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_node_manager_caches(void); +void f2fs_destroy_node_manager_caches(void); /* * segment.c */ -bool need_SSR(struct f2fs_sb_info *sbi); -void register_inmem_page(struct inode *inode, struct page *page); -void drop_inmem_pages_all(struct f2fs_sb_info *sbi); -void drop_inmem_pages(struct inode *inode); -void drop_inmem_page(struct inode *inode, struct page *page); -int commit_inmem_pages(struct inode *inode); +bool f2fs_need_SSR(struct f2fs_sb_info *sbi); +void f2fs_register_inmem_page(struct inode *inode, struct page *page); +void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); +void f2fs_drop_inmem_pages(struct inode *inode); +void f2fs_drop_inmem_page(struct inode *inode, struct page *page); +int f2fs_commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); -int create_flush_cmd_control(struct f2fs_sb_info *sbi); +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); -void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); -bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); -void init_discard_policy(struct discard_policy *dpolicy, int discard_type, - unsigned int granularity); -void drop_discard_cmd(struct f2fs_sb_info *sbi); -void stop_discard_thread(struct f2fs_sb_info *sbi); +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); -void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); -void release_discard_addrs(struct f2fs_sb_info *sbi); -int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); -void allocate_new_segments(struct f2fs_sb_info *sbi); +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); -bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); -struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); -void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, + block_t blk_addr); +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, enum iostat_type io_type); -void write_node_page(unsigned int nid, struct f2fs_io_info *fio); -void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); -int rewrite_data_page(struct f2fs_io_info *fio); -void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio); +int f2fs_inplace_write_data(struct f2fs_io_info *fio); +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr); void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr); -void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); -void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); -void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); -int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc); -void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); -int build_segment_manager(struct f2fs_sb_info *sbi); -void destroy_segment_manager(struct f2fs_sb_info *sbi); -int __init create_segment_manager_caches(void); -void destroy_segment_manager_caches(void); -int rw_hint_to_seg_type(enum rw_hint hint); -enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, - enum temp_type temp); +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_segment_manager_caches(void); +void f2fs_destroy_segment_manager_caches(void); +int f2fs_rw_hint_to_seg_type(enum rw_hint hint); +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp); /* * checkpoint.c */ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); -struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); -bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); -int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, +struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); -void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); -long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type); -void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); -void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); -void release_ino_entry(struct f2fs_sb_info *sbi, bool all); -bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); -void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); -bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); -int acquire_orphan_inode(struct f2fs_sb_info *sbi); -void release_orphan_inode(struct f2fs_sb_info *sbi); -void add_orphan_inode(struct inode *inode); -void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); -int recover_orphan_inodes(struct f2fs_sb_info *sbi); -int get_valid_checkpoint(struct f2fs_sb_info *sbi); -void update_dirty_page(struct inode *inode, struct page *page); -void remove_dirty_inode(struct inode *inode); -int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); -int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); -void init_ino_entry_info(struct f2fs_sb_info *sbi); -int __init create_checkpoint_caches(void); -void destroy_checkpoint_caches(void); +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_add_orphan_inode(struct inode *inode); +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); +void f2fs_update_dirty_page(struct inode *inode, struct page *page); +void f2fs_remove_dirty_inode(struct inode *inode); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_checkpoint_caches(void); +void f2fs_destroy_checkpoint_caches(void); /* * data.c */ +int f2fs_init_post_read_processing(void); +void f2fs_destroy_post_read_processing(void); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, enum page_type type); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); -int f2fs_submit_page_write(struct f2fs_io_info *fio); +void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); -void set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_set_data_blkaddr(struct dnode_of_data *dn); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); -int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); -int reserve_new_block(struct dnode_of_data *dn); +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -struct page *get_read_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); -struct page *find_data_page(struct inode *inode, pgoff_t index); -struct page *get_lock_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); +struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); -struct page *get_new_data_page(struct inode *inode, +struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); -int do_write_data_page(struct f2fs_io_info *fio); +int f2fs_do_write_data_page(struct f2fs_io_info *fio); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); -bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); -void f2fs_set_page_dirty_nobuffers(struct page *page); -int __f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc, - enum iostat_type io_type); +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -2901,22 +2973,23 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); +void f2fs_clear_radix_tree_dirty_tag(struct page *page); /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *sbi); -void stop_gc_thread(struct f2fs_sb_info *sbi); -block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno); -void build_gc_manager(struct f2fs_sb_info *sbi); +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); -bool space_for_roll_forward(struct f2fs_sb_info *sbi); +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); /* * debug.c @@ -2954,6 +3027,7 @@ struct f2fs_stat_info { int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; + unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; @@ -3120,29 +3194,31 @@ extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; -extern struct kmem_cache *inode_entry_slab; +extern struct kmem_cache *f2fs_inode_entry_slab; /* * inline.c */ bool f2fs_may_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); -void read_inline_data(struct page *page, struct page *ipage); -void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); +void f2fs_do_read_inline_data(struct page *page, struct page *ipage); +void f2fs_truncate_inline_inode(struct inode *inode, + struct page *ipage, u64 from); int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); int f2fs_write_inline_data(struct inode *inode, struct page *page); -bool recover_inline_data(struct inode *inode, struct page *npage); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, +bool f2fs_recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page); -int make_empty_inline_dir(struct inode *inode, struct inode *parent, +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage); int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode); -void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, - struct inode *dir, struct inode *inode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, + struct page *page, struct inode *dir, + struct inode *inode); bool f2fs_empty_inline_dir(struct inode *dir); int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr); @@ -3163,17 +3239,17 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -struct rb_entry *__lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs); -struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, +struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root *root, struct rb_node **parent, unsigned int ofs); -struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force); -bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, +bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); @@ -3185,9 +3261,9 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, void f2fs_update_extent_cache(struct dnode_of_data *dn); void f2fs_update_extent_cache_range(struct dnode_of_data *dn, pgoff_t fofs, block_t blkaddr, unsigned int len); -void init_extent_cache_info(struct f2fs_sb_info *sbi); -int __init create_extent_cache(void); -void destroy_extent_cache(void); +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_extent_cache(void); +void f2fs_destroy_extent_cache(void); /* * sysfs.c @@ -3218,9 +3294,13 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) #endif } -static inline bool f2fs_bio_encrypted(struct bio *bio) +/* + * Returns true if the reads of the inode's data need to undergo some + * postprocessing step, like decryption or authenticity verification. + */ +static inline bool f2fs_post_read_required(struct inode *inode) { - return bio->bi_private != NULL; + return f2fs_encrypted_file(inode); } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -3288,7 +3368,7 @@ static inline bool f2fs_may_encrypt(struct inode *inode) static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) { - return (f2fs_encrypted_file(inode) || + return (f2fs_post_read_required(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6b94f19b3fa8..6880c6f78d58 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -33,19 +33,19 @@ #include "trace.h" #include <trace/events/f2fs.h> -static int f2fs_filemap_fault(struct vm_fault *vmf) +static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - int err; + vm_fault_t ret; down_read(&F2FS_I(inode)->i_mmap_sem); - err = filemap_fault(vmf); + ret = filemap_fault(vmf); up_read(&F2FS_I(inode)->i_mmap_sem); - return err; + return ret; } -static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -95,7 +95,8 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) /* page is wholly or partially inside EOF */ if (((loff_t)(page->index + 1) << PAGE_SHIFT) > i_size_read(inode)) { - unsigned offset; + loff_t offset; + offset = i_size_read(inode) & ~PAGE_MASK; zero_user_segment(page, offset, PAGE_SIZE); } @@ -110,8 +111,8 @@ mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false); - /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_file(inode)) + /* wait for GCed page writeback via META_MAPPING */ + if (f2fs_post_read_required(inode)) f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); out_sem: @@ -157,17 +158,18 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_SB_NEED_CP; else if (file_wrong_pino(inode)) cp_reason = CP_WRONG_PINO; - else if (!space_for_roll_forward(sbi)) + else if (!f2fs_space_for_roll_forward(sbi)) cp_reason = CP_NO_SPC_ROLL; - else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) + else if (!f2fs_is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) cp_reason = CP_FASTBOOT_MODE; else if (F2FS_OPTION(sbi).active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT && - need_dentry_mark(sbi, inode->i_ino) && - exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) + f2fs_need_dentry_mark(sbi, inode->i_ino) && + f2fs_exist_written_data(sbi, F2FS_I(inode)->i_pino, + TRANS_DIR_INO)) cp_reason = CP_RECOVER_DIR; return cp_reason; @@ -178,7 +180,7 @@ static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) struct page *i = find_get_page(NODE_MAPPING(sbi), ino); bool ret = false; /* But we need to avoid that there are some inode updates */ - if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) + if ((i && PageDirty(i)) || f2fs_need_inode_block_update(sbi, ino)) ret = true; f2fs_put_page(i, 0); return ret; @@ -238,14 +240,14 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, * if there is no written data, don't waste time to write recovery info. */ if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && - !exist_written_data(sbi, ino, APPEND_INO)) { + !f2fs_exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || - exist_written_data(sbi, ino, UPDATE_INO)) + f2fs_exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; } @@ -272,7 +274,9 @@ go_write: goto out; } sync_nodes: - ret = fsync_node_pages(sbi, inode, &wbc, atomic); + atomic_inc(&sbi->wb_sync_req[NODE]); + ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic); + atomic_dec(&sbi->wb_sync_req[NODE]); if (ret) goto out; @@ -282,7 +286,7 @@ sync_nodes: goto out; } - if (need_inode_block_update(sbi, ino)) { + if (f2fs_need_inode_block_update(sbi, ino)) { f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; @@ -297,21 +301,21 @@ sync_nodes: * given fsync mark. */ if (!atomic) { - ret = wait_on_node_pages_writeback(sbi, ino); + ret = f2fs_wait_on_node_pages_writeback(sbi, ino); if (ret) goto out; } /* once recovery info is written, don't need to tack this */ - remove_ino_entry(sbi, ino, APPEND_INO); + f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic) + if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { - remove_ino_entry(sbi, ino, UPDATE_INO); + f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); - remove_ino_entry(sbi, ino, FLUSH_INO); + f2fs_remove_ino_entry(sbi, ino, FLUSH_INO); } f2fs_update_time(sbi, REQ_TIME); out: @@ -352,7 +356,7 @@ static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, switch (whence) { case SEEK_DATA: if ((blkaddr == NEW_ADDR && dirty == pgofs) || - (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR)) + is_valid_blkaddr(blkaddr)) return true; break; case SEEK_HOLE: @@ -392,13 +396,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { /* direct node does not exists */ if (whence == SEEK_DATA) { - pgofs = get_next_page_offset(&dn, pgofs); + pgofs = f2fs_get_next_page_offset(&dn, pgofs); continue; } else { goto found; @@ -412,6 +416,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); @@ -486,7 +491,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return dquot_file_open(inode, filp); } -void truncate_data_blocks_range(struct dnode_of_data *dn, int count) +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; @@ -502,12 +507,13 @@ void truncate_data_blocks_range(struct dnode_of_data *dn, int count) for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); + if (blkaddr == NULL_ADDR) continue; dn->data_blkaddr = NULL_ADDR; - set_data_blkaddr(dn); - invalidate_blocks(sbi, blkaddr); + f2fs_set_data_blkaddr(dn); + f2fs_invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); nr_free++; @@ -519,7 +525,7 @@ void truncate_data_blocks_range(struct dnode_of_data *dn, int count) * once we invalidate valid blkaddr in range [ofs, ofs + count], * we will invalidate all blkaddr in the whole range. */ - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); @@ -531,15 +537,15 @@ void truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->ofs_in_node, nr_free); } -void truncate_data_blocks(struct dnode_of_data *dn) +void f2fs_truncate_data_blocks(struct dnode_of_data *dn) { - truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); + f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); } static int truncate_partial_data_page(struct inode *inode, u64 from, bool cache_only) { - unsigned offset = from & (PAGE_SIZE - 1); + loff_t offset = from & (PAGE_SIZE - 1); pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; struct page *page; @@ -555,7 +561,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; } - page = get_lock_data_page(inode, index, true); + page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: @@ -570,7 +576,7 @@ truncate_out: return 0; } -int truncate_blocks(struct inode *inode, u64 from, bool lock) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -589,21 +595,21 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) if (lock) f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; } if (f2fs_has_inline_data(inode)) { - truncate_inline_inode(inode, ipage, from); + f2fs_truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; } set_new_dnode(&dn, inode, ipage, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); + err = f2fs_get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; @@ -616,13 +622,13 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) f2fs_bug_on(sbi, count < 0); if (dn.ofs_in_node || IS_INODE(dn.node_page)) { - truncate_data_blocks_range(&dn, count); + f2fs_truncate_data_blocks_range(&dn, count); free_from += count; } f2fs_put_dnode(&dn); free_next: - err = truncate_inode_blocks(inode, free_from); + err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) f2fs_unlock_op(sbi); @@ -661,7 +667,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = truncate_blocks(inode, i_size_read(inode), true); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (err) return err; @@ -686,16 +692,16 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, stat->btime.tv_nsec = fi->i_crtime.tv_nsec; } - flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); - if (flags & FS_APPEND_FL) + flags = fi->i_flags & F2FS_FL_USER_VISIBLE; + if (flags & F2FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; - if (flags & FS_COMPR_FL) + if (flags & F2FS_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; if (f2fs_encrypted_inode(inode)) stat->attributes |= STATX_ATTR_ENCRYPTED; - if (flags & FS_IMMUTABLE_FL) + if (flags & F2FS_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; - if (flags & FS_NODUMP_FL) + if (flags & F2FS_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= (STATX_ATTR_APPEND | @@ -724,14 +730,14 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; if (ia_valid & ATTR_ATIME) - inode->i_atime = timespec_trunc(attr->ia_atime, - inode->i_sb->s_time_gran); + inode->i_atime = timespec64_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MTIME) - inode->i_mtime = timespec_trunc(attr->ia_mtime, - inode->i_sb->s_time_gran); + inode->i_mtime = timespec64_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_CTIME) - inode->i_ctime = timespec_trunc(attr->ia_ctime, - inode->i_sb->s_time_gran); + inode->i_ctime = timespec64_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -811,7 +817,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) __setattr_copy(inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, get_inode_mode(inode)); + err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; clear_inode_flag(inode, FI_ACL_MODE); @@ -850,7 +856,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - page = get_new_data_page(inode, NULL, index, false); + page = f2fs_get_new_data_page(inode, NULL, index, false); f2fs_unlock_op(sbi); if (IS_ERR(page)) @@ -863,7 +869,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, return 0; } -int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) { int err; @@ -872,10 +878,11 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) pgoff_t end_offset, count; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { if (err == -ENOENT) { - pg_start = get_next_page_offset(&dn, pg_start); + pg_start = f2fs_get_next_page_offset(&dn, + pg_start); continue; } return err; @@ -886,7 +893,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); - truncate_data_blocks_range(&dn, count); + f2fs_truncate_data_blocks_range(&dn, count); f2fs_put_dnode(&dn); pg_start += count; @@ -942,7 +949,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_end - 1); f2fs_lock_op(sbi); - ret = truncate_hole(inode, pg_start, pg_end); + ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); up_write(&F2FS_I(inode)->i_mmap_sem); } @@ -960,7 +967,7 @@ static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { return ret; } else if (ret == -ENOENT) { @@ -977,7 +984,7 @@ next_dnode: for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { *blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (!is_checkpointed_data(sbi, *blkaddr)) { + if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { f2fs_put_dnode(&dn); @@ -1010,10 +1017,10 @@ static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, continue; set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + ret = f2fs_get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); if (ret) { dec_valid_block_count(sbi, inode, 1); - invalidate_blocks(sbi, *blkaddr); + f2fs_invalidate_blocks(sbi, *blkaddr); } else { f2fs_update_data_blkaddr(&dn, *blkaddr); } @@ -1043,18 +1050,18 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, pgoff_t ilen; set_new_dnode(&dn, dst_inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + ret = f2fs_get_dnode_of_data(&dn, dst + i, ALLOC_NODE); if (ret) return ret; - get_node_info(sbi, dn.nid, &ni); + f2fs_get_node_info(sbi, dn.nid, &ni); ilen = min((pgoff_t) ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { dn.data_blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - truncate_data_blocks_range(&dn, 1); + f2fs_truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { f2fs_i_blocks_write(src_inode, @@ -1077,10 +1084,11 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } else { struct page *psrc, *pdst; - psrc = get_lock_data_page(src_inode, src + i, true); + psrc = f2fs_get_lock_data_page(src_inode, + src + i, true); if (IS_ERR(psrc)) return PTR_ERR(psrc); - pdst = get_new_data_page(dst_inode, NULL, dst + i, + pdst = f2fs_get_new_data_page(dst_inode, NULL, dst + i, true); if (IS_ERR(pdst)) { f2fs_put_page(psrc, 1); @@ -1091,7 +1099,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); - ret = truncate_hole(src_inode, src + i, src + i + 1); + ret = f2fs_truncate_hole(src_inode, + src + i, src + i + 1); if (ret) return ret; i++; @@ -1113,12 +1122,14 @@ static int __exchange_data_block(struct inode *src_inode, olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), - sizeof(block_t) * olen, GFP_KERNEL); + array_size(olen, sizeof(block_t)), + GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), - sizeof(int) * olen, GFP_KERNEL); + array_size(olen, sizeof(int)), + GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1144,7 +1155,7 @@ static int __exchange_data_block(struct inode *src_inode, return 0; roll_back: - __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len); + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, olen); kvfree(src_blkaddr); kvfree(do_replace); return ret; @@ -1187,7 +1198,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_end = (offset + len) >> PAGE_SHIFT; /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ @@ -1208,12 +1219,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = truncate_blocks(inode, new_size, true); + ret = f2fs_truncate_blocks(inode, new_size, true); if (!ret) f2fs_i_size_write(inode, new_size); out_unlock: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1233,7 +1244,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, } dn->ofs_in_node = ofs_in_node; - ret = reserve_new_blocks(dn, count); + ret = f2fs_reserve_new_blocks(dn, count); if (ret) return ret; @@ -1242,7 +1253,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->data_blkaddr = datablock_addr(dn->inode, dn->node_page, dn->ofs_in_node); /* - * reserve_new_blocks will not guarantee entire block + * f2fs_reserve_new_blocks will not guarantee entire block * allocation. */ if (dn->data_blkaddr == NULL_ADDR) { @@ -1250,9 +1261,9 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, break; } if (dn->data_blkaddr != NEW_ADDR) { - invalidate_blocks(sbi, dn->data_blkaddr); + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); dn->data_blkaddr = NEW_ADDR; - set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn); } } @@ -1318,7 +1329,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, ALLOC_NODE); + ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); goto out; @@ -1389,10 +1400,10 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = truncate_blocks(inode, i_size_read(inode), true); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (ret) goto out; @@ -1430,7 +1441,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1473,7 +1484,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, last_off = map.m_lblk + map.m_len - 1; /* update new size to the failed position */ - new_size = (last_off == pg_end) ? offset + len: + new_size = (last_off == pg_end) ? offset + len : (loff_t)(last_off + 1) << PAGE_SHIFT; } else { new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; @@ -1553,13 +1564,13 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); + clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); } return 0; } @@ -1576,7 +1587,7 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) */ if (f2fs_is_atomic_file(inode) && F2FS_I(inode)->inmem_task == current) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); return 0; } @@ -1584,8 +1595,15 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & - (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); + unsigned int flags = fi->i_flags; + + if (file_is_encrypt(inode)) + flags |= F2FS_ENCRYPT_FL; + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + flags |= F2FS_INLINE_DATA_FL; + + flags &= F2FS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); } @@ -1602,15 +1620,15 @@ static int __f2fs_ioc_setflags(struct inode *inode, unsigned int flags) oldflags = fi->i_flags; - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) + if ((flags ^ oldflags) & (F2FS_APPEND_FL | F2FS_IMMUTABLE_FL)) if (!capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - flags = flags & (FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL); - flags |= oldflags & ~(FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL); + flags = flags & F2FS_FL_USER_MODIFIABLE; + flags |= oldflags & ~F2FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - if (fi->i_flags & FS_PROJINHERIT_FL) + if (fi->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); else clear_inode_flag(inode, FI_PROJ_INHERIT); @@ -1670,6 +1688,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (f2fs_is_atomic_file(inode)) goto out; @@ -1677,28 +1697,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; - set_inode_flag(inode, FI_ATOMIC_FILE); - set_inode_flag(inode, FI_HOT_DATA); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - if (!get_dirty_pages(inode)) - goto inc_stat; + goto skip_flush; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_HOT_DATA); + if (ret) goto out; - } +skip_flush: + set_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -inc_stat: F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1718,27 +1735,33 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (f2fs_is_volatile_file(inode)) + if (f2fs_is_volatile_file(inode)) { + ret = -EINVAL; goto err_out; + } if (f2fs_is_atomic_file(inode)) { - ret = commit_inmem_pages(inode); + ret = f2fs_commit_inmem_pages(inode); if (ret) goto err_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_HOT_DATA); + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC] = 0; stat_dec_atomic_write(inode); } } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + ret = -EINVAL; + } + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1823,7 +1846,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) inode_lock(inode); if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); stat_dec_volatile_write(inode); @@ -1851,9 +1874,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (get_user(in, (__u32 __user *)arg)) return -EFAULT; - ret = mnt_want_write_file(filp); - if (ret) - return ret; + if (in != F2FS_GOING_DOWN_FULLSYNC) { + ret = mnt_want_write_file(filp); + if (ret) + return ret; + } switch (in) { case F2FS_GOING_DOWN_FULLSYNC: @@ -1878,7 +1903,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -1886,15 +1911,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) goto out; } - stop_gc_thread(sbi); - stop_discard_thread(sbi); + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); - drop_discard_cmd(sbi); + f2fs_drop_discard_cmd(sbi); clear_opt(sbi, DISCARD); f2fs_update_time(sbi, REQ_TIME); out: - mnt_drop_write_file(filp); + if (in != F2FS_GOING_DOWN_FULLSYNC) + mnt_drop_write_file(filp); return ret; } @@ -2053,15 +2079,15 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { + return -EINVAL; + } + ret = mnt_want_write_file(filp); if (ret) return ret; - end = range.start + range.len; - if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { - ret = -EINVAL; - goto out; - } do_more: if (!range.sync) { if (!mutex_trylock(&sbi->gc_mutex)) { @@ -2081,7 +2107,7 @@ out: return ret; } -static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +static int f2fs_ioc_f2fs_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2110,7 +2136,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, 0, 0}; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; @@ -2119,7 +2145,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, int err; /* if in-place-update policy is enabled, don't waste time here */ - if (should_update_inplace(inode, NULL)) + if (f2fs_should_update_inplace(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -2214,7 +2240,7 @@ do_map: while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { struct page *page; - page = get_lock_data_page(inode, idx, true); + page = f2fs_get_lock_data_page(inode, idx, true); if (IS_ERR(page)) { err = PTR_ERR(page); goto clear_out; @@ -2325,12 +2351,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); - down_write(&F2FS_I(src)->dio_rwsem[WRITE]); + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; if (!inode_trylock(dst)) goto out; - if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) { + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { inode_unlock(dst); goto out; } @@ -2392,11 +2418,11 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_unlock_op(sbi); out_unlock: if (src != dst) { - up_write(&F2FS_I(dst)->dio_rwsem[WRITE]); + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); inode_unlock(dst); } out: - up_write(&F2FS_I(src)->dio_rwsem[WRITE]); + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); inode_unlock(src); return ret; } @@ -2554,7 +2580,7 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) if (IS_NOQUOTA(inode)) goto out_unlock; - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out_unlock; @@ -2568,7 +2594,9 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) } f2fs_put_page(ipage, 1); - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) + goto out_unlock; transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (!IS_ERR(transfer_to[PRJQUOTA])) { @@ -2601,17 +2629,17 @@ static inline __u32 f2fs_iflags_to_xflags(unsigned long iflags) { __u32 xflags = 0; - if (iflags & FS_SYNC_FL) + if (iflags & F2FS_SYNC_FL) xflags |= FS_XFLAG_SYNC; - if (iflags & FS_IMMUTABLE_FL) + if (iflags & F2FS_IMMUTABLE_FL) xflags |= FS_XFLAG_IMMUTABLE; - if (iflags & FS_APPEND_FL) + if (iflags & F2FS_APPEND_FL) xflags |= FS_XFLAG_APPEND; - if (iflags & FS_NODUMP_FL) + if (iflags & F2FS_NODUMP_FL) xflags |= FS_XFLAG_NODUMP; - if (iflags & FS_NOATIME_FL) + if (iflags & F2FS_NOATIME_FL) xflags |= FS_XFLAG_NOATIME; - if (iflags & FS_PROJINHERIT_FL) + if (iflags & F2FS_PROJINHERIT_FL) xflags |= FS_XFLAG_PROJINHERIT; return xflags; } @@ -2620,31 +2648,23 @@ static inline __u32 f2fs_iflags_to_xflags(unsigned long iflags) FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) -/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ -#define F2FS_FL_XFLAG_VISIBLE (FS_SYNC_FL | \ - FS_IMMUTABLE_FL | \ - FS_APPEND_FL | \ - FS_NODUMP_FL | \ - FS_NOATIME_FL | \ - FS_PROJINHERIT_FL) - /* Transfer xflags flags to internal */ static inline unsigned long f2fs_xflags_to_iflags(__u32 xflags) { unsigned long iflags = 0; if (xflags & FS_XFLAG_SYNC) - iflags |= FS_SYNC_FL; + iflags |= F2FS_SYNC_FL; if (xflags & FS_XFLAG_IMMUTABLE) - iflags |= FS_IMMUTABLE_FL; + iflags |= F2FS_IMMUTABLE_FL; if (xflags & FS_XFLAG_APPEND) - iflags |= FS_APPEND_FL; + iflags |= F2FS_APPEND_FL; if (xflags & FS_XFLAG_NODUMP) - iflags |= FS_NODUMP_FL; + iflags |= F2FS_NODUMP_FL; if (xflags & FS_XFLAG_NOATIME) - iflags |= FS_NOATIME_FL; + iflags |= F2FS_NOATIME_FL; if (xflags & FS_XFLAG_PROJINHERIT) - iflags |= FS_PROJINHERIT_FL; + iflags |= F2FS_PROJINHERIT_FL; return iflags; } @@ -2657,7 +2677,7 @@ static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg) memset(&fa, 0, sizeof(struct fsxattr)); fa.fsx_xflags = f2fs_iflags_to_xflags(fi->i_flags & - (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL)); + F2FS_FL_USER_VISIBLE); if (f2fs_sb_has_project_quota(inode->i_sb)) fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, @@ -2717,12 +2737,14 @@ int f2fs_pin_file_control(struct inode *inode, bool inc) /* Use i_gc_failures for normal file as a risk signal. */ if (inc) - f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1); + f2fs_i_gc_failures_write(inode, + fi->i_gc_failures[GC_FAILURE_PIN] + 1); - if (fi->i_gc_failures > sbi->gc_pin_file_threshold) { + if (fi->i_gc_failures[GC_FAILURE_PIN] > sbi->gc_pin_file_threshold) { f2fs_msg(sbi->sb, KERN_WARNING, "%s: Enable GC = ino %lx after %x GC trials\n", - __func__, inode->i_ino, fi->i_gc_failures); + __func__, inode->i_ino, + fi->i_gc_failures[GC_FAILURE_PIN]); clear_inode_flag(inode, FI_PIN_FILE); return -EAGAIN; } @@ -2753,14 +2775,14 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) inode_lock(inode); - if (should_update_outplace(inode, NULL)) { + if (f2fs_should_update_outplace(inode, NULL)) { ret = -EINVAL; goto out; } if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); - F2FS_I(inode)->i_gc_failures = 1; + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = 1; goto done; } @@ -2773,7 +2795,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) goto out; set_inode_flag(inode, FI_PIN_FILE); - ret = F2FS_I(inode)->i_gc_failures; + ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; done: f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -2788,7 +2810,7 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) __u32 pin = 0; if (is_inode_flag_set(inode, FI_PIN_FILE)) - pin = F2FS_I(inode)->i_gc_failures; + pin = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; return put_user(pin, (u32 __user *)arg); } @@ -2812,9 +2834,9 @@ int f2fs_precache_extents(struct inode *inode) while (map.m_lblk < end) { map.m_len = end - map.m_lblk; - down_write(&fi->dio_rwsem[WRITE]); + down_write(&fi->i_gc_rwsem[WRITE]); err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); - up_write(&fi->dio_rwsem[WRITE]); + up_write(&fi->i_gc_rwsem[WRITE]); if (err) return err; @@ -2866,7 +2888,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT_RANGE: return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: - return f2fs_ioc_write_checkpoint(filp, arg); + return f2fs_ioc_f2fs_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: @@ -2894,7 +2916,6 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct blk_plug plug; ssize_t ret; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) @@ -2924,6 +2945,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_count(from)) || f2fs_has_inline_data(inode) || f2fs_force_buffered_io(inode, WRITE)) { + clear_inode_flag(inode, + FI_NO_PREALLOC); inode_unlock(inode); return -EAGAIN; } @@ -2939,9 +2962,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; } } - blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); /* if we couldn't write data, we should deallocate blocks. */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9327411fd93b..9093be6e7a7d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -76,7 +76,7 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (gc_th->gc_urgent) { + if (sbi->gc_mode == GC_URGENT) { wait_ms = gc_th->urgent_sleep_time; mutex_lock(&sbi->gc_mutex); goto do_gc; @@ -114,7 +114,7 @@ next: return 0; } -int start_gc_thread(struct f2fs_sb_info *sbi) +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -131,8 +131,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi) gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; - gc_th->gc_idle = 0; - gc_th->gc_urgent = 0; gc_th->gc_wake= 0; sbi->gc_thread = gc_th; @@ -148,7 +146,7 @@ out: return err; } -void stop_gc_thread(struct f2fs_sb_info *sbi) +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th = sbi->gc_thread; if (!gc_th) @@ -158,21 +156,19 @@ void stop_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = NULL; } -static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) +static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) { int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; - if (!gc_th) - return gc_mode; - - if (gc_th->gc_idle) { - if (gc_th->gc_idle == 1) - gc_mode = GC_CB; - else if (gc_th->gc_idle == 2) - gc_mode = GC_GREEDY; - } - if (gc_th->gc_urgent) + switch (sbi->gc_mode) { + case GC_IDLE_CB: + gc_mode = GC_CB; + break; + case GC_IDLE_GREEDY: + case GC_URGENT: gc_mode = GC_GREEDY; + break; + } return gc_mode; } @@ -187,7 +183,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else { - p->gc_mode = select_gc_type(sbi->gc_thread, gc_type); + p->gc_mode = select_gc_type(sbi, gc_type); p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; p->max_search = dirty_i->nr_dirty[DIRTY]; p->ofs_unit = sbi->segs_per_sec; @@ -195,7 +191,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, /* we need to check every dirty segments in the FG_GC case */ if (gc_type != FG_GC && - (sbi->gc_thread && !sbi->gc_thread->gc_urgent) && + (sbi->gc_mode != GC_URGENT) && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; @@ -234,10 +230,6 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; - - if (no_fggc_candidate(sbi, secno)) - continue; - clear_bit(secno, dirty_i->victim_secmap); return GET_SEG_FROM_SEC(sbi, secno); } @@ -377,9 +369,6 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; - if (gc_type == FG_GC && p.alloc_mode == LFS && - no_fggc_candidate(sbi, secno)) - goto next; cost = get_gc_cost(sbi, segno, &p); @@ -440,7 +429,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) iput(inode); return; } - new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS); new_ie->inode = inode; f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); @@ -454,7 +443,7 @@ static void put_gc_inode(struct gc_inode_list *gc_list) radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); list_del(&ie->list); - kmem_cache_free(inode_entry_slab, ie); + kmem_cache_free(f2fs_inode_entry_slab, ie); } } @@ -484,12 +473,16 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, block_t start_addr; int off; int phase = 0; + bool fggc = (gc_type == FG_GC); start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; + if (fggc && phase == 2) + atomic_inc(&sbi->wb_sync_req[NODE]); + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; @@ -503,39 +496,42 @@ next_step: continue; if (phase == 0) { - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, META_NAT, true); continue; } if (phase == 1) { - ra_node_page(sbi, nid); + f2fs_ra_node_page(sbi, nid); continue; } /* phase == 2 */ - node_page = get_node_page(sbi, nid); + node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) continue; - /* block may become invalid during get_node_page */ + /* block may become invalid during f2fs_get_node_page */ if (check_valid_map(sbi, segno, off) == 0) { f2fs_put_page(node_page, 1); continue; } - get_node_info(sbi, nid, &ni); + f2fs_get_node_info(sbi, nid, &ni); if (ni.blk_addr != start_addr + off) { f2fs_put_page(node_page, 1); continue; } - move_node_page(node_page, gc_type); + f2fs_move_node_page(node_page, gc_type); stat_inc_node_blk_count(sbi, 1, gc_type); } if (++phase < 3) goto next_step; + + if (fggc) + atomic_dec(&sbi->wb_sync_req[NODE]); } /* @@ -545,7 +541,7 @@ next_step: * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode) +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -576,11 +572,11 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, nid = le32_to_cpu(sum->nid); ofs_in_node = le16_to_cpu(sum->ofs_in_node); - node_page = get_node_page(sbi, nid); + node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) return false; - get_node_info(sbi, nid, dni); + f2fs_get_node_info(sbi, nid, dni); if (sum->version != dni->version) { f2fs_msg(sbi->sb, KERN_WARNING, @@ -603,7 +599,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, * This can be used to move blocks, aka LBAs, directly on disk. */ static void move_data_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) + int gc_type, unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -614,6 +610,7 @@ static void move_data_block(struct inode *inode, block_t bidx, .op_flags = 0, .encrypted_page = NULL, .in_list = false, + .retry = false, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -621,6 +618,7 @@ static void move_data_block(struct inode *inode, block_t bidx, struct page *page; block_t newaddr; int err; + bool lfs_mode = test_opt(fio.sbi, LFS); /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); @@ -630,8 +628,11 @@ static void move_data_block(struct inode *inode, block_t bidx, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; + F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; goto out; + } if (f2fs_is_pinned_file(inode)) { f2fs_pin_file_control(inode, true); @@ -639,7 +640,7 @@ static void move_data_block(struct inode *inode, block_t bidx, } set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) goto out; @@ -654,14 +655,17 @@ static void move_data_block(struct inode *inode, block_t bidx, */ f2fs_wait_on_page_writeback(page, DATA, true); - get_node_info(fio.sbi, dn.nid, &ni); + f2fs_get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* read page */ fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + if (lfs_mode) + down_write(&fio.sbi->io_order_lock); + + f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, CURSEG_COLD_DATA, NULL, false); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), @@ -693,6 +697,7 @@ static void move_data_block(struct inode *inode, block_t bidx, dec_page_count(fio.sbi, F2FS_DIRTY_META); set_page_writeback(fio.encrypted_page); + ClearPageError(page); /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE, true); @@ -700,8 +705,8 @@ static void move_data_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; - err = f2fs_submit_page_write(&fio); - if (err) { + f2fs_submit_page_write(&fio); + if (fio.retry) { if (PageWriteback(fio.encrypted_page)) end_page_writeback(fio.encrypted_page); goto put_page_out; @@ -716,8 +721,10 @@ static void move_data_block(struct inode *inode, block_t bidx, put_page_out: f2fs_put_page(fio.encrypted_page, 1); recover_block: + if (lfs_mode) + up_write(&fio.sbi->io_order_lock); if (err) - __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, + f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, true, true); put_out: f2fs_put_dnode(&dn); @@ -730,15 +737,18 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, { struct page *page; - page = get_lock_data_page(inode, bidx, true); + page = f2fs_get_lock_data_page(inode, bidx, true); if (IS_ERR(page)) return; if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; + F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; goto out; + } if (f2fs_is_pinned_file(inode)) { if (gc_type == FG_GC) f2fs_pin_file_control(inode, true); @@ -772,15 +782,20 @@ retry: f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); } set_cold_data(page); - err = do_write_data_page(&fio); - if (err == -ENOMEM && is_dirty) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry; + err = f2fs_do_write_data_page(&fio); + if (err) { + clear_cold_data(page); + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + if (is_dirty) + set_page_dirty(page); } } out: @@ -824,13 +839,13 @@ next_step: continue; if (phase == 0) { - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, META_NAT, true); continue; } if (phase == 1) { - ra_node_page(sbi, nid); + f2fs_ra_node_page(sbi, nid); continue; } @@ -839,7 +854,7 @@ next_step: continue; if (phase == 2) { - ra_node_page(sbi, dni.ino); + f2fs_ra_node_page(sbi, dni.ino); continue; } @@ -850,23 +865,23 @@ next_step: if (IS_ERR(inode) || is_bad_inode(inode)) continue; - /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_file(inode)) { + /* if inode uses special I/O path, let's go phase 3 */ + if (f2fs_post_read_required(inode)) { add_gc_inode(gc_list, inode); continue; } if (!down_write_trylock( - &F2FS_I(inode)->dio_rwsem[WRITE])) { + &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); continue; } - start_bidx = start_bidx_of_node(nofs, inode); - data_page = get_read_data_page(inode, + start_bidx = f2fs_start_bidx_of_node(nofs, inode); + data_page = f2fs_get_read_data_page(inode, start_bidx + ofs_in_node, REQ_RAHEAD, true); - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; @@ -884,11 +899,11 @@ next_step: bool locked = false; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->dio_rwsem[READ])) + if (!down_write_trylock(&fi->i_gc_rwsem[READ])) continue; if (!down_write_trylock( - &fi->dio_rwsem[WRITE])) { - up_write(&fi->dio_rwsem[READ]); + &fi->i_gc_rwsem[WRITE])) { + up_write(&fi->i_gc_rwsem[READ]); continue; } locked = true; @@ -897,17 +912,18 @@ next_step: inode_dio_wait(inode); } - start_bidx = start_bidx_of_node(nofs, inode) + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_file(inode)) - move_data_block(inode, start_bidx, segno, off); + if (f2fs_post_read_required(inode)) + move_data_block(inode, start_bidx, gc_type, + segno, off); else move_data_page(inode, start_bidx, gc_type, segno, off); if (locked) { - up_write(&fi->dio_rwsem[WRITE]); - up_write(&fi->dio_rwsem[READ]); + up_write(&fi->i_gc_rwsem[WRITE]); + up_write(&fi->i_gc_rwsem[READ]); } stat_inc_data_blk_count(sbi, 1, gc_type); @@ -946,12 +962,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* readahead multi ssa blocks those have contiguous address */ if (sbi->segs_per_sec > 1) - ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), + f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, META_SSA, true); /* reference all summary page */ while (segno < end_segno) { - sum_page = get_sum_page(sbi, segno++); + sum_page = f2fs_get_sum_page(sbi, segno++); unlock_page(sum_page); } @@ -1017,6 +1033,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; + unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; + unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, sync, background, get_pages(sbi, F2FS_DIRTY_NODES), @@ -1045,7 +1063,7 @@ gc_more: * secure free segments which doesn't need fggc any more. */ if (prefree_segments(sbi)) { - ret = write_checkpoint(sbi, &cpc); + ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; } @@ -1068,17 +1086,27 @@ gc_more: sec_freed++; total_freed += seg_freed; + if (gc_type == FG_GC) { + if (sbi->skipped_atomic_files[FG_GC] > last_skipped) + skipped_round++; + last_skipped = sbi->skipped_atomic_files[FG_GC]; + round++; + } + if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round > MAX_SKIP_ATOMIC_COUNT && + skipped_round * 2 >= round) + f2fs_drop_inmem_pages_all(sbi, true); segno = NULL_SEGNO; goto gc_more; } if (gc_type == FG_GC) - ret = write_checkpoint(sbi, &cpc); + ret = f2fs_write_checkpoint(sbi, &cpc); } stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; @@ -1102,19 +1130,10 @@ stop: return ret; } -void build_gc_manager(struct f2fs_sb_info *sbi) +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) { - u64 main_count, resv_count, ovp_count; - DIRTY_I(sbi)->v_ops = &default_v_ops; - /* threshold of # of valid blocks in a section for victims of FG_GC */ - main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; - resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; - ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; - - sbi->fggc_threshold = div64_u64((main_count - ovp_count) * - BLKS_PER_SEC(sbi), (main_count - resv_count)); sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b0045d4c8d1e..c8619e408009 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -36,8 +36,6 @@ struct f2fs_gc_kthread { unsigned int no_gc_sleep_time; /* for changing gc mode */ - unsigned int gc_idle; - unsigned int gc_urgent; unsigned int gc_wake; }; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 265da200daa8..043830be5662 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -25,7 +25,7 @@ bool f2fs_may_inline_data(struct inode *inode) if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_file(inode)) + if (f2fs_post_read_required(inode)) return false; return true; @@ -42,7 +42,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) return true; } -void read_inline_data(struct page *page, struct page *ipage) +void f2fs_do_read_inline_data(struct page *page, struct page *ipage) { struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; @@ -64,7 +64,8 @@ void read_inline_data(struct page *page, struct page *ipage) SetPageUptodate(page); } -void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) +void f2fs_truncate_inline_inode(struct inode *inode, + struct page *ipage, u64 from) { void *addr; @@ -85,7 +86,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) { struct page *ipage; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { unlock_page(page); return PTR_ERR(ipage); @@ -99,7 +100,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) if (page->index) zero_user_segment(page, 0, PAGE_SIZE); else - read_inline_data(page, ipage); + f2fs_do_read_inline_data(page, ipage); if (!PageUptodate(page)) SetPageUptodate(page); @@ -131,7 +132,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); - read_inline_data(page, dn->inode_page); + f2fs_do_read_inline_data(page, dn->inode_page); set_page_dirty(page); /* clear dirty state */ @@ -139,20 +140,21 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); + ClearPageError(page); fio.old_blkaddr = dn->data_blkaddr; set_inode_flag(dn->inode, FI_HOT_DATA); - write_data_page(dn, &fio); + f2fs_outplace_write_data(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { inode_dec_dirty_pages(dn->inode); - remove_dirty_inode(dn->inode); + f2fs_remove_dirty_inode(dn->inode); } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode, dn->inode_page, 0); + f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); @@ -177,7 +179,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; @@ -203,12 +205,10 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; - struct address_space *mapping = page_mapping(page); - unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); if (err) return err; @@ -226,10 +226,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - xa_lock_irqsave(&mapping->i_pages, flags); - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); - xa_unlock_irqrestore(&mapping->i_pages, flags); + f2fs_clear_radix_tree_dirty_tag(page); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -239,7 +236,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) return 0; } -bool recover_inline_data(struct inode *inode, struct page *npage) +bool f2fs_recover_inline_data(struct inode *inode, struct page *npage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode *ri = NULL; @@ -260,7 +257,7 @@ bool recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); f2fs_wait_on_page_writeback(ipage, NODE, true); @@ -278,20 +275,20 @@ process_inline: } if (f2fs_has_inline_data(inode)) { - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - truncate_inline_inode(inode, ipage, 0); + f2fs_truncate_inline_inode(inode, ipage, 0); clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (truncate_blocks(inode, 0, false)) + if (f2fs_truncate_blocks(inode, 0, false)) return false; goto process_inline; } return false; } -struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); @@ -302,7 +299,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, void *inline_dentry; f2fs_hash_t namehash; - ipage = get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { *res_page = ipage; return NULL; @@ -313,7 +310,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, inline_dentry = inline_data_addr(dir, ipage); make_dentry_ptr_inline(dir, &d, inline_dentry); - de = find_target_dentry(fname, namehash, NULL, &d); + de = f2fs_find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) *res_page = ipage; @@ -323,7 +320,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return de; } -int make_empty_inline_dir(struct inode *inode, struct inode *parent, +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { struct f2fs_dentry_ptr d; @@ -332,7 +329,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); - do_make_empty_dir(inode, parent, &d); + f2fs_do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -367,7 +364,6 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = page_address(page); @@ -391,7 +387,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(dir, ipage, 0); + f2fs_truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -434,7 +430,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); - fake_mode = get_de_type(de) << S_SHIFT; + fake_mode = f2fs_get_de_type(de) << S_SHIFT; err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, ino, fake_mode); @@ -446,8 +442,8 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - truncate_blocks(dir, 0, false); - remove_dirty_inode(dir); + f2fs_truncate_blocks(dir, 0, false); + f2fs_remove_dirty_inode(dir); return err; } @@ -465,7 +461,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, } memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); - truncate_inline_inode(dir, ipage, 0); + f2fs_truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -514,14 +510,14 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *page = NULL; int err = 0; - ipage = get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); inline_dentry = inline_data_addr(dir, ipage); make_dentry_ptr_inline(dir, &d, inline_dentry); - bit_pos = room_for_filename(d.bitmap, slots, d.max); + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); if (bit_pos >= d.max) { err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) @@ -532,7 +528,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, new_name, + page = f2fs_init_inode_metadata(inode, dir, new_name, orig_name, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -553,7 +549,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_put_page(page, 1); } - update_parent_metadata(dir, inode, 0); + f2fs_update_parent_metadata(dir, inode, 0); fail: if (inode) up_write(&F2FS_I(inode)->i_sem); @@ -599,7 +595,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) void *inline_dentry; struct f2fs_dentry_ptr d; - ipage = get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; @@ -630,7 +626,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (ctx->pos == d.max) return 0; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -656,7 +652,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, struct page *ipage; int err = 0; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -672,7 +668,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; byteaddr += (char *)inline_data_addr(inode, ipage) - (char *)F2FS_INODE(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e0d9e8f27ed2..f121c864f4c0 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -36,15 +36,15 @@ void f2fs_set_inode_flags(struct inode *inode) unsigned int flags = F2FS_I(inode)->i_flags; unsigned int new_fl = 0; - if (flags & FS_SYNC_FL) + if (flags & F2FS_SYNC_FL) new_fl |= S_SYNC; - if (flags & FS_APPEND_FL) + if (flags & F2FS_APPEND_FL) new_fl |= S_APPEND; - if (flags & FS_IMMUTABLE_FL) + if (flags & F2FS_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; - if (flags & FS_NOATIME_FL) + if (flags & F2FS_NOATIME_FL) new_fl |= S_NOATIME; - if (flags & FS_DIRSYNC_FL) + if (flags & F2FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; if (f2fs_encrypted_inode(inode)) new_fl |= S_ENCRYPTED; @@ -72,7 +72,7 @@ static bool __written_first_block(struct f2fs_inode *ri) { block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); - if (addr != NEW_ADDR && addr != NULL_ADDR) + if (is_valid_blkaddr(addr)) return true; return false; } @@ -117,7 +117,6 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_inode *ri = &F2FS_NODE(page)->i; - int extra_isize = le32_to_cpu(ri->i_extra_isize); if (!f2fs_sb_has_inode_chksum(sbi->sb)) return false; @@ -125,7 +124,8 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) return false; - if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), + i_inode_checksum)) return false; return true; @@ -185,6 +185,21 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); } +static bool sanity_check_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) + && !f2fs_has_extra_attr(inode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode ino=%lx, run fsck to fix.", + __func__, inode->i_ino); + return false; + } + return true; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -194,14 +209,10 @@ static int do_read_inode(struct inode *inode) projid_t i_projid; /* Check if ino is within scope */ - if (check_nid_range(sbi, inode->i_ino)) { - f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", - (unsigned long) inode->i_ino); - WARN_ON(1); + if (f2fs_check_nid_range(sbi, inode->i_ino)) return -EINVAL; - } - node_page = get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -221,8 +232,11 @@ static int do_read_inode(struct inode *inode) inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); - - fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + if (S_ISDIR(inode->i_mode)) + fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + else if (S_ISREG(inode->i_mode)) + fi->i_gc_failures[GC_FAILURE_PIN] = + le16_to_cpu(ri->i_gc_failures); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); fi->flags = 0; @@ -239,7 +253,6 @@ static int do_read_inode(struct inode *inode) le16_to_cpu(ri->i_extra_isize) : 0; if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { - f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); } else if (f2fs_has_inline_xattr(inode) || f2fs_has_inline_dentry(inode)) { @@ -265,10 +278,10 @@ static int do_read_inode(struct inode *inode) if (__written_first_block(ri)) set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); - if (!need_inode_block_update(sbi, inode->i_ino)) + if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; - if (fi->i_flags & FS_PROJINHERIT_FL) + if (fi->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && @@ -284,9 +297,9 @@ static int do_read_inode(struct inode *inode) fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime); + F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); + F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; f2fs_put_page(node_page, 1); @@ -317,13 +330,17 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = do_read_inode(inode); if (ret) goto bad_inode; + if (!sanity_check_inode(inode)) { + ret = -EINVAL; + goto bad_inode; + } make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (ino == F2FS_META_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (S_ISREG(inode->i_mode)) { inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; @@ -373,7 +390,7 @@ retry: return inode; } -void update_inode(struct inode *inode, struct page *node_page) +void f2fs_update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; struct extent_tree *et = F2FS_I(inode)->extent_tree; @@ -408,7 +425,12 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); + if (S_ISDIR(inode->i_mode)) + ri->i_current_depth = + cpu_to_le32(F2FS_I(inode)->i_current_depth); + else if (S_ISREG(inode->i_mode)) + ri->i_gc_failures = + cpu_to_le16(F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]); ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); @@ -448,18 +470,18 @@ void update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime); + F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); + F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; } -void update_inode_page(struct inode *inode) +void f2fs_update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; retry: - node_page = get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); if (err == -ENOMEM) { @@ -470,7 +492,7 @@ retry: } return; } - update_inode(inode, node_page); + f2fs_update_inode(inode, node_page); f2fs_put_page(node_page, 1); } @@ -489,7 +511,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - update_inode_page(inode); + f2fs_update_inode_page(inode); if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; @@ -506,7 +528,7 @@ void f2fs_evict_inode(struct inode *inode) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -516,7 +538,7 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); @@ -525,9 +547,9 @@ void f2fs_evict_inode(struct inode *inode) dquot_initialize(inode); - remove_ino_entry(sbi, inode->i_ino, APPEND_INO); - remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); - remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); @@ -544,7 +566,7 @@ retry: #endif if (!err) { f2fs_lock_op(sbi); - err = remove_inode_page(inode); + err = f2fs_remove_inode_page(inode); f2fs_unlock_op(sbi); if (err == -ENOENT) err = 0; @@ -557,7 +579,7 @@ retry: } if (err) - update_inode_page(inode); + f2fs_update_inode_page(inode); dquot_free_inode(inode); sb_end_intwrite(inode->i_sb); no_delete: @@ -580,16 +602,19 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); + f2fs_add_ino_entry(sbi, inode->i_ino, APPEND_INO); if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + f2fs_add_ino_entry(sbi, inode->i_ino, UPDATE_INO); } if (is_inode_flag_set(inode, FI_FREE_NID)) { - alloc_nid_failed(sbi, inode->i_ino); + f2fs_alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); } else { - f2fs_bug_on(sbi, err && - !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); + /* + * If xattr nid is corrupted, we can reach out error condition, + * err & !f2fs_exist_written_data(sbi, inode->i_ino, ORPHAN_INO)). + * In that case, f2fs_check_nid_range() is enough to give a clue. + */ } out_clear: fscrypt_put_encryption_info(inode); @@ -597,7 +622,7 @@ out_clear: } /* caller should call f2fs_lock_op() */ -void handle_failed_inode(struct inode *inode) +void f2fs_handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; @@ -612,7 +637,7 @@ void handle_failed_inode(struct inode *inode) * we must call this to avoid inode being remained as dirty, resulting * in a panic when flushing dirty inodes in gdirty_list. */ - update_inode_page(inode); + f2fs_update_inode_page(inode); f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ @@ -623,18 +648,18 @@ void handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - get_node_info(sbi, inode->i_ino, &ni); + f2fs_get_node_info(sbi, inode->i_ino, &ni); if (ni.blk_addr != NULL_ADDR) { - int err = acquire_orphan_inode(sbi); + int err = f2fs_acquire_orphan_inode(sbi); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, "Too many orphan inodes, run fsck to fix."); } else { - add_orphan_inode(inode); + f2fs_add_orphan_inode(inode); } - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); } else { set_inode_flag(inode, FI_FREE_NID); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d5098efe577c..231b7f3ea7d3 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -37,7 +37,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) return ERR_PTR(-ENOMEM); f2fs_lock_op(sbi); - if (!alloc_nid(sbi, &ino)) { + if (!f2fs_alloc_nid(sbi, &ino)) { f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; @@ -50,10 +50,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = - F2FS_I(inode)->i_crtime = current_time(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + F2FS_I(inode)->i_crtime = timespec64_to_timespec(inode->i_mtime); inode->i_generation = sbi->s_next_generation++; + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_current_depth = 1; + err = insert_inode_locked(inode); if (err) { err = -EINVAL; @@ -61,7 +64,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } if (f2fs_sb_has_project_quota(sbi->sb) && - (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; else F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, @@ -116,9 +119,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); if (S_ISDIR(inode->i_mode)) - F2FS_I(inode)->i_flags |= FS_INDEX_FL; + F2FS_I(inode)->i_flags |= F2FS_INDEX_FL; - if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); trace_f2fs_new_inode(inode, 0); @@ -193,7 +196,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * up_read(&sbi->sb_lock); } -int update_extension_list(struct f2fs_sb_info *sbi, const char *name, +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; @@ -292,10 +295,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto out; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, ino); + f2fs_alloc_nid_done(sbi, ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -303,7 +305,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, f2fs_balance_fs(sbi, true); return 0; out: - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -398,7 +400,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) err = PTR_ERR(page); goto out; } else { - err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); + err = f2fs_do_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); if (err) goto out; } @@ -409,7 +411,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) else if (IS_ERR(page)) err = PTR_ERR(page); else - err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); + err = f2fs_do_add_link(dir, &dotdot, NULL, pino, S_IFDIR); out: if (!err) clear_inode_flag(dir, FI_INLINE_DOTS); @@ -521,7 +523,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) { f2fs_unlock_op(sbi); f2fs_put_page(page, 0); @@ -586,9 +588,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) - goto out_handle_failed_inode; + goto out_f2fs_handle_failed_inode; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); if (err) @@ -597,8 +599,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, err = page_symlink(inode, disk_link.name, disk_link.len); err_out: - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); /* * Let's flush symlink data in order to avoid broken symlink as much as @@ -622,8 +623,8 @@ err_out: f2fs_balance_fs(sbi, true); goto out_free_encrypted_link; -out_handle_failed_inode: - handle_failed_inode(inode); +out_f2fs_handle_failed_inode: + f2fs_handle_failed_inode(inode); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); @@ -659,10 +660,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -672,7 +672,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) out_fail: clear_inode_flag(inode, FI_INC_LINK); - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -711,10 +711,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, goto out; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -722,7 +721,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, f2fs_balance_fs(sbi, true); return 0; out: - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -751,7 +750,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, } f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) goto out; @@ -763,8 +762,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, * add this non-linked tmpfile to orphan list, in this way we could * remove all unused data of tmpfile after abnormal power-off. */ - add_orphan_inode(inode); - alloc_nid_done(sbi, inode->i_ino); + f2fs_add_orphan_inode(inode); + f2fs_alloc_nid_done(sbi, inode->i_ino); if (whiteout) { f2fs_i_links_write(inode, false); @@ -780,9 +779,9 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, return 0; release_out: - release_orphan_inode(sbi); + f2fs_release_orphan_inode(sbi); out: - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -889,7 +888,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) goto put_out_dir; @@ -903,9 +902,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(new_inode)->i_sem); if (!new_inode->i_nlink) - add_orphan_inode(new_inode); + f2fs_add_orphan_inode(new_inode); else - release_orphan_inode(sbi); + f2fs_release_orphan_inode(sbi); } else { f2fs_balance_fs(sbi, true); @@ -973,8 +972,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_put_page(old_dir_page, 0); f2fs_i_links_write(old_dir, false); } - if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (S_ISDIR(old_inode->i_mode)) + f2fs_add_ino_entry(sbi, old_inode->i_ino, + TRANS_DIR_INO); + } f2fs_unlock_op(sbi); @@ -1125,8 +1128,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_mark_inode_dirty_sync(new_dir, false); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { - add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); } f2fs_unlock_op(sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f202398e20ea..10643b11bd59 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -23,13 +23,28 @@ #include "trace.h" #include <trace/events/f2fs.h> -#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) +#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; -bool available_free_memory(struct f2fs_sb_info *sbi, int type) +/* + * Check whether the given nid is within node id range. + */ +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +{ + if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: out-of-range nid=%x, run fsck to fix.", + __func__, nid); + return -EINVAL; + } + return 0; +} + +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct sysinfo val; @@ -87,18 +102,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { - struct address_space *mapping = page->mapping; - unsigned int long flags; - if (PageDirty(page)) { - xa_lock_irqsave(&mapping->i_pages, flags); - radix_tree_tag_clear(&mapping->i_pages, - page_index(page), - PAGECACHE_TAG_DIRTY); - xa_unlock_irqrestore(&mapping->i_pages, flags); - + f2fs_clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); - dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); + dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); } ClearPageUptodate(page); } @@ -106,7 +113,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { pgoff_t index = current_nat_addr(sbi, nid); - return get_meta_page(sbi, index); + return f2fs_get_meta_page(sbi, index); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) @@ -123,8 +130,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) dst_off = next_nat_addr(sbi, src_off); /* get current nat block page with lock */ - src_page = get_meta_page(sbi, src_off); - dst_page = grab_meta_page(sbi, dst_off); + src_page = f2fs_get_meta_page(sbi, src_off); + dst_page = f2fs_grab_meta_page(sbi, dst_off); f2fs_bug_on(sbi, PageDirty(src_page)); src_addr = page_address(src_page); @@ -260,7 +267,7 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } -int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -277,7 +284,7 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) return need; } -bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -291,7 +298,7 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) return is_cp; } -bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -364,8 +371,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, new_blkaddr == NULL_ADDR); f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR && - nat_get_blkaddr(e) != NULL_ADDR && + f2fs_bug_on(sbi, is_valid_blkaddr(nat_get_blkaddr(e)) && new_blkaddr == NEW_ADDR); /* increment version no as node is removed */ @@ -376,7 +382,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* change address */ nat_set_blkaddr(e, new_blkaddr); - if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR) + if (!is_valid_blkaddr(new_blkaddr)) set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); @@ -391,7 +397,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, up_write(&nm_i->nat_tree_lock); } -int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; @@ -413,7 +419,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) /* * This function always returns success */ -void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) +void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -443,7 +450,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) /* Check current segment summary */ down_read(&curseg->journal_rwsem); - i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); + i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); @@ -458,7 +465,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) index = current_nat_addr(sbi, nid); up_read(&nm_i->nat_tree_lock); - page = get_meta_page(sbi, index); + page = f2fs_get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); @@ -471,7 +478,7 @@ cache: /* * readahead MAX_RA_NODE number of node pages. */ -static void ra_node_pages(struct page *parent, int start, int n) +static void f2fs_ra_node_pages(struct page *parent, int start, int n) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); struct blk_plug plug; @@ -485,13 +492,13 @@ static void ra_node_pages(struct page *parent, int start, int n) end = min(end, NIDS_PER_BLOCK); for (i = start; i < end; i++) { nid = get_nid(parent, i, false); - ra_node_page(sbi, nid); + f2fs_ra_node_page(sbi, nid); } blk_finish_plug(&plug); } -pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) { const long direct_index = ADDRS_PER_INODE(dn->inode); const long direct_blks = ADDRS_PER_BLOCK; @@ -606,7 +613,7 @@ got: * f2fs_unlock_op() only if ro is not set RDONLY_NODE. * In the case of RDONLY_NODE, we don't need to care about mutex. */ -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct page *npage[4]; @@ -625,7 +632,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) npage[0] = dn->inode_page; if (!npage[0]) { - npage[0] = get_node_page(sbi, nids[0]); + npage[0] = f2fs_get_node_page(sbi, nids[0]); if (IS_ERR(npage[0])) return PTR_ERR(npage[0]); } @@ -649,24 +656,24 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ - if (!alloc_nid(sbi, &(nids[i]))) { + if (!f2fs_alloc_nid(sbi, &(nids[i]))) { err = -ENOSPC; goto release_pages; } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i]); + npage[i] = f2fs_new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { - alloc_nid_failed(sbi, nids[i]); + f2fs_alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); goto release_pages; } set_nid(parent, offset[i - 1], nids[i], i == 1); - alloc_nid_done(sbi, nids[i]); + f2fs_alloc_nid_done(sbi, nids[i]); done = true; } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { - npage[i] = get_node_page_ra(parent, offset[i - 1]); + npage[i] = f2fs_get_node_page_ra(parent, offset[i - 1]); if (IS_ERR(npage[i])) { err = PTR_ERR(npage[i]); goto release_pages; @@ -681,7 +688,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } if (!done) { - npage[i] = get_node_page(sbi, nids[i]); + npage[i] = f2fs_get_node_page(sbi, nids[i]); if (IS_ERR(npage[i])) { err = PTR_ERR(npage[i]); f2fs_put_page(npage[0], 0); @@ -720,15 +727,15 @@ static void truncate_node(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; - get_node_info(sbi, dn->nid, &ni); + f2fs_get_node_info(sbi, dn->nid, &ni); /* Deallocate node address */ - invalidate_blocks(sbi, ni.blk_addr); + f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { - remove_orphan_inode(sbi, dn->nid); + f2fs_remove_orphan_inode(sbi, dn->nid); dec_valid_inode_count(sbi); f2fs_inode_synced(dn->inode); } @@ -753,7 +760,7 @@ static int truncate_dnode(struct dnode_of_data *dn) return 1; /* get direct node */ - page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); + page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) return 1; else if (IS_ERR(page)) @@ -762,7 +769,7 @@ static int truncate_dnode(struct dnode_of_data *dn) /* Make dnode_of_data for parameter */ dn->node_page = page; dn->ofs_in_node = 0; - truncate_data_blocks(dn); + f2fs_truncate_data_blocks(dn); truncate_node(dn); return 1; } @@ -783,13 +790,13 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); + page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); if (IS_ERR(page)) { trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); return PTR_ERR(page); } - ra_node_pages(page, ofs, NIDS_PER_BLOCK); + f2fs_ra_node_pages(page, ofs, NIDS_PER_BLOCK); rn = F2FS_NODE(page); if (depth < 3) { @@ -859,7 +866,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { /* reference count'll be increased */ - pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]); + pages[i] = f2fs_get_node_page(F2FS_I_SB(dn->inode), nid[i]); if (IS_ERR(pages[i])) { err = PTR_ERR(pages[i]); idx = i - 1; @@ -868,7 +875,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } - ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + f2fs_ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { @@ -905,7 +912,7 @@ fail: /* * All the block addresses of data and nodes should be nullified. */ -int truncate_inode_blocks(struct inode *inode, pgoff_t from) +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err = 0, cont = 1; @@ -921,7 +928,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) if (level < 0) return level; - page = get_node_page(sbi, inode->i_ino); + page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); return PTR_ERR(page); @@ -1001,7 +1008,7 @@ fail: } /* caller must lock inode page */ -int truncate_xattr_node(struct inode *inode) +int f2fs_truncate_xattr_node(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t nid = F2FS_I(inode)->i_xattr_nid; @@ -1011,7 +1018,7 @@ int truncate_xattr_node(struct inode *inode) if (!nid) return 0; - npage = get_node_page(sbi, nid); + npage = f2fs_get_node_page(sbi, nid); if (IS_ERR(npage)) return PTR_ERR(npage); @@ -1026,17 +1033,17 @@ int truncate_xattr_node(struct inode *inode) * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ -int remove_inode_page(struct inode *inode) +int f2fs_remove_inode_page(struct inode *inode) { struct dnode_of_data dn; int err; set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); if (err) return err; - err = truncate_xattr_node(inode); + err = f2fs_truncate_xattr_node(inode); if (err) { f2fs_put_dnode(&dn); return err; @@ -1045,7 +1052,7 @@ int remove_inode_page(struct inode *inode) /* remove potential inline_data blocks */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - truncate_data_blocks_range(&dn, 1); + f2fs_truncate_data_blocks_range(&dn, 1); /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(F2FS_I_SB(inode), @@ -1056,7 +1063,7 @@ int remove_inode_page(struct inode *inode) return 0; } -struct page *new_inode_page(struct inode *inode) +struct page *f2fs_new_inode_page(struct inode *inode) { struct dnode_of_data dn; @@ -1064,10 +1071,10 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0); + return f2fs_new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) +struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info new_ni; @@ -1085,7 +1092,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - get_node_info(sbi, dn->nid, &new_ni); + f2fs_get_node_info(sbi, dn->nid, &new_ni); f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); #endif new_ni.nid = dn->nid; @@ -1137,7 +1144,7 @@ static int read_node_page(struct page *page, int op_flags) if (PageUptodate(page)) return LOCKED_PAGE; - get_node_info(sbi, page->index, &ni); + f2fs_get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); @@ -1151,14 +1158,15 @@ static int read_node_page(struct page *page, int op_flags) /* * Readahead a node page */ -void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { struct page *apage; int err; if (!nid) return; - f2fs_bug_on(sbi, check_nid_range(sbi, nid)); + if (f2fs_check_nid_range(sbi, nid)) + return; rcu_read_lock(); apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); @@ -1182,7 +1190,8 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (!nid) return ERR_PTR(-ENOENT); - f2fs_bug_on(sbi, check_nid_range(sbi, nid)); + if (f2fs_check_nid_range(sbi, nid)) + return ERR_PTR(-EINVAL); repeat: page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!page) @@ -1198,7 +1207,7 @@ repeat: } if (parent) - ra_node_pages(parent, start + 1, MAX_RA_NODE); + f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); lock_page(page); @@ -1232,12 +1241,12 @@ out_err: return page; } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { return __get_node_page(sbi, nid, NULL, 0); } -struct page *get_node_page_ra(struct page *parent, int start) +struct page *f2fs_get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); nid_t nid = get_nid(parent, start, false); @@ -1272,7 +1281,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: @@ -1359,11 +1368,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); - if (unlikely(f2fs_cp_error(sbi))) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - unlock_page(page); - return 0; - } + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; @@ -1379,7 +1385,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, down_read(&sbi->node_write); } - get_node_info(sbi, nid, &ni); + f2fs_get_node_info(sbi, nid, &ni); /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { @@ -1394,8 +1400,9 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, fio.op_flags |= REQ_PREFLUSH | REQ_FUA; set_page_writeback(page); + ClearPageError(page); fio.old_blkaddr = ni.blk_addr; - write_node_page(nid, &fio); + f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); @@ -1424,7 +1431,7 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -void move_node_page(struct page *node_page, int gc_type) +void f2fs_move_node_page(struct page *node_page, int gc_type) { if (gc_type == FG_GC) { struct writeback_control wbc = { @@ -1461,7 +1468,7 @@ static int f2fs_write_node_page(struct page *page, return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); } -int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index; @@ -1528,9 +1535,9 @@ continue_unlock: if (IS_INODE(page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - update_inode(inode, page); + f2fs_update_inode(inode, page); set_dentry_mark(page, - need_dentry_mark(sbi, ino)); + f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ if (!PageDirty(page)) @@ -1580,7 +1587,8 @@ out: return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) { pgoff_t index; @@ -1588,21 +1596,28 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, int step = 0; int nwritten = 0; int ret = 0; - int nr_pages; + int nr_pages, done = 0; pagevec_init(&pvec); next_step: index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while (!done && (nr_pages = pagevec_lookup_tag(&pvec, + NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; bool submitted = false; + /* give a priority to WB_SYNC threads */ + if (atomic_read(&sbi->wb_sync_req[NODE]) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1681,7 +1696,7 @@ continue_unlock: return ret; } -int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index = 0; struct pagevec pvec; @@ -1730,14 +1745,21 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) goto skip_write; + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req[NODE]); + else if (atomic_read(&sbi->wb_sync_req[NODE])) + goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, NODE); diff = nr_pages_to_write(sbi, NODE, wbc); - wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc, true, FS_NODE_IO); + f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req[NODE]); return 0; skip_write: @@ -1753,7 +1775,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); if (!PageDirty(page)) { - f2fs_set_page_dirty_nobuffers(page); + __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); f2fs_trace_pid(page); @@ -1883,20 +1905,20 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, * Thread A Thread B * - f2fs_create * - f2fs_new_inode - * - alloc_nid + * - f2fs_alloc_nid * - __insert_nid_to_list(PREALLOC_NID) * - f2fs_balance_fs_bg - * - build_free_nids - * - __build_free_nids + * - f2fs_build_free_nids + * - __f2fs_build_free_nids * - scan_nat_page * - add_free_nid * - __lookup_nat_cache * - f2fs_add_link - * - init_inode_metadata - * - new_inode_page - * - new_node_page + * - f2fs_init_inode_metadata + * - f2fs_new_inode_page + * - f2fs_new_node_page * - set_node_addr - * - alloc_nid_done + * - f2fs_alloc_nid_done * - __remove_nid_from_list(PREALLOC_NID) * - __insert_nid_to_list(FREE_NID) */ @@ -2028,7 +2050,8 @@ out: up_read(&nm_i->nat_tree_lock); } -static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, + bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); int i = 0; @@ -2041,7 +2064,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; - if (!sync && !available_free_memory(sbi, FREE_NIDS)) + if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS)) return; if (!mount) { @@ -2053,7 +2076,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) } /* readahead nat pages to be scanned */ - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); down_read(&nm_i->nat_tree_lock); @@ -2083,14 +2106,14 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) up_read(&nm_i->nat_tree_lock); - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi, sync, mount); + __f2fs_build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -2099,7 +2122,7 @@ void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) * from second parameter of this function. * The returned nid could be used ino as well as nid when inode is created. */ -bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; @@ -2117,8 +2140,8 @@ retry: return false; } - /* We should not use stale free nids created by build_free_nids */ - if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) { + /* We should not use stale free nids created by f2fs_build_free_nids */ + if (nm_i->nid_cnt[FREE_NID] && !on_f2fs_build_free_nids(nm_i)) { f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); @@ -2135,14 +2158,14 @@ retry: spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi, true, false); + f2fs_build_free_nids(sbi, true, false); goto retry; } /* - * alloc_nid() should be called prior to this function. + * f2fs_alloc_nid() should be called prior to this function. */ -void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; @@ -2157,9 +2180,9 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) } /* - * alloc_nid() should be called prior to this function. + * f2fs_alloc_nid() should be called prior to this function. */ -void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; @@ -2172,7 +2195,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - if (!available_free_memory(sbi, FREE_NIDS)) { + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) { __remove_free_nid(sbi, i, PREALLOC_NID); need_free = true; } else { @@ -2189,7 +2212,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *next; @@ -2217,14 +2240,14 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) return nr - nr_shrink; } -void recover_inline_xattr(struct inode *inode, struct page *page) +void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; size_t inline_size; struct page *ipage; struct f2fs_inode *ri; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); ri = F2FS_INODE(page); @@ -2242,11 +2265,11 @@ void recover_inline_xattr(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(ipage, NODE, true); memcpy(dst_addr, src_addr, inline_size); update_inode: - update_inode(inode, ipage); + f2fs_update_inode(inode, ipage); f2fs_put_page(ipage, 1); } -int recover_xattr_data(struct inode *inode, struct page *page) +int f2fs_recover_xattr_data(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; @@ -2259,25 +2282,25 @@ int recover_xattr_data(struct inode *inode, struct page *page) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - get_node_info(sbi, prev_xnid, &ni); - invalidate_blocks(sbi, ni.blk_addr); + f2fs_get_node_info(sbi, prev_xnid, &ni); + f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: /* 2: update xattr nid in inode */ - if (!alloc_nid(sbi, &new_xnid)) + if (!f2fs_alloc_nid(sbi, &new_xnid)) return -ENOSPC; set_new_dnode(&dn, inode, NULL, NULL, new_xnid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { - alloc_nid_failed(sbi, new_xnid); + f2fs_alloc_nid_failed(sbi, new_xnid); return PTR_ERR(xpage); } - alloc_nid_done(sbi, new_xnid); - update_inode_page(inode); + f2fs_alloc_nid_done(sbi, new_xnid); + f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); @@ -2288,14 +2311,14 @@ recover_xnid: return 0; } -int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_inode *src, *dst; nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; struct page *ipage; - get_node_info(sbi, ino, &old_ni); + f2fs_get_node_info(sbi, ino, &old_ni); if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; @@ -2349,7 +2372,7 @@ retry: return 0; } -void restore_node_summary(struct f2fs_sb_info *sbi, +void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; @@ -2366,10 +2389,10 @@ void restore_node_summary(struct f2fs_sb_info *sbi, nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ - ra_meta_pages(sbi, addr, nrpages, META_POR, true); + f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); for (idx = addr; idx < addr + nrpages; idx++) { - struct page *page = get_tmp_page(sbi, idx); + struct page *page = f2fs_get_tmp_page(sbi, idx); rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; @@ -2511,7 +2534,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { - offset = lookup_journal_in_cursum(journal, + offset = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 1); f2fs_bug_on(sbi, offset < 0); raw_ne = &nat_in_journal(journal, offset); @@ -2548,7 +2571,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2611,7 +2634,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { - struct page *page = get_meta_page(sbi, nat_bits_addr++); + struct page *page = f2fs_get_meta_page(sbi, nat_bits_addr++); memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), page_address(page), F2FS_BLKSIZE); @@ -2730,8 +2753,10 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); int i; - nm_i->free_nid_bitmap = f2fs_kzalloc(sbi, nm_i->nat_blocks * - sizeof(unsigned char *), GFP_KERNEL); + nm_i->free_nid_bitmap = + f2fs_kzalloc(sbi, array_size(sizeof(unsigned char *), + nm_i->nat_blocks), + GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; @@ -2747,14 +2772,16 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = f2fs_kvzalloc(sbi, nm_i->nat_blocks * - sizeof(unsigned short), GFP_KERNEL); + nm_i->free_nid_count = + f2fs_kvzalloc(sbi, array_size(sizeof(unsigned short), + nm_i->nat_blocks), + GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; return 0; } -int build_node_manager(struct f2fs_sb_info *sbi) +int f2fs_build_node_manager(struct f2fs_sb_info *sbi) { int err; @@ -2774,11 +2801,11 @@ int build_node_manager(struct f2fs_sb_info *sbi) /* load free nid status from nat_bits table */ load_free_nid_bitmap(sbi); - build_free_nids(sbi, true, true); + f2fs_build_free_nids(sbi, true, true); return 0; } -void destroy_node_manager(struct f2fs_sb_info *sbi) +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *next_i; @@ -2850,7 +2877,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kfree(nm_i); } -int __init create_node_manager_caches(void) +int __init f2fs_create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry)); @@ -2876,7 +2903,7 @@ fail: return -ENOMEM; } -void destroy_node_manager_caches(void) +void f2fs_destroy_node_manager_caches(void) { kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 1b23d3febe4c..38f25f0b193a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -47,7 +47,7 @@ static struct kmem_cache *fsync_entry_slab; -bool space_for_roll_forward(struct f2fs_sb_info *sbi) +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); @@ -162,7 +162,7 @@ retry: goto out_put; } - err = acquire_orphan_inode(F2FS_I_SB(inode)); + err = f2fs_acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); goto out_put; @@ -173,7 +173,7 @@ retry: } else if (IS_ERR(page)) { err = PTR_ERR(page); } else { - err = __f2fs_do_add_link(dir, &fname, inode, + err = f2fs_add_dentry(dir, &fname, inode, inode->i_ino, inode->i_mode); } if (err == -ENOMEM) @@ -204,8 +204,6 @@ static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) set_inode_flag(inode, FI_DATA_EXIST); else clear_inode_flag(inode, FI_DATA_EXIST); - if (!(ri->i_inline & F2FS_INLINE_DOTS)) - clear_inode_flag(inode, FI_INLINE_DOTS); } static void recover_inode(struct inode *inode, struct page *page) @@ -254,10 +252,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, while (1) { struct fsync_inode_entry *entry; - if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) return 0; - page = get_tmp_page(sbi, blkaddr); + page = f2fs_get_tmp_page(sbi, blkaddr); if (!is_recoverable_dnode(page)) break; @@ -271,7 +269,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, if (!check_only && IS_INODE(page) && is_dent_dnode(page)) { - err = recover_inode_page(sbi, page); + err = f2fs_recover_inode_page(sbi, page); if (err) break; quota_inode = true; @@ -312,7 +310,7 @@ next: blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); - ra_meta_pages_cond(sbi, blkaddr); + f2fs_ra_meta_pages_cond(sbi, blkaddr); } f2fs_put_page(page, 1); return err; @@ -355,7 +353,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, } } - sum_page = get_sum_page(sbi, segno); + sum_page = f2fs_get_sum_page(sbi, segno); sum_node = (struct f2fs_summary_block *)page_address(sum_page); sum = sum_node->entries[blkoff]; f2fs_put_page(sum_page, 1); @@ -375,7 +373,7 @@ got_it: } /* Get the node page */ - node_page = get_node_page(sbi, nid); + node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -400,7 +398,8 @@ got_it: inode = dn->inode; } - bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node); + bidx = f2fs_start_bidx_of_node(offset, inode) + + le16_to_cpu(sum.ofs_in_node); /* * if inode page is locked, unlock temporarily, but its reference @@ -410,11 +409,11 @@ got_it: unlock_page(dn->inode_page); set_new_dnode(&tdn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + if (f2fs_get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) goto out; if (tdn.data_blkaddr == blkaddr) - truncate_data_blocks_range(&tdn, 1); + f2fs_truncate_data_blocks_range(&tdn, 1); f2fs_put_dnode(&tdn); out: @@ -427,7 +426,7 @@ out: truncate_out: if (datablock_addr(tdn.inode, tdn.node_page, tdn.ofs_in_node) == blkaddr) - truncate_data_blocks_range(&tdn, 1); + f2fs_truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); return 0; @@ -443,25 +442,25 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, /* step 1: recover xattr */ if (IS_INODE(page)) { - recover_inline_xattr(inode, page); + f2fs_recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - err = recover_xattr_data(inode, page); + err = f2fs_recover_xattr_data(inode, page); if (!err) recovered++; goto out; } /* step 2: recover inline data */ - if (recover_inline_data(inode, page)) + if (f2fs_recover_inline_data(inode, page)) goto out; /* step 3: recover data indices */ - start = start_bidx_of_node(ofs_of_node(page), inode); + start = f2fs_start_bidx_of_node(ofs_of_node(page), inode); end = start + ADDRS_PER_PAGE(page, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); retry_dn: - err = get_dnode_of_data(&dn, start, ALLOC_NODE); + err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -472,7 +471,7 @@ retry_dn: f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - get_node_info(sbi, dn.nid, &ni); + f2fs_get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); @@ -488,7 +487,7 @@ retry_dn: /* dest is invalid, just invalidate src block */ if (dest == NULL_ADDR) { - truncate_data_blocks_range(&dn, 1); + f2fs_truncate_data_blocks_range(&dn, 1); continue; } @@ -502,19 +501,19 @@ retry_dn: * and then reserve one new block in dnode page. */ if (dest == NEW_ADDR) { - truncate_data_blocks_range(&dn, 1); - reserve_new_block(&dn); + f2fs_truncate_data_blocks_range(&dn, 1); + f2fs_reserve_new_block(&dn); continue; } /* dest is valid block, try to recover from src to dest */ - if (is_valid_blkaddr(sbi, dest, META_POR)) { + if (f2fs_is_valid_meta_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { - err = reserve_new_block(&dn); + err = f2fs_reserve_new_block(&dn); #ifdef CONFIG_F2FS_FAULT_INJECTION while (err) - err = reserve_new_block(&dn); + err = f2fs_reserve_new_block(&dn); #endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); @@ -569,12 +568,12 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, while (1) { struct fsync_inode_entry *entry; - if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) break; - ra_meta_pages_cond(sbi, blkaddr); + f2fs_ra_meta_pages_cond(sbi, blkaddr); - page = get_tmp_page(sbi, blkaddr); + page = f2fs_get_tmp_page(sbi, blkaddr); if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); @@ -612,11 +611,11 @@ next: f2fs_put_page(page, 1); } if (!err) - allocate_new_segments(sbi); + f2fs_allocate_new_segments(sbi); return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct list_head inode_list; struct list_head dir_list; @@ -691,7 +690,7 @@ skip: struct cp_control cpc = { .reason = CP_RECOVERY, }; - err = write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); } kmem_cache_destroy(fsync_entry_slab); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5854cc4e1d67..9efce174c51a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -169,7 +169,7 @@ found: return result - size + __reverse_ffz(tmp); } -bool need_SSR(struct f2fs_sb_info *sbi) +bool f2fs_need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); @@ -177,14 +177,14 @@ bool need_SSR(struct f2fs_sb_info *sbi) if (test_opt(sbi, LFS)) return false; - if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + if (sbi->gc_mode == GC_URGENT) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void register_inmem_page(struct inode *inode, struct page *page) +void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -230,6 +230,8 @@ static int __revoke_inmem_pages(struct inode *inode, lock_page(page); + f2fs_wait_on_page_writeback(page, DATA, true); + if (recover) { struct dnode_of_data dn; struct node_info ni; @@ -237,7 +239,8 @@ static int __revoke_inmem_pages(struct inode *inode, trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, page->index, + LOOKUP_NODE); if (err) { if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -247,9 +250,9 @@ retry: err = -EAGAIN; goto next; } - get_node_info(sbi, dn.nid, &ni); + f2fs_get_node_info(sbi, dn.nid, &ni); if (cur->old_addr == NEW_ADDR) { - invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); } else f2fs_replace_block(sbi, &dn, dn.data_blkaddr, @@ -271,7 +274,7 @@ next: return err; } -void drop_inmem_pages_all(struct f2fs_sb_info *sbi) +void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) { struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; struct inode *inode; @@ -287,15 +290,23 @@ next: spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); if (inode) { - drop_inmem_pages(inode); + if (gc_failure) { + if (fi->i_gc_failures[GC_FAILURE_ATOMIC]) + goto drop; + goto skip; + } +drop: + set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + f2fs_drop_inmem_pages(inode); iput(inode); } +skip: congestion_wait(BLK_RW_ASYNC, HZ/50); cond_resched(); goto next; } -void drop_inmem_pages(struct inode *inode) +void f2fs_drop_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -309,11 +320,11 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_HOT_DATA); + fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; stat_dec_atomic_write(inode); } -void drop_inmem_page(struct inode *inode, struct page *page) +void f2fs_drop_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -328,7 +339,7 @@ void drop_inmem_page(struct inode *inode, struct page *page) break; } - f2fs_bug_on(sbi, !cur || cur->page != page); + f2fs_bug_on(sbi, list_empty(head) || cur->page != page); list_del(&cur->list); mutex_unlock(&fi->inmem_lock); @@ -343,8 +354,7 @@ void drop_inmem_page(struct inode *inode, struct page *page) trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } -static int __commit_inmem_pages(struct inode *inode, - struct list_head *revoke_list) +static int __f2fs_commit_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -357,9 +367,12 @@ static int __commit_inmem_pages(struct inode *inode, .op_flags = REQ_SYNC | REQ_PRIO, .io_type = FS_DATA_IO, }; + struct list_head revoke_list; pgoff_t last_idx = ULONG_MAX; int err = 0; + INIT_LIST_HEAD(&revoke_list); + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { struct page *page = cur->page; @@ -371,14 +384,14 @@ static int __commit_inmem_pages(struct inode *inode, f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); } retry: fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; fio.need_lock = LOCK_DONE; - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); if (err) { if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -393,50 +406,46 @@ retry: last_idx = page->index; } unlock_page(page); - list_move_tail(&cur->list, revoke_list); + list_move_tail(&cur->list, &revoke_list); } if (last_idx != ULONG_MAX) f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); - if (!err) - __revoke_inmem_pages(inode, revoke_list, false, false); + if (err) { + /* + * try to revoke all committed pages, but still we could fail + * due to no memory or other reason, if that happened, EAGAIN + * will be returned, which means in such case, transaction is + * already not integrity, caller should use journal to do the + * recovery or rewrite & commit last transaction. For other + * error number, revoking was done by filesystem itself. + */ + err = __revoke_inmem_pages(inode, &revoke_list, false, true); + + /* drop all uncommitted pages */ + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + } else { + __revoke_inmem_pages(inode, &revoke_list, false, false); + } return err; } -int commit_inmem_pages(struct inode *inode) +int f2fs_commit_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - struct list_head revoke_list; int err; - INIT_LIST_HEAD(&revoke_list); f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); mutex_lock(&fi->inmem_lock); - err = __commit_inmem_pages(inode, &revoke_list); - if (err) { - int ret; - /* - * try to revoke all committed pages, but still we could fail - * due to no memory or other reason, if that happened, EAGAIN - * will be returned, which means in such case, transaction is - * already not integrity, caller should use journal to do the - * recovery or rewrite & commit last transaction. For other - * error number, revoking was done by filesystem itself. - */ - ret = __revoke_inmem_pages(inode, &revoke_list, false, true); - if (ret) - err = ret; + err = __f2fs_commit_inmem_pages(inode); - /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - } spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (!list_empty(&fi->inmem_ilist)) list_del_init(&fi->inmem_ilist); @@ -478,25 +487,28 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) { + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return; + /* try to shrink extent cache when there is no enough memory */ - if (!available_free_memory(sbi, EXTENT_CACHE)) + if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ - if (!available_free_memory(sbi, NAT_ENTRIES)) - try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) + f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); - if (!available_free_memory(sbi, FREE_NIDS)) - try_to_free_nids(sbi, MAX_FREE_NIDS); + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) + f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi, false, false); + f2fs_build_free_nids(sbi, false, false); if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ - if (!available_free_memory(sbi, NAT_ENTRIES) || - !available_free_memory(sbi, INO_ENTRIES) || + if (!f2fs_available_free_memory(sbi, NAT_ENTRIES) || + !f2fs_available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || f2fs_time_over(sbi, CP_TIME)) { @@ -504,7 +516,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) struct blk_plug plug; blk_start_plug(&plug); - sync_dirty_inodes(sbi, FILE_INODE); + f2fs_sync_dirty_inodes(sbi, FILE_INODE); blk_finish_plug(&plug); } f2fs_sync_fs(sbi->sb, true); @@ -537,7 +549,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) return __submit_flush_wait(sbi, sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) { - if (!is_dirty_device(sbi, ino, i, FLUSH_INO)) + if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO)) continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) @@ -648,7 +660,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) return cmd.ret; } -int create_flush_cmd_control(struct f2fs_sb_info *sbi) +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; @@ -685,7 +697,7 @@ init_thread: return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; @@ -915,6 +927,42 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi, #endif } +static void __init_discard_policy(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) +{ + /* common policy */ + dpolicy->type = discard_type; + dpolicy->sync = true; + dpolicy->granularity = granularity; + + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->io_aware = true; + dpolicy->sync = false; + if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { + dpolicy->granularity = 1; + dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; + } + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->max_requests = UINT_MAX; + dpolicy->io_aware = false; + } +} + + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, @@ -929,6 +977,9 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (dc->state != D_PREP) return; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return; + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); dc->error = __blkdev_issue_discard(dc->bdev, @@ -972,7 +1023,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, goto do_insert; } - p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); + p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); do_insert: dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); if (!dc) @@ -1037,7 +1088,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, lstart, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, @@ -1130,68 +1181,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, - struct discard_policy *dpolicy, - unsigned int start, unsigned int end) -{ - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct discard_cmd *prev_dc = NULL, *next_dc = NULL; - struct rb_node **insert_p = NULL, *insert_parent = NULL; - struct discard_cmd *dc; - struct blk_plug plug; - int issued; - -next: - issued = 0; - - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); - - dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, - NULL, start, - (struct rb_entry **)&prev_dc, - (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true); - if (!dc) - dc = next_dc; - - blk_start_plug(&plug); - - while (dc && dc->lstart <= end) { - struct rb_node *node; - - if (dc->len < dpolicy->granularity) - goto skip; - - if (dc->state != D_PREP) { - list_move_tail(&dc->list, &dcc->fstrim_list); - goto skip; - } - - __submit_discard_cmd(sbi, dpolicy, dc); - - if (++issued >= dpolicy->max_requests) { - start = dc->lstart + dc->len; - - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); - - schedule(); - - goto next; - } -skip: - node = rb_next(&dc->rb_node); - dc = rb_entry_safe(node, struct discard_cmd, rb_node); - - if (fatal_signal_pending(current)) - break; - } - - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); -} - static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { @@ -1210,7 +1199,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); if (list_empty(pend_list)) goto next; - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + f2fs_bug_on(sbi, + !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1263,7 +1253,7 @@ static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) return dropped; } -void drop_discard_cmd(struct f2fs_sb_info *sbi) +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi) { __drop_discard_cmd(sbi); } @@ -1332,7 +1322,18 @@ next: static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { - __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + struct discard_policy dp; + + if (dpolicy) { + __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + return; + } + + /* wait all */ + __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); + __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); + __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -1343,7 +1344,8 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) bool need_wait = false; mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree(&dcc->root, + NULL, blkaddr); if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); @@ -1358,7 +1360,7 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) __wait_one_discard_bio(sbi, dc); } -void stop_discard_thread(struct f2fs_sb_info *sbi) +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -1377,11 +1379,13 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) struct discard_policy dpolicy; bool dropped; - init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, + dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); - __wait_all_discard_cmd(sbi, &dpolicy); + /* just to make sure there is no pending discard commands */ + __wait_all_discard_cmd(sbi, NULL); return dropped; } @@ -1397,32 +1401,39 @@ static int issue_discard_thread(void *data) set_freezable(); do { - init_discard_policy(&dpolicy, DPOLICY_BG, + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || dcc->discard_wake, msecs_to_jiffies(wait_ms)); + + if (dcc->discard_wake) + dcc->discard_wake = 0; + if (try_to_freeze()) continue; if (f2fs_readonly(sbi->sb)) continue; if (kthread_should_stop()) return 0; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + wait_ms = dpolicy.max_interval; + continue; + } - if (dcc->discard_wake) - dcc->discard_wake = 0; - - if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - init_discard_policy(&dpolicy, DPOLICY_FORCE, 1); + if (sbi->gc_mode == GC_URGENT) + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); issued = __issue_discard_cmd(sbi, &dpolicy); - if (issued) { + if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; + } else if (issued == -1){ + wait_ms = dpolicy.mid_interval; } else { wait_ms = dpolicy.max_interval; } @@ -1591,20 +1602,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, return false; } -void release_discard_addrs(struct f2fs_sb_info *sbi) +static void release_discard_addr(struct discard_entry *entry) +{ + list_del(&entry->list); + kmem_cache_free(discard_entry_slab, entry); +} + +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ - list_for_each_entry_safe(entry, this, head, list) { - list_del(&entry->list); - kmem_cache_free(discard_entry_slab, entry); - } + list_for_each_entry_safe(entry, this, head, list) + release_discard_addr(entry); } /* - * Should call clear_prefree_segments after checkpoint is done. + * Should call f2fs_clear_prefree_segments after checkpoint is done. */ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { @@ -1617,7 +1632,8 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) mutex_unlock(&dirty_i->seglist_lock); } -void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *head = &dcc->entry_list; @@ -1700,40 +1716,13 @@ skip: if (cur_pos < sbi->blocks_per_seg) goto find_next; - list_del(&entry->list); + release_discard_addr(entry); dcc->nr_discards -= total_len; - kmem_cache_free(discard_entry_slab, entry); } wake_up_discard_thread(sbi, false); } -void init_discard_policy(struct discard_policy *dpolicy, - int discard_type, unsigned int granularity) -{ - /* common policy */ - dpolicy->type = discard_type; - dpolicy->sync = true; - dpolicy->granularity = granularity; - - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; - - if (discard_type == DPOLICY_BG) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = true; - } else if (discard_type == DPOLICY_FORCE) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = false; - } else if (discard_type == DPOLICY_FSTRIM) { - dpolicy->io_aware = false; - } else if (discard_type == DPOLICY_UMOUNT) { - dpolicy->io_aware = false; - } -} - static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -1786,7 +1775,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return; - stop_discard_thread(sbi); + f2fs_stop_discard_thread(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; @@ -1833,8 +1822,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) (new_vblocks > sbi->blocks_per_seg))); se->valid_blocks = new_vblocks; - se->mtime = get_mtime(sbi); - SIT_I(sbi)->max_mtime = se->mtime; + se->mtime = get_mtime(sbi, false); + if (se->mtime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = se->mtime; /* Update valid block bitmap */ if (del > 0) { @@ -1902,7 +1892,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) { unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); @@ -1922,14 +1912,14 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) up_write(&sit_i->sentry_lock); } -bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) { struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, offset; struct seg_entry *se; bool is_cp = false; - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + if (!is_valid_blkaddr(blkaddr)) return true; down_read(&sit_i->sentry_lock); @@ -1961,7 +1951,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, /* * Calculate the number of current summary pages for writing */ -int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) { int valid_sum_count = 0; int i, sum_in_page; @@ -1991,14 +1981,15 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) /* * Caller should put this summary page */ -struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) +struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); + return f2fs_get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); } -void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) { - struct page *page = grab_meta_page(sbi, blk_addr); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); @@ -2008,18 +1999,19 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) static void write_sum_page(struct f2fs_sb_info *sbi, struct f2fs_summary_block *sum_blk, block_t blk_addr) { - update_meta_page(sbi, (void *)sum_blk, blk_addr); + f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr); } static void write_current_sum_page(struct f2fs_sb_info *sbi, int type, block_t blk_addr) { struct curseg_info *curseg = CURSEG_I(sbi, type); - struct page *page = grab_meta_page(sbi, blk_addr); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); struct f2fs_summary_block *src = curseg->sum_blk; struct f2fs_summary_block *dst; dst = (struct f2fs_summary_block *)page_address(page); + memset(dst, 0, PAGE_SIZE); mutex_lock(&curseg->curseg_mutex); @@ -2259,7 +2251,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - sum_page = get_sum_page(sbi, new_segno); + sum_page = f2fs_get_sum_page(sbi, new_segno); sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); @@ -2273,7 +2265,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) int i, cnt; bool reversed = false; - /* need_SSR() already forces to do this */ + /* f2fs_need_SSR() already forces to do this */ if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { curseg->next_segno = segno; return 1; @@ -2325,7 +2317,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); - else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) + else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -2333,7 +2325,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -void allocate_new_segments(struct f2fs_sb_info *sbi) +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { struct curseg_info *curseg; unsigned int old_segno; @@ -2355,7 +2347,8 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; -bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; bool has_candidate = false; @@ -2373,11 +2366,72 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) return has_candidate; } +static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); + + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, + NULL, start, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->lstart <= end) { + struct rb_node *node; + + if (dc->len < dpolicy->granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } + + __submit_discard_cmd(sbi, dpolicy, dc); + + if (++issued >= dpolicy->max_requests) { + start = dc->lstart + dc->len; + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + __wait_all_discard_cmd(sbi, NULL); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto next; + } +skip: + node = rb_next(&dc->rb_node); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; + } + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; - unsigned int start_segno, end_segno, cur_segno; + unsigned int start_segno, end_segno; block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; @@ -2388,12 +2442,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) return -EINVAL; if (end <= MAIN_BLKADDR(sbi)) - goto out; + return -EINVAL; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_msg(sbi->sb, KERN_WARNING, "Found FS corruption, run fsck to fix."); - goto out; + return -EIO; } /* start/end segment number in main_area */ @@ -2403,40 +2457,36 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; - /* do checkpoint to issue discard commands safely */ - for (cur_segno = start_segno; cur_segno <= end_segno; - cur_segno = cpc.trim_end + 1) { - cpc.trim_start = cur_segno; - - if (sbi->discard_blks == 0) - break; - else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi)) - cpc.trim_end = end_segno; - else - cpc.trim_end = min_t(unsigned int, - rounddown(cur_segno + - BATCHED_TRIM_SEGMENTS(sbi), - sbi->segs_per_sec) - 1, end_segno); - - mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); - if (err) - break; + if (sbi->discard_blks == 0) + goto out; - schedule(); - } + mutex_lock(&sbi->gc_mutex); + err = f2fs_write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + if (err) + goto out; start_block = START_BLOCK(sbi, start_segno); - end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); + end_block = START_BLOCK(sbi, end_segno + 1); - init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + + /* + * We filed discard candidates, but actually we don't need to wait for + * all of them, since they'll be issued in idle time along with runtime + * discard option. User configuration looks like using runtime discard + * or periodic fstrim instead of it. + */ + if (!test_opt(sbi, DISCARD)) { + trimmed = __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + range->len = F2FS_BLK_TO_BYTES(trimmed); + } out: - range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } @@ -2448,7 +2498,7 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -int rw_hint_to_seg_type(enum rw_hint hint) +int f2fs_rw_hint_to_seg_type(enum rw_hint hint) { switch (hint) { case WRITE_LIFE_SHORT: @@ -2521,7 +2571,7 @@ int rw_hint_to_seg_type(enum rw_hint hint) * WRITE_LIFE_LONG " WRITE_LIFE_LONG */ -enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { @@ -2588,9 +2638,11 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || - is_inode_flag_set(inode, FI_HOT_DATA)) + is_inode_flag_set(inode, FI_HOT_DATA) || + is_inode_flag_set(inode, FI_ATOMIC_FILE) || + is_inode_flag_set(inode, FI_VOLATILE_FILE)) return CURSEG_HOT_DATA; - return rw_hint_to_seg_type(inode->i_write_hint); + return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { if (IS_DNODE(fio->page)) return is_cold_node(fio->page) ? CURSEG_WARM_NODE : @@ -2626,7 +2678,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) return type; } -void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio, bool add_list) @@ -2686,6 +2738,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, INIT_LIST_HEAD(&fio->list); fio->in_list = true; + fio->retry = false; io = sbi->write_io[fio->type] + fio->temp; spin_lock(&io->io_lock); list_add_tail(&fio->list, &io->io_list); @@ -2708,7 +2761,7 @@ static void update_device_state(struct f2fs_io_info *fio) devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); /* update device state for fsync */ - set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); /* update device state for checkpoint */ if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { @@ -2721,23 +2774,28 @@ static void update_device_state(struct f2fs_io_info *fio) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); - int err; + bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA); + if (keep_order) + down_read(&fio->sbi->io_order_lock); reallocate: - allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ - err = f2fs_submit_page_write(fio); - if (err == -EAGAIN) { + f2fs_submit_page_write(fio); + if (fio->retry) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; - } else if (!err) { - update_device_state(fio); } + + update_device_state(fio); + + if (keep_order) + up_read(&fio->sbi->io_order_lock); } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, enum iostat_type io_type) { struct f2fs_io_info fio = { @@ -2757,12 +2815,13 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, fio.op_flags &= ~REQ_META; set_page_writeback(page); + ClearPageError(page); f2fs_submit_page_write(&fio); f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } -void write_node_page(unsigned int nid, struct f2fs_io_info *fio) +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; @@ -2772,14 +2831,15 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } -void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; struct node_info ni; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); - get_node_info(sbi, dn->nid, &ni); + f2fs_get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); @@ -2787,7 +2847,7 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } -int rewrite_data_page(struct f2fs_io_info *fio) +int f2fs_inplace_write_data(struct f2fs_io_info *fio) { int err; struct f2fs_sb_info *sbi = fio->sbi; @@ -2822,7 +2882,7 @@ static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, return i; } -void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr) { @@ -2907,7 +2967,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, set_summary(&sum, dn->nid, dn->ofs_in_node, version); - __f2fs_replace_block(sbi, &sum, old_addr, new_addr, + f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg, recover_newaddr); f2fs_update_data_blkaddr(dn, new_addr); @@ -2932,7 +2992,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + if (!is_valid_blkaddr(blkaddr)) return; cpage = find_lock_page(META_MAPPING(sbi), blkaddr); @@ -2953,7 +3013,7 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) start = start_sum_block(sbi); - page = get_meta_page(sbi, start++); + page = f2fs_get_meta_page(sbi, start++); kaddr = (unsigned char *)page_address(page); /* Step 1: restore nat cache */ @@ -2993,7 +3053,7 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); page = NULL; - page = get_meta_page(sbi, start++); + page = f2fs_get_meta_page(sbi, start++); kaddr = (unsigned char *)page_address(page); offset = 0; } @@ -3032,7 +3092,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) blk_addr = GET_SUM_BLOCK(sbi, segno); } - new = get_meta_page(sbi, blk_addr); + new = f2fs_get_meta_page(sbi, blk_addr); sum = (struct f2fs_summary_block *)page_address(new); if (IS_NODESEG(type)) { @@ -3044,7 +3104,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - restore_node_summary(sbi, segno, sum); + f2fs_restore_node_summary(sbi, segno, sum); } } @@ -3076,10 +3136,10 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) int err; if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { - int npages = npages_for_summary_flush(sbi, true); + int npages = f2fs_npages_for_summary_flush(sbi, true); if (npages >= 2) - ra_meta_pages(sbi, start_sum_block(sbi), npages, + f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages, META_CP, true); /* restore for compacted data summary */ @@ -3088,7 +3148,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) } if (__exist_node_summaries(sbi)) - ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), + f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), NR_CURSEG_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { @@ -3114,8 +3174,9 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) int written_size = 0; int i, j; - page = grab_meta_page(sbi, blkaddr++); + page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); + memset(kaddr, 0, PAGE_SIZE); /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -3138,8 +3199,9 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) for (j = 0; j < blkoff; j++) { if (!page) { - page = grab_meta_page(sbi, blkaddr++); + page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); + memset(kaddr, 0, PAGE_SIZE); written_size = 0; } summary = (struct f2fs_summary *)(kaddr + written_size); @@ -3174,7 +3236,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, write_current_sum_page(sbi, i, blkaddr + (i - type)); } -void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); @@ -3182,12 +3244,12 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); } -void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } -int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; @@ -3212,7 +3274,7 @@ int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return get_meta_page(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, @@ -3225,7 +3287,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); - page = grab_meta_page(sbi, dst_off); + page = f2fs_grab_meta_page(sbi, dst_off); seg_info_to_sit_page(sbi, page, start); set_page_dirty(page); @@ -3321,7 +3383,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi) * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ -void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; @@ -3380,6 +3442,11 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) int offset, sit_offset; se = get_seg_entry(sbi, segno); +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(se->cur_valid_map, se->cur_valid_map_mir, + SIT_VBLOCK_MAP_SIZE)) + f2fs_bug_on(sbi, 1); +#endif /* add discard candidates */ if (!(cpc->reason & CP_DISCARD)) { @@ -3388,17 +3455,21 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (to_journal) { - offset = lookup_journal_in_cursum(journal, + offset = f2fs_lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); segno_in_journal(journal, offset) = cpu_to_le32(segno); seg_info_to_raw_sit(se, &sit_in_journal(journal, offset)); + check_block_count(sbi, segno, + &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); + check_block_count(sbi, segno, + &raw_sit->entries[sit_offset]); } __clear_bit(segno, bitmap); @@ -3446,8 +3517,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(sbi, MAIN_SEGS(sbi) * - sizeof(struct seg_entry), GFP_KERNEL); + sit_i->sentries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry), + MAIN_SEGS(sbi)), + GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; @@ -3487,8 +3560,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(sbi, MAIN_SECS(sbi) * - sizeof(struct sec_entry), GFP_KERNEL); + sit_i->sec_entries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), + MAIN_SECS(sbi)), + GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } @@ -3564,7 +3639,8 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = f2fs_kzalloc(sbi, sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); + array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), + GFP_KERNEL); if (!array) return -ENOMEM; @@ -3597,9 +3673,10 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; + block_t total_node_blocks = 0; do { - readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, META_SIT, true); start = start_blk * sit_i->sents_per_block; @@ -3619,6 +3696,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (err) return err; seg_info_from_raw_sit(se, &sit); + if (IS_NODESEG(se->type)) + total_node_blocks += se->valid_blocks; /* build discard map only one time */ if (f2fs_discard_en(sbi)) { @@ -3647,15 +3726,28 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int old_valid_blocks; start = le32_to_cpu(segno_in_journal(journal, i)); + if (start >= MAIN_SEGS(sbi)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong journal entry on segno %u", + start); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; + break; + } + se = &sit_i->sentries[start]; sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; + if (IS_NODESEG(se->type)) + total_node_blocks -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); + if (IS_NODESEG(se->type)) + total_node_blocks += se->valid_blocks; if (f2fs_discard_en(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { @@ -3664,16 +3756,28 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) } else { memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; } } - if (sbi->segs_per_sec > 1) + if (sbi->segs_per_sec > 1) { get_sec_entry(sbi, start)->valid_blocks += - se->valid_blocks - old_valid_blocks; + se->valid_blocks; + get_sec_entry(sbi, start)->valid_blocks -= + old_valid_blocks; + } } up_read(&curseg->journal_rwsem); + + if (!err && total_node_blocks != valid_node_count(sbi)) { + f2fs_msg(sbi->sb, KERN_ERR, + "SIT is corrupted node# %u vs %u", + total_node_blocks, valid_node_count(sbi)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; + } + return err; } @@ -3772,7 +3876,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) down_write(&sit_i->sentry_lock); - sit_i->min_mtime = LLONG_MAX; + sit_i->min_mtime = ULLONG_MAX; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { unsigned int i; @@ -3786,11 +3890,11 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; } - sit_i->max_mtime = get_mtime(sbi); + sit_i->max_mtime = get_mtime(sbi, false); up_write(&sit_i->sentry_lock); } -int build_segment_manager(struct f2fs_sb_info *sbi) +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -3822,14 +3926,12 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); - sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; - INIT_LIST_HEAD(&sm_info->sit_entry_set); init_rwsem(&sm_info->curseg_lock); if (!f2fs_readonly(sbi->sb)) { - err = create_flush_cmd_control(sbi); + err = f2fs_create_flush_cmd_control(sbi); if (err) return err; } @@ -3954,13 +4056,13 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kfree(sit_i); } -void destroy_segment_manager(struct f2fs_sb_info *sbi) +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); if (!sm_info) return; - destroy_flush_cmd_control(sbi, true); + f2fs_destroy_flush_cmd_control(sbi, true); destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); @@ -3970,7 +4072,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) kfree(sm_info); } -int __init create_segment_manager_caches(void) +int __init f2fs_create_segment_manager_caches(void) { discard_entry_slab = f2fs_kmem_cache_create("discard_entry", sizeof(struct discard_entry)); @@ -4003,7 +4105,7 @@ fail: return -ENOMEM; } -void destroy_segment_manager_caches(void) +void f2fs_destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3325d0769723..f18fc82fbe99 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -85,7 +85,7 @@ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ + ((!is_valid_blkaddr(blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ @@ -215,6 +215,8 @@ struct segment_allocation { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) +#define MAX_SKIP_ATOMIC_COUNT 16 + struct inmem_pages { struct list_head list; struct page *page; @@ -375,6 +377,7 @@ static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, int i; raw_sit = (struct f2fs_sit_block *)page_address(page); + memset(raw_sit, 0, PAGE_SIZE); for (i = 0; i < end - start; i++) { rs = &raw_sit->entries[i]; se = get_seg_entry(sbi, start + i); @@ -742,12 +745,23 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) #endif } -static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) +static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, + bool base_time) { struct sit_info *sit_i = SIT_I(sbi); - time64_t now = ktime_get_real_seconds(); + time64_t diff, now = ktime_get_real_seconds(); + + if (now >= sit_i->mounted_time) + return sit_i->elapsed_time + now - sit_i->mounted_time; - return sit_i->elapsed_time + now - sit_i->mounted_time; + /* system time is set to the past */ + if (!base_time) { + diff = sit_i->mounted_time - now; + if (sit_i->elapsed_time >= diff) + return sit_i->elapsed_time - diff; + return 0; + } + return sit_i->elapsed_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, @@ -771,15 +785,6 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } -static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, - unsigned int secno) -{ - if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) > - sbi->fggc_threshold) - return true; - return false; -} - static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 0b5664a1a6cc..36cfd816c160 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -109,11 +109,11 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, /* shrink clean nat cache entries */ if (freed < nr) - freed += try_to_free_nats(sbi, nr - freed); + freed += f2fs_try_to_free_nats(sbi, nr - freed); /* shrink free nids cache entries */ if (freed < nr) - freed += try_to_free_nids(sbi, nr - freed); + freed += f2fs_try_to_free_nids(sbi, nr - freed); spin_lock(&f2fs_list_lock); p = p->next; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 42d564c5ccd0..3995e926ba3a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -740,6 +740,10 @@ static int parse_options(struct super_block *sb, char *options) } else if (strlen(name) == 6 && !strncmp(name, "strict", 6)) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; + } else if (strlen(name) == 9 && + !strncmp(name, "nobarrier", 9)) { + F2FS_OPTION(sbi).fsync_mode = + FSYNC_MODE_NOBARRIER; } else { kfree(name); return -EINVAL; @@ -826,15 +830,14 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); - fi->i_current_depth = 1; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_ilist); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - init_rwsem(&fi->dio_rwsem[READ]); - init_rwsem(&fi->dio_rwsem[WRITE]); + init_rwsem(&fi->i_gc_rwsem[READ]); + init_rwsem(&fi->i_gc_rwsem[WRITE]); init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); @@ -862,7 +865,7 @@ static int f2fs_drop_inode(struct inode *inode) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); @@ -999,7 +1002,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT, }; - write_checkpoint(sbi, &cpc); + f2fs_write_checkpoint(sbi, &cpc); } /* be sure to wait for any on-going discard commands */ @@ -1009,17 +1012,17 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; - write_checkpoint(sbi, &cpc); + f2fs_write_checkpoint(sbi, &cpc); } - /* write_checkpoint can update stat informaion */ + /* f2fs_write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_ino_entry(sbi, true); + f2fs_release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); @@ -1031,8 +1034,8 @@ static void f2fs_put_super(struct super_block *sb) iput(sbi->meta_inode); /* destroy f2fs internal modules */ - destroy_node_manager(sbi); - destroy_segment_manager(sbi); + f2fs_destroy_node_manager(sbi); + f2fs_destroy_segment_manager(sbi); kfree(sbi->ckpt); @@ -1074,7 +1077,7 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } f2fs_trace_ios(NULL, 1); @@ -1477,11 +1480,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { - stop_gc_thread(sbi); + f2fs_stop_gc_thread(sbi); need_restart_gc = true; } } else if (!sbi->gc_thread) { - err = start_gc_thread(sbi); + err = f2fs_start_gc_thread(sbi); if (err) goto restore_opts; need_stop_gc = true; @@ -1504,9 +1507,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); - destroy_flush_cmd_control(sbi, false); + f2fs_destroy_flush_cmd_control(sbi, false); } else { - err = create_flush_cmd_control(sbi); + err = f2fs_create_flush_cmd_control(sbi); if (err) goto restore_gc; } @@ -1524,11 +1527,11 @@ skip: return 0; restore_gc: if (need_restart_gc) { - if (start_gc_thread(sbi)) + if (f2fs_start_gc_thread(sbi)) f2fs_msg(sbi->sb, KERN_WARNING, "background gc thread has stopped"); } else if (need_stop_gc) { - stop_gc_thread(sbi); + f2fs_stop_gc_thread(sbi); } restore_opts: #ifdef CONFIG_QUOTA @@ -1800,7 +1803,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, inode = d_inode(path->dentry); inode_lock(inode); - F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL; + F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); @@ -1824,7 +1827,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) goto out_put; inode_lock(inode); - F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL); + F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); @@ -1930,19 +1933,13 @@ static bool f2fs_dummy_context(struct inode *inode) return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); } -static unsigned f2fs_max_namelen(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? - inode->i_sb->s_blocksize : F2FS_NAME_LEN; -} - static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, .dummy_context = f2fs_dummy_context, .empty_dir = f2fs_empty_dir, - .max_namelen = f2fs_max_namelen, + .max_namelen = F2FS_NAME_LEN, }; #endif @@ -1952,7 +1949,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (check_nid_range(sbi, ino)) + if (f2fs_check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* @@ -2135,6 +2132,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, static int sanity_check_raw_super(struct f2fs_sb_info *sbi, struct buffer_head *bh) { + block_t segment_count, segs_per_sec, secs_per_zone; + block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); struct super_block *sb = sbi->sb; @@ -2191,6 +2190,72 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + segment_count = le32_to_cpu(raw_super->segment_count); + segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + total_sections = le32_to_cpu(raw_super->section_count); + + /* blocks_per_seg should be 512, given the above check */ + blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg); + + if (segment_count > F2FS_MAX_SEGMENT || + segment_count < F2FS_MIN_SEGMENTS) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + segment_count); + return 1; + } + + if (total_sections > segment_count || + total_sections < F2FS_MIN_SEGMENTS || + segs_per_sec > segment_count || !segs_per_sec) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment/section count (%u, %u x %u)", + segment_count, total_sections, segs_per_sec); + return 1; + } + + if ((segment_count / segs_per_sec) < total_sections) { + f2fs_msg(sb, KERN_INFO, + "Small segment_count (%u < %u * %u)", + segment_count, segs_per_sec, total_sections); + return 1; + } + + if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) { + f2fs_msg(sb, KERN_INFO, + "Wrong segment_count / block_count (%u > %u)", + segment_count, le32_to_cpu(raw_super->block_count)); + return 1; + } + + if (secs_per_zone > total_sections) { + f2fs_msg(sb, KERN_INFO, + "Wrong secs_per_zone (%u > %u)", + secs_per_zone, total_sections); + return 1; + } + if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION || + raw_super->hot_ext_count > F2FS_MAX_EXTENSION || + (le32_to_cpu(raw_super->extension_count) + + raw_super->hot_ext_count) > F2FS_MAX_EXTENSION) { + f2fs_msg(sb, KERN_INFO, + "Corrupted extension count (%u + %u > %u)", + le32_to_cpu(raw_super->extension_count), + raw_super->hot_ext_count, + F2FS_MAX_EXTENSION); + return 1; + } + + if (le32_to_cpu(raw_super->cp_payload) > + (blocks_per_seg - F2FS_CP_PACKS)) { + f2fs_msg(sb, KERN_INFO, + "Insane cp_payload (%u > %u)", + le32_to_cpu(raw_super->cp_payload), + blocks_per_seg - F2FS_CP_PACKS); + return 1; + } + /* check reserved ino info */ if (le32_to_cpu(raw_super->node_ino) != 1 || le32_to_cpu(raw_super->meta_ino) != 2 || @@ -2203,13 +2268,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } - if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { - f2fs_msg(sb, KERN_INFO, - "Invalid segment count (%u)", - le32_to_cpu(raw_super->segment_count)); - return 1; - } - /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; @@ -2217,7 +2275,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 0; } -int sanity_check_ckpt(struct f2fs_sb_info *sbi) +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -2298,13 +2356,15 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); - atomic_set(&sbi->wb_sync_req, 0); + for (i = 0; i < META; i++) + atomic_set(&sbi->wb_sync_req[i], 0); INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); for (i = 0; i < NR_PAGE_TYPE - 1; i++) for (j = HOT; j < NR_TEMP_TYPE; j++) mutex_init(&sbi->wio_mutex[i][j]); + init_rwsem(&sbi->io_order_lock); spin_lock_init(&sbi->cp_lock); sbi->dirty_device = 0; @@ -2359,8 +2419,10 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) #define F2FS_REPORT_NR_ZONES 4096 - zones = f2fs_kzalloc(sbi, sizeof(struct blk_zone) * - F2FS_REPORT_NR_ZONES, GFP_KERNEL); + zones = f2fs_kzalloc(sbi, + array_size(F2FS_REPORT_NR_ZONES, + sizeof(struct blk_zone)), + GFP_KERNEL); if (!zones) return -ENOMEM; @@ -2500,8 +2562,10 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) * Initialize multiple devices information, or single * zoned block device information. */ - sbi->devs = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_info) * - max_devices, GFP_KERNEL); + sbi->devs = f2fs_kzalloc(sbi, + array_size(max_devices, + sizeof(struct f2fs_dev_info)), + GFP_KERNEL); if (!sbi->devs) return -ENOMEM; @@ -2723,9 +2787,11 @@ try_onemore: int n = (i == META) ? 1: NR_TEMP_TYPE; int j; - sbi->write_io[i] = f2fs_kmalloc(sbi, - n * sizeof(struct f2fs_bio_info), - GFP_KERNEL); + sbi->write_io[i] = + f2fs_kmalloc(sbi, + array_size(n, + sizeof(struct f2fs_bio_info)), + GFP_KERNEL); if (!sbi->write_io[i]) { err = -ENOMEM; goto free_options; @@ -2765,7 +2831,7 @@ try_onemore: goto free_io_dummy; } - err = get_valid_checkpoint(sbi); + err = f2fs_get_valid_checkpoint(sbi); if (err) { f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); goto free_meta_inode; @@ -2795,18 +2861,18 @@ try_onemore: spin_lock_init(&sbi->inode_lock[i]); } - init_extent_cache_info(sbi); + f2fs_init_extent_cache_info(sbi); - init_ino_entry_info(sbi); + f2fs_init_ino_entry_info(sbi); /* setup f2fs internal modules */ - err = build_segment_manager(sbi); + err = f2fs_build_segment_manager(sbi); if (err) { f2fs_msg(sb, KERN_ERR, "Failed to initialize F2FS segment manager"); goto free_sm; } - err = build_node_manager(sbi); + err = f2fs_build_node_manager(sbi); if (err) { f2fs_msg(sb, KERN_ERR, "Failed to initialize F2FS node manager"); @@ -2824,7 +2890,7 @@ try_onemore: sbi->kbytes_written = le64_to_cpu(seg_i->journal->info.kbytes_written); - build_gc_manager(sbi); + f2fs_build_gc_manager(sbi); /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); @@ -2876,7 +2942,7 @@ try_onemore: } #endif /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); + err = f2fs_recover_orphan_inodes(sbi); if (err) goto free_meta; @@ -2898,7 +2964,7 @@ try_onemore: if (!retry) goto skip_recovery; - err = recover_fsync_data(sbi, false); + err = f2fs_recover_fsync_data(sbi, false); if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, @@ -2906,7 +2972,7 @@ try_onemore: goto free_meta; } } else { - err = recover_fsync_data(sbi, true); + err = f2fs_recover_fsync_data(sbi, true); if (!f2fs_readonly(sb) && err > 0) { err = -EINVAL; @@ -2916,7 +2982,7 @@ try_onemore: } } skip_recovery: - /* recover_fsync_data() cleared this already */ + /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); /* @@ -2925,7 +2991,7 @@ skip_recovery: */ if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ - err = start_gc_thread(sbi); + err = f2fs_start_gc_thread(sbi); if (err) goto free_meta; } @@ -2956,10 +3022,10 @@ free_meta: #endif f2fs_sync_inode_meta(sbi); /* - * Some dirty meta pages can be produced by recover_orphan_inodes() + * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). + * followed by f2fs_write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); #ifdef CONFIG_QUOTA @@ -2972,13 +3038,13 @@ free_root_inode: free_stats: f2fs_destroy_stats(sbi); free_node_inode: - release_ino_entry(sbi, true); + f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); free_nm: - destroy_node_manager(sbi); + f2fs_destroy_node_manager(sbi); free_sm: - destroy_segment_manager(sbi); + f2fs_destroy_segment_manager(sbi); free_devices: destroy_device_list(sbi); kfree(sbi->ckpt); @@ -3024,8 +3090,8 @@ static void kill_f2fs_super(struct super_block *sb) { if (sb->s_root) { set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); - stop_gc_thread(F2FS_SB(sb)); - stop_discard_thread(F2FS_SB(sb)); + f2fs_stop_gc_thread(F2FS_SB(sb)); + f2fs_stop_discard_thread(F2FS_SB(sb)); } kill_block_super(sb); } @@ -3063,21 +3129,27 @@ static int __init init_f2fs_fs(void) { int err; + if (PAGE_SIZE != F2FS_BLKSIZE) { + printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n", + PAGE_SIZE, F2FS_BLKSIZE); + return -EINVAL; + } + f2fs_build_trace_ios(); err = init_inodecache(); if (err) goto fail; - err = create_node_manager_caches(); + err = f2fs_create_node_manager_caches(); if (err) goto free_inodecache; - err = create_segment_manager_caches(); + err = f2fs_create_segment_manager_caches(); if (err) goto free_node_manager_caches; - err = create_checkpoint_caches(); + err = f2fs_create_checkpoint_caches(); if (err) goto free_segment_manager_caches; - err = create_extent_cache(); + err = f2fs_create_extent_cache(); if (err) goto free_checkpoint_caches; err = f2fs_init_sysfs(); @@ -3092,8 +3164,13 @@ static int __init init_f2fs_fs(void) err = f2fs_create_root_stats(); if (err) goto free_filesystem; + err = f2fs_init_post_read_processing(); + if (err) + goto free_root_stats; return 0; +free_root_stats: + f2fs_destroy_root_stats(); free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: @@ -3101,13 +3178,13 @@ free_shrinker: free_sysfs: f2fs_exit_sysfs(); free_extent_cache: - destroy_extent_cache(); + f2fs_destroy_extent_cache(); free_checkpoint_caches: - destroy_checkpoint_caches(); + f2fs_destroy_checkpoint_caches(); free_segment_manager_caches: - destroy_segment_manager_caches(); + f2fs_destroy_segment_manager_caches(); free_node_manager_caches: - destroy_node_manager_caches(); + f2fs_destroy_node_manager_caches(); free_inodecache: destroy_inodecache(); fail: @@ -3116,14 +3193,15 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); f2fs_exit_sysfs(); - destroy_extent_cache(); - destroy_checkpoint_caches(); - destroy_segment_manager_caches(); - destroy_node_manager_caches(); + f2fs_destroy_extent_cache(); + f2fs_destroy_checkpoint_caches(); + f2fs_destroy_segment_manager_caches(); + f2fs_destroy_node_manager_caches(); destroy_inodecache(); f2fs_destroy_trace_ios(); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f33a56d6e6dd..2e7e611deaef 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -147,13 +147,13 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, int len = 0, i; len += snprintf(buf + len, PAGE_SIZE - len, - "cold file extenstion:\n"); + "cold file extension:\n"); for (i = 0; i < cold_count; i++) len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); len += snprintf(buf + len, PAGE_SIZE - len, - "hot file extenstion:\n"); + "hot file extension:\n"); for (i = cold_count; i < cold_count + hot_count; i++) len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); @@ -165,7 +165,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return snprintf(buf, PAGE_SIZE, "%u\n", *ui); } -static ssize_t f2fs_sbi_store(struct f2fs_attr *a, +static ssize_t __sbi_store(struct f2fs_attr *a, struct f2fs_sb_info *sbi, const char *buf, size_t count) { @@ -201,13 +201,13 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, down_write(&sbi->sb_lock); - ret = update_extension_list(sbi, name, hot, set); + ret = f2fs_update_extension_list(sbi, name, hot, set); if (ret) goto out; ret = f2fs_commit_super(sbi, false); if (ret) - update_extension_list(sbi, name, hot, !set); + f2fs_update_extension_list(sbi, name, hot, !set); out: up_write(&sbi->sb_lock); return ret ? ret : count; @@ -245,19 +245,56 @@ out: return count; } + if (!strcmp(a->attr.name, "trim_sections")) + return -EINVAL; + + if (!strcmp(a->attr.name, "gc_urgent")) { + if (t >= 1) { + sbi->gc_mode = GC_URGENT; + if (sbi->gc_thread) { + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + } else { + sbi->gc_mode = GC_NORMAL; + } + return count; + } + if (!strcmp(a->attr.name, "gc_idle")) { + if (t == GC_IDLE_CB) + sbi->gc_mode = GC_IDLE_CB; + else if (t == GC_IDLE_GREEDY) + sbi->gc_mode = GC_IDLE_GREEDY; + else + sbi->gc_mode = GC_NORMAL; + return count; + } + *ui = t; if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) f2fs_reset_iostat(sbi); - if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { - sbi->gc_thread->gc_wake = 1; - wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); - wake_up_discard_thread(sbi, true); - } - return count; } +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + ssize_t ret; + bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") || + a->struct_type == GC_THREAD); + + if (gc_entry) + down_read(&sbi->sb->s_umount); + ret = __sbi_store(a, sbi, buf, count); + if (gc_entry) + up_read(&sbi->sb->s_umount); + + return ret; +} + static ssize_t f2fs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -346,8 +383,8 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); @@ -572,23 +609,6 @@ static int iostat_info_seq_show(struct seq_file *seq, void *offset) return 0; } -#define F2FS_PROC_FILE_DEF(_name) \ -static int _name##_open_fs(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ -} \ - \ -static const struct file_operations f2fs_seq_##_name##_fops = { \ - .open = _name##_open_fs, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -F2FS_PROC_FILE_DEF(segment_info); -F2FS_PROC_FILE_DEF(segment_bits); -F2FS_PROC_FILE_DEF(iostat_info); - int __init f2fs_init_sysfs(void) { int ret; @@ -632,12 +652,12 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_iostat_info_fops, sb); + proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc, + segment_info_seq_show, sb); + proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc, + segment_bits_seq_show, sb); + proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc, + iostat_info_seq_show, sb); } return 0; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index ae2dfa709f5d..708271871f94 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -252,7 +252,7 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - page = get_node_page(sbi, inode->i_ino); + page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -273,7 +273,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); + xpage = f2fs_get_node_page(sbi, xnid); if (IS_ERR(xpage)) return PTR_ERR(xpage); @@ -397,7 +397,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, int err = 0; if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) - if (!alloc_nid(sbi, &new_nid)) + if (!f2fs_alloc_nid(sbi, &new_nid)) return -ENOSPC; /* write to inline xattr */ @@ -405,9 +405,9 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - in_page = get_node_page(sbi, inode->i_ino); + in_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(in_page)) { - alloc_nid_failed(sbi, new_nid); + f2fs_alloc_nid_failed(sbi, new_nid); return PTR_ERR(in_page); } inline_addr = inline_xattr_addr(inode, in_page); @@ -417,8 +417,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, NODE, true); /* no need to use xattr node block */ if (hsize <= inline_size) { - err = truncate_xattr_node(inode); - alloc_nid_failed(sbi, new_nid); + err = f2fs_truncate_xattr_node(inode); + f2fs_alloc_nid_failed(sbi, new_nid); if (err) { f2fs_put_page(in_page, 1); return err; @@ -431,10 +431,10 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to xattr node block */ if (F2FS_I(inode)->i_xattr_nid) { - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); - alloc_nid_failed(sbi, new_nid); + f2fs_alloc_nid_failed(sbi, new_nid); goto in_page_out; } f2fs_bug_on(sbi, new_nid); @@ -442,13 +442,13 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); - alloc_nid_failed(sbi, new_nid); + f2fs_alloc_nid_failed(sbi, new_nid); goto in_page_out; } - alloc_nid_done(sbi, new_nid); + f2fs_alloc_nid_done(sbi, new_nid); } xattr_addr = page_address(xpage); @@ -693,7 +693,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (err) return err; - /* this case is only from init_inode_metadata */ + /* this case is only from f2fs_init_inode_metadata */ if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index ffbbf0520d9e..bfd589ea74c0 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -158,8 +158,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; + if (!phys) { + fat_fs_error(sb, + "invalid FAT chain (i_pos %lld, last_block %llu)", + MSDOS_I(inode)->i_pos, + (unsigned long long)last_block); + return -EIO; + } - BUG_ON(!phys); BUG_ON(*max_blocks != mapped_blocks); set_buffer_new(bh_result); map_bh(bh_result, sb, phys); @@ -502,6 +508,7 @@ static int fat_validate_dir(struct inode *dir) /* doesn't deal with root inode */ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { + struct timespec ts; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; @@ -552,11 +559,14 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; - fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); + fat_time_fat2unix(sbi, &ts, de->time, de->date, 0); + inode->i_mtime = timespec_to_timespec64(ts); if (sbi->options.isvfat) { - fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, + fat_time_fat2unix(sbi, &ts, de->ctime, de->cdate, de->ctime_cs); - fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); + inode->i_ctime = timespec_to_timespec64(ts); + fat_time_fat2unix(sbi, &ts, 0, de->adate, 0); + inode->i_atime = timespec_to_timespec64(ts); } else inode->i_ctime = inode->i_atime = inode->i_mtime; @@ -697,13 +707,21 @@ static void fat_set_state(struct super_block *sb, brelse(bh); } +static void fat_reset_iocharset(struct fat_mount_options *opts) +{ + if (opts->iocharset != fat_default_iocharset) { + /* Note: opts->iocharset can be NULL here */ + kfree(opts->iocharset); + opts->iocharset = fat_default_iocharset; + } +} + static void delayed_free(struct rcu_head *p) { struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu); unload_nls(sbi->nls_disk); unload_nls(sbi->nls_io); - if (sbi->options.iocharset != fat_default_iocharset) - kfree(sbi->options.iocharset); + fat_reset_iocharset(&sbi->options); kfree(sbi); } @@ -825,6 +843,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) static int __fat_write_inode(struct inode *inode, int wait) { + struct timespec ts; struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; @@ -862,13 +881,16 @@ retry: raw_entry->size = cpu_to_le32(inode->i_size); raw_entry->attr = fat_make_attrs(inode); fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart); - fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time, + ts = timespec64_to_timespec(inode->i_mtime); + fat_time_unix2fat(sbi, &ts, &raw_entry->time, &raw_entry->date, NULL); if (sbi->options.isvfat) { __le16 atime; - fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, + ts = timespec64_to_timespec(inode->i_ctime); + fat_time_unix2fat(sbi, &ts, &raw_entry->ctime, &raw_entry->cdate, &raw_entry->ctime_cs); - fat_time_unix2fat(sbi, &inode->i_atime, &atime, + ts = timespec64_to_timespec(inode->i_atime); + fat_time_unix2fat(sbi, &ts, &atime, &raw_entry->adate, NULL); } spin_unlock(&sbi->inode_hash_lock); @@ -1118,7 +1140,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, opts->fs_fmask = opts->fs_dmask = current_umask(); opts->allow_utime = -1; opts->codepage = fat_default_codepage; - opts->iocharset = fat_default_iocharset; + fat_reset_iocharset(opts); if (is_vfat) { opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; opts->rodir = 0; @@ -1275,8 +1297,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, /* vfat specific */ case Opt_charset: - if (opts->iocharset != fat_default_iocharset) - kfree(opts->iocharset); + fat_reset_iocharset(opts); iocharset = match_strdup(&args[0]); if (!iocharset) return -ENOMEM; @@ -1867,8 +1888,7 @@ out_fail: iput(fat_inode); unload_nls(sbi->nls_io); unload_nls(sbi->nls_disk); - if (sbi->options.iocharset != fat_default_iocharset) - kfree(sbi->options.iocharset); + fat_reset_iocharset(&sbi->options); sb->s_fs_info = NULL; kfree(sbi); return error; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 582ca731a6c9..16a832c37d66 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -250,7 +250,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, if (err) return err; - dir->i_ctime = dir->i_mtime = *ts; + dir->i_ctime = dir->i_mtime = timespec_to_timespec64(*ts); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -266,7 +266,8 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct super_block *sb = dir->i_sb; struct inode *inode = NULL; struct fat_slot_info sinfo; - struct timespec ts; + struct timespec64 ts; + struct timespec t; unsigned char msdos_name[MSDOS_NAME]; int err, is_hid; @@ -285,7 +286,8 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, } ts = current_time(dir); - err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo); + t = timespec64_to_timespec(ts); + err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &t, &sinfo); if (err) goto out; inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); @@ -314,10 +316,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) int err; mutex_lock(&MSDOS_SB(sb)->s_lock); - /* - * Check whether the directory is not in use, then check - * whether it is empty. - */ err = fat_dir_empty(inode); if (err) goto out; @@ -348,7 +346,8 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct fat_slot_info sinfo; struct inode *inode; unsigned char msdos_name[MSDOS_NAME]; - struct timespec ts; + struct timespec64 ts; + struct timespec t; int err, is_hid, cluster; mutex_lock(&MSDOS_SB(sb)->s_lock); @@ -366,12 +365,13 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } ts = current_time(dir); - cluster = fat_alloc_new_dir(dir, &ts); + t = timespec64_to_timespec(ts); + cluster = fat_alloc_new_dir(dir, &t); if (cluster < 0) { err = cluster; goto out; } - err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo); + err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &t, &sinfo); if (err) goto out_free; inc_nlink(dir); @@ -436,7 +436,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, struct msdos_dir_entry *dotdot_de; struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; - struct timespec ts; + struct timespec64 ts; loff_t new_i_pos; int err, old_attrs, is_dir, update_dotdot, corrupt = 0; @@ -503,8 +503,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, new_i_pos = MSDOS_I(new_inode)->i_pos; fat_detach(new_inode); } else { + struct timespec t = timespec64_to_timespec(ts); err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0, - &ts, &sinfo); + &t, &sinfo); if (err) goto out; new_i_pos = sinfo.i_pos; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 2649759c478a..9a5469120caa 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -664,7 +664,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname, if (len == 0) return -ENOENT; - slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS); + slots = kmalloc_array(MSDOS_SLOTS, sizeof(*slots), GFP_NOFS); if (slots == NULL) return -ENOMEM; @@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname, goto cleanup; /* update timestamp */ - dir->i_ctime = dir->i_mtime = dir->i_atime = *ts; + dir->i_ctime = dir->i_mtime = dir->i_atime = timespec_to_timespec64(*ts); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -697,15 +697,6 @@ static int vfat_find(struct inode *dir, const struct qstr *qname, return fat_search_long(dir, qname->name, len, sinfo); } -/* - * (nfsd's) anonymous disconnected dentry? - * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job). - */ -static int vfat_d_anon_disconn(struct dentry *dentry) -{ - return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED); -} - static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { @@ -738,8 +729,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, * Checking "alias->d_parent == dentry->d_parent" to make sure * FS is not corrupted (especially double linked dir). */ - if (alias && alias->d_parent == dentry->d_parent && - !vfat_d_anon_disconn(alias)) { + if (alias && alias->d_parent == dentry->d_parent) { /* * This inode has non anonymous-DCACHE_DISCONNECTED * dentry. This means, the user did ->lookup() by an @@ -747,7 +737,6 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, * * Switch to new one for reason of locality if possible. */ - BUG_ON(d_unhashed(alias)); if (!S_ISDIR(inode->i_mode)) d_move(alias, dentry); iput(inode); @@ -772,13 +761,15 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct super_block *sb = dir->i_sb; struct inode *inode; struct fat_slot_info sinfo; - struct timespec ts; + struct timespec64 ts; + struct timespec t; int err; mutex_lock(&MSDOS_SB(sb)->s_lock); ts = current_time(dir); - err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); + t = timespec64_to_timespec(ts); + err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &t, &sinfo); if (err) goto out; inode_inc_iversion(dir); @@ -861,18 +852,20 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct super_block *sb = dir->i_sb; struct inode *inode; struct fat_slot_info sinfo; - struct timespec ts; + struct timespec64 ts; + struct timespec t; int err, cluster; mutex_lock(&MSDOS_SB(sb)->s_lock); ts = current_time(dir); - cluster = fat_alloc_new_dir(dir, &ts); + t = timespec64_to_timespec(ts); + cluster = fat_alloc_new_dir(dir, &t); if (cluster < 0) { err = cluster; goto out; } - err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo); + err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &t, &sinfo); if (err) goto out_free; inode_inc_iversion(dir); @@ -910,7 +903,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct msdos_dir_entry *dotdot_de; struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; - struct timespec ts; + struct timespec64 ts; + struct timespec t; loff_t new_i_pos; int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; @@ -945,8 +939,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, new_i_pos = MSDOS_I(new_inode)->i_pos; fat_detach(new_inode); } else { + t = timespec64_to_timespec(ts); err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0, - &ts, &sinfo); + &t, &sinfo); if (err) goto out; new_i_pos = sinfo.i_pos; diff --git a/fs/fcntl.c b/fs/fcntl.c index d737ff082472..12273b6ea56d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -23,7 +23,7 @@ #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> -#include <linux/shmem_fs.h> +#include <linux/memfd.h> #include <linux/compat.h> #include <linux/poll.h> @@ -871,9 +871,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) if (fa->fa_file != filp) continue; - spin_lock_irq(&fa->fa_lock); + write_lock_irq(&fa->fa_lock); fa->fa_file = NULL; - spin_unlock_irq(&fa->fa_lock); + write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; call_rcu(&fa->fa_rcu, fasync_free_rcu); @@ -918,13 +918,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy if (fa->fa_file != filp) continue; - spin_lock_irq(&fa->fa_lock); + write_lock_irq(&fa->fa_lock); fa->fa_fd = fd; - spin_unlock_irq(&fa->fa_lock); + write_unlock_irq(&fa->fa_lock); goto out; } - spin_lock_init(&new->fa_lock); + rwlock_init(&new->fa_lock); new->magic = FASYNC_MAGIC; new->fa_file = filp; new->fa_fd = fd; @@ -987,14 +987,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; - unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - spin_lock_irqsave(&fa->fa_lock, flags); + read_lock(&fa->fa_lock); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a @@ -1003,7 +1002,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } - spin_unlock_irqrestore(&fa->fa_lock, flags); + read_unlock(&fa->fa_lock); fa = rcu_dereference(fa->fa_next); } } diff --git a/fs/filesystems.c b/fs/filesystems.c index f2728a4a03a1..b03f57b1105b 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -238,21 +238,9 @@ static int filesystems_proc_show(struct seq_file *m, void *v) return 0; } -static int filesystems_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, filesystems_proc_show, NULL); -} - -static const struct file_operations filesystems_proc_fops = { - .open = filesystems_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_filesystems_init(void) { - proc_create("filesystems", 0, NULL, &filesystems_proc_fops); + proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index ce4785fd81c6..a51425634f65 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -193,13 +193,9 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags) return ERR_PTR(-ENAMETOOLONG); ino = vxfs_inode_by_name(dip, dp); - if (ino) { + if (ino) ip = vxfs_iget(dip->i_sb, ino); - if (IS_ERR(ip)) - return ERR_CAST(ip); - } - d_add(dp, ip); - return NULL; + return d_splice_alias(ip, dp); } /** diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index c184c5a356ff..cdcb376ef8df 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -220,6 +220,7 @@ int fscache_add_cache(struct fscache_cache *cache, { struct fscache_cache_tag *tag; + ASSERTCMP(ifsdef->cookie, ==, &fscache_fsdef_index); BUG_ON(!cache->ops); BUG_ON(!ifsdef); @@ -248,7 +249,6 @@ int fscache_add_cache(struct fscache_cache *cache, if (!cache->kobj) goto error; - ifsdef->cookie = &fscache_fsdef_index; ifsdef->cache = cache; cache->fsdef = ifsdef; diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 97137d7ec5ee..83bfe04456b6 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -516,6 +516,7 @@ static int fscache_alloc_object(struct fscache_cache *cache, goto error; } + ASSERTCMP(object->cookie, ==, cookie); fscache_stat(&fscache_n_object_alloc); object->debug_id = atomic_inc_return(&fscache_object_debug_id); @@ -571,6 +572,8 @@ static int fscache_attach_object(struct fscache_cookie *cookie, _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); + ASSERTCMP(object->cookie, ==, cookie); + spin_lock(&cookie->lock); /* there may be multiple initial creations of this object, but we only @@ -610,9 +613,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, spin_unlock(&cache->object_list_lock); } - /* attach to the cookie */ - object->cookie = cookie; - fscache_cookie_get(cookie, fscache_cookie_get_attach_object); + /* Attach to the cookie. The object already has a ref on it. */ hlist_add_head(&object->cookie_link, &cookie->backing_objects); fscache_objlist_add(object); diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c index 15a3d042247e..9a13e9e15b69 100644 --- a/fs/fscache/histogram.c +++ b/fs/fscache/histogram.c @@ -83,24 +83,9 @@ static void fscache_histogram_stop(struct seq_file *m, void *v) { } -static const struct seq_operations fscache_histogram_ops = { +const struct seq_operations fscache_histogram_ops = { .start = fscache_histogram_start, .stop = fscache_histogram_stop, .next = fscache_histogram_next, .show = fscache_histogram_show, }; - -/* - * open "/proc/fs/fscache/histogram" to provide latency data - */ -static int fscache_histogram_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &fscache_histogram_ops); -} - -const struct file_operations fscache_histogram_fops = { - .open = fscache_histogram_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 500650f938fe..f83328a7f048 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -31,6 +31,7 @@ #include <linux/fscache-cache.h> #include <trace/events/fscache.h> #include <linux/sched.h> +#include <linux/seq_file.h> #define FSCACHE_MIN_THREADS 4 #define FSCACHE_MAX_THREADS 32 @@ -84,7 +85,7 @@ static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif) atomic_inc(&histogram[jif]); } -extern const struct file_operations fscache_histogram_fops; +extern const struct seq_operations fscache_histogram_ops; #else #define fscache_hist(hist, start_jif) do {} while (0) @@ -294,7 +295,7 @@ static inline void fscache_stat_d(atomic_t *stat) #define __fscache_stat(stat) (stat) -extern const struct file_operations fscache_stats_fops; +int fscache_stats_show(struct seq_file *m, void *v); #else #define __fscache_stat(stat) (NULL) diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 20e0d0a4dc8c..9edc920f651f 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -327,6 +327,7 @@ void fscache_object_init(struct fscache_object *object, object->store_limit_l = 0; object->cache = cache; object->cookie = cookie; + fscache_cookie_get(cookie, fscache_cookie_get_attach_object); object->parent = NULL; #ifdef CONFIG_FSCACHE_OBJECT_LIST RB_CLEAR_NODE(&object->objlist_link); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index e30c5975ea58..8d265790374c 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -70,7 +70,8 @@ void fscache_enqueue_operation(struct fscache_operation *op) ASSERT(op->processor != NULL); ASSERT(fscache_object_is_available(op->object)); ASSERTCMP(atomic_read(&op->usage), >, 0); - ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); + ASSERTIFCMP(op->state != FSCACHE_OP_ST_IN_PROGRESS, + op->state, ==, FSCACHE_OP_ST_CANCELLED); fscache_stat(&fscache_n_op_enqueue); switch (op->flags & FSCACHE_OP_TYPE) { @@ -499,7 +500,8 @@ void fscache_put_operation(struct fscache_operation *op) struct fscache_cache *cache; _enter("{OBJ%x OP%x,%d}", - op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + op->object ? op->object->debug_id : 0, + op->debug_id, atomic_read(&op->usage)); ASSERTCMP(atomic_read(&op->usage), >, 0); diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index 1d9e4951a597..49a8c90414bc 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -26,14 +26,14 @@ int __init fscache_proc_init(void) goto error_dir; #ifdef CONFIG_FSCACHE_STATS - if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL, - &fscache_stats_fops)) + if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, + fscache_stats_show)) goto error_stats; #endif #ifdef CONFIG_FSCACHE_HISTOGRAM - if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL, - &fscache_histogram_fops)) + if (!proc_create_seq("fs/fscache/histogram", S_IFREG | 0444, NULL, + &fscache_histogram_ops)) goto error_histogram; #endif diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index fcc8c2f2690e..00564a1dfd76 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -138,7 +138,7 @@ atomic_t fscache_n_cache_culled_objects; /* * display the general statistics */ -static int fscache_stats_show(struct seq_file *m, void *v) +int fscache_stats_show(struct seq_file *m, void *v) { seq_puts(m, "FS-Cache statistics\n"); @@ -284,18 +284,3 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_cache_culled_objects)); return 0; } - -/* - * open "/proc/fs/fscache/stats" allowing provision of a statistical summary - */ -static int fscache_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, fscache_stats_show, NULL); -} - -const struct file_operations fscache_stats_fops = { - .open = fscache_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index ec85765502f1..5a48cee6d7d3 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -34,7 +34,7 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) return ERR_PTR(-ENOMEM); size = fuse_getxattr(inode, name, value, PAGE_SIZE); if (size > 0) - acl = posix_acl_from_xattr(&init_user_ns, value, size); + acl = posix_acl_from_xattr(fc->user_ns, value, size); else if ((size == 0) || (size == -ENODATA) || (size == -EOPNOTSUPP && fc->no_getxattr)) acl = NULL; @@ -81,7 +81,7 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (!value) return -ENOMEM; - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + ret = posix_acl_to_xattr(fc->user_ns, acl, value, size); if (ret < 0) { kfree(value); return ret; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index b9ea99c5b5b3..0b694655d988 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -35,7 +35,7 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { - fuse_abort_conn(fc); + fuse_abort_conn(fc, true); fuse_conn_put(fc); } return count; @@ -211,10 +211,11 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, if (!dentry) return NULL; - fc->ctl_dentry[fc->ctl_ndents++] = dentry; inode = new_inode(fuse_control_sb); - if (!inode) + if (!inode) { + dput(dentry); return NULL; + } inode->i_ino = get_next_ino(); inode->i_mode = mode; @@ -228,6 +229,9 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, set_nlink(inode, nlink); inode->i_private = fc; d_add(dentry, inode); + + fc->ctl_dentry[fc->ctl_ndents++] = dentry; + return dentry; } @@ -284,7 +288,10 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) for (i = fc->ctl_ndents - 1; i >= 0; i--) { struct dentry *dentry = fc->ctl_dentry[i]; d_inode(dentry)->i_private = NULL; - d_drop(dentry); + if (!i) { + /* Get rid of submounts: */ + d_invalidate(dentry); + } dput(dentry); } drop_nlink(d_inode(fuse_control_sb->s_root)); diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index e9e97803442a..8f68181256c0 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -48,6 +48,7 @@ #include <linux/stat.h> #include <linux/module.h> #include <linux/uio.h> +#include <linux/user_namespace.h> #include "fuse_i.h" @@ -406,7 +407,7 @@ err_unlock: err_region: unregister_chrdev_region(devt, 1); err: - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); goto out; } @@ -498,7 +499,11 @@ static int cuse_channel_open(struct inode *inode, struct file *file) if (!cc) return -ENOMEM; - fuse_conn_init(&cc->fc); + /* + * Limit the cuse channel to requests that can + * be represented in file->f_cred->user_ns. + */ + fuse_conn_init(&cc->fc, file->f_cred->user_ns); fud = fuse_dev_alloc(&cc->fc); if (!fud) { @@ -581,7 +586,7 @@ static ssize_t cuse_class_abort_store(struct device *dev, { struct cuse_conn *cc = dev_get_drvdata(dev); - fuse_abort_conn(&cc->fc); + fuse_abort_conn(&cc->fc, false); return count; } static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5d06384c2cae..c6b88fa85e2e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -64,9 +64,12 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) pages = req->inline_pages; page_descs = req->inline_page_descs; } else { - pages = kmalloc(sizeof(struct page *) * npages, flags); - page_descs = kmalloc(sizeof(struct fuse_page_desc) * - npages, flags); + pages = kmalloc_array(npages, sizeof(struct page *), + flags); + page_descs = + kmalloc_array(npages, + sizeof(struct fuse_page_desc), + flags); } if (!pages || !page_descs) { @@ -112,13 +115,6 @@ static void __fuse_put_request(struct fuse_req *req) refcount_dec(&req->count); } -static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req) -{ - req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); - req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); -} - void fuse_set_initialized(struct fuse_conn *fc) { /* Make sure stores before this are seen on another CPU */ @@ -163,11 +159,19 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, goto out; } - fuse_req_init_context(fc, req); + req->in.h.uid = from_kuid(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid(fc->user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); + __set_bit(FR_WAITING, &req->flags); if (for_background) __set_bit(FR_BACKGROUND, &req->flags); + if (unlikely(req->in.h.uid == ((uid_t)-1) || + req->in.h.gid == ((gid_t)-1))) { + fuse_put_request(fc, req); + return ERR_PTR(-EOVERFLOW); + } return req; out: @@ -256,7 +260,10 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, if (!req) req = get_reserved_req(fc, file); - fuse_req_init_context(fc, req); + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); + __set_bit(FR_WAITING, &req->flags); __clear_bit(FR_BACKGROUND, &req->flags); return req; @@ -381,8 +388,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) wake_up(&fc->blocked_waitq); - if (fc->num_background == fc->congestion_threshold && - fc->connected && fc->sb) { + if (fc->num_background == fc->congestion_threshold && fc->sb) { clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } @@ -1234,9 +1240,10 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, if (err) goto err_unlock; - err = -ENODEV; - if (!fiq->connected) + if (!fiq->connected) { + err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; goto err_unlock; + } if (!list_empty(&fiq->interrupts)) { req = list_entry(fiq->interrupts.next, struct fuse_req, @@ -1260,12 +1267,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, in = &req->in; reqsize = in->h.len; - if (task_active_pid_ns(current) != fc->pid_ns) { - rcu_read_lock(); - in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns)); - rcu_read_unlock(); - } - /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; @@ -1287,7 +1288,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); if (!fpq->connected) { - err = -ENODEV; + err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; goto out_end; } if (err) { @@ -1361,7 +1362,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (!fud) return -EPERM; - bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); + bufs = kmalloc_array(pipe->buffers, sizeof(struct pipe_buffer), + GFP_KERNEL); if (!bufs) return -ENOMEM; @@ -1942,7 +1944,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, if (!fud) return -EPERM; - bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); + bufs = kmalloc_array(pipe->buffers, sizeof(struct pipe_buffer), + GFP_KERNEL); if (!bufs) return -ENOMEM; @@ -2076,7 +2079,7 @@ static void end_polls(struct fuse_conn *fc) * is OK, the request will in that case be removed from the list before we touch * it. */ -void fuse_abort_conn(struct fuse_conn *fc) +void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) { struct fuse_iqueue *fiq = &fc->iq; @@ -2089,6 +2092,7 @@ void fuse_abort_conn(struct fuse_conn *fc) fc->connected = 0; fc->blocked = 0; + fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { struct fuse_pqueue *fpq = &fud->pq; @@ -2151,7 +2155,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { WARN_ON(fc->iq.fasync != NULL); - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); } fuse_dev_free(fud); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 24967382a7b1..56231b31f806 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -858,8 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; - stat->uid = make_kuid(&init_user_ns, attr->uid); - stat->gid = make_kgid(&init_user_ns, attr->gid); + stat->uid = make_kuid(fc->user_ns, attr->uid); + stat->gid = make_kgid(fc->user_ns, attr->gid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; @@ -924,12 +924,20 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, } static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat) + struct kstat *stat, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; + bool sync; - if (time_before64(fi->i_time, get_jiffies_64())) { + if (flags & AT_STATX_FORCE_SYNC) + sync = true; + else if (flags & AT_STATX_DONT_SYNC) + sync = false; + else + sync = time_before64(fi->i_time, get_jiffies_64()); + + if (sync) { forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { @@ -943,7 +951,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int fuse_update_attributes(struct inode *inode, struct file *file) { - return fuse_update_get_attr(inode, file, NULL); + return fuse_update_get_attr(inode, file, NULL, 0); } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, @@ -1030,7 +1038,7 @@ int fuse_allow_current_process(struct fuse_conn *fc) const struct cred *cred; if (fc->allow_other) - return 1; + return current_in_userns(fc->user_ns); cred = current_cred(); if (uid_eq(cred->euid, fc->user_id) && @@ -1475,17 +1483,17 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) return true; } -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, - bool trust_local_cmtime) +static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, + struct fuse_setattr_in *arg, bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid); + arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid); + arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { @@ -1629,8 +1637,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, return err; if (attr->ia_valid & ATTR_OPEN) { - if (fc->atomic_o_trunc) + /* This is coming from open(..., ... | O_TRUNC); */ + WARN_ON(!(attr->ia_valid & ATTR_SIZE)); + WARN_ON(attr->ia_size != 0); + if (fc->atomic_o_trunc) { + /* + * No need to send request to userspace, since actual + * truncation has already been done by OPEN. But still + * need to truncate page cache. + */ + i_size_write(inode, 0); + truncate_pagecache(inode, 0); return 0; + } file = NULL; } @@ -1646,7 +1665,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg, trust_local_cmtime); + iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1783,7 +1802,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_get_attr(inode, NULL, stat); + return fuse_update_get_attr(inode, NULL, stat, flags); } static const struct inode_operations fuse_dir_inode_operations = { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c4c093bbf456..5256ad333b05 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -26,6 +26,7 @@ #include <linux/xattr.h> #include <linux/pid_namespace.h> #include <linux/refcount.h> +#include <linux/user_namespace.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -466,6 +467,9 @@ struct fuse_conn { /** The pid namespace for this mount */ struct pid_namespace *pid_ns; + /** The user namespace for this mount */ + struct user_namespace *user_ns; + /** Maximum read size */ unsigned max_read; @@ -515,6 +519,9 @@ struct fuse_conn { abort and device release */ unsigned connected; + /** Connection aborted via sysfs */ + bool aborted; + /** Connection failed (version mismatch). Cannot race with setting other bitfields since it is only set once in INIT reply, before any other request, and never cleared */ @@ -526,6 +533,9 @@ struct fuse_conn { /** Do readpages asynchronously? Only set in INIT */ unsigned async_read:1; + /** Return an unique read error after abort. Only set in INIT */ + unsigned abort_err:1; + /** Do not send separate SETATTR request before open(O_TRUNC) */ unsigned atomic_o_trunc:1; @@ -851,7 +861,7 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ -void fuse_abort_conn(struct fuse_conn *fc); +void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); /** * Invalidate inode attributes @@ -870,7 +880,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc); +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns); /** * Release reference to fuse_conn @@ -975,6 +985,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); int fuse_removexattr(struct inode *inode, const char *name); extern const struct xattr_handler *fuse_xattr_handlers[]; extern const struct xattr_handler *fuse_acl_xattr_handlers[]; +extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ef309958e060..a24df8861b40 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -171,8 +171,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); set_nlink(inode, attr->nlink); - inode->i_uid = make_kuid(&init_user_ns, attr->uid); - inode->i_gid = make_kgid(&init_user_ns, attr->gid); + inode->i_uid = make_kuid(fc->user_ns, attr->uid); + inode->i_gid = make_kgid(fc->user_ns, attr->gid); inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; @@ -217,7 +217,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, return; } - old_mtime = inode->i_mtime; + old_mtime = timespec64_to_timespec(inode->i_mtime); fuse_change_attributes_common(inode, attr, attr_valid); oldsize = inode->i_size; @@ -371,7 +371,7 @@ void fuse_unlock_inode(struct inode *inode) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb)); + fuse_abort_conn(get_fuse_conn_super(sb), false); } static void fuse_send_destroy(struct fuse_conn *fc) @@ -393,7 +393,7 @@ static void fuse_put_super(struct super_block *sb) fuse_send_destroy(fc); - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); mutex_lock(&fuse_mutex); list_del(&fc->entry); fuse_ctl_remove_conn(fc); @@ -477,7 +477,8 @@ static int fuse_match_uint(substring_t *s, unsigned int *res) return err; } -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) +static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, + struct user_namespace *user_ns) { char *p; memset(d, 0, sizeof(struct fuse_mount_data)); @@ -513,7 +514,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) case OPT_USER_ID: if (fuse_match_uint(&args[0], &uv)) return 0; - d->user_id = make_kuid(current_user_ns(), uv); + d->user_id = make_kuid(user_ns, uv); if (!uid_valid(d->user_id)) return 0; d->user_id_present = 1; @@ -522,7 +523,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) case OPT_GROUP_ID: if (fuse_match_uint(&args[0], &uv)) return 0; - d->group_id = make_kgid(current_user_ns(), uv); + d->group_id = make_kgid(user_ns, uv); if (!gid_valid(d->group_id)) return 0; d->group_id_present = 1; @@ -565,8 +566,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); - seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); + seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); if (fc->default_permissions) seq_puts(m, ",default_permissions"); if (fc->allow_other) @@ -597,7 +598,7 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc) +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); @@ -621,6 +622,7 @@ void fuse_conn_init(struct fuse_conn *fc) fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); + fc->user_ns = get_user_ns(user_ns); } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -630,6 +632,7 @@ void fuse_conn_put(struct fuse_conn *fc) if (fc->destroy_req) fuse_request_free(fc->destroy_req); put_pid_ns(fc->pid_ns); + put_user_ns(fc->user_ns); fc->release(fc); } } @@ -918,6 +921,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->posix_acl = 1; fc->sb->s_xattr = fuse_acl_xattr_handlers; } + if (arg->flags & FUSE_ABORT_ERROR) + fc->abort_err = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -948,7 +953,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | - FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL; + FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | + FUSE_ABORT_ERROR; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1061,7 +1067,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - if (!parse_fuse_opt(data, &d, is_bdev)) + if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) goto err; if (is_bdev) { @@ -1089,16 +1095,27 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) goto err; - if ((file->f_op != &fuse_dev_operations) || - (file->f_cred->user_ns != &init_user_ns)) + /* + * Require mount to happen from the same user namespace which + * opened /dev/fuse to prevent potential attacks. + */ + if (file->f_op != &fuse_dev_operations || + file->f_cred->user_ns != sb->s_user_ns) goto err_fput; + /* + * If we are not in the initial user namespace posix + * acls must be translated. + */ + if (sb->s_user_ns != &init_user_ns) + sb->s_xattr = fuse_no_acl_xattr_handlers; + fc = kmalloc(sizeof(*fc), GFP_KERNEL); err = -ENOMEM; if (!fc) goto err_fput; - fuse_conn_init(fc); + fuse_conn_init(fc, sb->s_user_ns); fc->release = fuse_free_conn; fud = fuse_dev_alloc(fc); @@ -1179,6 +1196,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fuse_dev_free(fud); err_put_conn: fuse_conn_put(fc); + sb->s_fs_info = NULL; err_fput: fput(file); err: @@ -1208,7 +1226,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE, + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, .mount = fuse_mount, .kill_sb = fuse_kill_sb_anon, }; diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 3caac46b08b0..433717640f78 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -192,6 +192,26 @@ static int fuse_xattr_set(const struct xattr_handler *handler, return fuse_setxattr(inode, name, value, size, flags); } +static bool no_xattr_list(struct dentry *dentry) +{ + return false; +} + +static int no_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) +{ + return -EOPNOTSUPP; +} + +static int no_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *nodee, + const char *name, const void *value, + size_t size, int flags) +{ + return -EOPNOTSUPP; +} + static const struct xattr_handler fuse_xattr_handler = { .prefix = "", .get = fuse_xattr_get, @@ -209,3 +229,26 @@ const struct xattr_handler *fuse_acl_xattr_handlers[] = { &fuse_xattr_handler, NULL }; + +static const struct xattr_handler fuse_no_acl_access_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = no_xattr_list, + .get = no_xattr_get, + .set = no_xattr_set, +}; + +static const struct xattr_handler fuse_no_acl_default_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_DEFAULT, + .flags = ACL_TYPE_ACCESS, + .list = no_xattr_list, + .get = no_xattr_get, + .set = no_xattr_set, +}; + +const struct xattr_handler *fuse_no_acl_xattr_handlers[] = { + &fuse_no_acl_access_xattr_handler, + &fuse_no_acl_default_xattr_handler, + &fuse_xattr_handler, + NULL +}; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index f58716567972..35f5ee23566d 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -54,8 +54,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, continue; if (start >= to) break; - if (gfs2_is_jdata(ip)) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); } } @@ -747,18 +746,21 @@ out: put_page(page); gfs2_trans_end(sdp); - if (pos + len > ip->i_inode.i_size) - gfs2_trim_blocks(&ip->i_inode); - goto out_trans_fail; + if (alloc_required) { + gfs2_inplace_release(ip); + if (pos + len > ip->i_inode.i_size) + gfs2_trim_blocks(&ip->i_inode); + } + goto out_qunlock; out_endtrans: gfs2_trans_end(sdp); out_trans_fail: - if (alloc_required) { + if (alloc_required) gfs2_inplace_release(ip); out_qunlock: + if (alloc_required) gfs2_quota_unlock(ip); - } out_unlock: if (&ip->i_inode == sdp->sd_rindex) { gfs2_glock_dq(&m_ip->i_gh); @@ -814,7 +816,6 @@ out: * @inode: The inode * @dibh: The buffer_head containing the on-disk inode * @pos: The file position - * @len: The length of the write * @copied: How much was actually copied by the VFS * @page: The page * @@ -824,17 +825,15 @@ out: * Returns: errno */ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, - loff_t pos, unsigned len, unsigned copied, + loff_t pos, unsigned copied, struct page *page) { struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); u64 to = pos + copied; void *kaddr; unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); - BUG_ON(pos + len > gfs2_max_stuffed_size(ip)); + BUG_ON(pos + copied > gfs2_max_stuffed_size(ip)); kaddr = kmap_atomic(page); memcpy(buf + pos, kaddr + pos, copied); @@ -850,20 +849,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, i_size_write(inode, to); mark_inode_dirty(inode); } - - if (inode == sdp->sd_rindex) { - adjust_fs_space(inode); - sdp->sd_rindex_uptodate = 0; - } - - brelse(dibh); - gfs2_trans_end(sdp); - if (inode == sdp->sd_rindex) { - gfs2_glock_dq(&m_ip->i_gh); - gfs2_holder_uninit(&m_ip->i_gh); - } - gfs2_glock_dq(&ip->i_gh); - gfs2_holder_uninit(&ip->i_gh); return copied; } @@ -877,9 +862,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, * @page: The page that has been written * @fsdata: The fsdata (unused in GFS2) * - * The main write_end function for GFS2. We have a separate one for - * stuffed files as they are slightly different, otherwise we just - * put our locking around the VFS provided functions. + * The main write_end function for GFS2. We just put our locking around the VFS + * provided functions. * * Returns: errno */ @@ -900,32 +884,39 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); ret = gfs2_meta_inode_buffer(ip, &dibh); - if (unlikely(ret)) { - unlock_page(page); - put_page(page); - goto failed; - } + if (unlikely(ret)) + goto out; - if (gfs2_is_stuffed(ip)) - return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); + if (gfs2_is_stuffed(ip)) { + ret = gfs2_stuffed_write_end(inode, dibh, pos, copied, page); + page = NULL; + goto out2; + } - if (!gfs2_is_writeback(ip)) + if (gfs2_is_jdata(ip)) gfs2_page_add_databufs(ip, page, pos & ~PAGE_MASK, len); + else + gfs2_ordered_add_inode(ip); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + page = NULL; if (tr->tr_num_buf_new) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); else gfs2_trans_add_meta(ip->i_gl, dibh); - +out2: if (inode == sdp->sd_rindex) { adjust_fs_space(inode); sdp->sd_rindex_uptodate = 0; } brelse(dibh); -failed: +out: + if (page) { + unlock_page(page); + put_page(page); + } gfs2_trans_end(sdp); gfs2_inplace_release(ip); if (ip->i_qadata && ip->i_qadata->qa_qd_num) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 278ed0869c3c..ed6699705c13 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -89,10 +89,12 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, map_bh(bh, inode->i_sb, block); set_buffer_uptodate(bh); - if (!gfs2_is_jdata(ip)) - mark_buffer_dirty(bh); - if (!gfs2_is_writeback(ip)) + if (gfs2_is_jdata(ip)) gfs2_trans_add_data(ip->i_gl, bh); + else { + mark_buffer_dirty(bh); + gfs2_ordered_add_inode(ip); + } if (release) { unlock_page(page); @@ -176,8 +178,8 @@ out: /** * find_metapath - Find path through the metadata tree * @sdp: The superblock - * @mp: The metapath to return the result in * @block: The disk block to look up + * @mp: The metapath to return the result in * @height: The pre-calculated height of the metadata tree * * This routine returns a struct metapath structure that defines a path @@ -188,8 +190,7 @@ out: * filesystem with a blocksize of 4096. * * find_metapath() would return a struct metapath structure set to: - * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48, - * and mp_list[2] = 165. + * mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165. * * That means that in order to get to the block containing the byte at * offset 101342453, we would load the indirect block pointed to by pointer @@ -279,6 +280,21 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp return p + mp->mp_list[height]; } +static inline const __be64 *metaend(unsigned int height, const struct metapath *mp) +{ + const struct buffer_head *bh = mp->mp_bh[height]; + return (const __be64 *)(bh->b_data + bh->b_size); +} + +static void clone_metapath(struct metapath *clone, struct metapath *mp) +{ + unsigned int hgt; + + *clone = *mp; + for (hgt = 0; hgt < mp->mp_aheight; hgt++) + get_bh(clone->mp_bh[hgt]); +} + static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) { const __be64 *t; @@ -420,20 +436,140 @@ static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __b return (ptr - first); } -static inline void bmap_lock(struct gfs2_inode *ip, int create) +typedef const __be64 *(*gfs2_metadata_walker)( + struct metapath *mp, + const __be64 *start, const __be64 *end, + u64 factor, void *data); + +#define WALK_STOP ((__be64 *)0) +#define WALK_NEXT ((__be64 *)1) + +static int gfs2_walk_metadata(struct inode *inode, sector_t lblock, + u64 len, struct metapath *mp, gfs2_metadata_walker walker, + void *data) { - if (create) - down_write(&ip->i_rw_mutex); - else - down_read(&ip->i_rw_mutex); + struct metapath clone; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const __be64 *start, *end, *ptr; + u64 factor = 1; + unsigned int hgt; + int ret = 0; + + for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--) + factor *= sdp->sd_inptrs; + + for (;;) { + u64 step; + + /* Walk indirect block. */ + start = metapointer(hgt, mp); + end = metaend(hgt, mp); + + step = (end - start) * factor; + if (step > len) + end = start + DIV_ROUND_UP_ULL(len, factor); + + ptr = walker(mp, start, end, factor, data); + if (ptr == WALK_STOP) + break; + if (step >= len) + break; + len -= step; + if (ptr != WALK_NEXT) { + BUG_ON(!*ptr); + mp->mp_list[hgt] += ptr - start; + goto fill_up_metapath; + } + +lower_metapath: + /* Decrease height of metapath. */ + if (mp != &clone) { + clone_metapath(&clone, mp); + mp = &clone; + } + brelse(mp->mp_bh[hgt]); + mp->mp_bh[hgt] = NULL; + if (!hgt) + break; + hgt--; + factor *= sdp->sd_inptrs; + + /* Advance in metadata tree. */ + (mp->mp_list[hgt])++; + start = metapointer(hgt, mp); + end = metaend(hgt, mp); + if (start >= end) { + mp->mp_list[hgt] = 0; + if (!hgt) + break; + goto lower_metapath; + } + +fill_up_metapath: + /* Increase height of metapath. */ + if (mp != &clone) { + clone_metapath(&clone, mp); + mp = &clone; + } + ret = fillup_metapath(ip, mp, ip->i_height - 1); + if (ret < 0) + break; + hgt += ret; + for (; ret; ret--) + do_div(factor, sdp->sd_inptrs); + mp->mp_aheight = hgt + 1; + } + if (mp == &clone) + release_metapath(mp); + return ret; } -static inline void bmap_unlock(struct gfs2_inode *ip, int create) +struct gfs2_hole_walker_args { + u64 blocks; +}; + +static const __be64 *gfs2_hole_walker(struct metapath *mp, + const __be64 *start, const __be64 *end, + u64 factor, void *data) { - if (create) - up_write(&ip->i_rw_mutex); - else - up_read(&ip->i_rw_mutex); + struct gfs2_hole_walker_args *args = data; + const __be64 *ptr; + + for (ptr = start; ptr < end; ptr++) { + if (*ptr) { + args->blocks += (ptr - start) * factor; + if (mp->mp_aheight == mp->mp_fheight) + return WALK_STOP; + return ptr; /* increase height */ + } + } + args->blocks += (end - start) * factor; + return WALK_NEXT; +} + +/** + * gfs2_hole_size - figure out the size of a hole + * @inode: The inode + * @lblock: The logical starting block number + * @len: How far to look (in blocks) + * @mp: The metapath at lblock + * @iomap: The iomap to store the hole size in + * + * This function modifies @mp. + * + * Returns: errno on error + */ +static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len, + struct metapath *mp, struct iomap *iomap) +{ + struct gfs2_hole_walker_args args = { }; + int ret = 0; + + ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args); + if (!ret) + iomap->length = args.blocks << inode->i_blkbits; + return ret; } static inline __be64 *gfs2_indirect_init(struct metapath *mp, @@ -462,15 +598,11 @@ enum alloc_state { }; /** - * gfs2_bmap_alloc - Build a metadata tree of the requested height + * gfs2_iomap_alloc - Build a metadata tree of the requested height * @inode: The GFS2 inode - * @lblock: The logical starting block of the extent - * @bh_map: This is used to return the mapping details - * @zero_new: True if newly allocated blocks should be zeroed + * @iomap: The iomap structure + * @flags: iomap flags * @mp: The metapath, with proper height information calculated - * @maxlen: The max number of data blocks to alloc - * @dblock: Pointer to return the resulting new block - * @dblks: Pointer to return the number of blocks allocated * * In this routine we may have to alloc: * i) Indirect blocks to grow the metadata tree height @@ -483,6 +615,13 @@ enum alloc_state { * blocks are available, there will only be one request per bmap call) * and uses the state machine to initialise the blocks in order. * + * Right now, this function will allocate at most one indirect block + * worth of data -- with a default block size of 4K, that's slightly + * less than 2M. If this limitation is ever removed to allow huge + * allocations, we would probably still want to limit the iomap size we + * return to avoid stalling other tasks during huge writes; the next + * iomap iteration would then find the blocks already allocated. + * * Returns: errno on error */ @@ -497,6 +636,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, unsigned dblks = 0; unsigned ptrs_per_blk; const unsigned end_of_metadata = mp->mp_fheight - 1; + int ret; enum alloc_state state; __be64 *ptr; __be64 zero_bn = 0; @@ -507,6 +647,8 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, gfs2_trans_add_meta(ip->i_gl, dibh); + down_write(&ip->i_rw_mutex); + if (mp->mp_fheight == mp->mp_aheight) { struct buffer_head *bh; int eob; @@ -542,11 +684,10 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, blks = dblks + iblks; i = mp->mp_aheight; do { - int error; n = blks - alloced; - error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); - if (error) - return error; + ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + if (ret) + goto out; alloced += n; if (state != ALLOC_DATA || gfs2_is_jdata(ip)) gfs2_trans_add_unrevoke(sdp, bn, n); @@ -602,7 +743,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, dblks = n; ptr = metapointer(end_of_metadata, mp); iomap->addr = bn << inode->i_blkbits; - iomap->flags |= IOMAP_F_NEW; + iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW; while (n-- > 0) *ptr++ = cpu_to_be64(bn++); break; @@ -612,64 +753,10 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, iomap->length = (u64)dblks << inode->i_blkbits; ip->i_height = mp->mp_fheight; gfs2_add_inode_blocks(&ip->i_inode, alloced); - gfs2_dinode_out(ip, mp->mp_bh[0]->b_data); - return 0; -} - -/** - * hole_size - figure out the size of a hole - * @inode: The inode - * @lblock: The logical starting block number - * @mp: The metapath - * - * Returns: The hole size in bytes - * - */ -static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct metapath mp_eof; - u64 factor = 1; - int hgt; - u64 holesz = 0; - const __be64 *first, *end, *ptr; - const struct buffer_head *bh; - u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits; - int zeroptrs; - bool done = false; - - /* Get another metapath, to the very last byte */ - find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height); - for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) { - bh = mp->mp_bh[hgt]; - if (bh) { - zeroptrs = 0; - first = metapointer(hgt, mp); - end = (const __be64 *)(bh->b_data + bh->b_size); - - for (ptr = first; ptr < end; ptr++) { - if (*ptr) { - done = true; - break; - } else { - zeroptrs++; - } - } - } else { - zeroptrs = sdp->sd_inptrs; - } - if (factor * zeroptrs >= lblock_stop - lblock + 1) { - holesz = lblock_stop - lblock + 1; - break; - } - holesz += factor * zeroptrs; - - factor *= sdp->sd_inptrs; - if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1])) - (mp->mp_list[hgt - 1])++; - } - return holesz << inode->i_blkbits; + gfs2_dinode_out(ip, dibh->b_data); +out: + up_write(&ip->i_rw_mutex); + return ret; } static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap) @@ -680,126 +767,136 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap) sizeof(struct gfs2_dinode); iomap->offset = 0; iomap->length = i_size_read(inode); - iomap->type = IOMAP_MAPPED; - iomap->flags = IOMAP_F_DATA_INLINE; + iomap->type = IOMAP_INLINE; } +#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE + /** - * gfs2_iomap_begin - Map blocks from an inode to disk blocks + * gfs2_iomap_get - Map blocks from an inode to disk blocks * @inode: The inode * @pos: Starting position in bytes * @length: Length to map, in bytes * @flags: iomap flags * @iomap: The iomap structure + * @mp: The metapath * * Returns: errno */ -int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap *iomap) +static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap, + struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct metapath mp = { .mp_aheight = 1, }; - unsigned int factor = sdp->sd_sb.sb_bsize; - const u64 *arr = sdp->sd_heightsize; __be64 *ptr; sector_t lblock; - sector_t lend; - int ret = 0; + sector_t lblock_stop; + int ret; int eob; - unsigned int len; + u64 len; struct buffer_head *bh; u8 height; - trace_gfs2_iomap_start(ip, pos, length, flags); - if (!length) { - ret = -EINVAL; - goto out; - } + if (!length) + return -EINVAL; if (gfs2_is_stuffed(ip)) { if (flags & IOMAP_REPORT) { + if (pos >= i_size_read(inode)) + return -ENOENT; gfs2_stuffed_iomap(inode, iomap); - if (pos >= iomap->length) - ret = -ENOENT; - goto out; + return 0; } BUG_ON(!(flags & IOMAP_WRITE)); } - lblock = pos >> inode->i_blkbits; - lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits; - iomap->offset = lblock << inode->i_blkbits; - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; - iomap->length = (u64)(lend - lblock) << inode->i_blkbits; - iomap->flags = IOMAP_F_MERGED; - bmap_lock(ip, flags & IOMAP_WRITE); + lblock_stop = (pos + length - 1) >> inode->i_blkbits; + len = lblock_stop - lblock + 1; - /* - * Directory data blocks have a struct gfs2_meta_header header, so the - * remaining size is smaller than the filesystem block size. Logical - * block numbers for directories are in units of this remaining size! - */ - if (gfs2_is_dir(ip)) { - factor = sdp->sd_jbsize; - arr = sdp->sd_jheightsize; - } + down_read(&ip->i_rw_mutex); - ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]); + ret = gfs2_meta_inode_buffer(ip, &mp->mp_bh[0]); if (ret) - goto out_release; + goto unlock; height = ip->i_height; - while ((lblock + 1) * factor > arr[height]) + while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height]) height++; - find_metapath(sdp, lblock, &mp, height); + find_metapath(sdp, lblock, mp, height); if (height > ip->i_height || gfs2_is_stuffed(ip)) goto do_alloc; - ret = lookup_metapath(ip, &mp); + ret = lookup_metapath(ip, mp); if (ret) - goto out_release; + goto unlock; - if (mp.mp_aheight != ip->i_height) + if (mp->mp_aheight != ip->i_height) goto do_alloc; - ptr = metapointer(ip->i_height - 1, &mp); + ptr = metapointer(ip->i_height - 1, mp); if (*ptr == 0) goto do_alloc; - iomap->type = IOMAP_MAPPED; - iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; + bh = mp->mp_bh[ip->i_height - 1]; + len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, len, &eob); - bh = mp.mp_bh[ip->i_height - 1]; - len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob); + iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; + iomap->length = len << inode->i_blkbits; + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_MERGED; if (eob) - iomap->flags |= IOMAP_F_BOUNDARY; - iomap->length = (u64)len << inode->i_blkbits; + iomap->flags |= IOMAP_F_GFS2_BOUNDARY; -out_release: - release_metapath(&mp); - bmap_unlock(ip, flags & IOMAP_WRITE); out: - trace_gfs2_iomap_end(ip, iomap, ret); + iomap->bdev = inode->i_sb->s_bdev; +unlock: + up_read(&ip->i_rw_mutex); return ret; do_alloc: - if (flags & IOMAP_WRITE) { - ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); - } else if (flags & IOMAP_REPORT) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = len << inode->i_blkbits; + iomap->type = IOMAP_HOLE; + iomap->flags = 0; + if (flags & IOMAP_REPORT) { loff_t size = i_size_read(inode); if (pos >= size) ret = -ENOENT; - else if (height <= ip->i_height) - iomap->length = hole_size(inode, lblock, &mp); + else if (height == ip->i_height) + ret = gfs2_hole_size(inode, lblock, len, mp, iomap); else iomap->length = size - pos; } - goto out_release; + goto out; +} + +static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + trace_gfs2_iomap_start(ip, pos, length, flags); + if (flags & IOMAP_WRITE) { + ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + if (!ret && iomap->type == IOMAP_HOLE) + ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); + release_metapath(&mp); + } else { + ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + release_metapath(&mp); + } + trace_gfs2_iomap_end(ip, iomap, ret); + return ret; } +const struct iomap_ops gfs2_iomap_ops = { + .iomap_begin = gfs2_iomap_begin, +}; + /** * gfs2_block_map - Map one or more blocks of an inode to a disk block * @inode: The inode @@ -825,34 +922,43 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh_map, int create) { struct gfs2_inode *ip = GFS2_I(inode); - struct iomap iomap; - int ret, flags = 0; + loff_t pos = (loff_t)lblock << inode->i_blkbits; + loff_t length = bh_map->b_size; + struct metapath mp = { .mp_aheight = 1, }; + struct iomap iomap = { }; + int ret; clear_buffer_mapped(bh_map); clear_buffer_new(bh_map); clear_buffer_boundary(bh_map); trace_gfs2_bmap(ip, bh_map, lblock, create, 1); - if (create) - flags |= IOMAP_WRITE; - ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits, - bh_map->b_size, flags, &iomap); - if (ret) { - if (!create && ret == -ENOENT) { - /* Return unmapped buffer beyond the end of file. */ + if (create) { + ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp); + if (!ret && iomap.type == IOMAP_HOLE) + ret = gfs2_iomap_alloc(inode, &iomap, IOMAP_WRITE, &mp); + release_metapath(&mp); + } else { + ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp); + release_metapath(&mp); + + /* Return unmapped buffer beyond the end of file. */ + if (ret == -ENOENT) { ret = 0; + goto out; } - goto out; } + if (ret) + goto out; if (iomap.length > bh_map->b_size) { iomap.length = bh_map->b_size; - iomap.flags &= ~IOMAP_F_BOUNDARY; + iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY; } if (iomap.addr != IOMAP_NULL_ADDR) map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits); bh_map->b_size = iomap.length; - if (iomap.flags & IOMAP_F_BOUNDARY) + if (iomap.flags & IOMAP_F_GFS2_BOUNDARY) set_buffer_boundary(bh_map); if (iomap.flags & IOMAP_F_NEW) set_buffer_new(bh_map); @@ -945,8 +1051,10 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, err = 0; } - if (!gfs2_is_writeback(ip)) + if (gfs2_is_jdata(ip)) gfs2_trans_add_data(ip->i_gl, bh); + else + gfs2_ordered_add_inode(ip); zero_user(page, offset, length); mark_buffer_dirty(bh); @@ -1056,6 +1164,19 @@ out: return error; } +int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap) +{ + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp); + if (!ret && iomap->type == IOMAP_HOLE) + ret = gfs2_iomap_alloc(inode, iomap, IOMAP_WRITE, &mp); + release_metapath(&mp); + return ret; +} + /** * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein * @ip: inode diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index c3402fe00653..6b18fb323f0a 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -46,11 +46,13 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, } } +extern const struct iomap_ops gfs2_iomap_ops; + extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); extern int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); -extern int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap *iomap); +extern int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap); extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); extern int gfs2_setattr_size(struct inode *inode, u64 size); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index d9fb0ad6cc30..d97ad89955d1 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -871,7 +871,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct buffer_head *bh; struct gfs2_leaf *leaf; struct gfs2_dirent *dent; - struct timespec tv = current_time(inode); + struct timespec64 tv = current_time(inode); error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) @@ -1055,7 +1055,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) /* Change the pointers. Don't bother distinguishing stuffed from non-stuffed. This code is complicated enough already. */ - lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS); + lp = kmalloc_array(half_len, sizeof(__be64), GFP_NOFS); if (!lp) { error = -ENOMEM; goto fail_brelse; @@ -1169,7 +1169,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) if (IS_ERR(hc)) return PTR_ERR(hc); - hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN); + hc2 = kmalloc_array(hsize_bytes, 2, GFP_NOFS | __GFP_NOWARN); if (hc2 == NULL) hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); @@ -1596,7 +1596,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, error = -ENOMEM; /* 96 is max number of dirents which can be stuffed into an inode */ - darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS); + darr = kmalloc_array(96, sizeof(struct gfs2_dirent *), GFP_NOFS); if (darr) { g.pdent = (const struct gfs2_dirent **)darr; g.offset = 0; @@ -1802,7 +1802,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *bh = da->bh; struct gfs2_dirent *dent = da->dent; - struct timespec tv; + struct timespec64 tv; struct gfs2_leaf *leaf; int error; @@ -1880,7 +1880,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; - struct timespec tv = current_time(&dip->i_inode); + struct timespec64 tv = current_time(&dip->i_inode); /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4b71f021a9e2..7137db7b0119 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -733,7 +733,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, struct gfs2_inode *ip = GFS2_I(inode); loff_t end = offset + len; struct buffer_head *dibh; - struct iomap iomap; + struct iomap iomap = { }; int error; error = gfs2_meta_inode_buffer(ip, &dibh); @@ -749,8 +749,8 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, } while (offset < end) { - error = gfs2_iomap_begin(inode, offset, end - offset, - IOMAP_WRITE, &iomap); + error = gfs2_iomap_get_alloc(inode, offset, end - offset, + &iomap); if (error) goto out; offset = iomap.offset + iomap.length; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 097bd3c0f270..4614ee25f621 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1303,7 +1303,8 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) default: if (num_gh <= 4) break; - pph = kmalloc(num_gh * sizeof(struct gfs2_holder *), GFP_NOFS); + pph = kmalloc_array(num_gh, sizeof(struct gfs2_holder *), + GFP_NOFS); if (!pph) return -ENOMEM; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d8782a7a1e7d..c63bee9adb6a 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -338,7 +338,7 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl) static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { const struct gfs2_dinode *str = buf; - struct timespec atime; + struct timespec64 atime; u16 height, depth; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) @@ -361,7 +361,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); - if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0) + if (timespec64_compare(&ip->i_inode.i_atime, &atime) < 0) ip->i_inode.i_atime = atime; ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 1b6b1e3f5caf..d2ad817e089f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -116,6 +116,7 @@ static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) { + BUG_ON(rbm->offset >= rbm->rgd->rd_data); return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + rbm->offset; } @@ -696,8 +697,6 @@ struct gfs2_sbd { u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ u32 sd_max_height; /* Max height of a file's metadata tree */ u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; - u32 sd_max_jheight; /* Max height of journaled file's meta tree */ - u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1]; u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */ struct gfs2_args sd_args; /* Mount arguments */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8700eb815638..feda55f67050 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -2006,10 +2006,6 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat, return 0; } -const struct iomap_ops gfs2_iomap_ops = { - .iomap_begin = gfs2_iomap_begin, -}; - static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 1862e310a067..20241436126d 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -14,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/writeback.h> #include "incore.h" +#include "inode.h" /** * gfs2_log_lock - acquire the right to mess with the log manager @@ -50,8 +51,12 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_sbd *sdp; + if (!gfs2_is_ordered(ip)) + return; + + sdp = GFS2_SB(&ip->i_inode); if (!test_bit(GIF_ORDERED, &ip->i_flags)) { spin_lock(&sdp->sd_ordered_lock); if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3ba3f167641c..c2469833b4fb 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -335,25 +335,6 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_heightsize[x] = ~0; gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); - sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode); - sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; - for (x = 2;; x++) { - u64 space, d; - u32 m; - - space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; - d = space; - m = do_div(d, sdp->sd_inptrs); - - if (d != sdp->sd_jheightsize[x - 1] || m) - break; - sdp->sd_jheightsize[x] = space; - } - sdp->sd_max_jheight = x; - sdp->sd_jheightsize[x] = ~0; - gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); - sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_leaf)) / GFS2_MIN_DIRENT_SIZE; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 7a98abd340ee..0efae7a0ee80 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -735,7 +735,10 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (!buffer_uptodate(bh)) goto unlock_out; } - gfs2_trans_add_data(ip->i_gl, bh); + if (gfs2_is_jdata(ip)) + gfs2_trans_add_data(ip->i_gl, bh); + else + gfs2_ordered_add_inode(ip); /* If we need to write to the next block as well */ if (to_write > (bsize - boff)) { @@ -883,7 +886,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); - ghs = kmalloc(num_qd * sizeof(struct gfs2_holder), GFP_NOFS); + ghs = kmalloc_array(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); if (!ghs) return -ENOMEM; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 8b683917a27e..33abcf29bc05 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -372,8 +372,8 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) start = bi->bi_bh->b_data; if (bi->bi_clone) start = bi->bi_clone; - end = start + bi->bi_bh->b_size; start += bi->bi_offset; + end = start + bi->bi_len; BUG_ON(rbm.offset & 3); start += (rbm.offset / GFS2_NBBY); bytes = min_t(u32, len / GFS2_NBBY, (end - start)); @@ -2605,8 +2605,9 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) { unsigned int x; - rlist->rl_ghs = kmalloc(rlist->rl_rgrps * sizeof(struct gfs2_holder), - GFP_NOFS | __GFP_NOFAIL); + rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps, + sizeof(struct gfs2_holder), + GFP_NOFS | __GFP_NOFAIL); for (x = 0; x < rlist->rl_rgrps; x++) gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, state, 0, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index cf5c7f3080d2..af0d5b01cf0b 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1097,7 +1097,7 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host int error = 0, err; memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); - gha = kmalloc(slots * sizeof(struct gfs2_holder), GFP_KERNEL); + gha = kmalloc_array(slots, sizeof(struct gfs2_holder), GFP_KERNEL); if (!gha) return -ENOMEM; for (x = 0; x < slots; x++) diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index c75cacaa349b..064c9a0ef046 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -143,32 +143,21 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, * @gl: The inode glock associated with the buffer * @bh: The buffer to add * - * This is used in two distinct cases: - * i) In ordered write mode - * We put the data buffer on a list so that we can ensure that it's - * synced to disk at the right time - * ii) In journaled data mode - * We need to journal the data block in the same way as metadata in - * the functions above. The difference is that here we have a tag - * which is two __be64's being the block number (as per meta data) - * and a flag which says whether the data block needs escaping or - * not. This means we need a new log entry for each 251 or so data - * blocks, which isn't an enormous overhead but twice as much as - * for normal metadata blocks. + * This is used in journaled data mode. + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. */ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) { struct gfs2_trans *tr = current->journal_info; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct address_space *mapping = bh->b_page->mapping; - struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_bufdata *bd; - if (!gfs2_is_jdata(ip)) { - gfs2_ordered_add_inode(ip); - return; - } - lock_buffer(bh); if (buffer_pinned(bh)) { set_bit(TR_TOUCHED, &tr->tr_flags); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 75b254280ff6..3bf2ae0e467c 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -31,21 +31,15 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (res) { - hfs_find_exit(&fd); - if (res == -ENOENT) { - /* No such entry */ - inode = NULL; - goto done; - } - return ERR_PTR(res); + if (res != -ENOENT) + inode = ERR_PTR(res); + } else { + inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec); + if (!inode) + inode = ERR_PTR(-EACCES); } - inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec); hfs_find_exit(&fd); - if (!inode) - return ERR_PTR(-EACCES); -done: - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } /* diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 2538b49cc349..2a16111d312f 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -351,7 +351,7 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_mode &= ~hsb->s_file_umask; inode->i_mode |= S_IFREG; inode->i_ctime = inode->i_atime = inode->i_mtime = - hfs_m_to_utime(rec->file.MdDat); + timespec_to_timespec64(hfs_m_to_utime(rec->file.MdDat)); inode->i_op = &hfs_file_inode_operations; inode->i_fop = &hfs_file_operations; inode->i_mapping->a_ops = &hfs_aops; @@ -362,7 +362,7 @@ static int hfs_read_inode(struct inode *inode, void *data) HFS_I(inode)->fs_blocks = 0; inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); inode->i_ctime = inode->i_atime = inode->i_mtime = - hfs_m_to_utime(rec->dir.MdDat); + timespec_to_timespec64(hfs_m_to_utime(rec->dir.MdDat)); inode->i_op = &hfs_dir_inode_operations; inode->i_fop = &hfs_dir_operations; break; @@ -543,9 +543,9 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, igrab(dir); hlist_add_fake(&inode->i_hash); mark_inode_dirty(inode); + dont_mount(dentry); out: - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } void hfs_evict_inode(struct inode *inode) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 15e06fb552da..b5254378f011 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -122,8 +122,7 @@ again: if (S_ISREG(inode->i_mode)) HFSPLUS_I(inode)->linkid = linkid; out: - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); fail: hfs_find_exit(&fd); return ERR_PTR(err); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index c0c8d433864f..c824f702feec 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -493,9 +493,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfsplus_get_perms(inode, &folder->permissions, 1); set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); - inode->i_atime = hfsp_mt2ut(folder->access_date); - inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); - inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); + inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(folder->access_date)); + inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(folder->content_mod_date)); + inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(folder->attribute_mod_date)); HFSPLUS_I(inode)->create_date = folder->create_date; HFSPLUS_I(inode)->fs_blocks = 0; if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { @@ -531,9 +531,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) init_special_inode(inode, inode->i_mode, be32_to_cpu(file->permissions.dev)); } - inode->i_atime = hfsp_mt2ut(file->access_date); - inode->i_mtime = hfsp_mt2ut(file->content_mod_date); - inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); + inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(file->access_date)); + inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(file->content_mod_date)); + inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(file->attribute_mod_date)); HFSPLUS_I(inode)->create_date = file->create_date; } else { pr_err("bad catalog entry used to create inode\n"); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 3cd85eb5bbb1..2597b290c2a5 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -555,9 +555,9 @@ static int read_name(struct inode *ino, char *name) set_nlink(ino, st.nlink); i_uid_write(ino, st.uid); i_gid_write(ino, st.gid); - ino->i_atime = st.atime; - ino->i_mtime = st.mtime; - ino->i_ctime = st.ctime; + ino->i_atime = timespec_to_timespec64(st.atime); + ino->i_mtime = timespec_to_timespec64(st.mtime); + ino->i_ctime = timespec_to_timespec64(st.ctime); ino->i_size = st.size; ino->i_blocks = st.blocks; return 0; @@ -838,15 +838,15 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_ATIME) { attrs.ia_valid |= HOSTFS_ATTR_ATIME; - attrs.ia_atime = attr->ia_atime; + attrs.ia_atime = timespec64_to_timespec(attr->ia_atime); } if (attr->ia_valid & ATTR_MTIME) { attrs.ia_valid |= HOSTFS_ATTR_MTIME; - attrs.ia_mtime = attr->ia_mtime; + attrs.ia_mtime = timespec64_to_timespec(attr->ia_mtime); } if (attr->ia_valid & ATTR_CTIME) { attrs.ia_valid |= HOSTFS_ATTR_CTIME; - attrs.ia_ctime = attr->ia_ctime; + attrs.ia_ctime = timespec64_to_timespec(attr->ia_ctime); } if (attr->ia_valid & ATTR_ATIME_SET) { attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET; diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index a4ad18afbdec..4ada525c5c43 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -33,7 +33,8 @@ int hpfs_add_pos(struct inode *inode, loff_t *pos) if (hpfs_inode->i_rddir_off[i] == pos) return 0; if (!(i&0x0f)) { - if (!(ppos = kmalloc((i+0x11) * sizeof(loff_t*), GFP_NOFS))) { + ppos = kmalloc_array(i + 0x11, sizeof(loff_t *), GFP_NOFS); + if (!ppos) { pr_err("out of memory for position list\n"); return -ENOMEM; } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 2577ef1034ef..2a153aed4c19 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -26,8 +26,7 @@ #include "hpfs.h" #define EIOERROR EIO -#define EFSERROR EPERM -#define EMEMERROR ENOMEM +#define EFSERROR EUCLEAN #define ANODE_ALLOC_FWD 512 #define FNODE_ALLOC_FWD 0 diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index 7c49f1ef0c85..ecd9fccd1663 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -115,7 +115,7 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21; int i; __le32 *b; - if (!(b = kmalloc(n * 512, GFP_KERNEL))) { + if (!(b = kmalloc_array(n, 512, GFP_KERNEL))) { pr_err("can't allocate memory for bitmap directory\n"); return NULL; } diff --git a/fs/inode.c b/fs/inode.c index 13ceb98c3bd3..8c86c809ca17 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -178,6 +178,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; @@ -1003,6 +1004,70 @@ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) EXPORT_SYMBOL(unlock_two_nondirectories); /** + * inode_insert5 - obtain an inode from a mounted file system + * @inode: pre-allocated inode to use for insert to cache + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if present it is return it with an increased reference count. This is + * a variant of iget5_locked() for callers that don't want to fail on memory + * allocation of inode. + * + * If the inode is not in cache, insert the pre-allocated inode to cache and + * return it locked, hashed, and with the I_NEW flag set. The file system gets + * to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. + */ +struct inode *inode_insert5(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); + struct inode *old; + +again: + spin_lock(&inode_hash_lock); + old = find_inode(inode->i_sb, head, test, data); + if (unlikely(old)) { + /* + * Uhhuh, somebody else created the same inode under us. + * Use the old inode instead of the preallocated one. + */ + spin_unlock(&inode_hash_lock); + wait_on_inode(old); + if (unlikely(inode_unhashed(old))) { + iput(old); + goto again; + } + return old; + } + + if (set && unlikely(set(inode, data))) { + inode = NULL; + goto unlock; + } + + /* + * Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); +unlock: + spin_unlock(&inode_hash_lock); + + return inode; +} +EXPORT_SYMBOL(inode_insert5); + +/** * iget5_locked - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get @@ -1026,66 +1091,18 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); - struct inode *inode; -again: - spin_lock(&inode_hash_lock); - inode = find_inode(sb, head, test, data); - spin_unlock(&inode_hash_lock); + struct inode *inode = ilookup5(sb, hashval, test, data); - if (inode) { - wait_on_inode(inode); - if (unlikely(inode_unhashed(inode))) { - iput(inode); - goto again; - } - return inode; - } - - inode = alloc_inode(sb); - if (inode) { - struct inode *old; - - spin_lock(&inode_hash_lock); - /* We released the lock, so.. */ - old = find_inode(sb, head, test, data); - if (!old) { - if (set(inode, data)) - goto set_failed; + if (!inode) { + struct inode *new = new_inode(sb); - spin_lock(&inode->i_lock); - inode->i_state = I_NEW; - hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode->i_lock); - inode_sb_list_add(inode); - spin_unlock(&inode_hash_lock); - - /* Return the locked inode with I_NEW set, the - * caller is responsible for filling in the contents - */ - return inode; - } - - /* - * Uhhuh, somebody else created the same inode under - * us. Use the old inode instead of the one we just - * allocated. - */ - spin_unlock(&inode_hash_lock); - destroy_inode(inode); - inode = old; - wait_on_inode(inode); - if (unlikely(inode_unhashed(inode))) { - iput(inode); - goto again; + if (new) { + inode = inode_insert5(new, hashval, test, set, data); + if (unlikely(inode != new)) + iput(new); } } return inode; - -set_failed: - spin_unlock(&inode_hash_lock); - destroy_inode(inode); - return NULL; } EXPORT_SYMBOL(iget5_locked); @@ -1426,43 +1443,13 @@ EXPORT_SYMBOL(insert_inode_locked); int insert_inode_locked4(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct super_block *sb = inode->i_sb; - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *old = inode_insert5(inode, hashval, test, NULL, data); - while (1) { - struct inode *old = NULL; - - spin_lock(&inode_hash_lock); - hlist_for_each_entry(old, head, i_hash) { - if (old->i_sb != sb) - continue; - if (!test(old, data)) - continue; - spin_lock(&old->i_lock); - if (old->i_state & (I_FREEING|I_WILL_FREE)) { - spin_unlock(&old->i_lock); - continue; - } - break; - } - if (likely(!old)) { - spin_lock(&inode->i_lock); - inode->i_state |= I_NEW; - hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); - return 0; - } - __iget(old); - spin_unlock(&old->i_lock); - spin_unlock(&inode_hash_lock); - wait_on_inode(old); - if (unlikely(!inode_unhashed(old))) { - iput(old); - return -EBUSY; - } + if (old != inode) { iput(old); + return -EBUSY; } + return 0; } EXPORT_SYMBOL(insert_inode_locked4); @@ -1590,8 +1577,8 @@ static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode, if (upperdentry) { struct inode *realinode = d_inode(upperdentry); - if ((!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || - !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) { + if ((!timespec64_equal(&inode->i_mtime, &realinode->i_mtime) || + !timespec64_equal(&inode->i_ctime, &realinode->i_ctime))) { inode->i_mtime = realinode->i_mtime; inode->i_ctime = realinode->i_ctime; } @@ -1614,12 +1601,12 @@ static int relatime_need_update(const struct path *path, struct inode *inode, /* * Is mtime younger than atime? If yes, update atime: */ - if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0) + if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) return 1; /* * Is ctime younger than atime? If yes, update atime: */ - if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0) + if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) return 1; /* @@ -1634,7 +1621,7 @@ static int relatime_need_update(const struct path *path, struct inode *inode, return 0; } -int generic_update_time(struct inode *inode, struct timespec *time, int flags) +int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) { int iflags = I_DIRTY_TIME; bool dirty = false; @@ -1662,9 +1649,9 @@ EXPORT_SYMBOL(generic_update_time); * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ -static int update_time(struct inode *inode, struct timespec *time, int flags) +static int update_time(struct inode *inode, struct timespec64 *time, int flags) { - int (*update_time)(struct inode *, struct timespec *, int); + int (*update_time)(struct inode *, struct timespec64 *, int); update_time = inode->i_op->update_time ? inode->i_op->update_time : generic_update_time; @@ -1685,7 +1672,7 @@ bool __atime_needs_update(const struct path *path, struct inode *inode, bool rcu) { struct vfsmount *mnt = path->mnt; - struct timespec now; + struct timespec64 now; if (inode->i_flags & S_NOATIME) return false; @@ -1708,10 +1695,10 @@ bool __atime_needs_update(const struct path *path, struct inode *inode, now = current_time(inode); - if (!relatime_need_update(path, inode, now, rcu)) + if (!relatime_need_update(path, inode, timespec64_to_timespec(now), rcu)) return false; - if (timespec_equal(&inode->i_atime, &now)) + if (timespec64_equal(&inode->i_atime, &now)) return false; return true; @@ -1721,7 +1708,7 @@ void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = d_inode(path->dentry); - struct timespec now; + struct timespec64 now; if (!__atime_needs_update(path, inode, false)) return; @@ -1855,7 +1842,7 @@ EXPORT_SYMBOL(file_remove_privs); int file_update_time(struct file *file) { struct inode *inode = file_inode(file); - struct timespec now; + struct timespec64 now; int sync_it = 0; int ret; @@ -1864,10 +1851,10 @@ int file_update_time(struct file *file) return 0; now = current_time(inode); - if (!timespec_equal(&inode->i_mtime, &now)) + if (!timespec64_equal(&inode->i_mtime, &now)) sync_it = S_MTIME; - if (!timespec_equal(&inode->i_ctime, &now)) + if (!timespec64_equal(&inode->i_ctime, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) @@ -2012,8 +1999,14 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, inode->i_uid = current_fsuid(); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; + + /* Directories are special, and always inherit S_ISGID */ if (S_ISDIR(mode)) mode |= S_ISGID; + else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && + !in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(dir, CAP_FSETID)) + mode &= ~S_ISGID; } else inode->i_gid = current_fsgid(); inode->i_mode = mode; @@ -2111,6 +2104,30 @@ void inode_nohighmem(struct inode *inode) EXPORT_SYMBOL(inode_nohighmem); /** + * timespec64_trunc - Truncate timespec64 to a granularity + * @t: Timespec64 + * @gran: Granularity in ns. + * + * Truncate a timespec64 to a granularity. Always rounds down. gran must + * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). + */ +struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran) +{ + /* Avoid division in the common cases 1 ns and 1 s. */ + if (gran == 1) { + /* nothing */ + } else if (gran == NSEC_PER_SEC) { + t.tv_nsec = 0; + } else if (gran > 1 && gran < NSEC_PER_SEC) { + t.tv_nsec -= t.tv_nsec % gran; + } else { + WARN(1, "illegal file time granularity: %u", gran); + } + return t; +} +EXPORT_SYMBOL(timespec64_trunc); + +/** * current_time - Return FS time * @inode: inode. * @@ -2120,15 +2137,15 @@ EXPORT_SYMBOL(inode_nohighmem); * Note that inode and inode->sb cannot be NULL. * Otherwise, the function warns and returns time without truncation. */ -struct timespec current_time(struct inode *inode) +struct timespec64 current_time(struct inode *inode) { - struct timespec now = current_kernel_time(); + struct timespec64 now = current_kernel_time64(); if (unlikely(!inode->i_sb)) { WARN(1, "current_time() called with uninitialized super_block in the inode"); return now; } - return timespec_trunc(now, inode->i_sb->s_time_gran); + return timespec64_trunc(now, inode->i_sb->s_time_gran); } EXPORT_SYMBOL(current_time); diff --git a/fs/internal.h b/fs/internal.h index e08972db0303..5645b4ebf494 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,8 +125,8 @@ int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); +extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); -extern struct file *filp_clone_open(struct file *); /* * inode.c diff --git a/fs/ioctl.c b/fs/ioctl.c index 4823431d1c9d..b445b13fc59b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -549,7 +549,7 @@ static int ioctl_fsfreeze(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* If filesystem doesn't support freeze feature, return. */ @@ -566,7 +566,7 @@ static int ioctl_fsthaw(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Thaw */ diff --git a/fs/iomap.c b/fs/iomap.c index afd163586aa0..77397b5a96ef 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -20,6 +20,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/backing-dev.h> @@ -27,6 +28,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/dax.h> #include <linux/sched/signal.h> +#include <linux/swap.h> #include "internal.h" @@ -95,6 +97,12 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, return written ? written : ret; } +static sector_t +iomap_sector(struct iomap *iomap, loff_t pos) +{ + return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; +} + static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -352,11 +360,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, struct iomap *iomap) { - sector_t sector = (iomap->addr + - (pos & PAGE_MASK) - iomap->offset) >> 9; - - return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector, - offset, bytes); + return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, + iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); } static loff_t @@ -501,10 +506,13 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, case IOMAP_DELALLOC: flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; break; + case IOMAP_MAPPED: + break; case IOMAP_UNWRITTEN: flags |= FIEMAP_EXTENT_UNWRITTEN; break; - case IOMAP_MAPPED: + case IOMAP_INLINE: + flags |= FIEMAP_EXTENT_DATA_INLINE; break; } @@ -512,8 +520,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, flags |= FIEMAP_EXTENT_MERGED; if (iomap->flags & IOMAP_F_SHARED) flags |= FIEMAP_EXTENT_SHARED; - if (iomap->flags & IOMAP_F_DATA_INLINE) - flags |= FIEMAP_EXTENT_DATA_INLINE; return fiemap_fill_next_extent(fi, iomap->offset, iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, @@ -587,6 +593,113 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } EXPORT_SYMBOL_GPL(iomap_fiemap); +/* + * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. + * Returns true if found and updates @lastoff to the offset in file. + */ +static bool +page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, + int whence) +{ + const struct address_space_operations *ops = inode->i_mapping->a_ops; + unsigned int bsize = i_blocksize(inode), off; + bool seek_data = whence == SEEK_DATA; + loff_t poff = page_offset(page); + + if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) + return false; + + if (*lastoff < poff) { + /* + * Last offset smaller than the start of the page means we found + * a hole: + */ + if (whence == SEEK_HOLE) + return true; + *lastoff = poff; + } + + /* + * Just check the page unless we can and should check block ranges: + */ + if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) + return PageUptodate(page) == seek_data; + + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping)) + goto out_unlock_not_found; + + for (off = 0; off < PAGE_SIZE; off += bsize) { + if ((*lastoff & ~PAGE_MASK) >= off + bsize) + continue; + if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { + unlock_page(page); + return true; + } + *lastoff = poff + off + bsize; + } + +out_unlock_not_found: + unlock_page(page); + return false; +} + +/* + * Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * + * Within unwritten extents, the page cache determines which parts are holes + * and which are data: uptodate buffer heads count as data; everything else + * counts as a hole. + * + * Returns the resulting offset on successs, and -ENOENT otherwise. + */ +static loff_t +page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, + int whence) +{ + pgoff_t index = offset >> PAGE_SHIFT; + pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); + loff_t lastoff = offset; + struct pagevec pvec; + + if (length <= 0) + return -ENOENT; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i; + + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, + end - 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page_seek_hole_data(inode, page, &lastoff, whence)) + goto check_range; + lastoff = page_offset(page) + PAGE_SIZE; + } + pagevec_release(&pvec); + } while (index < end); + + /* When no page at lastoff and we are not done, we found a hole. */ + if (whence != SEEK_HOLE) + goto not_found; + +check_range: + if (lastoff < offset + length) + goto out; +not_found: + lastoff = -ENOENT; +out: + pagevec_release(&pvec); + return lastoff; +} + + static loff_t iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, void *data, struct iomap *iomap) @@ -685,6 +798,8 @@ EXPORT_SYMBOL_GPL(iomap_seek_data); * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_WRITE_FUA (1 << 28) +#define IOMAP_DIO_NEED_SYNC (1 << 29) #define IOMAP_DIO_WRITE (1 << 30) #define IOMAP_DIO_DIRTY (1 << 31) @@ -759,6 +874,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) dio_warn_stale_pagecache(iocb->ki_filp); } + /* + * If this is a DSYNC write, make sure we push it to stable storage now + * that we've written data. + */ + if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) + ret = generic_write_sync(iocb, ret); + inode_dio_end(file_inode(iocb->ki_filp)); kfree(dio); @@ -769,13 +891,8 @@ static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); struct kiocb *iocb = dio->iocb; - bool is_write = (dio->flags & IOMAP_DIO_WRITE); - ssize_t ret; - ret = iomap_dio_complete(dio); - if (is_write && ret > 0) - ret = generic_write_sync(iocb, ret); - iocb->ki_complete(iocb, ret, 0); + iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); } /* @@ -833,14 +950,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio = bio_alloc(GFP_KERNEL, 1); bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = - (iomap->addr + pos - iomap->offset) >> 9; + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; get_page(page); - if (bio_add_page(bio, page, len, 0) != len) - BUG(); + __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); atomic_inc(&dio->ref); @@ -858,6 +973,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, struct iov_iter iter; struct bio *bio; bool need_zeroout = false; + bool use_fua = false; int nr_pages, ret; size_t copied = 0; @@ -881,8 +997,20 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, case IOMAP_MAPPED: if (iomap->flags & IOMAP_F_SHARED) dio->flags |= IOMAP_DIO_COW; - if (iomap->flags & IOMAP_F_NEW) + if (iomap->flags & IOMAP_F_NEW) { need_zeroout = true; + } else { + /* + * Use a FUA write if we need datasync semantics, this + * is a pure data IO that doesn't require any metadata + * updates and the underlying device supports FUA. This + * allows us to avoid cache flushes on IO completion. + */ + if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && + (dio->flags & IOMAP_DIO_WRITE_FUA) && + blk_queue_fua(bdev_get_queue(iomap->bdev))) + use_fua = true; + } break; default: WARN_ON_ONCE(1); @@ -916,9 +1044,9 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, bio = bio_alloc(GFP_KERNEL, nr_pages); bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = - (iomap->addr + pos - iomap->offset) >> 9; + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_write_hint = dio->iocb->ki_hint; + bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -930,10 +1058,14 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, n = bio->bi_iter.bi_size; if (dio->flags & IOMAP_DIO_WRITE) { - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + if (use_fua) + bio->bi_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_FUA; task_io_account_write(n); } else { - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; if (dio->flags & IOMAP_DIO_DIRTY) bio_set_pages_dirty(bio); } @@ -961,6 +1093,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return copied; } +/* + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO + * is being issued as AIO or not. This allows us to optimise pure data writes + * to use REQ_FUA rather than requiring generic_write_sync() to issue a + * REQ_FLUSH post write. This is slightly tricky because a single request here + * can be mapped into multiple disjoint IOs and only a subset of the IOs issued + * may be pure data writes. In that case, we still need to do a full data sync + * completion. + */ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, iomap_dio_end_io_t end_io) @@ -1005,8 +1146,21 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iter->type == ITER_IOVEC) dio->flags |= IOMAP_DIO_DIRTY; } else { - dio->flags |= IOMAP_DIO_WRITE; flags |= IOMAP_WRITE; + dio->flags |= IOMAP_DIO_WRITE; + + /* for data sync or sync, we need sync completion processing */ + if (iocb->ki_flags & IOCB_DSYNC) + dio->flags |= IOMAP_DIO_NEED_SYNC; + + /* + * For datasync only writes, we optimistically try using FUA for + * this IO. Any non-FUA write that occurs will clear this flag, + * hence we know before completion whether a cache flush is + * necessary. + */ + if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) + dio->flags |= IOMAP_DIO_WRITE_FUA; } if (iocb->ki_flags & IOCB_NOWAIT) { @@ -1062,6 +1216,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret < 0) iomap_dio_set_error(dio, ret); + /* + * If all the writes we issued were FUA, we don't need to flush the + * cache on IO completion. Clear the sync flag for this case. + */ + if (dio->flags & IOMAP_DIO_WRITE_FUA) + dio->flags &= ~IOMAP_DIO_NEED_SYNC; + if (!atomic_dec_and_test(&dio->ref)) { if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; @@ -1089,3 +1250,207 @@ out_free_dio: return ret; } EXPORT_SYMBOL_GPL(iomap_dio_rw); + +/* Swapfile activation */ + +#ifdef CONFIG_SWAP +struct iomap_swapfile_info { + struct iomap iomap; /* accumulated iomap */ + struct swap_info_struct *sis; + uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ + uint64_t highest_ppage; /* highest physical addr seen (pages) */ + unsigned long nr_pages; /* number of pages collected */ + int nr_extents; /* extent count */ +}; + +/* + * Collect physical extents for this swap file. Physical extents reported to + * the swap code must be trimmed to align to a page boundary. The logical + * offset within the file is irrelevant since the swapfile code maps logical + * page numbers of the swap device to the physical page-aligned extents. + */ +static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) +{ + struct iomap *iomap = &isi->iomap; + unsigned long nr_pages; + uint64_t first_ppage; + uint64_t first_ppage_reported; + uint64_t next_ppage; + int error; + + /* + * Round the start up and the end down so that the physical + * extent aligns to a page boundary. + */ + first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> + PAGE_SHIFT; + + /* Skip too-short physical extents. */ + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; + + /* + * Calculate how much swap space we're adding; the first page contains + * the swap header and doesn't count. The mm still wants that first + * page fed to add_swap_extent, however. + */ + first_ppage_reported = first_ppage; + if (iomap->offset == 0) + first_ppage_reported++; + if (isi->lowest_ppage > first_ppage_reported) + isi->lowest_ppage = first_ppage_reported; + if (isi->highest_ppage < (next_ppage - 1)) + isi->highest_ppage = next_ppage - 1; + + /* Add extent, set up for the next call. */ + error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); + if (error < 0) + return error; + isi->nr_extents += error; + isi->nr_pages += nr_pages; + return 0; +} + +/* + * Accumulate iomaps for this swap file. We have to accumulate iomaps because + * swap only cares about contiguous page-aligned physical extents and makes no + * distinction between written and unwritten extents. + */ +static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, + loff_t count, void *data, struct iomap *iomap) +{ + struct iomap_swapfile_info *isi = data; + int error; + + switch (iomap->type) { + case IOMAP_MAPPED: + case IOMAP_UNWRITTEN: + /* Only real or unwritten extents. */ + break; + case IOMAP_INLINE: + /* No inline data. */ + pr_err("swapon: file is inline\n"); + return -EINVAL; + default: + pr_err("swapon: file has unallocated extents\n"); + return -EINVAL; + } + + /* No uncommitted metadata or shared blocks. */ + if (iomap->flags & IOMAP_F_DIRTY) { + pr_err("swapon: file is not committed\n"); + return -EINVAL; + } + if (iomap->flags & IOMAP_F_SHARED) { + pr_err("swapon: file has shared extents\n"); + return -EINVAL; + } + + /* Only one bdev per swap file. */ + if (iomap->bdev != isi->sis->bdev) { + pr_err("swapon: file is on multiple devices\n"); + return -EINVAL; + } + + if (isi->iomap.length == 0) { + /* No accumulated extent, so just store it. */ + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); + } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { + /* Append this to the accumulated extent. */ + isi->iomap.length += iomap->length; + } else { + /* Otherwise, add the retained iomap and store this one. */ + error = iomap_swapfile_add_extent(isi); + if (error) + return error; + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); + } + return count; +} + +/* + * Iterate a swap file's iomaps to construct physical extents that can be + * passed to the swapfile subsystem. + */ +int iomap_swapfile_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *pagespan, + const struct iomap_ops *ops) +{ + struct iomap_swapfile_info isi = { + .sis = sis, + .lowest_ppage = (sector_t)-1ULL, + }; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + loff_t pos = 0; + loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); + loff_t ret; + + /* + * Persist all file mapping metadata so that we won't have any + * IOMAP_F_DIRTY iomaps. + */ + ret = vfs_fsync(swap_file, 1); + if (ret) + return ret; + + while (len > 0) { + ret = iomap_apply(inode, pos, len, IOMAP_REPORT, + ops, &isi, iomap_swapfile_activate_actor); + if (ret <= 0) + return ret; + + pos += ret; + len -= ret; + } + + if (isi.iomap.length) { + ret = iomap_swapfile_add_extent(&isi); + if (ret) + return ret; + } + + *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; + sis->max = isi.nr_pages; + sis->pages = isi.nr_pages - 1; + sis->highest_bit = isi.nr_pages - 1; + return isi.nr_extents; +} +EXPORT_SYMBOL_GPL(iomap_swapfile_activate); +#endif /* CONFIG_SWAP */ + +static loff_t +iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + sector_t *bno = data, addr; + + if (iomap->type == IOMAP_MAPPED) { + addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; + if (addr > INT_MAX) + WARN(1, "would truncate bmap result\n"); + else + *bno = addr; + } + return 0; +} + +/* legacy ->bmap interface. 0 is the error return (!) */ +sector_t +iomap_bmap(struct address_space *mapping, sector_t bno, + const struct iomap_ops *ops) +{ + struct inode *inode = mapping->host; + loff_t pos = bno >> inode->i_blkbits; + unsigned blocksize = i_blocksize(inode); + + if (filemap_write_and_wait(mapping)) + return 0; + + bno = 0; + iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); + return bno; +} +EXPORT_SYMBOL_GPL(iomap_bmap); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index dfb057900e79..8ef6b6daaa7a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -114,7 +114,7 @@ void __jbd2_debug(int level, const char *file, const char *func, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf); va_end(args); } EXPORT_SYMBOL(__jbd2_debug); @@ -2302,8 +2302,7 @@ static void jbd2_journal_destroy_slabs(void) int i; for (i = 0; i < JBD2_MAX_SLABS; i++) { - if (jbd2_slab[i]) - kmem_cache_destroy(jbd2_slab[i]); + kmem_cache_destroy(jbd2_slab[i]); jbd2_slab[i] = NULL; } } @@ -2404,10 +2403,8 @@ static int jbd2_journal_init_journal_head_cache(void) static void jbd2_journal_destroy_journal_head_cache(void) { - if (jbd2_journal_head_cache) { - kmem_cache_destroy(jbd2_journal_head_cache); - jbd2_journal_head_cache = NULL; - } + kmem_cache_destroy(jbd2_journal_head_cache); + jbd2_journal_head_cache = NULL; } /* @@ -2665,11 +2662,10 @@ static int __init jbd2_journal_init_handle_cache(void) static void jbd2_journal_destroy_handle_cache(void) { - if (jbd2_handle_cache) - kmem_cache_destroy(jbd2_handle_cache); - if (jbd2_inode_cache) - kmem_cache_destroy(jbd2_inode_cache); - + kmem_cache_destroy(jbd2_handle_cache); + jbd2_handle_cache = NULL; + kmem_cache_destroy(jbd2_inode_cache); + jbd2_inode_cache = NULL; } /* diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 696ef15ec942..a1143e57a718 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -180,14 +180,10 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, void jbd2_journal_destroy_revoke_caches(void) { - if (jbd2_revoke_record_cache) { - kmem_cache_destroy(jbd2_revoke_record_cache); - jbd2_revoke_record_cache = NULL; - } - if (jbd2_revoke_table_cache) { - kmem_cache_destroy(jbd2_revoke_table_cache); - jbd2_revoke_table_cache = NULL; - } + kmem_cache_destroy(jbd2_revoke_record_cache); + jbd2_revoke_record_cache = NULL; + kmem_cache_destroy(jbd2_revoke_table_cache); + jbd2_revoke_table_cache = NULL; } int __init jbd2_journal_init_revoke_caches(void) @@ -227,7 +223,7 @@ static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = - kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); + kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8aa453784402..c0b66a7a795b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -49,10 +49,8 @@ int __init jbd2_journal_init_transaction_cache(void) void jbd2_journal_destroy_transaction_cache(void) { - if (transaction_cache) { - kmem_cache_destroy(transaction_cache); - transaction_cache = NULL; - } + kmem_cache_destroy(transaction_cache); + transaction_cache = NULL; } void jbd2_journal_free_transaction(transaction_t *transaction) @@ -1363,6 +1361,13 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) { jbd_lock_bh_state(bh); + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) + pr_err("JBD2: assertion failure: h_type=%u " + "h_line_no=%u block_no=%llu jlist=%u\n", + handle->h_type, handle->h_line_no, + (unsigned long long) bh->b_blocknr, + jh->b_jlist); J_ASSERT_JH(jh, jh->b_transaction != transaction || jh->b_jlist == BJ_Metadata); jbd_unlock_bh_state(bh); @@ -1382,11 +1387,11 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * of the transaction. This needs to be done * once a transaction -bzzz */ - jh->b_modified = 1; if (handle->h_buffer_credits <= 0) { ret = -ENOSPC; goto out_unlock_bh; } + jh->b_modified = 1; handle->h_buffer_credits--; } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 7ebacf14837f..093ffbd82395 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -133,7 +133,8 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size) size_t i; *size = jffs2_acl_size(acl->a_count); - header = kmalloc(sizeof(*header) + acl->a_count * sizeof(*entry), GFP_KERNEL); + header = kmalloc(struct_size(header, a_entries, acl->a_count), + GFP_KERNEL); if (!header) return ERR_PTR(-ENOMEM); header->a_version = cpu_to_je32(JFFS2_ACL_VERSION); diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 2e2b5745c3b7..12d0271bdde3 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -22,6 +22,7 @@ struct jffs2_acl_entry_short { struct jffs2_acl_header { jint32_t a_version; + struct jffs2_acl_entry a_entries[]; }; #ifdef CONFIG_JFFS2_FS_POSIX_ACL diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 0a754f38462e..b2944f9218f7 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -201,7 +201,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, if (ret) goto fail; - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->ctime))); jffs2_free_raw_inode(ri); @@ -209,8 +209,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, __func__, inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->pino_nlink, inode->i_mapping->nrpages); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -235,7 +234,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) if (dead_f->inocache) set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink); if (!ret) - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(now)); return ret; } /***********************************************************************/ @@ -269,7 +268,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink); mutex_unlock(&f->sem); d_instantiate(dentry, d_inode(old_dentry)); - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(now)); ihold(d_inode(old_dentry)); } return ret; @@ -419,7 +418,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(rd->mctime))); jffs2_free_raw_dirent(rd); @@ -430,8 +429,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -563,7 +561,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(rd->mctime))); inc_nlink(dir_i); jffs2_free_raw_dirent(rd); @@ -575,8 +573,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -601,7 +598,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, f, now); if (!ret) { - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(now)); clear_nlink(d_inode(dentry)); drop_nlink(dir_i); } @@ -736,7 +733,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(rd->mctime))); jffs2_free_raw_dirent(rd); @@ -747,8 +744,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -857,14 +853,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, * caller won't do it on its own since we are returning an error. */ d_invalidate(new_dentry); - new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); + new_dir_i->i_mtime = new_dir_i->i_ctime = timespec_to_timespec64(ITIME(now)); return ret; } if (d_is_dir(old_dentry)) drop_nlink(old_dir_i); - new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now); + new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = timespec_to_timespec64(ITIME(now)); return 0; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index bd0428bebe9b..481afd4c2e1a 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -308,7 +308,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, inode->i_size = pos + writtenlen; inode->i_blocks = (inode->i_size + 511) >> 9; - inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime)); + inode->i_ctime = inode->i_mtime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->ctime))); } } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index eab04eca95a3..0ecfb8ea38cd 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -146,9 +146,9 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) return PTR_ERR(new_metadata); } /* It worked. Update the inode */ - inode->i_atime = ITIME(je32_to_cpu(ri->atime)); - inode->i_ctime = ITIME(je32_to_cpu(ri->ctime)); - inode->i_mtime = ITIME(je32_to_cpu(ri->mtime)); + inode->i_atime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->atime))); + inode->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->ctime))); + inode->i_mtime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->mtime))); inode->i_mode = jemode_to_cpu(ri->mode); i_uid_write(inode, je16_to_cpu(ri->uid)); i_gid_write(inode, je16_to_cpu(ri->gid)); @@ -280,9 +280,9 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) i_uid_write(inode, je16_to_cpu(latest_node.uid)); i_gid_write(inode, je16_to_cpu(latest_node.gid)); inode->i_size = je32_to_cpu(latest_node.isize); - inode->i_atime = ITIME(je32_to_cpu(latest_node.atime)); - inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); - inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); + inode->i_atime = timespec_to_timespec64(ITIME(je32_to_cpu(latest_node.atime))); + inode->i_mtime = timespec_to_timespec64(ITIME(je32_to_cpu(latest_node.mtime))); + inode->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(latest_node.ctime))); set_nlink(inode, f->inocache->pino_nlink); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 2cfe487708e0..c6821a509481 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1208,7 +1208,7 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c) if (!c->wbuf) return -ENOMEM; - c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->oobavail, GFP_KERNEL); + c->oobbuf = kmalloc_array(NR_OOB_SCAN_PAGES, c->oobavail, GFP_KERNEL); if (!c->oobbuf) { kfree(c->wbuf); return -ENOMEM; diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index a70907606025..35a5b2a81ae0 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -29,7 +29,6 @@ #ifdef PROC_FS_JFS /* see jfs_debug.h */ -static struct proc_dir_entry *base; #ifdef CONFIG_JFS_DEBUG static int jfs_loglevel_proc_show(struct seq_file *m, void *v) { @@ -66,43 +65,29 @@ static const struct file_operations jfs_loglevel_proc_fops = { }; #endif -static struct { - const char *name; - const struct file_operations *proc_fops; -} Entries[] = { -#ifdef CONFIG_JFS_STATISTICS - { "lmstats", &jfs_lmstats_proc_fops, }, - { "txstats", &jfs_txstats_proc_fops, }, - { "xtstat", &jfs_xtstat_proc_fops, }, - { "mpstat", &jfs_mpstat_proc_fops, }, -#endif -#ifdef CONFIG_JFS_DEBUG - { "TxAnchor", &jfs_txanchor_proc_fops, }, - { "loglevel", &jfs_loglevel_proc_fops } -#endif -}; -#define NPROCENT ARRAY_SIZE(Entries) - void jfs_proc_init(void) { - int i; + struct proc_dir_entry *base; - if (!(base = proc_mkdir("fs/jfs", NULL))) + base = proc_mkdir("fs/jfs", NULL); + if (!base) return; - for (i = 0; i < NPROCENT; i++) - proc_create(Entries[i].name, 0, base, Entries[i].proc_fops); +#ifdef CONFIG_JFS_STATISTICS + proc_create_single("lmstats", 0, base, jfs_lmstats_proc_show); + proc_create_single("txstats", 0, base, jfs_txstats_proc_show); + proc_create_single("xtstat", 0, base, jfs_xtstat_proc_show); + proc_create_single("mpstat", 0, base, jfs_mpstat_proc_show); +#endif +#ifdef CONFIG_JFS_DEBUG + proc_create_single("TxAnchor", 0, base, jfs_txanchor_proc_show); + proc_create("loglevel", 0, base, &jfs_loglevel_proc_fops); +#endif } void jfs_proc_clean(void) { - int i; - - if (base) { - for (i = 0; i < NPROCENT; i++) - remove_proc_entry(Entries[i].name, base); - remove_proc_entry("fs/jfs", NULL); - } + remove_proc_subtree("fs/jfs", NULL); } #endif /* PROC_FS_JFS */ diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h index eafd1300a00b..0d9e35da8462 100644 --- a/fs/jfs/jfs_debug.h +++ b/fs/jfs/jfs_debug.h @@ -62,7 +62,7 @@ extern void jfs_proc_clean(void); extern int jfsloglevel; -extern const struct file_operations jfs_txanchor_proc_fops; +int jfs_txanchor_proc_show(struct seq_file *m, void *v); /* information message: e.g., configuration, major event */ #define jfs_info(fmt, arg...) do { \ @@ -105,10 +105,10 @@ extern const struct file_operations jfs_txanchor_proc_fops; * ---------- */ #ifdef CONFIG_JFS_STATISTICS -extern const struct file_operations jfs_lmstats_proc_fops; -extern const struct file_operations jfs_txstats_proc_fops; -extern const struct file_operations jfs_mpstat_proc_fops; -extern const struct file_operations jfs_xtstat_proc_fops; +int jfs_lmstats_proc_show(struct seq_file *m, void *v); +int jfs_txstats_proc_show(struct seq_file *m, void *v); +int jfs_mpstat_proc_show(struct seq_file *m, void *v); +int jfs_xtstat_proc_show(struct seq_file *m, void *v); #define INCREMENT(x) ((x)++) #define DECREMENT(x) ((x)--) diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 2d514c7affc2..49263e220dbc 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -1641,7 +1641,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen) max_ranges = nblocks; do_div(max_ranges, minlen); range_cnt = min_t(u64, max_ranges + 1, 32 * 1024); - totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS); + totrim = kmalloc_array(range_cnt, sizeof(struct range2trim), GFP_NOFS); if (totrim == NULL) { jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n"); IWRITE_UNLOCK(ipbmap); diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index de2bcb36e079..52bae3f5c914 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -594,7 +594,8 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, struct component_name ciKey; struct super_block *sb = ip->i_sb; - ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS); + ciKey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), + GFP_NOFS); if (!ciKey.name) { rc = -ENOMEM; goto dtSearch_Exit2; @@ -957,7 +958,7 @@ static int dtSplitUp(tid_t tid, smp = split->mp; sp = DT_PAGE(ip, smp); - key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS); + key.name = kmalloc_array(JFS_NAME_MAX + 2, sizeof(wchar_t), GFP_NOFS); if (!key.name) { DT_PUTPAGE(smp); rc = -ENOMEM; @@ -3779,12 +3780,12 @@ static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, struct component_name lkey; struct component_name rkey; - lkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), + lkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (lkey.name == NULL) return -ENOMEM; - rkey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), + rkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (rkey.name == NULL) { kfree(lkey.name); diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 0e5d412c0b01..6b68df395892 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2493,7 +2493,7 @@ exit: } #ifdef CONFIG_JFS_STATISTICS -static int jfs_lmstats_proc_show(struct seq_file *m, void *v) +int jfs_lmstats_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Logmgr stats\n" @@ -2510,16 +2510,4 @@ static int jfs_lmstats_proc_show(struct seq_file *m, void *v) lmStat.partial_page); return 0; } - -static int jfs_lmstats_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_lmstats_proc_show, NULL); -} - -const struct file_operations jfs_lmstats_proc_fops = { - .open = jfs_lmstats_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif /* CONFIG_JFS_STATISTICS */ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 1a3b0cc22ad3..fa2c6824c7f2 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -815,7 +815,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len) } #ifdef CONFIG_JFS_STATISTICS -static int jfs_mpstat_proc_show(struct seq_file *m, void *v) +int jfs_mpstat_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Metapage statistics\n" @@ -828,16 +828,4 @@ static int jfs_mpstat_proc_show(struct seq_file *m, void *v) mpStat.lockwait); return 0; } - -static int jfs_mpstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_mpstat_proc_show, NULL); -} - -const struct file_operations jfs_mpstat_proc_fops = { - .open = jfs_mpstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 4d973524c887..a5663cb621d8 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2998,7 +2998,7 @@ int jfs_sync(void *arg) } #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) -static int jfs_txanchor_proc_show(struct seq_file *m, void *v) +int jfs_txanchor_proc_show(struct seq_file *m, void *v) { char *freewait; char *freelockwait; @@ -3032,22 +3032,10 @@ static int jfs_txanchor_proc_show(struct seq_file *m, void *v) list_empty(&TxAnchor.unlock_queue) ? "" : "not "); return 0; } - -static int jfs_txanchor_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_txanchor_proc_show, NULL); -} - -const struct file_operations jfs_txanchor_proc_fops = { - .open = jfs_txanchor_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) -static int jfs_txstats_proc_show(struct seq_file *m, void *v) +int jfs_txstats_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS TxStats\n" @@ -3072,16 +3060,4 @@ static int jfs_txstats_proc_show(struct seq_file *m, void *v) TxStat.txLockAlloc_freelock); return 0; } - -static int jfs_txstats_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_txstats_proc_show, NULL); -} - -const struct file_operations jfs_txstats_proc_fops = { - .open = jfs_txstats_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c index c7de6f5bbefc..0148e2e4d97a 100644 --- a/fs/jfs/jfs_unicode.c +++ b/fs/jfs/jfs_unicode.c @@ -121,7 +121,7 @@ int get_UCSname(struct component_name * uniName, struct dentry *dentry) return -ENAMETOOLONG; uniName->name = - kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS); + kmalloc_array(length + 1, sizeof(wchar_t), GFP_NOFS); if (uniName->name == NULL) return -ENOMEM; diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 5cde6d2fcfca..2c200b5256a6 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -3874,7 +3874,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) } #ifdef CONFIG_JFS_STATISTICS -static int jfs_xtstat_proc_show(struct seq_file *m, void *v) +int jfs_xtstat_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Xtree statistics\n" @@ -3887,16 +3887,4 @@ static int jfs_xtstat_proc_show(struct seq_file *m, void *v) xtStat.split); return 0; } - -static int jfs_xtstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_xtstat_proc_show, NULL); -} - -const struct file_operations jfs_xtstat_proc_fops = { - .open = jfs_xtstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index b41596d71858..56c3fcbfe80e 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -178,8 +178,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -313,8 +312,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -1059,8 +1057,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -1447,8 +1444,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out1: diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index c60f3d32ee91..a6797986b625 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -491,15 +491,17 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) if (size > PSIZE) { /* * To keep the rest of the code simple. Allocate a - * contiguous buffer to work with + * contiguous buffer to work with. Make the buffer large + * enough to make use of the whole extent. */ - ea_buf->xattr = kmalloc(size, GFP_KERNEL); + ea_buf->max_size = (size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + + ea_buf->xattr = kmalloc(ea_buf->max_size, GFP_KERNEL); if (ea_buf->xattr == NULL) return -ENOMEM; ea_buf->flag = EA_MALLOC; - ea_buf->max_size = (size + sb->s_blocksize - 1) & - ~(sb->s_blocksize - 1); if (ea_size == 0) return 0; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 89d1dc19340b..d66cc0777303 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -779,7 +779,7 @@ int kernfs_add_one(struct kernfs_node *kn) ps_iattr = parent->iattr; if (ps_iattr) { struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ktime_get_real_ts(&ps_iattrs->ia_ctime); + ktime_get_real_ts64(&ps_iattrs->ia_ctime); ps_iattrs->ia_mtime = ps_iattrs->ia_ctime; } @@ -1306,7 +1306,7 @@ static void __kernfs_remove(struct kernfs_node *kn) /* update timestamps on the parent */ if (ps_iattr) { - ktime_get_real_ts(&ps_iattr->ia_iattr.ia_ctime); + ktime_get_real_ts64(&ps_iattr->ia_iattr.ia_ctime); ps_iattr->ia_iattr.ia_mtime = ps_iattr->ia_iattr.ia_ctime; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index fd5ce883072e..2015d8c45e4a 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -348,11 +348,11 @@ static void kernfs_vma_open(struct vm_area_struct *vma) kernfs_put_active(of->kn); } -static int kernfs_vma_fault(struct vm_fault *vmf) +static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); - int ret; + vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; @@ -368,11 +368,11 @@ static int kernfs_vma_fault(struct vm_fault *vmf) return ret; } -static int kernfs_vma_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); - int ret; + vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index a34303981deb..3d73fe9d56e2 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -52,7 +52,7 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_uid = GLOBAL_ROOT_UID; iattrs->ia_gid = GLOBAL_ROOT_GID; - ktime_get_real_ts(&iattrs->ia_atime); + ktime_get_real_ts64(&iattrs->ia_atime); iattrs->ia_mtime = iattrs->ia_atime; iattrs->ia_ctime = iattrs->ia_atime; @@ -176,9 +176,9 @@ static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) struct super_block *sb = inode->i_sb; inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; - inode->i_atime = timespec_trunc(iattr->ia_atime, sb->s_time_gran); - inode->i_mtime = timespec_trunc(iattr->ia_mtime, sb->s_time_gran); - inode->i_ctime = timespec_trunc(iattr->ia_ctime, sb->s_time_gran); + inode->i_atime = timespec64_trunc(iattr->ia_atime, sb->s_time_gran); + inode->i_mtime = timespec64_trunc(iattr->ia_mtime, sb->s_time_gran); + inode->i_ctime = timespec64_trunc(iattr->ia_ctime, sb->s_time_gran); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 26dd9a50f383..ff2716f9322e 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -316,6 +316,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, info->root = root; info->ns = ns; + INIT_LIST_HEAD(&info->node); sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, &init_user_ns, info); diff --git a/fs/locks.c b/fs/locks.c index 62bbe8b31f26..db7b6917d9c5 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1562,7 +1562,7 @@ EXPORT_SYMBOL(__break_lease); * exclusive leases. The justification is that if someone has an * exclusive lease, then they could be modifying it. */ -void lease_get_mtime(struct inode *inode, struct timespec *time) +void lease_get_mtime(struct inode *inode, struct timespec64 *time) { bool has_lease = false; struct file_lock_context *ctx; @@ -2788,22 +2788,10 @@ static const struct seq_operations locks_seq_operations = { .show = locks_show, }; -static int locks_open(struct inode *inode, struct file *filp) -{ - return seq_open_private(filp, &locks_seq_operations, - sizeof(struct locks_iterator)); -} - -static const struct file_operations proc_locks_operations = { - .open = locks_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - static int __init proc_locks_init(void) { - proc_create("locks", 0, NULL, &proc_locks_operations); + proc_create_seq_private("locks", 0, NULL, &locks_seq_operations, + sizeof(struct locks_iterator), NULL); return 0; } fs_initcall(proc_locks_init); diff --git a/fs/mbcache.c b/fs/mbcache.c index bf41e2e72c18..081ccf0caee3 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -353,8 +353,9 @@ struct mb_cache *mb_cache_create(int bucket_bits) cache->c_max_entries = bucket_count << 4; INIT_LIST_HEAD(&cache->c_list); spin_lock_init(&cache->c_list_lock); - cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), - GFP_KERNEL); + cache->c_hash = kmalloc_array(bucket_count, + sizeof(struct hlist_bl_head), + GFP_KERNEL); if (!cache->c_hash) { kfree(cache); goto err_out; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index ccf0f00030bf..1a6084d2b02e 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -28,13 +28,9 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un return ERR_PTR(-ENAMETOOLONG); ino = minix_inode_by_name(dentry); - if (ino) { + if (ino) inode = minix_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - } - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) diff --git a/fs/namei.c b/fs/namei.c index 186bd2464fd5..734cef54fdf8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -537,12 +537,12 @@ static int __nd_alloc_stack(struct nameidata *nd) struct saved *p; if (nd->flags & LOOKUP_RCU) { - p= kmalloc(MAXSYMLINKS * sizeof(struct saved), + p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), GFP_ATOMIC); if (unlikely(!p)) return -ECHILD; } else { - p= kmalloc(MAXSYMLINKS * sizeof(struct saved), + p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), GFP_KERNEL); if (unlikely(!p)) return -ENOMEM; @@ -984,13 +984,15 @@ static bool safe_hardlink_source(struct inode *inode) */ static int may_linkat(struct path *link) { - struct inode *inode; + struct inode *inode = link->dentry->d_inode; + + /* Inode writeback is not safe when the uid or gid are invalid. */ + if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; - inode = link->dentry->d_inode; - /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ @@ -1438,10 +1440,8 @@ static int path_parent_directory(struct path *path) static int follow_dotdot(struct nameidata *nd) { while(1) { - if (nd->path.dentry == nd->root.dentry && - nd->path.mnt == nd->root.mnt) { + if (path_equal(&nd->path, &nd->root)) break; - } if (nd->path.dentry != nd->path.mnt->mnt_root) { int ret = path_parent_directory(&nd->path); if (ret) @@ -2464,6 +2464,35 @@ static int lookup_one_len_common(const char *name, struct dentry *base, } /** + * try_lookup_one_len - filesystem helper to lookup single pathname component + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * Look up a dentry by name in the dcache, returning NULL if it does not + * currently exist. The function does not try to create a dentry. + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + * + * The caller must hold base->i_mutex. + */ +struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) +{ + struct qstr this; + int err; + + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); + + err = lookup_one_len_common(name, base, len, &this); + if (err) + return ERR_PTR(err); + + return lookup_dcache(&this, base, 0); +} +EXPORT_SYMBOL(try_lookup_one_len); + +/** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from @@ -2749,6 +2778,11 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); + + /* Inode writeback is not safe when the uid or gid are invalid. */ + if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + return -EOVERFLOW; + audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); @@ -3367,7 +3401,9 @@ finish_open_created: goto out; *opened |= FILE_OPENED; opened: - error = ima_file_check(file, op->acc_mode, *opened); + error = open_check_o_direct(file); + if (!error) + error = ima_file_check(file, op->acc_mode, *opened); if (!error && will_truncate) error = handle_truncate(file); out: @@ -3447,6 +3483,9 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = finish_open(file, child, NULL, opened); if (error) goto out2; + error = open_check_o_direct(file); + if (error) + fput(file); out2: mnt_drop_write(path.mnt); out: @@ -3672,7 +3711,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && + !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) @@ -3847,11 +3887,11 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (error) goto out; - shrink_dcache_parent(dentry); error = dir->i_op->rmdir(dir, dentry); if (error) goto out; + shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); @@ -4434,8 +4474,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_nlink >= max_links) goto out; } - if (is_dir && !(flags & RENAME_EXCHANGE) && target) - shrink_dcache_parent(new_dentry); if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) @@ -4452,8 +4490,10 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; if (!(flags & RENAME_EXCHANGE) && target) { - if (is_dir) + if (is_dir) { + shrink_dcache_parent(new_dentry); target->i_flags |= S_DEAD; + } dont_mount(new_dentry); detach_mounts(new_dentry); } diff --git a/fs/namespace.c b/fs/namespace.c index 5f75969adff1..8ddd14806799 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1590,7 +1590,7 @@ static int do_umount(struct mount *mnt, int flags) * Special case for "unmounting" root ... * we just try to remount it readonly. */ - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; down_write(&sb->s_umount); if (!sb_rdonly(sb)) @@ -2333,7 +2333,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, down_write(&sb->s_umount); if (ms_flags & MS_BIND) err = change_mount_flags(path->mnt, ms_flags); - else if (!capable(CAP_SYS_ADMIN)) + else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) err = -EPERM; else err = do_remount_sb(sb, sb_flags, data, 0); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index a50d7813e3ea..64c214fb9da6 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -40,7 +40,9 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) { + if (IS_ERR(inode)) { + if (inode == ERR_PTR(-EAGAIN)) + res->status = htonl(NFS4ERR_DELAY); trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL, -ntohl(res->status)); goto out; @@ -54,8 +56,8 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, res->change_attr = delegation->change_attr; if (nfs_have_writebacks(inode)) res->change_attr++; - res->ctime = inode->i_ctime; - res->mtime = inode->i_mtime; + res->ctime = timespec64_to_timespec(inode->i_ctime); + res->mtime = timespec64_to_timespec(inode->i_mtime); res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & args->bitmap[0]; res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & @@ -86,7 +88,9 @@ __be32 nfs4_callback_recall(void *argp, void *resp, res = htonl(NFS4ERR_BADHANDLE); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) { + if (IS_ERR(inode)) { + if (inode == ERR_PTR(-EAGAIN)) + res = htonl(NFS4ERR_DELAY); trace_nfs4_cb_recall(cps->clp, &args->fh, NULL, &args->stateid, -ntohl(res)); goto out; @@ -124,7 +128,6 @@ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp, struct inode *inode; struct pnfs_layout_hdr *lo; -restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry(lo, &server->layouts, plh_layouts) { if (stateid != NULL && @@ -132,20 +135,20 @@ restart: continue; inode = igrab(lo->plh_inode); if (!inode) - continue; + return ERR_PTR(-EAGAIN); if (!nfs_sb_active(inode->i_sb)) { rcu_read_unlock(); spin_unlock(&clp->cl_lock); iput(inode); spin_lock(&clp->cl_lock); rcu_read_lock(); - goto restart; + return ERR_PTR(-EAGAIN); } return inode; } } - return NULL; + return ERR_PTR(-ENOENT); } /* @@ -162,7 +165,6 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp, struct inode *inode; struct pnfs_layout_hdr *lo; -restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry(lo, &server->layouts, plh_layouts) { nfsi = NFS_I(lo->plh_inode); @@ -172,20 +174,20 @@ restart: continue; inode = igrab(lo->plh_inode); if (!inode) - continue; + return ERR_PTR(-EAGAIN); if (!nfs_sb_active(inode->i_sb)) { rcu_read_unlock(); spin_unlock(&clp->cl_lock); iput(inode); spin_lock(&clp->cl_lock); rcu_read_lock(); - goto restart; + return ERR_PTR(-EAGAIN); } return inode; } } - return NULL; + return ERR_PTR(-ENOENT); } static struct inode *nfs_layout_find_inode(struct nfs_client *clp, @@ -197,7 +199,7 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp, spin_lock(&clp->cl_lock); rcu_read_lock(); inode = nfs_layout_find_inode_by_stateid(clp, stateid); - if (!inode) + if (inode == ERR_PTR(-ENOENT)) inode = nfs_layout_find_inode_by_fh(clp, fh); rcu_read_unlock(); spin_unlock(&clp->cl_lock); @@ -252,8 +254,11 @@ static u32 initiate_file_draining(struct nfs_client *clp, LIST_HEAD(free_me_list); ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid); - if (!ino) - goto out; + if (IS_ERR(ino)) { + if (ino == ERR_PTR(-EAGAIN)) + rv = NFS4ERR_DELAY; + goto out_noput; + } pnfs_layoutcommit_inode(ino, false); @@ -299,9 +304,10 @@ unlock: nfs_commit_inode(ino, 0); pnfs_put_layout_hdr(lo); out: + nfs_iput_and_deactive(ino); +out_noput: trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino, &args->cbl_stateid, -rv); - nfs_iput_and_deactive(ino); return rv; } @@ -322,6 +328,8 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, static u32 do_callback_layoutrecall(struct nfs_client *clp, struct cb_layoutrecallargs *args) { + write_seqcount_begin(&clp->cl_callback_count); + write_seqcount_end(&clp->cl_callback_count); if (args->cbl_recall_type == RETURN_FILE) return initiate_file_draining(clp, args); return initiate_bulk_draining(clp, args); @@ -420,11 +428,8 @@ validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot, return htonl(NFS4ERR_SEQ_FALSE_RETRY); } - /* Wraparound */ - if (unlikely(slot->seq_nr == 0xFFFFFFFFU)) { - if (args->csa_sequenceid == 1) - return htonl(NFS4_OK); - } else if (likely(args->csa_sequenceid == slot->seq_nr + 1)) + /* Note: wraparound relies on seq_nr being of type u32 */ + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) return htonl(NFS4_OK); /* Misordered request */ diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b9129e2befea..377a61654a88 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -969,7 +969,8 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, } if (!(fattr->valid & NFS_ATTR_FATTR)) { - error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL); + error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, + fattr, NULL, NULL); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; @@ -1067,7 +1068,6 @@ void nfs_clients_init(struct net *net) } #ifdef CONFIG_PROC_FS -static int nfs_server_list_open(struct inode *inode, struct file *file); static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); @@ -1080,14 +1080,6 @@ static const struct seq_operations nfs_server_list_ops = { .show = nfs_server_list_show, }; -static const struct file_operations nfs_server_list_fops = { - .open = nfs_server_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -static int nfs_volume_list_open(struct inode *inode, struct file *file); static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); @@ -1100,23 +1092,6 @@ static const struct seq_operations nfs_volume_list_ops = { .show = nfs_volume_list_show, }; -static const struct file_operations nfs_volume_list_fops = { - .open = nfs_volume_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -/* - * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which - * we're dealing - */ -static int nfs_server_list_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &nfs_server_list_ops, - sizeof(struct seq_net_private)); -} - /* * set up the iterator to start reading from the server list and return the first item */ @@ -1185,15 +1160,6 @@ static int nfs_server_list_show(struct seq_file *m, void *v) } /* - * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes - */ -static int nfs_volume_list_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &nfs_volume_list_ops, - sizeof(struct seq_net_private)); -} - -/* * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) @@ -1278,14 +1244,14 @@ int nfs_fs_proc_net_init(struct net *net) goto error_0; /* a file of servers with which we're dealing */ - p = proc_create("servers", S_IFREG|S_IRUGO, - nn->proc_nfsfs, &nfs_server_list_fops); + p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs, + &nfs_server_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; /* a file of volumes that we have mounted */ - p = proc_create("volumes", S_IFREG|S_IRUGO, - nn->proc_nfsfs, &nfs_volume_list_fops); + p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, + &nfs_volume_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; return 0; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 1819d0d0ba4b..f033f3a69a3b 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -404,6 +404,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, trace_nfs4_set_delegation(inode, type); + spin_lock(&inode->i_lock); + if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) + NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED; + spin_unlock(&inode->i_lock); out: spin_unlock(&clp->cl_lock); if (delegation != NULL) @@ -483,38 +487,88 @@ out: int nfs_client_return_marked_delegations(struct nfs_client *clp) { struct nfs_delegation *delegation; + struct nfs_delegation *prev; struct nfs_server *server; struct inode *inode; + struct inode *place_holder = NULL; + struct nfs_delegation *place_holder_deleg = NULL; int err = 0; restart: + /* + * To avoid quadratic looping we hold a reference + * to an inode place_holder. Each time we restart, we + * list nfs_servers from the server of that inode, and + * delegation in the server from the delegations of that + * inode. + * prev is an RCU-protected pointer to a delegation which + * wasn't marked for return and might be a good choice for + * the next place_holder. + */ rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry_rcu(delegation, &server->delegations, - super_list) { - if (!nfs_delegation_need_return(delegation)) + prev = NULL; + if (place_holder) + server = NFS_SERVER(place_holder); + else + server = list_entry_rcu(clp->cl_superblocks.next, + struct nfs_server, client_link); + list_for_each_entry_from_rcu(server, &clp->cl_superblocks, client_link) { + delegation = NULL; + if (place_holder && server == NFS_SERVER(place_holder)) + delegation = rcu_dereference(NFS_I(place_holder)->delegation); + if (!delegation || delegation != place_holder_deleg) + delegation = list_entry_rcu(server->delegations.next, + struct nfs_delegation, super_list); + list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) { + struct inode *to_put = NULL; + + if (!nfs_delegation_need_return(delegation)) { + prev = delegation; continue; + } if (!nfs_sb_active(server->super)) - continue; + break; /* continue in outer loop */ + + if (prev) { + struct inode *tmp; + + tmp = nfs_delegation_grab_inode(prev); + if (tmp) { + to_put = place_holder; + place_holder = tmp; + place_holder_deleg = prev; + } + } + inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) { rcu_read_unlock(); + if (to_put) + iput(to_put); nfs_sb_deactive(server->super); goto restart; } delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); + if (to_put) + iput(to_put); + err = nfs_end_delegation_return(inode, delegation, 0); iput(inode); nfs_sb_deactive(server->super); + cond_resched(); if (!err) goto restart; set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + if (place_holder) + iput(place_holder); return err; } } rcu_read_unlock(); + if (place_holder) + iput(place_holder); return 0; } @@ -802,12 +856,14 @@ nfs_delegation_find_inode_server(struct nfs_server *server, if (delegation->inode != NULL && nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { res = igrab(delegation->inode); + spin_unlock(&delegation->lock); + if (res != NULL) + return res; + return ERR_PTR(-EAGAIN); } spin_unlock(&delegation->lock); - if (res != NULL) - break; } - return res; + return ERR_PTR(-ENOENT); } /** @@ -822,16 +878,18 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle) { struct nfs_server *server; - struct inode *res = NULL; + struct inode *res; rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { res = nfs_delegation_find_inode_server(server, fhandle); - if (res != NULL) - break; + if (res != ERR_PTR(-ENOENT)) { + rcu_read_unlock(); + return res; + } } rcu_read_unlock(); - return res; + return ERR_PTR(-ENOENT); } static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) @@ -887,7 +945,7 @@ restart: &delegation->flags) == 0) continue; if (!nfs_sb_active(server->super)) - continue; + break; /* continue in outer loop */ inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) { rcu_read_unlock(); @@ -904,6 +962,7 @@ restart: } iput(inode); nfs_sb_deactive(server->super); + cond_resched(); goto restart; } } @@ -995,7 +1054,7 @@ restart: &delegation->flags) == 0) continue; if (!nfs_sb_active(server->super)) - continue; + break; /* continue in outer loop */ inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) { rcu_read_unlock(); @@ -1020,6 +1079,7 @@ restart: } iput(inode); nfs_sb_deactive(server->super); + cond_resched(); goto restart; } } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 73f8b43d988c..7a9c14426855 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1012,13 +1012,25 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) if (IS_AUTOMOUNT(inode)) return 0; + + if (flags & LOOKUP_OPEN) { + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + /* A NFSv4 OPEN will revalidate later */ + if (server->caps & NFS_CAP_ATOMIC_OPEN) + goto out; + /* Fallthrough */ + case S_IFDIR: + if (server->flags & NFS_MOUNT_NOCTO) + break; + /* NFS close-to-open cache consistency validation */ + goto out_force; + } + } + /* VFS wants an on-the-wire revalidation */ if (flags & LOOKUP_REVAL) goto out_force; - /* This is an open(2) */ - if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) && - (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) - goto out_force; out: return (inode->i_nlink == 0) ? -ENOENT : 0; out_force: @@ -1039,13 +1051,15 @@ out_force: * * If LOOKUP_RCU prevents us from performing a full check, return 1 * suggesting a reval is needed. + * + * Note that when creating a new file, or looking up a rename target, + * then it shouldn't be necessary to revalidate a negative dentry. */ static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, unsigned int flags) { - /* Don't revalidate a negative dentry if we're creating a new file */ - if (flags & LOOKUP_CREATE) + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) return 1; @@ -1106,7 +1120,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) goto out_set_verifier; /* Force a full look up iff the parent directory has changed */ - if (!nfs_is_exclusive_create(dir, flags) && + if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) && nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { error = nfs_lookup_verify_inode(inode, flags); if (error) { @@ -1270,11 +1284,13 @@ static void nfs_drop_nlink(struct inode *inode) { spin_lock(&inode->i_lock); /* drop the inode if we're reasonably sure this is the last link */ - if (inode->i_nlink == 1) - clear_nlink(inode); + if (inode->i_nlink > 0) + drop_nlink(inode); + NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME - | NFS_INO_INVALID_OTHER; + | NFS_INO_INVALID_OTHER + | NFS_INO_REVAL_FORCED; spin_unlock(&inode->i_lock); } @@ -1335,7 +1351,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. */ - if (nfs_is_exclusive_create(dir, flags)) + if (nfs_is_exclusive_create(dir, flags) || flags & LOOKUP_RENAME_TARGET) return NULL; res = ERR_PTR(-ENOMEM); @@ -1640,7 +1656,8 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); if (!(fattr->valid & NFS_ATTR_FATTR)) { struct nfs_server *server = NFS_SB(dentry->d_sb); - error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL); + error = server->nfs_client->rpc_ops->getattr(server, fhandle, + fattr, NULL, NULL); if (error < 0) goto out_error; } @@ -2036,7 +2053,15 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else error = task->tk_status; rpc_put_task(task); - nfs_mark_for_revalidate(old_inode); + /* Ensure the inode attributes are revalidated */ + if (error == 0) { + spin_lock(&old_inode->i_lock); + NFS_I(old_inode)->attr_gencount = nfs_inc_attr_generation_counter(); + NFS_I(old_inode)->cache_validity |= NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME + | NFS_INO_REVAL_FORCED; + spin_unlock(&old_inode->i_lock); + } out: if (rehash) d_rehash(rehash); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index ab5de3246c5c..deecb67638aa 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -102,7 +102,7 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid, } rpc_ops = NFS_SB(sb)->nfs_client->rpc_ops; - ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label); + ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label, NULL); if (ret) { dprintk("%s: getattr failed %d\n", __func__, ret); dentry = ERR_PTR(ret); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index c75ad982bcfc..8f003792ccde 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -461,7 +461,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fh_count = be32_to_cpup(p); fls->mirror_array[i]->fh_versions = - kzalloc(fh_count * sizeof(struct nfs_fh), + kcalloc(fh_count, sizeof(struct nfs_fh), gfp_flags); if (fls->mirror_array[i]->fh_versions == NULL) { rc = -ENOMEM; @@ -1243,17 +1243,18 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); + clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); + clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); switch (err) { case -NFS4ERR_RESET_TO_PNFS: if (ff_layout_choose_best_ds_for_read(hdr->lseg, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; - ff_layout_read_record_layoutstats_done(task, hdr); - pnfs_read_resend_pnfs(hdr); + set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - ff_layout_reset_read(hdr); + set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); return task->tk_status; case -EAGAIN: goto out_eagain; @@ -1403,6 +1404,10 @@ static void ff_layout_read_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_read_record_layoutstats_done(&hdr->task, hdr); + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + pnfs_read_resend_pnfs(hdr); + else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + ff_layout_reset_read(hdr); pnfs_generic_rw_release(data); } @@ -1423,12 +1428,14 @@ static int ff_layout_write_done_cb(struct rpc_task *task, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); + clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); + clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); switch (err) { case -NFS4ERR_RESET_TO_PNFS: - ff_layout_reset_write(hdr, true); + set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - ff_layout_reset_write(hdr, false); + set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); return task->tk_status; case -EAGAIN: return -EAGAIN; @@ -1575,6 +1582,10 @@ static void ff_layout_write_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_write_record_layoutstats_done(&hdr->task, hdr); + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + ff_layout_reset_write(hdr, true); + else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + ff_layout_reset_write(hdr, false); pnfs_generic_rw_release(data); } @@ -2347,6 +2358,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTGET_ON_OPEN, .set_layoutdriver = ff_layout_set_layoutdriver, .alloc_layout_hdr = ff_layout_alloc_layout_hdr, .free_layout_hdr = ff_layout_free_layout_hdr, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index d62279d3fc5d..59aa04976331 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -99,7 +99,8 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, version_count = be32_to_cpup(p); dprintk("%s: version count %d\n", __func__, version_count); - ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version), + ds_versions = kcalloc(version_count, + sizeof(struct nfs4_ff_ds_version), gfp_flags); if (!ds_versions) goto out_scratch; diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 1c5d8d31fc0a..666415d13d52 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -88,8 +88,8 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OBSOLETE; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = nfsi->vfs_inode.i_mtime; - auxdata.ctime = nfsi->vfs_inode.i_ctime; + auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); + auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index b55fc7920c3b..4dc887813c71 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -237,8 +237,8 @@ void nfs_fscache_init_inode(struct inode *inode) return; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = nfsi->vfs_inode.i_mtime; - auxdata.ctime = nfsi->vfs_inode.i_ctime; + auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); + auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); @@ -262,8 +262,8 @@ void nfs_fscache_clear_inode(struct inode *inode) dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = nfsi->vfs_inode.i_mtime; - auxdata.ctime = nfsi->vfs_inode.i_ctime; + auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); + auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); fscache_relinquish_cookie(cookie, &auxdata, false); nfsi->fscache = NULL; } @@ -304,8 +304,8 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp) return; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime = nfsi->vfs_inode.i_mtime; - auxdata.ctime = nfsi->vfs_inode.i_ctime; + auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); + auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); if (inode_is_open_for_write(inode)) { dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bd15d0b57626..b65aee481d13 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -195,10 +195,16 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); - bool have_delegation = nfs_have_delegated_attributes(inode); + bool have_delegation = NFS_PROTO(inode)->have_delegation(inode, FMODE_READ); + + if (have_delegation) { + if (!(flags & NFS_INO_REVAL_FORCED)) + flags &= ~NFS_INO_INVALID_OTHER; + flags &= ~(NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_SIZE + | NFS_INO_REVAL_PAGECACHE); + } - if (have_delegation) - flags &= ~(NFS_INO_INVALID_CHANGE|NFS_INO_REVAL_PAGECACHE); if (inode->i_mapping->nrpages == 0) flags &= ~NFS_INO_INVALID_DATA; nfsi->cache_validity |= flags; @@ -448,6 +454,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; + nfsi->cache_validity = 0; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); @@ -494,15 +501,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode->i_atime = timespec_to_timespec64(fattr->atime); else if (nfs_server_capable(inode, NFS_CAP_ATIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode->i_mtime = timespec_to_timespec64(fattr->mtime); else if (nfs_server_capable(inode, NFS_CAP_MTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode->i_ctime = timespec_to_timespec64(fattr->ctime); else if (nfs_server_capable(inode, NFS_CAP_CTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -534,6 +541,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); } + if (nfsi->cache_validity != 0) + nfsi->cache_validity |= NFS_INO_REVAL_FORCED; + nfs_setsecurity(inode, fattr, label); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); @@ -667,9 +677,13 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; - nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME); + if ((attr->ia_valid & ATTR_SIZE) != 0) { + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); + nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); + nfs_vmtruncate(inode, attr->ia_size); + } if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME; if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -679,13 +693,45 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode->i_ctime = timespec_to_timespec64(fattr->ctime); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL); } - if ((attr->ia_valid & ATTR_SIZE) != 0) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); - nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); - nfs_vmtruncate(inode, attr->ia_size); + if (attr->ia_valid & (ATTR_ATIME_SET|ATTR_ATIME)) { + NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME + | NFS_INO_INVALID_CTIME); + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + inode->i_atime = timespec_to_timespec64(fattr->atime); + else if (attr->ia_valid & ATTR_ATIME_SET) + inode->i_atime = attr->ia_atime; + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); + + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode->i_ctime = timespec_to_timespec64(fattr->ctime); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); + } + if (attr->ia_valid & (ATTR_MTIME_SET|ATTR_MTIME)) { + NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME + | NFS_INO_INVALID_CTIME); + if (fattr->valid & NFS_ATTR_FATTR_MTIME) + inode->i_mtime = timespec_to_timespec64(fattr->mtime); + else if (attr->ia_valid & ATTR_MTIME_SET) + inode->i_mtime = attr->ia_mtime; + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); + + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode->i_ctime = timespec_to_timespec64(fattr->ctime); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); } if (fattr->valid) nfs_update_inode(inode, fattr); @@ -1097,7 +1143,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) goto out; } - status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label); + status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, + label, inode); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", inode->i_sb->s_id, @@ -1304,6 +1351,8 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { + struct timespec ts; + if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) { @@ -1312,16 +1361,18 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); } /* If we have atomic WCC data, we may update some attributes */ + ts = timespec64_to_timespec(inode->i_ctime); if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) - && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + && timespec_equal(&ts, &fattr->pre_ctime)) { + inode->i_ctime = timespec_to_timespec64(fattr->ctime); } + ts = timespec64_to_timespec(inode->i_mtime); if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) - && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + && timespec_equal(&ts, &fattr->pre_mtime)) { + inode->i_mtime = timespec_to_timespec64(fattr->mtime); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); } @@ -1347,10 +1398,11 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_size, new_isize; unsigned long invalid = 0; + struct timespec ts; - - if (nfs_have_delegated_attributes(inode)) + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) return 0; + /* Has the inode gone and changed behind our back? */ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) return -ESTALE; @@ -1363,10 +1415,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat invalid |= NFS_INO_INVALID_CHANGE | NFS_INO_REVAL_PAGECACHE; - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + ts = timespec64_to_timespec(inode->i_mtime); + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; - if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) + ts = timespec64_to_timespec(inode->i_ctime); + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1396,11 +1450,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_OTHER; - if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) + ts = timespec64_to_timespec(inode->i_atime); + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) - nfs_set_cache_invalid(inode, invalid | NFS_INO_REVAL_FORCED); + nfs_set_cache_invalid(inode, invalid); nfsi->read_cache_jiffies = fattr->time_start; return 0; @@ -1629,7 +1684,8 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_locked(inode, fattr, NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME); + | NFS_INO_INVALID_CTIME + | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); return status; @@ -1667,12 +1723,12 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { - memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); + fattr->pre_ctime = timespec64_to_timespec(inode->i_ctime); fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { - memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); + fattr->pre_mtime = timespec64_to_timespec(inode->i_mtime); fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && @@ -1746,6 +1802,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long save_cache_validity; bool have_writers = nfs_file_has_buffered_writers(nfsi); bool cache_revalidated = true; + bool attr_changed = false; + bool have_delegation; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1780,6 +1838,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) !IS_AUTOMOUNT(inode)) server->fsid = fattr->fsid; + /* Save the delegation state before clearing cache_validity */ + have_delegation = nfs_have_delegated_attributes(inode); + /* * Update the read time so we don't revalidate too often. */ @@ -1802,12 +1863,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { if (!inode_eq_iversion_raw(inode, fattr->change_attr)) { - dprintk("NFS: change_attr change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); /* Could it be a race with writeback? */ - if (!have_writers) { - invalid |= NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_DATA + if (!(have_writers || have_delegation)) { + invalid |= NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; /* Force revalidate of all attributes */ @@ -1817,8 +1875,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); + dprintk("NFS: change_attr change on server for file %s/%ld\n", + inode->i_sb->s_id, + inode->i_ino); } inode_set_iversion_raw(inode, fattr->change_attr); + attr_changed = true; } } else { nfsi->cache_validity |= save_cache_validity & @@ -1829,7 +1891,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_MTIME) { - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + inode->i_mtime = timespec_to_timespec64(fattr->mtime); } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_MTIME @@ -1838,7 +1900,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_CTIME) { - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + inode->i_ctime = timespec_to_timespec64(fattr->ctime); } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_CTIME @@ -1850,13 +1912,14 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_SIZE) { new_isize = nfs_size_to_loff_t(fattr->size); cur_isize = i_size_read(inode); - if (new_isize != cur_isize) { + if (new_isize != cur_isize && !have_delegation) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { i_size_write(inode, new_isize); if (!have_writers) invalid |= NFS_INO_INVALID_DATA; + attr_changed = true; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -1875,7 +1938,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_ATIME) - memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + inode->i_atime = timespec_to_timespec64(fattr->atime); else if (server->caps & NFS_CAP_ATIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATIME @@ -1889,14 +1952,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) newmode |= fattr->mode & S_IALLUGO; inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_OTHER; + | NFS_INO_INVALID_ACL; + attr_changed = true; } } else if (server->caps & NFS_CAP_MODE) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_OTHER + (NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1904,15 +1965,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_OTHER; + | NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; + attr_changed = true; } } else if (server->caps & NFS_CAP_OWNER) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_OTHER + (NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1920,25 +1979,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_OTHER; + | NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; + attr_changed = true; } } else if (server->caps & NFS_CAP_OWNER_GROUP) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_OTHER + (NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { - invalid |= NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); + attr_changed = true; } } else if (server->caps & NFS_CAP_NLINK) { nfsi->cache_validity |= save_cache_validity & @@ -1958,7 +2015,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) cache_revalidated = false; /* Update attrtimeo value if we're out of the unstable period */ - if (invalid & NFS_INO_INVALID_ATTR) { + if (attr_changed) { invalid &= ~NFS_INO_INVALID_ATTR; nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); @@ -1984,9 +2041,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) || - (save_cache_validity & NFS_INO_REVAL_FORCED)) - nfs_set_cache_invalid(inode, invalid); + nfs_set_cache_invalid(inode, invalid); return 0; out_err: diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 85e4b4a233f9..350675e3ed47 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -354,6 +354,7 @@ static __be32 *xdr_time_not_set(__be32 *p) static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) { + struct timespec ts; __be32 *p; p = xdr_reserve_space(xdr, NFS_sattr_sz << 2); @@ -375,17 +376,21 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); - if (attr->ia_valid & ATTR_ATIME_SET) - p = xdr_encode_time(p, &attr->ia_atime); - else if (attr->ia_valid & ATTR_ATIME) - p = xdr_encode_current_server_time(p, &attr->ia_atime); - else + if (attr->ia_valid & ATTR_ATIME_SET) { + ts = timespec64_to_timespec(attr->ia_atime); + p = xdr_encode_time(p, &ts); + } else if (attr->ia_valid & ATTR_ATIME) { + ts = timespec64_to_timespec(attr->ia_atime); + p = xdr_encode_current_server_time(p, &ts); + } else p = xdr_time_not_set(p); - if (attr->ia_valid & ATTR_MTIME_SET) - xdr_encode_time(p, &attr->ia_mtime); - else if (attr->ia_valid & ATTR_MTIME) - xdr_encode_current_server_time(p, &attr->ia_mtime); - else + if (attr->ia_valid & ATTR_MTIME_SET) { + ts = timespec64_to_timespec(attr->ia_atime); + xdr_encode_time(p, &ts); + } else if (attr->ia_valid & ATTR_MTIME) { + ts = timespec64_to_timespec(attr->ia_mtime); + xdr_encode_current_server_time(p, &ts); + } else xdr_time_not_set(p); } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index eadf1ab31d16..ec8a9efa268f 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -101,7 +101,8 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, */ static int nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) + struct nfs_fattr *fattr, struct nfs4_label *label, + struct inode *inode) { struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], @@ -414,7 +415,9 @@ out: } static void -nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) +nfs3_proc_unlink_setup(struct rpc_message *msg, + struct dentry *dentry, + struct inode *inode) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; } @@ -823,7 +826,8 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) } static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr, - struct rpc_message *msg) + struct rpc_message *msg, + struct rpc_clnt **clnt) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; } @@ -844,7 +848,8 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data) return 0; } -static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) +static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg, + struct rpc_clnt **clnt) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT]; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 09ee36dd8426..64e4fa33d89f 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -561,6 +561,7 @@ static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) */ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) { + struct timespec ts; u32 nbytes; __be32 *p; @@ -610,8 +611,10 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) *p++ = xdr_zero; if (attr->ia_valid & ATTR_ATIME_SET) { + struct timespec ts; *p++ = xdr_two; - p = xdr_encode_nfstime3(p, &attr->ia_atime); + ts = timespec64_to_timespec(attr->ia_atime); + p = xdr_encode_nfstime3(p, &ts); } else if (attr->ia_valid & ATTR_ATIME) { *p++ = xdr_one; } else @@ -619,7 +622,8 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) if (attr->ia_valid & ATTR_MTIME_SET) { *p++ = xdr_two; - xdr_encode_nfstime3(p, &attr->ia_mtime); + ts = timespec64_to_timespec(attr->ia_mtime); + xdr_encode_nfstime3(p, &ts); } else if (attr->ia_valid & ATTR_MTIME) { *p = xdr_one; } else diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 9c374441f660..5f59b6f65a42 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -370,6 +370,10 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: break; + case -NFS4ERR_BADHANDLE: + case -ESTALE: + pnfs_destroy_layout(NFS_I(inode)); + break; case -NFS4ERR_EXPIRED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: @@ -462,7 +466,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, nfs42_layoutstat_release(data); return -EAGAIN; } - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0); task = rpc_run_task(&task_setup); if (IS_ERR(task)) return PTR_ERR(task); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index b374f680830c..137e18abb7e7 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -212,6 +212,31 @@ struct nfs4_state_recovery_ops { struct rpc_cred *); }; +struct nfs4_opendata { + struct kref kref; + struct nfs_openargs o_arg; + struct nfs_openres o_res; + struct nfs_open_confirmargs c_arg; + struct nfs_open_confirmres c_res; + struct nfs4_string owner_name; + struct nfs4_string group_name; + struct nfs4_label *a_label; + struct nfs_fattr f_attr; + struct nfs4_label *f_label; + struct dentry *dir; + struct dentry *dentry; + struct nfs4_state_owner *owner; + struct nfs4_state *state; + struct iattr attrs; + struct nfs4_layoutget *lgp; + unsigned long timestamp; + bool rpc_done; + bool file_created; + bool is_recover; + bool cancelled; + int rpc_status; +}; + struct nfs4_add_xprt_data { struct nfs_client *clp; struct rpc_cred *cred; @@ -251,7 +276,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, struct rpc_message *, struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); -extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); +extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int, int); extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 22dc30a679a0..b6f9d84ba19b 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -343,7 +343,7 @@ static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, int id_len; ssize_t ret; - id_len = snprintf(id_str, sizeof(id_str), "%u", id); + id_len = nfs_map_numeric_to_string(id, id_str, sizeof(id_str)); ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap); if (ret < 0) return -EINVAL; @@ -627,7 +627,8 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, if (strcmp(upcall->im_name, im->im_name) != 0) break; /* Note: here we store the NUL terminator too */ - len = sprintf(id_str, "%d", im->im_id) + 1; + len = 1 + nfs_map_numeric_to_string(im->im_id, id_str, + sizeof(id_str)); ret = nfs_idmap_instantiate(key, authkey, id_str, len); break; case IDMAP_CONV_IDTONAME: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b71757e85066..6dd146885da9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -71,6 +71,8 @@ #define NFSDBG_FACILITY NFSDBG_PROC +#define NFS4_BITMASK_SZ 3 + #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) @@ -86,12 +88,11 @@ | ATTR_MTIME_SET) struct nfs4_opendata; -static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); -static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); -static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); +static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label, struct inode *inode); +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, struct nfs_open_context *ctx, struct nfs4_label *ilabel, @@ -274,6 +275,33 @@ const u32 nfs4_fs_locations_bitmap[3] = { | FATTR4_WORD1_MOUNTED_ON_FILEID, }; +static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, + struct inode *inode) +{ + unsigned long cache_validity; + + memcpy(dst, src, NFS4_BITMASK_SZ*sizeof(*dst)); + if (!inode || !nfs4_have_delegation(inode, FMODE_READ)) + return; + + cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + if (!(cache_validity & NFS_INO_REVAL_FORCED)) + cache_validity &= ~(NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_SIZE); + + if (!(cache_validity & NFS_INO_INVALID_SIZE)) + dst[0] &= ~FATTR4_WORD0_SIZE; + + if (!(cache_validity & NFS_INO_INVALID_CHANGE)) + dst[0] &= ~FATTR4_WORD0_CHANGE; +} + +static void nfs4_bitmap_copy_adjust_setattr(__u32 *dst, + const __u32 *src, struct inode *inode) +{ + nfs4_bitmap_copy_adjust(dst, src, inode); +} + static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, struct nfs4_readdir_arg *readdir) { @@ -407,6 +435,11 @@ static int nfs4_do_handle_exception(struct nfs_server *server, switch(errorcode) { case 0: return 0; + case -NFS4ERR_BADHANDLE: + case -ESTALE: + if (inode != NULL && S_ISREG(inode->i_mode)) + pnfs_destroy_layout(NFS_I(inode)); + break; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: @@ -608,20 +641,16 @@ struct nfs4_call_sync_data { }; void nfs4_init_sequence(struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, int cache_reply) + struct nfs4_sequence_res *res, int cache_reply, + int privileged) { args->sa_slot = NULL; args->sa_cache_this = cache_reply; - args->sa_privileged = 0; + args->sa_privileged = privileged; res->sr_slot = NULL; } -static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) -{ - args->sa_privileged = 1; -} - static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot *slot = res->sr_slot; @@ -746,12 +775,19 @@ static int nfs41_sequence_process(struct rpc_task *task, slot->slot_nr, slot->seq_nr); goto out_retry; + case -NFS4ERR_RETRY_UNCACHED_REP: + case -NFS4ERR_SEQ_FALSE_RETRY: + /* + * The server thinks we tried to replay a request. + * Retry the call after bumping the sequence ID. + */ + goto retry_new_seq; case -NFS4ERR_BADSLOT: /* * The slot id we used was probably retired. Try again * using a different slot id. */ - if (slot->seq_nr < slot->table->target_highest_slotid) + if (slot->slot_nr < slot->table->target_highest_slotid) goto session_recover; goto retry_nowait; case -NFS4ERR_SEQ_MISORDERED: @@ -770,10 +806,6 @@ static int nfs41_sequence_process(struct rpc_task *task, goto retry_nowait; } goto session_recover; - case -NFS4ERR_SEQ_FALSE_RETRY: - if (interrupted) - goto retry_new_seq; - goto session_recover; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -1035,7 +1067,7 @@ int nfs4_call_sync(struct rpc_clnt *clnt, struct nfs4_sequence_res *res, int cache_reply) { - nfs4_init_sequence(args, res, cache_reply); + nfs4_init_sequence(args, res, cache_reply, 0); return nfs4_call_sync_sequence(clnt, server, msg, args, res); } @@ -1064,30 +1096,6 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, spin_unlock(&dir->i_lock); } -struct nfs4_opendata { - struct kref kref; - struct nfs_openargs o_arg; - struct nfs_openres o_res; - struct nfs_open_confirmargs c_arg; - struct nfs_open_confirmres c_res; - struct nfs4_string owner_name; - struct nfs4_string group_name; - struct nfs4_label *a_label; - struct nfs_fattr f_attr; - struct nfs4_label *f_label; - struct dentry *dir; - struct dentry *dentry; - struct nfs4_state_owner *owner; - struct nfs4_state *state; - struct iattr attrs; - unsigned long timestamp; - bool rpc_done; - bool file_created; - bool is_recover; - bool cancelled; - int rpc_status; -}; - struct nfs4_open_createattrs { struct nfs4_label *label; struct iattr *sattr; @@ -1268,6 +1276,7 @@ static void nfs4_opendata_free(struct kref *kref) struct nfs4_opendata, kref); struct super_block *sb = p->dentry->d_sb; + nfs4_lgopen_release(p->lgp); nfs_free_seqid(p->o_arg.seqid); nfs4_sequence_free_slot(&p->o_res.seq_res); if (p->state != NULL) @@ -2187,13 +2196,12 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) }; int status; - nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1); + nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1, + data->is_recover); kref_get(&data->kref); data->rpc_done = false; data->rpc_status = 0; data->timestamp = jiffies; - if (data->is_recover) - nfs4_set_sequence_privileged(&data->c_arg.seq_args); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -2327,7 +2335,8 @@ static const struct rpc_call_ops nfs4_open_ops = { .rpc_release = nfs4_open_release, }; -static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) +static int nfs4_run_open_task(struct nfs4_opendata *data, + struct nfs_open_context *ctx) { struct inode *dir = d_inode(data->dir); struct nfs_server *server = NFS_SERVER(dir); @@ -2350,15 +2359,17 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) }; int status; - nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1); kref_get(&data->kref); data->rpc_done = false; data->rpc_status = 0; data->cancelled = false; data->is_recover = false; - if (isrecover) { - nfs4_set_sequence_privileged(&o_arg->seq_args); + if (!ctx) { + nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 1); data->is_recover = true; + } else { + nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 0); + pnfs_lgopen_prepare(data, ctx); } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -2380,7 +2391,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) struct nfs_openres *o_res = &data->o_res; int status; - status = nfs4_run_open_task(data, 1); + status = nfs4_run_open_task(data, NULL); if (status != 0 || !data->rpc_done) return status; @@ -2441,7 +2452,8 @@ static int nfs4_opendata_access(struct rpc_cred *cred, /* * Note: On error, nfs4_proc_open will free the struct nfs4_opendata */ -static int _nfs4_proc_open(struct nfs4_opendata *data) +static int _nfs4_proc_open(struct nfs4_opendata *data, + struct nfs_open_context *ctx) { struct inode *dir = d_inode(data->dir); struct nfs_server *server = NFS_SERVER(dir); @@ -2449,7 +2461,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) struct nfs_openres *o_res = &data->o_res; int status; - status = nfs4_run_open_task(data, 0); + status = nfs4_run_open_task(data, ctx); if (!data->rpc_done) return status; if (status != 0) { @@ -2480,7 +2492,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { nfs4_sequence_free_slot(&o_res->seq_res); - nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, + o_res->f_label, NULL); } return 0; } @@ -2800,11 +2813,11 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); - ret = _nfs4_proc_open(opendata); + ret = _nfs4_proc_open(opendata, ctx); if (ret != 0) goto out; - state = nfs4_opendata_to_nfs4_state(opendata); + state = _nfs4_opendata_to_nfs4_state(opendata); ret = PTR_ERR(state); if (IS_ERR(state)) goto out; @@ -2838,8 +2851,12 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, nfs_inode_attach_open_context(ctx); if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) nfs4_schedule_stateid_recovery(server, state); + else + pnfs_parse_lgopen(state->inode, opendata->lgp, ctx); } + out: + nfs4_sequence_free_slot(&opendata->o_res.seq_res); return ret; } @@ -3039,7 +3056,6 @@ static int _nfs4_do_setattr(struct inode *inode, }; struct rpc_cred *delegation_cred = NULL; unsigned long timestamp = jiffies; - fmode_t fmode; bool truncate; int status; @@ -3047,11 +3063,12 @@ static int _nfs4_do_setattr(struct inode *inode, /* Servers should only apply open mode checks for file size changes */ truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false; - fmode = truncate ? FMODE_WRITE : FMODE_READ; + if (!truncate) + goto zero_stateid; - if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) { + if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) { /* Use that stateid */ - } else if (truncate && ctx != NULL) { + } else if (ctx != NULL) { struct nfs_lock_context *l_ctx; if (!nfs4_valid_open_stateid(ctx->state)) return -EBADF; @@ -3063,8 +3080,10 @@ static int _nfs4_do_setattr(struct inode *inode, nfs_put_lock_context(l_ctx); if (status == -EIO) return -EBADF; - } else + } else { +zero_stateid: nfs4_stateid_copy(&arg->stateid, &zero_stateid); + } if (delegation_cred) msg.rpc_cred = delegation_cred; @@ -3083,12 +3102,13 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); + __u32 bitmask[NFS4_BITMASK_SZ]; struct nfs4_state *state = ctx ? ctx->state : NULL; struct nfs_setattrargs arg = { .fh = NFS_FH(inode), .iap = sattr, .server = server, - .bitmask = server->attr_bitmask, + .bitmask = bitmask, .label = ilabel, }; struct nfs_setattrres res = { @@ -3103,11 +3123,11 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, }; int err; - arg.bitmask = nfs4_bitmask(server, ilabel); - if (ilabel) - arg.bitmask = nfs4_bitmask(server, olabel); - do { + nfs4_bitmap_copy_adjust_setattr(bitmask, + nfs4_bitmask(server, olabel), + inode); + err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx); switch (err) { case -NFS4ERR_OPENMODE: @@ -3274,6 +3294,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct inode *inode = calldata->inode; + struct pnfs_layout_hdr *lo; bool is_rdonly, is_wronly, is_rdwr; int call_close = 0; @@ -3317,6 +3338,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_wait; } + lo = calldata->arg.lr_args ? calldata->arg.lr_args->layout : NULL; + if (lo && !pnfs_layout_is_valid(lo)) { + calldata->arg.lr_args = NULL; + calldata->res.lr_res = NULL; + } + if (calldata->arg.fmode == 0) task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; @@ -3393,7 +3420,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) calldata = kzalloc(sizeof(*calldata), gfp_mask); if (calldata == NULL) goto out; - nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1); + nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1, 0); calldata->inode = state->inode; calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); @@ -3742,7 +3769,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, if (IS_ERR(label)) return PTR_ERR(label); - error = nfs4_proc_getattr(server, mntfh, fattr, label); + error = nfs4_proc_getattr(server, mntfh, fattr, label, NULL); if (error < 0) { dprintk("nfs4_get_root: getattr error = %d\n", -error); goto err_free_label; @@ -3807,11 +3834,13 @@ out: } static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) + struct nfs_fattr *fattr, struct nfs4_label *label, + struct inode *inode) { + __u32 bitmask[NFS4_BITMASK_SZ]; struct nfs4_getattr_arg args = { .fh = fhandle, - .bitmask = server->attr_bitmask, + .bitmask = bitmask, }; struct nfs4_getattr_res res = { .fattr = fattr, @@ -3824,19 +3853,20 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; - args.bitmask = nfs4_bitmask(server, label); + nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode); nfs_fattr_init(fattr); return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) + struct nfs_fattr *fattr, struct nfs4_label *label, + struct inode *inode) { struct nfs4_exception exception = { }; int err; do { - err = _nfs4_proc_getattr(server, fhandle, fattr, label); + err = _nfs4_proc_getattr(server, fhandle, fattr, label, inode); trace_nfs4_getattr(server, fhandle, fattr, err); err = nfs4_handle_exception(server, err, &exception); @@ -4089,7 +4119,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry }; int status = 0; - if (!nfs_have_delegated_attributes(inode)) { + if (!nfs4_have_delegation(inode, FMODE_READ)) { res.fattr = nfs_alloc_fattr(); if (res.fattr == NULL) return -ENOMEM; @@ -4265,15 +4295,16 @@ static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name) return err; } -static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) +static void nfs4_proc_unlink_setup(struct rpc_message *msg, + struct dentry *dentry, + struct inode *inode) { struct nfs_removeargs *args = msg->rpc_argp; struct nfs_removeres *res = msg->rpc_resp; - struct inode *inode = d_inode(dentry); res->server = NFS_SB(dentry->d_sb); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; - nfs4_init_sequence(&args->seq_args, &res->seq_res, 1); + nfs4_init_sequence(&args->seq_args, &res->seq_res, 1, 0); nfs_fattr_init(res->dir_attr); @@ -4319,7 +4350,7 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, nfs4_inode_return_delegation(new_inode); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; res->server = NFS_SB(old_dentry->d_sb); - nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1); + nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1, 0); } static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) @@ -4352,11 +4383,12 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { struct nfs_server *server = NFS_SERVER(inode); + __u32 bitmask[NFS4_BITMASK_SZ]; struct nfs4_link_arg arg = { .fh = NFS_FH(inode), .dir_fh = NFS_FH(dir), .name = name, - .bitmask = server->attr_bitmask, + .bitmask = bitmask, }; struct nfs4_link_res res = { .server = server, @@ -4378,9 +4410,9 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct status = PTR_ERR(res.label); goto out; } - arg.bitmask = nfs4_bitmask(server, res.label); nfs4_inode_make_writeable(inode); + nfs4_bitmap_copy_adjust_setattr(bitmask, nfs4_bitmask(server, res.label), inode); status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { @@ -4895,7 +4927,7 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, if (!hdr->pgio_done_cb) hdr->pgio_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; - nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); + nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); } static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, @@ -4979,7 +5011,8 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr) } static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, - struct rpc_message *msg) + struct rpc_message *msg, + struct rpc_clnt **clnt) { struct nfs_server *server = NFS_SERVER(hdr->inode); @@ -4995,7 +5028,8 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, hdr->timestamp = jiffies; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; - nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1); + nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1, 0); + nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr); } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) @@ -5026,7 +5060,8 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data) return data->commit_done_cb(task, data); } -static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) +static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg, + struct rpc_clnt **clnt) { struct nfs_server *server = NFS_SERVER(data->inode); @@ -5034,7 +5069,8 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess data->commit_done_cb = nfs4_commit_done_cb; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0); + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_COMMIT, clnt, msg); } struct nfs4_renewdata { @@ -5391,7 +5427,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl */ spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME; + | NFS_INO_INVALID_CTIME + | NFS_INO_REVAL_FORCED; spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); @@ -5591,13 +5628,14 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) return 0; rcu_read_lock(); - len = 14 + strlen(clp->cl_ipaddr) + 1 + - strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + + len = 14 + + strlen(clp->cl_rpcclient->cl_nodename) + 1 + - strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) + + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + 1; rcu_read_unlock(); - + if (nfs4_client_id_uniquifier[0] != '\0') + len += strlen(nfs4_client_id_uniquifier) + 1; if (len > NFS4_OPAQUE_LIMIT + 1) return -EINVAL; @@ -5611,10 +5649,17 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) return -ENOMEM; rcu_read_lock(); - scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); + if (nfs4_client_id_uniquifier[0] != '\0') + scnprintf(str, len, "Linux NFSv4.0 %s/%s/%s", + clp->cl_rpcclient->cl_nodename, + nfs4_client_id_uniquifier, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)); + else + scnprintf(str, len, "Linux NFSv4.0 %s/%s", + clp->cl_rpcclient->cl_nodename, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)); rcu_read_unlock(); clp->cl_owner_id = str; @@ -5934,12 +5979,19 @@ static void nfs4_delegreturn_release(void *calldata) static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) { struct nfs4_delegreturndata *d_data; + struct pnfs_layout_hdr *lo; d_data = (struct nfs4_delegreturndata *)data; if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) return; + lo = d_data->args.lr_args ? d_data->args.lr_args->layout : NULL; + if (lo && !pnfs_layout_is_valid(lo)) { + d_data->args.lr_args = NULL; + d_data->res.lr_res = NULL; + } + nfs4_setup_sequence(d_data->res.server->nfs_client, &d_data->args.seq_args, &d_data->res.seq_res, @@ -5972,7 +6024,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co data = kzalloc(sizeof(*data), GFP_NOFS); if (data == NULL) return -ENOMEM; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0); nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, @@ -6247,7 +6299,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, return ERR_PTR(-ENOMEM); } - nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1, 0); msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; @@ -6411,32 +6463,36 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) case 0: renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), data->timestamp); - if (data->arg.new_lock) { + if (data->arg.new_lock && !data->cancelled) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); - if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) { - rpc_restart_call_prepare(task); + if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) break; - } } + if (data->arg.new_lock_owner != 0) { nfs_confirm_seqid(&lsp->ls_seqid, 0); nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid); set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); - } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid)) - rpc_restart_call_prepare(task); + goto out_done; + } else if (nfs4_update_lock_stateid(lsp, &data->res.stateid)) + goto out_done; + break; case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: if (data->arg.new_lock_owner != 0) { - if (!nfs4_stateid_match(&data->arg.open_stateid, + if (nfs4_stateid_match(&data->arg.open_stateid, &lsp->ls_state->open_stateid)) - rpc_restart_call_prepare(task); - } else if (!nfs4_stateid_match(&data->arg.lock_stateid, + goto out_done; + } else if (nfs4_stateid_match(&data->arg.lock_stateid, &lsp->ls_stateid)) - rpc_restart_call_prepare(task); + goto out_done; } + if (!data->cancelled) + rpc_restart_call_prepare(task); +out_done: dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); } @@ -6509,14 +6565,14 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f return -ENOMEM; if (IS_SETLKW(cmd)) data->arg.block = 1; - nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1, + recovery_type > NFS_LOCK_NEW); msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; if (recovery_type > NFS_LOCK_NEW) { if (recovery_type == NFS_LOCK_RECLAIM) data->arg.reclaim = NFS_LOCK_RECLAIM; - nfs4_set_sequence_privileged(&data->arg.seq_args); } else data->arg.new_lock = 1; task = rpc_run_task(&task_setup_data); @@ -6911,7 +6967,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0); rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); } @@ -7107,8 +7163,7 @@ static int _nfs40_proc_get_locations(struct inode *inode, locations->server = server; locations->nlocations = 0; - nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); status = nfs4_call_sync_sequence(clnt, server, &msg, &args.seq_args, &res.seq_res); if (status) @@ -7161,8 +7216,7 @@ static int _nfs41_proc_get_locations(struct inode *inode, locations->server = server; locations->nlocations = 0; - nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); status = nfs4_call_sync_sequence(clnt, server, &msg, &args.seq_args, &res.seq_res); if (status == NFS4_OK && @@ -7249,8 +7303,7 @@ static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) if (res.fh == NULL) return -ENOMEM; - nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); status = nfs4_call_sync_sequence(clnt, server, &msg, &args.seq_args, &res.seq_res); nfs_free_fhandle(res.fh); @@ -7291,8 +7344,7 @@ static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) if (res.fh == NULL) return -ENOMEM; - nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); status = nfs4_call_sync_sequence(clnt, server, &msg, &args.seq_args, &res.seq_res); nfs_free_fhandle(res.fh); @@ -8070,8 +8122,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) }; int status; - nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); - nfs4_set_sequence_privileged(&args.la_seq_args); + nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1); task = rpc_run_task(&task_setup); if (IS_ERR(task)) @@ -8408,10 +8459,8 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, calldata = kzalloc(sizeof(*calldata), GFP_NOFS); if (calldata == NULL) goto out_put_clp; - nfs4_init_sequence(&calldata->args, &calldata->res, 0); + nfs4_init_sequence(&calldata->args, &calldata->res, 0, is_privileged); nfs4_sequence_attach_slot(&calldata->args, &calldata->res, slot); - if (is_privileged) - nfs4_set_sequence_privileged(&calldata->args); msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; calldata->clp = clp; @@ -8563,8 +8612,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, calldata->clp = clp; calldata->arg.one_fs = 0; - nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); - nfs4_set_sequence_privileged(&calldata->arg.seq_args); + nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0, 1); msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; @@ -8616,6 +8664,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); + nfs4_sequence_free_slot(&lgp->res.seq_res); + switch (nfs4err) { case 0: goto out; @@ -8680,7 +8730,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, goto out; } - nfs4_sequence_free_slot(&lgp->res.seq_res); err = nfs4_handle_exception(server, nfs4err, exception); if (!status) { if (exception->retry) @@ -8693,63 +8742,19 @@ out: return status; } -static size_t max_response_pages(struct nfs_server *server) +size_t max_response_pages(struct nfs_server *server) { u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; return nfs_page_array_len(0, max_resp_sz); } -static void nfs4_free_pages(struct page **pages, size_t size) -{ - int i; - - if (!pages) - return; - - for (i = 0; i < size; i++) { - if (!pages[i]) - break; - __free_page(pages[i]); - } - kfree(pages); -} - -static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) -{ - struct page **pages; - int i; - - pages = kcalloc(size, sizeof(struct page *), gfp_flags); - if (!pages) { - dprintk("%s: can't alloc array of %zu pages\n", __func__, size); - return NULL; - } - - for (i = 0; i < size; i++) { - pages[i] = alloc_page(gfp_flags); - if (!pages[i]) { - dprintk("%s: failed to allocate page\n", __func__); - nfs4_free_pages(pages, size); - return NULL; - } - } - - return pages; -} - static void nfs4_layoutget_release(void *calldata) { struct nfs4_layoutget *lgp = calldata; - struct inode *inode = lgp->args.inode; - struct nfs_server *server = NFS_SERVER(inode); - size_t max_pages = max_response_pages(server); dprintk("--> %s\n", __func__); nfs4_sequence_free_slot(&lgp->res.seq_res); - nfs4_free_pages(lgp->args.layout.pages, max_pages); - pnfs_put_layout_hdr(NFS_I(inode)->layout); - put_nfs_open_context(lgp->args.ctx); - kfree(calldata); + pnfs_layoutget_free(lgp); dprintk("<-- %s\n", __func__); } @@ -8760,11 +8765,10 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { }; struct pnfs_layout_segment * -nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); - size_t max_pages = max_response_pages(server); struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], @@ -8791,35 +8795,28 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ pnfs_get_layout_hdr(NFS_I(inode)->layout); - lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); - if (!lgp->args.layout.pages) { - nfs4_layoutget_release(lgp); - return ERR_PTR(-ENOMEM); - } - lgp->args.layout.pglen = max_pages * PAGE_SIZE; - - lgp->res.layoutp = &lgp->args.layout; - lgp->res.seq_res.sr_slot = NULL; - nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return ERR_CAST(task); status = rpc_wait_for_completion_task(task); - if (status == 0) { + if (status != 0) + goto out; + + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ + if (task->tk_status < 0 || lgp->res.layoutp->len == 0) { status = nfs4_layoutget_handle_exception(task, lgp, &exception); *timeout = exception.timeout; - } - + } else + lseg = pnfs_layout_process(lgp); +out: trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, &lgp->res.stateid, status); - /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ - if (status == 0 && lgp->res.layoutp->len) - lseg = pnfs_layout_process(lgp); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) @@ -8837,6 +8834,8 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) &lrp->args.seq_args, &lrp->res.seq_res, task); + if (!pnfs_layout_is_valid(lrp->args.layout)) + rpc_exit(task, 0); } static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) @@ -8927,7 +8926,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) } task_setup_data.flags |= RPC_TASK_ASYNC; } - nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); + nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, 0); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -9074,7 +9073,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) } task_setup_data.flags = RPC_TASK_ASYNC; } - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -9254,8 +9253,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, &rpc_client, &msg); dprintk("NFS call test_stateid %p\n", stateid); - nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); status = nfs4_call_sync_sequence(rpc_client, server, &msg, &args.seq_args, &res.seq_res); if (status != NFS_OK) { @@ -9347,7 +9345,17 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { .rpc_release = nfs41_free_stateid_release, }; -static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, +/** + * nfs41_free_stateid - perform a FREE_STATEID operation + * + * @server: server / transport on which to perform the operation + * @stateid: state ID to release + * @cred: credential + * @is_recovery: set to true if this call needs to be privileged + * + * Note: this function is always asynchronous. + */ +static int nfs41_free_stateid(struct nfs_server *server, const nfs4_stateid *stateid, struct rpc_cred *cred, bool privileged) @@ -9363,6 +9371,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, .flags = RPC_TASK_ASYNC, }; struct nfs_free_stateid_data *data; + struct rpc_task *task; nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID, &task_setup.rpc_client, &msg); @@ -9370,7 +9379,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, dprintk("NFS call free_stateid %p\n", stateid); data = kmalloc(sizeof(*data), GFP_NOFS); if (!data) - return ERR_PTR(-ENOMEM); + return -ENOMEM; data->server = server; nfs4_stateid_copy(&data->args.stateid, stateid); @@ -9378,31 +9387,8 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); - if (privileged) - nfs4_set_sequence_privileged(&data->args.seq_args); - - return rpc_run_task(&task_setup); -} - -/** - * nfs41_free_stateid - perform a FREE_STATEID operation - * - * @server: server / transport on which to perform the operation - * @stateid: state ID to release - * @cred: credential - * @is_recovery: set to true if this call needs to be privileged - * - * Note: this function is always asynchronous. - */ -static int nfs41_free_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - struct rpc_cred *cred, - bool is_recovery) -{ - struct rpc_task *task; - - task = _nfs41_free_stateid(server, stateid, cred, is_recovery); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, privileged); + task = rpc_run_task(&task_setup); if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); @@ -9539,7 +9525,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { | NFS_CAP_ATOMIC_OPEN | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 - | NFS_CAP_ATOMIC_OPEN_V1, + | NFS_CAP_ATOMIC_OPEN_V1 + | NFS_CAP_LGOPEN, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -9564,6 +9551,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 + | NFS_CAP_LGOPEN | NFS_CAP_ALLOCATE | NFS_CAP_COPY | NFS_CAP_DEALLOCATE diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index c10a422efe6f..2bf2eaa08ca7 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -77,6 +77,14 @@ const nfs4_stateid invalid_stateid = { .type = NFS4_INVALID_STATEID_TYPE, }; +const nfs4_stateid current_stateid = { + { + /* Funky initialiser keeps older gcc versions happy */ + .data = { 0x0, 0x0, 0x0, 0x1, 0 }, + }, + .type = NFS4_SPECIAL_STATEID_TYPE, +}; + static DEFINE_MUTEX(nfs_clid_init_mutex); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 9b7392032321..cd41d2577a04 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -65,7 +65,13 @@ /* Mapping from NFS error code to "errno" error code. */ #define errno_NFSERR_IO EIO +struct compound_hdr; static int nfs4_stat_to_errno(int); +static void encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr); +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res); /* NFSv4 COMPOUND tags are only wanted for debugging purposes */ #ifdef DEBUG @@ -424,6 +430,8 @@ static int nfs4_stat_to_errno(int); #define decode_sequence_maxsz 0 #define encode_layoutreturn_maxsz 0 #define decode_layoutreturn_maxsz 0 +#define encode_layoutget_maxsz 0 +#define decode_layoutget_maxsz 0 #endif /* CONFIG_NFS_V4_1 */ #define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ @@ -476,14 +484,16 @@ static int nfs4_stat_to_errno(int); encode_open_maxsz + \ encode_access_maxsz + \ encode_getfh_maxsz + \ - encode_getattr_maxsz) + encode_getattr_maxsz + \ + encode_layoutget_maxsz) #define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_open_maxsz + \ decode_access_maxsz + \ decode_getfh_maxsz + \ - decode_getattr_maxsz) + decode_getattr_maxsz + \ + decode_layoutget_maxsz) #define NFS4_enc_open_confirm_sz \ (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ @@ -497,13 +507,15 @@ static int nfs4_stat_to_errno(int); encode_putfh_maxsz + \ encode_open_maxsz + \ encode_access_maxsz + \ - encode_getattr_maxsz) + encode_getattr_maxsz + \ + encode_layoutget_maxsz) #define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_open_maxsz + \ decode_access_maxsz + \ - decode_getattr_maxsz) + decode_getattr_maxsz + \ + decode_layoutget_maxsz) #define NFS4_enc_open_downgrade_sz \ (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ @@ -1057,6 +1069,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server, const uint32_t attrmask[]) { + struct timespec ts; char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; int owner_namelen = 0; @@ -1145,14 +1158,16 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_nfstime4(p, &iap->ia_atime); + ts = timespec64_to_timespec(iap->ia_atime); + p = xdr_encode_nfstime4(p, &ts); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_nfstime4(p, &iap->ia_mtime); + ts = timespec64_to_timespec(iap->ia_mtime); + p = xdr_encode_nfstime4(p, &ts); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } @@ -2070,6 +2085,13 @@ encode_layoutreturn(struct xdr_stream *xdr, struct compound_hdr *hdr) { } + +static void +encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr) +{ +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2316,6 +2338,12 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, if (args->access) encode_access(xdr, args->access, &hdr); encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); + if (args->lg_args) { + encode_layoutget(xdr, args->lg_args, &hdr); + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, + args->lg_args->layout.pages, + 0, args->lg_args->layout.pglen); + } encode_nops(&hdr); } @@ -2356,6 +2384,12 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, if (args->access) encode_access(xdr, args->access, &hdr); encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); + if (args->lg_args) { + encode_layoutget(xdr, args->lg_args, &hdr); + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, + args->lg_args->layout.pages, + 0, args->lg_args->layout.pglen); + } encode_nops(&hdr); } @@ -6024,7 +6058,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, status = decode_op_hdr(xdr, OP_LAYOUTGET); if (status) - return status; + goto out; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; @@ -6037,7 +6071,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, if (!layout_count) { dprintk("%s: server responded with empty layout array\n", __func__); - return -EINVAL; + status = -EINVAL; + goto out; } p = xdr_inline_decode(xdr, 28); @@ -6062,7 +6097,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, dprintk("NFS: server cheating in layoutget reply: " "layout len %u > recvd %u\n", res->layoutp->len, recvd); - return -EINVAL; + status = -EINVAL; + goto out; } if (layout_count > 1) { @@ -6075,10 +6111,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, __func__, layout_count); } - return 0; +out: + res->status = status; + return status; out_overflow: print_overflow_msg(__func__, xdr); - return -EIO; + status = -EIO; + goto out; } static int decode_layoutreturn(struct xdr_stream *xdr, @@ -6177,6 +6216,13 @@ int decode_layoutreturn(struct xdr_stream *xdr, { return 0; } + +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res) +{ + return 0; +} + #endif /* CONFIG_NFS_V4_1 */ /* @@ -6623,6 +6669,8 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, if (res->access_request) decode_access(xdr, &res->access_supported, &res->access_result); decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server); + if (res->lg_res) + decode_layoutget(xdr, rqstp, res->lg_res); out: return status; } @@ -6675,6 +6723,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, if (res->access_request) decode_access(xdr, &res->access_supported, &res->access_result); decode_getfattr(xdr, res->f_attr, res->server); + if (res->lg_res) + decode_layoutget(xdr, rqstp, res->lg_res); out: return status; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ee723aa153a3..bcc3addec3c5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -37,6 +37,7 @@ #include "nfs4trace.h" #include "delegation.h" #include "nfs42.h" +#include "nfs4_fs.h" #define NFSDBG_FACILITY NFSDBG_PNFS #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) @@ -915,45 +916,99 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } -/* - * Get layout from server. - * for now, assume that whole file layouts are requested. - * arg->offset: 0 - * arg->length: all ones - */ -static struct pnfs_layout_segment * -send_layoutget(struct pnfs_layout_hdr *lo, +static struct nfs_server * +pnfs_find_server(struct inode *inode, struct nfs_open_context *ctx) +{ + struct nfs_server *server; + + if (inode) { + server = NFS_SERVER(inode); + } else { + struct dentry *parent_dir = dget_parent(ctx->dentry); + server = NFS_SERVER(parent_dir->d_inode); + dput(parent_dir); + } + return server; +} + +static void nfs4_free_pages(struct page **pages, size_t size) +{ + int i; + + if (!pages) + return; + + for (i = 0; i < size; i++) { + if (!pages[i]) + break; + __free_page(pages[i]); + } + kfree(pages); +} + +static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) +{ + struct page **pages; + int i; + + pages = kcalloc(size, sizeof(struct page *), gfp_flags); + if (!pages) { + dprintk("%s: can't alloc array of %zu pages\n", __func__, size); + return NULL; + } + + for (i = 0; i < size; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) { + dprintk("%s: failed to allocate page\n", __func__); + nfs4_free_pages(pages, size); + return NULL; + } + } + + return pages; +} + +static struct nfs4_layoutget * +pnfs_alloc_init_layoutget_args(struct inode *ino, struct nfs_open_context *ctx, - nfs4_stateid *stateid, + const nfs4_stateid *stateid, const struct pnfs_layout_range *range, - long *timeout, gfp_t gfp_flags) + gfp_t gfp_flags) { - struct inode *ino = lo->plh_inode; - struct nfs_server *server = NFS_SERVER(ino); + struct nfs_server *server = pnfs_find_server(ino, ctx); + size_t max_pages = max_response_pages(server); struct nfs4_layoutget *lgp; - loff_t i_size; dprintk("--> %s\n", __func__); - /* - * Synchronously retrieve layout information from server and - * store in lseg. If we race with a concurrent seqid morphing - * op, then re-send the LAYOUTGET. - */ lgp = kzalloc(sizeof(*lgp), gfp_flags); if (lgp == NULL) - return ERR_PTR(-ENOMEM); + return NULL; + + lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); + if (!lgp->args.layout.pages) { + kfree(lgp); + return NULL; + } + lgp->args.layout.pglen = max_pages * PAGE_SIZE; + lgp->res.layoutp = &lgp->args.layout; - i_size = i_size_read(ino); + /* Don't confuse uninitialised result and success */ + lgp->res.status = -NFS4ERR_DELAY; lgp->args.minlength = PAGE_SIZE; if (lgp->args.minlength > range->length) lgp->args.minlength = range->length; - if (range->iomode == IOMODE_READ) { - if (range->offset >= i_size) - lgp->args.minlength = 0; - else if (i_size - range->offset < lgp->args.minlength) - lgp->args.minlength = i_size - range->offset; + if (ino) { + loff_t i_size = i_size_read(ino); + + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; + } } lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; pnfs_copy_range(&lgp->args.range, range); @@ -962,9 +1017,21 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.ctx = get_nfs_open_context(ctx); nfs4_stateid_copy(&lgp->args.stateid, stateid); lgp->gfp_flags = gfp_flags; - lgp->cred = lo->plh_lc_cred; + lgp->cred = get_rpccred(ctx->cred); + lgp->callback_count = raw_seqcount_begin(&server->nfs_client->cl_callback_count); + return lgp; +} - return nfs4_proc_layoutget(lgp, timeout, gfp_flags); +void pnfs_layoutget_free(struct nfs4_layoutget *lgp) +{ + size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE; + + nfs4_free_pages(lgp->args.layout.pages, max_pages); + if (lgp->args.inode) + pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout); + put_rpccred(lgp->cred); + put_nfs_open_context(lgp->args.ctx); + kfree(lgp); } static void pnfs_clear_layoutcommit(struct inode *inode, @@ -1144,7 +1211,7 @@ _pnfs_return_layout(struct inode *ino) LIST_HEAD(tmp_list); nfs4_stateid stateid; int status = 0; - bool send; + bool send, valid_layout; dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); @@ -1165,6 +1232,7 @@ _pnfs_return_layout(struct inode *ino) goto out_put_layout_hdr; spin_lock(&ino->i_lock); } + valid_layout = pnfs_layout_is_valid(lo); pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0); @@ -1178,7 +1246,8 @@ _pnfs_return_layout(struct inode *ino) } /* Don't send a LAYOUTRETURN if list was initially empty */ - if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { + if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) || + !valid_layout) { spin_unlock(&ino->i_lock); dprintk("NFS: %s no layout segments to return\n", __func__); goto out_put_layout_hdr; @@ -1671,6 +1740,22 @@ static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET); } +static void _add_to_server_list(struct pnfs_layout_hdr *lo, + struct nfs_server *server) +{ + if (list_empty(&lo->plh_layouts)) { + struct nfs_client *clp = server->nfs_client; + + /* The lo must be on the clp list if there is any + * chance of a CB_LAYOUTRECALL(FILE) coming in. + */ + spin_lock(&clp->cl_lock); + if (list_empty(&lo->plh_layouts)) + list_add_tail(&lo->plh_layouts, &server->layouts); + spin_unlock(&clp->cl_lock); + } +} + /* * Layout segment is retreived from the server if not cached. * The appropriate layout segment is referenced and returned to the caller. @@ -1694,6 +1779,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo = NULL; struct pnfs_layout_segment *lseg = NULL; + struct nfs4_layoutget *lgp; nfs4_stateid stateid; long timeout = 0; unsigned long giveup = jiffies + (clp->cl_lease_time << 1); @@ -1820,15 +1906,7 @@ lookup_again: atomic_inc(&lo->plh_outstanding); spin_unlock(&ino->i_lock); - if (list_empty(&lo->plh_layouts)) { - /* The lo must be on the clp list if there is any - * chance of a CB_LAYOUTRECALL(FILE) coming in. - */ - spin_lock(&clp->cl_lock); - if (list_empty(&lo->plh_layouts)) - list_add_tail(&lo->plh_layouts, &server->layouts); - spin_unlock(&clp->cl_lock); - } + _add_to_server_list(lo, server); pg_offset = arg.offset & ~PAGE_MASK; if (pg_offset) { @@ -1838,7 +1916,15 @@ lookup_again: if (arg.length != NFS4_MAX_UINT64) arg.length = PAGE_ALIGN(arg.length); - lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags); + if (!lgp) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL, + PNFS_UPDATE_LAYOUT_NOMEM); + atomic_dec(&lo->plh_outstanding); + goto out_put_layout_hdr; + } + + lseg = nfs4_proc_layoutget(lgp, &timeout); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); atomic_dec(&lo->plh_outstanding); @@ -1919,6 +2005,171 @@ pnfs_sanity_check_layout_range(struct pnfs_layout_range *range) return true; } +static struct pnfs_layout_hdr * +_pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&ino->i_lock); + lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL); + if (!lo) + goto out_unlock; + if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) + goto out_unlock; + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + goto out_unlock; + if (pnfs_layoutgets_blocked(lo)) + goto out_unlock; + if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags)) + goto out_unlock; + atomic_inc(&lo->plh_outstanding); + spin_unlock(&ino->i_lock); + _add_to_server_list(lo, NFS_SERVER(ino)); + return lo; + +out_unlock: + spin_unlock(&ino->i_lock); + pnfs_put_layout_hdr(lo); + return NULL; +} + +extern const nfs4_stateid current_stateid; + +static void _lgopen_prepare_attached(struct nfs4_opendata *data, + struct nfs_open_context *ctx) +{ + struct inode *ino = data->dentry->d_inode; + struct pnfs_layout_range rng = { + .iomode = (data->o_arg.fmode & FMODE_WRITE) ? + IOMODE_RW: IOMODE_READ, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + struct nfs4_layoutget *lgp; + struct pnfs_layout_hdr *lo; + + /* Heuristic: don't send layoutget if we have cached data */ + if (rng.iomode == IOMODE_READ && + (i_size_read(ino) == 0 || ino->i_mapping->nrpages != 0)) + return; + + lo = _pnfs_grab_empty_layout(ino, ctx); + if (!lo) + return; + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, + &rng, GFP_KERNEL); + if (!lgp) { + pnfs_clear_first_layoutget(lo); + pnfs_put_layout_hdr(lo); + return; + } + data->lgp = lgp; + data->o_arg.lg_args = &lgp->args; + data->o_res.lg_res = &lgp->res; +} + +static void _lgopen_prepare_floating(struct nfs4_opendata *data, + struct nfs_open_context *ctx) +{ + struct pnfs_layout_range rng = { + .iomode = (data->o_arg.fmode & FMODE_WRITE) ? + IOMODE_RW: IOMODE_READ, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + struct nfs4_layoutget *lgp; + + lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, ¤t_stateid, + &rng, GFP_KERNEL); + if (!lgp) + return; + data->lgp = lgp; + data->o_arg.lg_args = &lgp->args; + data->o_res.lg_res = &lgp->res; +} + +void pnfs_lgopen_prepare(struct nfs4_opendata *data, + struct nfs_open_context *ctx) +{ + struct nfs_server *server = NFS_SERVER(data->dir->d_inode); + + if (!(pnfs_enabled_sb(server) && + server->pnfs_curr_ld->flags & PNFS_LAYOUTGET_ON_OPEN)) + return; + /* Could check on max_ops, but currently hardcoded high enough */ + if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN)) + return; + if (data->state) + _lgopen_prepare_attached(data, ctx); + else + _lgopen_prepare_floating(data, ctx); +} + +void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, + struct nfs_open_context *ctx) +{ + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg; + struct nfs_server *srv = NFS_SERVER(ino); + u32 iomode; + + if (!lgp) + return; + dprintk("%s: entered with status %i\n", __func__, lgp->res.status); + if (lgp->res.status) { + switch (lgp->res.status) { + default: + break; + /* + * Halt lgopen attempts if the server doesn't recognise + * the "current stateid" value, the layout type, or the + * layoutget operation as being valid. + * Also if it complains about too many ops in the compound + * or of the request/reply being too big. + */ + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_NOTSUPP: + case -NFS4ERR_REP_TOO_BIG: + case -NFS4ERR_REP_TOO_BIG_TO_CACHE: + case -NFS4ERR_REQ_TOO_BIG: + case -NFS4ERR_TOO_MANY_OPS: + case -NFS4ERR_UNKNOWN_LAYOUTTYPE: + srv->caps &= ~NFS_CAP_LGOPEN; + } + return; + } + if (!lgp->args.inode) { + lo = _pnfs_grab_empty_layout(ino, ctx); + if (!lo) + return; + lgp->args.inode = ino; + } else + lo = NFS_I(lgp->args.inode)->layout; + + if (read_seqcount_retry(&srv->nfs_client->cl_callback_count, + lgp->callback_count)) + return; + lseg = pnfs_layout_process(lgp); + if (!IS_ERR(lseg)) { + iomode = lgp->args.range.iomode; + pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + pnfs_put_lseg(lseg); + } +} + +void nfs4_lgopen_release(struct nfs4_layoutget *lgp) +{ + if (lgp != NULL) { + struct inode *inode = lgp->args.inode; + if (inode) { + struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; + atomic_dec(&lo->plh_outstanding); + pnfs_clear_first_layoutget(lo); + } + pnfs_layoutget_free(lgp); + } +} + struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { @@ -1984,8 +2235,6 @@ out_forget: spin_unlock(&ino->i_lock); lseg->pls_layout = lo; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - if (!pnfs_layout_is_valid(lo)) - nfs_commit_inode(ino, 0); return ERR_PTR(-EAGAIN); } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index daf6cbf5c15f..3fe81424337d 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -35,6 +35,8 @@ #include <linux/nfs_page.h> #include <linux/workqueue.h> +struct nfs4_opendata; + enum { NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ NFS_LSEG_ROC, /* roc bit received from server */ @@ -110,6 +112,7 @@ enum layoutdriver_policy_flags { PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, PNFS_LAYOUTRET_ON_ERROR = 1 << 1, PNFS_READ_WHOLE_PAGE = 1 << 2, + PNFS_LAYOUTGET_ON_OPEN = 1 << 3, }; struct nfs4_deviceid_node; @@ -223,10 +226,11 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); /* nfs4proc.c */ +extern size_t max_response_pages(struct nfs_server *server); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, struct rpc_cred *cred); -extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags); +extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout); extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); /* pnfs.c */ @@ -246,6 +250,7 @@ size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_layoutget_free(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); @@ -375,6 +380,11 @@ void pnfs_layout_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx); +void pnfs_lgopen_prepare(struct nfs4_opendata *data, + struct nfs_open_context *ctx); +void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, + struct nfs_open_context *ctx); +void nfs4_lgopen_release(struct nfs4_layoutget *lgp); static inline bool nfs_have_layout(struct inode *inode) { @@ -775,6 +785,27 @@ static inline bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, { return false; } + +static inline void pnfs_lgopen_prepare(struct nfs4_opendata *data, + struct nfs_open_context *ctx) +{ +} + +static inline void pnfs_parse_lgopen(struct inode *ino, + struct nfs4_layoutget *lgp, + struct nfs_open_context *ctx) +{ +} + +static inline void nfs4_lgopen_release(struct nfs4_layoutget *lgp) +{ +} + +static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo) +{ + return false; +} + #endif /* CONFIG_NFS_V4_1 */ #if IS_ENABLED(CONFIG_NFS_V4_2) diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 4e93d6308733..e0c257bd62b9 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -99,7 +99,8 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, */ static int nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) + struct nfs_fattr *fattr, struct nfs4_label *label, + struct inode *inode) { struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], @@ -321,7 +322,9 @@ nfs_proc_remove(struct inode *dir, struct dentry *dentry) } static void -nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) +nfs_proc_unlink_setup(struct rpc_message *msg, + struct dentry *dentry, + struct inode *inode) { msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE]; } @@ -618,7 +621,8 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) } static void nfs_proc_write_setup(struct nfs_pgio_header *hdr, - struct rpc_message *msg) + struct rpc_message *msg, + struct rpc_clnt **clnt) { /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ hdr->args.stable = NFS_FILE_SYNC; @@ -631,7 +635,8 @@ static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit } static void -nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) +nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg, + struct rpc_clnt **clnt) { BUG(); } diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index bf54fc9ae135..fd61bf0fce63 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -85,7 +85,7 @@ static const struct rpc_call_ops nfs_unlink_ops = { .rpc_call_prepare = nfs_unlink_prepare, }; -static void nfs_do_call_unlink(struct nfs_unlinkdata *data) +static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data) { struct rpc_message msg = { .rpc_argp = &data->args, @@ -105,7 +105,7 @@ static void nfs_do_call_unlink(struct nfs_unlinkdata *data) data->args.fh = NFS_FH(dir); nfs_fattr_init(data->res.dir_attr); - NFS_PROTO(dir)->unlink_setup(&msg, data->dentry); + NFS_PROTO(dir)->unlink_setup(&msg, data->dentry, inode); task_setup_data.rpc_client = NFS_CLIENT(dir); task = rpc_run_task(&task_setup_data); @@ -113,7 +113,7 @@ static void nfs_do_call_unlink(struct nfs_unlinkdata *data) rpc_put_task_async(task); } -static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) +static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nfs_unlinkdata *data) { struct inode *dir = d_inode(dentry->d_parent); struct dentry *alias; @@ -153,7 +153,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) return ret; } data->dentry = alias; - nfs_do_call_unlink(data); + nfs_do_call_unlink(inode, data); return 1; } @@ -231,7 +231,7 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode) dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); - if (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)) + if (NFS_STALE(inode) || !nfs_call_unlink(dentry, inode, data)) nfs_free_unlinkdata(data); } @@ -448,6 +448,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) unsigned char silly[SILLYNAME_LEN + 1]; unsigned long long fileid; struct dentry *sdentry; + struct inode *inode = d_inode(dentry); struct rpc_task *task; int error = -EBUSY; @@ -485,6 +486,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) goto out; } while (d_inode(sdentry) != NULL); /* need negative lookup */ + ihold(inode); + /* queue unlink first. Can't do this from rpc_release as it * has to allocate memory */ @@ -509,6 +512,12 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) case 0: /* The rename succeeded */ nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + spin_lock(&inode->i_lock); + NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME + | NFS_INO_REVAL_FORCED; + spin_unlock(&inode->i_lock); d_move(dentry, sdentry); break; case -ERESTARTSYS: @@ -519,6 +528,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) } rpc_put_task(task); out_dput: + iput(inode); dput(sdentry); out: return error; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0193053bc139..a057b4f45a46 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1375,12 +1375,9 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, int priority = flush_task_priority(how); task_setup_data->priority = priority; - rpc_ops->write_setup(hdr, msg); + rpc_ops->write_setup(hdr, msg, &task_setup_data->rpc_client); trace_nfs_initiate_write(hdr->inode, hdr->io_start, hdr->good_bytes, hdr->args.stable); - - nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client, - &task_setup_data->rpc_client, msg, hdr); } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -1669,14 +1666,11 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .priority = priority, }; /* Set up the initial task struct. */ - nfs_ops->commit_setup(data, &msg); + nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client); trace_nfs_initiate_commit(data); dprintk("NFS: initiated commit call\n"); - nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, - NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); - task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 70b8bf781fce..4fb1f72a25fb 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -121,13 +121,15 @@ nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, { loff_t new_size = lcp->lc_last_wr + 1; struct iattr iattr = { .ia_valid = 0 }; + struct timespec ts; int error; + ts = timespec64_to_timespec(inode->i_mtime); if (lcp->lc_mtime.tv_nsec == UTIME_NOW || - timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) - lcp->lc_mtime = current_time(inode); + timespec_compare(&lcp->lc_mtime, &ts) < 0) + lcp->lc_mtime = timespec64_to_timespec(current_time(inode)); iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; - iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; + iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = timespec_to_timespec64(lcp->lc_mtime); if (new_size > i_size_read(inode)) { iattr.ia_valid |= ATTR_SIZE; @@ -216,18 +218,26 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, struct request_queue *q = bdev->bd_disk->queue; struct request *rq; struct scsi_request *req; - size_t bufflen = 252, len, id_len; + /* + * The allocation length (passed in bytes 3 and 4 of the INQUIRY + * command descriptor block) specifies the number of bytes that have + * been allocated for the data-in buffer. + * 252 is the highest one-byte value that is a multiple of 4. + * 65532 is the highest two-byte value that is a multiple of 4. + */ + size_t bufflen = 252, maxlen = 65532, len, id_len; u8 *buf, *d, type, assoc; - int error; + int retries = 1, error; if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) return -EINVAL; +again: buf = kzalloc(bufflen, GFP_KERNEL); if (!buf) return -ENOMEM; - rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL); + rq = blk_get_request(q, REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) { error = -ENOMEM; goto out_free_buf; @@ -255,6 +265,12 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, len = (buf[2] << 8) + buf[3] + 4; if (len > bufflen) { + if (len <= maxlen && retries--) { + blk_put_request(rq); + kfree(buf); + bufflen = len; + goto again; + } pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n", len); goto out_put_request; diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 046b3f048757..b7559c6f2b97 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -67,11 +67,6 @@ enum { RC_REPLBUFF, }; -/* - * If requests are retransmitted within this interval, they're dropped. - */ -#define RC_DELAY (HZ/5) - /* Cache entries expire after this time period */ #define RC_EXPIRE (120 * HZ) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 8ceb25a10ea0..a1143f7c2201 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -404,8 +404,9 @@ fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) if (fsloc->locations_count == 0) return 0; - fsloc->locations = kzalloc(fsloc->locations_count - * sizeof(struct nfsd4_fs_location), GFP_KERNEL); + fsloc->locations = kcalloc(fsloc->locations_count, + sizeof(struct nfsd4_fs_location), + GFP_KERNEL); if (!fsloc->locations) return -ENOMEM; for (i=0; i < fsloc->locations_count; i++) { diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 3192b544a441..9b973f4f7d01 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -165,6 +165,7 @@ static __be32 * encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) { + struct timespec ts; *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); *p++ = htonl((u32) (stat->mode & S_IALLUGO)); *p++ = htonl((u32) stat->nlink); @@ -180,9 +181,12 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, *p++ = htonl((u32) MINOR(stat->rdev)); p = encode_fsid(p, fhp); p = xdr_encode_hyper(p, stat->ino); - p = encode_time3(p, &stat->atime); - p = encode_time3(p, &stat->mtime); - p = encode_time3(p, &stat->ctime); + ts = timespec64_to_timespec(stat->atime); + p = encode_time3(p, &ts); + ts = timespec64_to_timespec(stat->mtime); + p = encode_time3(p, &ts); + ts = timespec64_to_timespec(stat->ctime); + p = encode_time3(p, &ts); return p; } @@ -271,8 +275,8 @@ void fill_pre_wcc(struct svc_fh *fhp) stat.size = inode->i_size; } - fhp->fh_pre_mtime = stat.mtime; - fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_mtime = timespec64_to_timespec(stat.mtime); + fhp->fh_pre_ctime = timespec64_to_timespec(stat.ctime); fhp->fh_pre_size = stat.size; fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); fhp->fh_pre_saved = true; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 66eaeb1e8c2c..9c247fa1e959 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -510,8 +510,9 @@ nfs4_legacy_state_init(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; - nn->reclaim_str_hashtbl = kmalloc(sizeof(struct list_head) * - CLIENT_HASH_SIZE, GFP_KERNEL); + nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); if (!nn->reclaim_str_hashtbl) return -ENOMEM; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fc74d6f46bd5..857141446d6b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1807,8 +1807,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL); if (clp->cl_name.data == NULL) goto err_no_name; - clp->cl_ownerstr_hashtbl = kmalloc(sizeof(struct list_head) * - OWNER_HASH_SIZE, GFP_KERNEL); + clp->cl_ownerstr_hashtbl = kmalloc_array(OWNER_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); if (!clp->cl_ownerstr_hashtbl) goto err_no_hashtbl; for (i = 0; i < OWNER_HASH_SIZE; i++) @@ -4378,8 +4379,11 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, spin_unlock(&state_lock); if (status) - destroy_unhashed_deleg(dp); + goto out_unlock; + return dp; +out_unlock: + vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); out_stid: @@ -7093,16 +7097,19 @@ static int nfs4_state_create_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; - nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) * - CLIENT_HASH_SIZE, GFP_KERNEL); + nn->conf_id_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); if (!nn->conf_id_hashtbl) goto err; - nn->unconf_id_hashtbl = kmalloc(sizeof(struct list_head) * - CLIENT_HASH_SIZE, GFP_KERNEL); + nn->unconf_id_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); if (!nn->unconf_id_hashtbl) goto err_unconf_id; - nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) * - SESSION_HASH_SIZE, GFP_KERNEL); + nn->sessionid_hashtbl = kmalloc_array(SESSION_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); if (!nn->sessionid_hashtbl) goto err_sessionid; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 1d048dd95464..a96843c59fc1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -320,6 +320,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, struct nfs4_acl **acl, struct xdr_netobj *label, int *umask) { + struct timespec ts; int expected_len, len = 0; u32 dummy32; char *buf; @@ -421,7 +422,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: len += 12; - status = nfsd4_decode_time(argp, &iattr->ia_atime); + status = nfsd4_decode_time(argp, &ts); + iattr->ia_atime = timespec_to_timespec64(ts); if (status) return status; iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); @@ -440,7 +442,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: len += 12; - status = nfsd4_decode_time(argp, &iattr->ia_mtime); + status = nfsd4_decode_time(argp, &ts); + iattr->ia_mtime = timespec_to_timespec64(ts); if (status) return status; iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); @@ -1585,6 +1588,8 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, gdev->gd_maxcount = be32_to_cpup(p++); num = be32_to_cpup(p++); if (num) { + if (num > 1000) + goto xdr_error; READ_BUF(4 * num); gdev->gd_notify_types = be32_to_cpup(p++); for (i = 1; i < num; i++) { @@ -3651,7 +3656,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 nfserr = nfserr_resource; goto err_no_verf; } - maxcount = min_t(u32, readdir->rd_maxcount, INT_MAX); + maxcount = svc_max_payload(resp->rqstp); + maxcount = min_t(u32, readdir->rd_maxcount, maxcount); /* * Note the rfc defines rd_maxcount as the size of the * READDIR4resok structure, which includes the verifier above @@ -3665,7 +3671,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */ if (!readdir->rd_dircount) - readdir->rd_dircount = INT_MAX; + readdir->rd_dircount = svc_max_payload(resp->rqstp); readdir->xdr = xdr; readdir->rd_maxcount = maxcount; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 334f2ad60704..dbdeb9d6af03 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -177,7 +177,8 @@ int nfsd_reply_cache_init(void) drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL); if (!drc_hashtbl) { - drc_hashtbl = vzalloc(hashsize * sizeof(*drc_hashtbl)); + drc_hashtbl = vzalloc(array_size(hashsize, + sizeof(*drc_hashtbl))); if (!drc_hashtbl) goto out_nomem; } @@ -394,7 +395,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) __wsum csum; u32 hash = nfsd_cache_hash(xid); struct nfsd_drc_bucket *b = &drc_hashtbl[hash]; - unsigned long age; int type = rqstp->rq_cachetype; int rtn = RC_DOIT; @@ -461,12 +461,11 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) found_entry: nfsdstats.rchits++; /* We found a matching entry which is either in progress or done. */ - age = jiffies - rp->c_timestamp; lru_put_end(b, rp); rtn = RC_DROPIT; - /* Request being processed or excessive rexmits */ - if (rp->c_state == RC_INPROG || age < RC_DELAY) + /* Request being processed */ + if (rp->c_state == RC_INPROG) goto out; /* From the hall of fame of impractical attacks: diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index a43e8260520a..6b2e8b73d36e 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -131,7 +131,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, { struct dentry *dentry = fhp->fh_dentry; int type; - struct timespec time; + struct timespec64 time; u32 f; type = (stat->mode & S_IFMT); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2410b093a2e6..b0555d7d8200 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1201,6 +1201,28 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, break; case S_IFDIR: host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); + if (!host_err && unlikely(d_unhashed(dchild))) { + struct dentry *d; + d = lookup_one_len(dchild->d_name.name, + dchild->d_parent, + dchild->d_name.len); + if (IS_ERR(d)) { + host_err = PTR_ERR(d); + break; + } + if (unlikely(d_is_negative(d))) { + dput(d); + err = nfserr_serverfault; + goto out; + } + dput(resfhp->fh_dentry); + resfhp->fh_dentry = dget(d); + err = fh_update(resfhp); + dput(dchild); + dchild = d; + if (err) + goto out; + } break; case S_IFCHR: case S_IFBLK: diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 1a2894aa0194..dd52d3f82e8d 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -46,8 +46,7 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) int err = nilfs_add_link(dentry, inode); if (!err) { - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -243,8 +242,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; nilfs_mark_inode_dirty(inode); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); out: if (!err) err = nilfs_transaction_commit(dir->i_sb); diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 63a1ca4b9dee..e2bea2ac5dfb 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -79,12 +79,11 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) */ static int dnotify_handle_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { + struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct dnotify_mark *dn_mark; struct dnotify_struct *dn; struct dnotify_struct **prev; @@ -95,7 +94,8 @@ static int dnotify_handle_event(struct fsnotify_group *group, if (!S_ISDIR(inode->i_mode)) return 0; - BUG_ON(vfsmount_mark); + if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info))) + return 0; dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); @@ -319,7 +319,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { - error = fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0); + error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0); if (error) { mutex_unlock(&dnotify_group->mark_mutex); goto out_err; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d94e8031fe5f..f90842efea13 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -87,17 +87,17 @@ static int fanotify_get_response(struct fsnotify_group *group, return ret; } -static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmnt_mark, - u32 event_mask, - const void *data, int data_type) +static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info, + u32 event_mask, const void *data, + int data_type) { __u32 marks_mask = 0, marks_ignored_mask = 0; const struct path *path = data; + struct fsnotify_mark *mark; + int type; - pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" - " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, - event_mask, data, data_type); + pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", + __func__, iter_info->report_mask, event_mask, data, data_type); /* if we don't have enough info to send an event to userspace say no */ if (data_type != FSNOTIFY_EVENT_PATH) @@ -108,20 +108,21 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, !d_can_lookup(path->dentry)) return false; - /* - * if the event is for a child and this inode doesn't care about - * events on the child, don't send it! - */ - if (inode_mark && - (!(event_mask & FS_EVENT_ON_CHILD) || - (inode_mark->mask & FS_EVENT_ON_CHILD))) { - marks_mask |= inode_mark->mask; - marks_ignored_mask |= inode_mark->ignored_mask; - } + fsnotify_foreach_obj_type(type) { + if (!fsnotify_iter_should_report_type(iter_info, type)) + continue; + mark = iter_info->marks[type]; + /* + * if the event is for a child and this inode doesn't care about + * events on the child, don't send it! + */ + if (type == FSNOTIFY_OBJ_TYPE_INODE && + (event_mask & FS_EVENT_ON_CHILD) && + !(mark->mask & FS_EVENT_ON_CHILD)) + continue; - if (vfsmnt_mark) { - marks_mask |= vfsmnt_mark->mask; - marks_ignored_mask |= vfsmnt_mark->ignored_mask; + marks_mask |= mark->mask; + marks_ignored_mask |= mark->ignored_mask; } if (d_is_dir(path->dentry) && @@ -178,8 +179,6 @@ init: __maybe_unused static int fanotify_handle_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *fanotify_mark, u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) @@ -199,8 +198,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); - if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data, - data_type)) + if (!fanotify_should_send_event(iter_info, mask, data, data_type)) return 0; pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index d478629c728b..10aac1942c9f 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -77,7 +77,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct inotify_inode_mark *inode_mark; struct inode *inode; - if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE)) + if (mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE) return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); @@ -116,7 +116,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; - if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) { + if (mark->connector->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = igrab(mark->connector->inode); if (!inode) return; @@ -126,7 +126,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); - } else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { struct mount *mnt = real_mount(mark->connector->mnt); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 613ec7e5a465..f174397b63a0 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -184,8 +184,6 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask EXPORT_SYMBOL_GPL(__fsnotify_parent); static int send_to_group(struct inode *to_tell, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, __u32 mask, const void *data, int data_is, u32 cookie, const unsigned char *file_name, @@ -195,48 +193,45 @@ static int send_to_group(struct inode *to_tell, __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); __u32 marks_mask = 0; __u32 marks_ignored_mask = 0; + struct fsnotify_mark *mark; + int type; - if (unlikely(!inode_mark && !vfsmount_mark)) { - BUG(); + if (WARN_ON(!iter_info->report_mask)) return 0; - } /* clear ignored on inode modification */ if (mask & FS_MODIFY) { - if (inode_mark && - !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) - inode_mark->ignored_mask = 0; - if (vfsmount_mark && - !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) - vfsmount_mark->ignored_mask = 0; - } - - /* does the inode mark tell us to do something? */ - if (inode_mark) { - group = inode_mark->group; - marks_mask |= inode_mark->mask; - marks_ignored_mask |= inode_mark->ignored_mask; + fsnotify_foreach_obj_type(type) { + if (!fsnotify_iter_should_report_type(iter_info, type)) + continue; + mark = iter_info->marks[type]; + if (mark && + !(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) + mark->ignored_mask = 0; + } } - /* does the vfsmount_mark tell us to do something? */ - if (vfsmount_mark) { - group = vfsmount_mark->group; - marks_mask |= vfsmount_mark->mask; - marks_ignored_mask |= vfsmount_mark->ignored_mask; + fsnotify_foreach_obj_type(type) { + if (!fsnotify_iter_should_report_type(iter_info, type)) + continue; + mark = iter_info->marks[type]; + /* does the object mark tell us to do something? */ + if (mark) { + group = mark->group; + marks_mask |= mark->mask; + marks_ignored_mask |= mark->ignored_mask; + } } - pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" - " vfsmount_mark=%p marks_mask=%x marks_ignored_mask=%x" + pr_debug("%s: group=%p to_tell=%p mask=%x marks_mask=%x marks_ignored_mask=%x" " data=%p data_is=%d cookie=%d\n", - __func__, group, to_tell, mask, inode_mark, vfsmount_mark, - marks_mask, marks_ignored_mask, data, - data_is, cookie); + __func__, group, to_tell, mask, marks_mask, marks_ignored_mask, + data, data_is, cookie); if (!(test_mask & marks_mask & ~marks_ignored_mask)) return 0; - return group->ops->handle_event(group, to_tell, inode_mark, - vfsmount_mark, mask, data, data_is, + return group->ops->handle_event(group, to_tell, mask, data, data_is, file_name, cookie, iter_info); } @@ -264,6 +259,57 @@ static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) } /* + * iter_info is a multi head priority queue of marks. + * Pick a subset of marks from queue heads, all with the + * same group and set the report_mask for selected subset. + * Returns the report_mask of the selected subset. + */ +static unsigned int fsnotify_iter_select_report_types( + struct fsnotify_iter_info *iter_info) +{ + struct fsnotify_group *max_prio_group = NULL; + struct fsnotify_mark *mark; + int type; + + /* Choose max prio group among groups of all queue heads */ + fsnotify_foreach_obj_type(type) { + mark = iter_info->marks[type]; + if (mark && + fsnotify_compare_groups(max_prio_group, mark->group) > 0) + max_prio_group = mark->group; + } + + if (!max_prio_group) + return 0; + + /* Set the report mask for marks from same group as max prio group */ + iter_info->report_mask = 0; + fsnotify_foreach_obj_type(type) { + mark = iter_info->marks[type]; + if (mark && + fsnotify_compare_groups(max_prio_group, mark->group) == 0) + fsnotify_iter_set_report_type(iter_info, type); + } + + return iter_info->report_mask; +} + +/* + * Pop from iter_info multi head queue, the marks that were iterated in the + * current iteration step. + */ +static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) +{ + int type; + + fsnotify_foreach_obj_type(type) { + if (fsnotify_iter_should_report_type(iter_info, type)) + iter_info->marks[type] = + fsnotify_next_mark(iter_info->marks[type]); + } +} + +/* * This is the main call to fsnotify. The VFS calls into hook specific functions * in linux/fsnotify.h. Those functions then in turn call here. Here will call * out to all of the registered fsnotify_group. Those groups can then use the @@ -307,15 +353,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, if ((mask & FS_MODIFY) || (test_mask & to_tell->i_fsnotify_mask)) { - iter_info.inode_mark = + iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] = fsnotify_first_mark(&to_tell->i_fsnotify_marks); } if (mnt && ((mask & FS_MODIFY) || (test_mask & mnt->mnt_fsnotify_mask))) { - iter_info.inode_mark = + iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] = fsnotify_first_mark(&to_tell->i_fsnotify_marks); - iter_info.vfsmount_mark = + iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } @@ -324,32 +370,14 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * ignore masks are properly reflected for mount mark notifications. * That's why this traversal is so complicated... */ - while (iter_info.inode_mark || iter_info.vfsmount_mark) { - struct fsnotify_mark *inode_mark = iter_info.inode_mark; - struct fsnotify_mark *vfsmount_mark = iter_info.vfsmount_mark; - - if (inode_mark && vfsmount_mark) { - int cmp = fsnotify_compare_groups(inode_mark->group, - vfsmount_mark->group); - if (cmp > 0) - inode_mark = NULL; - else if (cmp < 0) - vfsmount_mark = NULL; - } - - ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask, - data, data_is, cookie, file_name, - &iter_info); + while (fsnotify_iter_select_report_types(&iter_info)) { + ret = send_to_group(to_tell, mask, data, data_is, cookie, + file_name, &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; - if (inode_mark) - iter_info.inode_mark = - fsnotify_next_mark(iter_info.inode_mark); - if (vfsmount_mark) - iter_info.vfsmount_mark = - fsnotify_next_mark(iter_info.vfsmount_mark); + fsnotify_iter_next(&iter_info); } ret = 0; out: diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 60f365dc1408..34515d2c4ba3 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -9,12 +9,6 @@ #include "../mount.h" -struct fsnotify_iter_info { - struct fsnotify_mark *inode_mark; - struct fsnotify_mark *vfsmount_mark; - int srcu_idx; -}; - /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); diff --git a/fs/notify/group.c b/fs/notify/group.c index b7a4b6a69efa..aa5468f23e45 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -67,7 +67,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group) fsnotify_group_stop_queueing(group); /* Clear all marks for this group and queue them for destruction */ - fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES); + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES_MASK); /* * Some marks can still be pinned when waiting for response from diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index c00d2caca894..7e4578d35b61 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -25,8 +25,6 @@ extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); extern int inotify_handle_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 40dedb37a1f3..9ab6dde38a14 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -65,12 +65,11 @@ static int inotify_merge(struct list_head *list, int inotify_handle_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { + struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct inotify_inode_mark *i_mark; struct inotify_event_info *event; struct fsnotify_event *fsn_event; @@ -78,7 +77,8 @@ int inotify_handle_event(struct fsnotify_group *group, int len = 0; int alloc_len = sizeof(struct inotify_event_info); - BUG_ON(vfsmount_mark); + if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info))) + return 0; if ((inode_mark->mask & FS_EXCL_UNLINK) && (data_type == FSNOTIFY_EVENT_PATH)) { diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ef32f3657958..1cf5b779d862 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -485,10 +485,14 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { struct inotify_inode_mark *i_mark; + struct fsnotify_iter_info iter_info = { }; + + fsnotify_iter_set_report_type_mark(&iter_info, FSNOTIFY_OBJ_TYPE_INODE, + fsn_mark); /* Queue ignore event for the watch */ - inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, - NULL, FSNOTIFY_EVENT_NONE, NULL, 0, NULL); + inotify_handle_event(group, NULL, FS_IN_IGNORED, NULL, + FSNOTIFY_EVENT_NONE, NULL, 0, &iter_info); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ @@ -578,7 +582,7 @@ static int inotify_new_watch(struct fsnotify_group *group, } /* we are on the idr, now get on the inode */ - ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, inode, NULL, 0); + ret = fsnotify_add_inode_mark_locked(&tmp_i_mark->fsn_mark, inode, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index e9191b416434..61f4c5fa34c7 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -119,9 +119,9 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) new_mask |= mark->mask; } - if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) + if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) conn->inode->i_fsnotify_mask = new_mask; - else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) + else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask; } @@ -139,7 +139,7 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) spin_lock(&conn->lock); __fsnotify_recalc_mask(conn); spin_unlock(&conn->lock); - if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) + if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) __fsnotify_update_child_dentry_flags(conn->inode); } @@ -166,18 +166,18 @@ static struct inode *fsnotify_detach_connector_from_object( { struct inode *inode = NULL; - if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { + if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = conn->inode; rcu_assign_pointer(inode->i_fsnotify_marks, NULL); inode->i_fsnotify_mask = 0; conn->inode = NULL; - conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; - } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; + } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks, NULL); real_mount(conn->mnt)->mnt_fsnotify_mask = 0; conn->mnt = NULL; - conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; + conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; } return inode; @@ -294,12 +294,12 @@ static void fsnotify_put_mark_wake(struct fsnotify_mark *mark) bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) { - /* This can fail if mark is being removed */ - if (!fsnotify_get_mark_safe(iter_info->inode_mark)) - return false; - if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark)) { - fsnotify_put_mark_wake(iter_info->inode_mark); - return false; + int type; + + fsnotify_foreach_obj_type(type) { + /* This can fail if mark is being removed */ + if (!fsnotify_get_mark_safe(iter_info->marks[type])) + goto fail; } /* @@ -310,13 +310,20 @@ bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); return true; + +fail: + for (type--; type >= 0; type--) + fsnotify_put_mark_wake(iter_info->marks[type]); + return false; } void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) { + int type; + iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); - fsnotify_put_mark_wake(iter_info->inode_mark); - fsnotify_put_mark_wake(iter_info->vfsmount_mark); + fsnotify_foreach_obj_type(type) + fsnotify_put_mark_wake(iter_info->marks[type]); } /* @@ -442,10 +449,10 @@ static int fsnotify_attach_connector_to_object( spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); if (inode) { - conn->flags = FSNOTIFY_OBJ_TYPE_INODE; + conn->type = FSNOTIFY_OBJ_TYPE_INODE; conn->inode = igrab(inode); } else { - conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT; + conn->type = FSNOTIFY_OBJ_TYPE_VFSMOUNT; conn->mnt = mnt; } /* @@ -479,8 +486,7 @@ static struct fsnotify_mark_connector *fsnotify_grab_connector( if (!conn) goto out; spin_lock(&conn->lock); - if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE | - FSNOTIFY_OBJ_TYPE_VFSMOUNT))) { + if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) { spin_unlock(&conn->lock); srcu_read_unlock(&fsnotify_mark_srcu, idx); return NULL; @@ -646,16 +652,16 @@ struct fsnotify_mark *fsnotify_find_mark( return NULL; } -/* Clear any marks in a group with given type */ +/* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, - unsigned int type) + unsigned int type_mask) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); struct list_head *head = &to_free; /* Skip selection step if we want to clear all marks. */ - if (type == FSNOTIFY_OBJ_ALL_TYPES) { + if (type_mask == FSNOTIFY_OBJ_ALL_TYPES_MASK) { head = &group->marks_list; goto clear; } @@ -670,7 +676,7 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, */ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { - if (mark->connector->flags & type) + if ((1U << mark->connector->type) & type_mask) list_move(&mark->g_list, &to_free); } mutex_unlock(&group->mark_mutex); diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index f8eb04387ca4..fbd0090d7d0c 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -527,7 +527,7 @@ int ntfs_read_compressed_block(struct page *page) BUG_ON(ni->type != AT_DATA); BUG_ON(ni->name_len); - pages = kmalloc(nr_pages * sizeof(struct page *), GFP_NOFS); + pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); /* Allocate memory to store the buffer heads we need. */ bhs_size = cb_size / block_size * sizeof(struct buffer_head *); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 1c1ee489284b..decaf75d1cd5 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -667,18 +667,18 @@ static int ntfs_read_locked_inode(struct inode *vi) * mtime is the last change of the data within the file. Not changed * when only metadata is changed, e.g. a rename doesn't affect mtime. */ - vi->i_mtime = ntfs2utc(si->last_data_change_time); + vi->i_mtime = timespec_to_timespec64(ntfs2utc(si->last_data_change_time)); /* * ctime is the last change of the metadata of the file. This obviously * always changes, when mtime is changed. ctime can be changed on its * own, mtime is then not changed, e.g. when a file is renamed. */ - vi->i_ctime = ntfs2utc(si->last_mft_change_time); + vi->i_ctime = timespec_to_timespec64(ntfs2utc(si->last_mft_change_time)); /* * Last access to the data within the file. Not changed during a rename * for example but changed whenever the file is written to. */ - vi->i_atime = ntfs2utc(si->last_access_time); + vi->i_atime = timespec_to_timespec64(ntfs2utc(si->last_access_time)); /* Find the attribute list attribute if present. */ ntfs_attr_reinit_search_ctx(ctx); @@ -2804,11 +2804,11 @@ done: * for real. */ if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { - struct timespec now = current_time(VFS_I(base_ni)); + struct timespec64 now = current_time(VFS_I(base_ni)); int sync_it = 0; - if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) || - !timespec_equal(&VFS_I(base_ni)->i_ctime, &now)) + if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) || + !timespec64_equal(&VFS_I(base_ni)->i_ctime, &now)) sync_it = 1; VFS_I(base_ni)->i_mtime = now; VFS_I(base_ni)->i_ctime = now; @@ -2923,14 +2923,14 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) } } if (ia_valid & ATTR_ATIME) - vi->i_atime = timespec_trunc(attr->ia_atime, - vi->i_sb->s_time_gran); + vi->i_atime = timespec64_trunc(attr->ia_atime, + vi->i_sb->s_time_gran); if (ia_valid & ATTR_MTIME) - vi->i_mtime = timespec_trunc(attr->ia_mtime, - vi->i_sb->s_time_gran); + vi->i_mtime = timespec64_trunc(attr->ia_mtime, + vi->i_sb->s_time_gran); if (ia_valid & ATTR_CTIME) - vi->i_ctime = timespec_trunc(attr->ia_ctime, - vi->i_sb->s_time_gran); + vi->i_ctime = timespec64_trunc(attr->ia_ctime, + vi->i_sb->s_time_gran); mark_inode_dirty(vi); out: return err; @@ -2997,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si = (STANDARD_INFORMATION*)((u8*)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset)); /* Update the access times if they have changed. */ - nt = utc2ntfs(vi->i_mtime); + nt = utc2ntfs(timespec64_to_timespec(vi->i_mtime)); if (si->last_data_change_time != nt) { ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3006,7 +3006,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_data_change_time = nt; modified = true; } - nt = utc2ntfs(vi->i_ctime); + nt = utc2ntfs(timespec64_to_timespec(vi->i_ctime)); if (si->last_mft_change_time != nt) { ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3015,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_mft_change_time = nt; modified = true; } - nt = utc2ntfs(vi->i_atime); + nt = utc2ntfs(timespec64_to_timespec(vi->i_atime)); if (si->last_access_time != nt) { ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 91a8889abf9b..ea8c551bcd7e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -570,16 +570,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) { - mlog(ML_ERROR, "Adding page[%d] to bio failed, " - "page %p, len %d, vec_len %u, vec_start %u, " - "bi_sector %llu\n", current_page, page, len, - vec_len, vec_start, - (unsigned long long)bio->bi_iter.bi_sector); - bio_put(bio); - bio = ERR_PTR(-EIO); - return bio; - } + if (len != vec_len) break; cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index e5076185cc1e..1296f78ae966 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1078,7 +1078,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, o2net_set_nst_sock_container(&nst, sc); veclen = caller_veclen + 1; - vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); + vec = kmalloc_array(veclen, sizeof(struct kvec), GFP_ATOMIC); if (vec == NULL) { mlog(0, "failed to %zu element kvec!\n", veclen); ret = -ENOMEM; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 425081be6161..2acd58ba9b7b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -86,7 +86,7 @@ static void dlm_free_pagevec(void **vec, int pages) static void **dlm_alloc_pagevec(int pages) { - void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL); + void **vec = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); int i; if (!vec) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 97a972efab83..0ff424c6d17c 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -788,35 +788,34 @@ static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, spin_unlock(&lockres->l_lock); } -static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, - struct ocfs2_lock_holder *oh) -{ - spin_lock(&lockres->l_lock); - list_del(&oh->oh_list); - spin_unlock(&lockres->l_lock); - - put_pid(oh->oh_owner_pid); -} - -static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres) +static struct ocfs2_lock_holder * +ocfs2_pid_holder(struct ocfs2_lock_res *lockres, + struct pid *pid) { struct ocfs2_lock_holder *oh; - struct pid *pid; - /* look in the list of holders for one with the current task as owner */ spin_lock(&lockres->l_lock); - pid = task_pid(current); list_for_each_entry(oh, &lockres->l_holders, oh_list) { if (oh->oh_owner_pid == pid) { spin_unlock(&lockres->l_lock); - return 1; + return oh; } } spin_unlock(&lockres->l_lock); + return NULL; +} - return 0; +static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + spin_lock(&lockres->l_lock); + list_del(&oh->oh_list); + spin_unlock(&lockres->l_lock); + + put_pid(oh->oh_owner_pid); } + static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) { @@ -2141,6 +2140,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec ts; lvb = ocfs2_dlm_lvb(&lockres->l_lksb); @@ -2161,12 +2161,15 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); + ts = timespec64_to_timespec(inode->i_atime); lvb->lvb_iatime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); + cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = timespec64_to_timespec(inode->i_ctime); lvb->lvb_ictime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); + cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = timespec64_to_timespec(inode->i_mtime); lvb->lvb_imtime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); + cpu_to_be64(ocfs2_pack_timespec(&ts)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); @@ -2184,6 +2187,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec, static void ocfs2_refresh_inode_from_lvb(struct inode *inode) { + struct timespec ts; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; @@ -2211,12 +2215,15 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); inode->i_mode = be16_to_cpu(lvb->lvb_imode); set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); - ocfs2_unpack_timespec(&inode->i_atime, + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed)); - ocfs2_unpack_timespec(&inode->i_mtime, + inode->i_atime = timespec_to_timespec64(ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed)); - ocfs2_unpack_timespec(&inode->i_ctime, + inode->i_mtime = timespec_to_timespec64(ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed)); + inode->i_ctime = timespec_to_timespec64(ts); spin_unlock(&oi->ip_lock); } @@ -2610,34 +2617,93 @@ void ocfs2_inode_unlock(struct inode *inode, * * return < 0 on error, return == 0 if there's no lock holder on the stack * before this call, return == 1 if this call would be a recursive locking. + * return == -1 if this lock attempt will cause an upgrade which is forbidden. + * + * When taking lock levels into account,we face some different situations. + * + * 1. no lock is held + * In this case, just lock the inode as requested and return 0 + * + * 2. We are holding a lock + * For this situation, things diverges into several cases + * + * wanted holding what to do + * ex ex see 2.1 below + * ex pr see 2.2 below + * pr ex see 2.1 below + * pr pr see 2.1 below + * + * 2.1 lock level that is been held is compatible + * with the wanted level, so no lock action will be tacken. + * + * 2.2 Otherwise, an upgrade is needed, but it is forbidden. + * + * Reason why upgrade within a process is forbidden is that + * lock upgrade may cause dead lock. The following illustrates + * how it happens. + * + * thread on node1 thread on node2 + * ocfs2_inode_lock_tracker(ex=0) + * + * <====== ocfs2_inode_lock_tracker(ex=1) + * + * ocfs2_inode_lock_tracker(ex=1) */ int ocfs2_inode_lock_tracker(struct inode *inode, struct buffer_head **ret_bh, int ex, struct ocfs2_lock_holder *oh) { - int status; - int arg_flags = 0, has_locked; + int status = 0; struct ocfs2_lock_res *lockres; + struct ocfs2_lock_holder *tmp_oh; + struct pid *pid = task_pid(current); + lockres = &OCFS2_I(inode)->ip_inode_lockres; - has_locked = ocfs2_is_locked_by_me(lockres); - /* Just get buffer head if the cluster lock has been taken */ - if (has_locked) - arg_flags = OCFS2_META_LOCK_GETBH; + tmp_oh = ocfs2_pid_holder(lockres, pid); - if (likely(!has_locked || ret_bh)) { - status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags); + if (!tmp_oh) { + /* + * This corresponds to the case 1. + * We haven't got any lock before. + */ + status = ocfs2_inode_lock_full(inode, ret_bh, ex, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); return status; } - } - if (!has_locked) + + oh->oh_ex = ex; ocfs2_add_holder(lockres, oh); + return 0; + } - return has_locked; + if (unlikely(ex && !tmp_oh->oh_ex)) { + /* + * case 2.2 upgrade may cause dead lock, forbid it. + */ + mlog(ML_ERROR, "Recursive locking is not permitted to " + "upgrade to EX level from PR level.\n"); + dump_stack(); + return -EINVAL; + } + + /* + * case 2.1 OCFS2_META_LOCK_GETBH flag make ocfs2_inode_lock_full. + * ignore the lock level and just update it. + */ + if (ret_bh) { + status = ocfs2_inode_lock_full(inode, ret_bh, ex, + OCFS2_META_LOCK_GETBH); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + } + return tmp_oh ? 1 : 0; } void ocfs2_inode_unlock_tracker(struct inode *inode, @@ -2649,12 +2715,13 @@ void ocfs2_inode_unlock_tracker(struct inode *inode, lockres = &OCFS2_I(inode)->ip_inode_lockres; /* had_lock means that the currect process already takes the cluster - * lock previously. If had_lock is 1, we have nothing to do here, and - * it will get unlocked where we got the lock. + * lock previously. + * If had_lock is 1, we have nothing to do here. + * If had_lock is 0, we will release the lock. */ if (!had_lock) { + ocfs2_inode_unlock(inode, oh->oh_ex); ocfs2_remove_holder(lockres, oh); - ocfs2_inode_unlock(inode, ex); } } diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 256e0a9067b8..4ec1c828f6e0 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -96,6 +96,7 @@ struct ocfs2_trim_fs_info { struct ocfs2_lock_holder { struct list_head oh_list; struct pid *oh_owner_pid; + int oh_ex; }; /* ocfs2_inode_lock_full() 'arg_flags' flags */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6ee94bc23f5b..255f758af03a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -222,7 +222,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, int ocfs2_should_update_atime(struct inode *inode, struct vfsmount *vfsmnt) { - struct timespec now; + struct timespec64 now; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) @@ -248,8 +248,8 @@ int ocfs2_should_update_atime(struct inode *inode, return 0; if (vfsmnt->mnt_flags & MNT_RELATIME) { - if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || - (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) || + (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0)) return 1; return 0; @@ -563,8 +563,8 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, return ret; } -static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten) +static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) { int status = 0; int restart_func = 0; @@ -1035,8 +1035,8 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, clusters_to_add -= oi->ip_clusters; if (clusters_to_add) { - ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, - clusters_to_add, 0); + ret = ocfs2_extend_allocation(inode, oi->ip_clusters, + clusters_to_add, 0); if (ret) { mlog_errno(ret); goto out; @@ -1493,7 +1493,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, goto next; } - ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); + ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 1fdc9839cd93..7eb7f03531f6 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -65,8 +65,6 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index ab30c005cc4b..994726ada857 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -402,7 +402,7 @@ out_err: static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, unsigned int chunksize) { - int index; + u32 index; index = __ilog2_u32(chunksize); if (index >= OCFS2_INFO_MAX_HIST) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e5dcea6cee5f..bd3475694e83 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1383,7 +1383,7 @@ static int __ocfs2_recovery_thread(void *arg) goto bail; } - rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS); + rm_quota = kcalloc(osb->max_slots, sizeof(int), GFP_NOFS); if (!rm_quota) { status = -ENOMEM; goto bail; diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index fb9a20e3d608..05220b365fb9 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -44,11 +44,11 @@ #include "ocfs2_trace.h" -static int ocfs2_fault(struct vm_fault *vmf) +static vm_fault_t ocfs2_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; sigset_t oldset; - int ret; + vm_fault_t ret; ocfs2_block_signals(&oldset); ret = filemap_fault(vmf); @@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_fault *vmf) return ret; } -static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, - struct page *page) +static vm_fault_t __ocfs2_page_mkwrite(struct file *file, + struct buffer_head *di_bh, struct page *page) { - int ret = VM_FAULT_NOPAGE; + int err; + vm_fault_t ret = VM_FAULT_NOPAGE; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); @@ -105,15 +106,12 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, if (page->index == last_index) len = ((size - 1) & ~PAGE_MASK) + 1; - ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, + err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, &locked_page, &fsdata, di_bh, page); - if (ret) { - if (ret != -ENOSPC) - mlog_errno(ret); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + if (err) { + if (err != -ENOSPC) + mlog_errno(err); + ret = vmf_error(err); goto out; } @@ -121,20 +119,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, ret = VM_FAULT_NOPAGE; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); - BUG_ON(ret != len); + err = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); + BUG_ON(err != len); ret = VM_FAULT_LOCKED; out: return ret; } -static int ocfs2_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct buffer_head *di_bh = NULL; sigset_t oldset; - int ret; + int err; + vm_fault_t ret; sb_start_pagefault(inode->i_sb); ocfs2_block_signals(&oldset); @@ -144,13 +143,10 @@ static int ocfs2_page_mkwrite(struct vm_fault *vmf) * node. Taking the data lock will also ensure that we don't * attempt page truncation as part of a downconvert. */ - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - mlog_errno(ret); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + err = ocfs2_inode_lock(inode, &di_bh, 1); + if (err < 0) { + mlog_errno(err); + ret = vmf_error(err); goto out; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8dd6f703c819..b7ca84bc3df7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2332,8 +2332,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio) { - const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN; - char name[namelen + 1]; + char name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 5bb4a89f9045..7071ad0dec90 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -807,11 +807,11 @@ struct ocfs2_dir_block_trailer { * in this block. (unused) */ /*10*/ __u8 db_signature[8]; /* Signature for verification */ __le64 db_reserved2; - __le64 db_free_next; /* Next block in list (unused) */ -/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */ - __le64 db_parent_dinode; /* dinode which owns me, in +/*20*/ __le64 db_free_next; /* Next block in list (unused) */ + __le64 db_blkno; /* Offset on disk, in blocks */ +/*30*/ __le64 db_parent_dinode; /* dinode which owns me, in blocks */ -/*30*/ struct ocfs2_block_check db_check; /* Error checking */ + struct ocfs2_block_check db_check; /* Error checking */ /*40*/ }; diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index af155c183123..5965f3878d49 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -69,10 +69,11 @@ static struct inode **get_local_system_inode(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); if (unlikely(!local_system_inodes)) { - local_system_inodes = kzalloc(sizeof(struct inode *) * - NUM_LOCAL_SYSTEM_INODES * - osb->max_slots, - GFP_NOFS); + local_system_inodes = + kzalloc(array3_size(sizeof(struct inode *), + NUM_LOCAL_SYSTEM_INODES, + osb->max_slots), + GFP_NOFS); if (!local_system_inodes) { mlog_errno(-ENOMEM); /* diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index b7146526afff..4bee3a72b9f3 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -305,11 +305,10 @@ static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry, ino_t ino = be64_to_cpu(oi->i_head.h_self); brelse(bh); inode = omfs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + } else if (bh != ERR_PTR(-ENOENT)) { + inode = ERR_CAST(bh); } - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } /* sanity check block's self pointer */ diff --git a/fs/open.c b/fs/open.c index c5ee7cd60424..d0e955b558ad 100644 --- a/fs/open.c +++ b/fs/open.c @@ -724,6 +724,16 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } +int open_check_o_direct(struct file *f) +{ + /* NB: we're sure to have correct a_ops only after f_op->open */ + if (f->f_flags & O_DIRECT) { + if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) + return -EINVAL; + } + return 0; +} + static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *), @@ -745,7 +755,7 @@ static int do_dentry_open(struct file *f, if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH; f->f_op = &empty_fops; - goto done; + return 0; } if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { @@ -798,12 +808,7 @@ static int do_dentry_open(struct file *f, f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); -done: - /* NB: we're sure to have correct a_ops only after f_op->open */ - error = -EINVAL; - if ((f->f_flags & O_DIRECT) && - (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)) - goto out_fput; + return 0; cleanup_all: @@ -818,9 +823,6 @@ cleanup_file: f->f_path.dentry = NULL; f->f_inode = NULL; return error; -out_fput: - fput(f); - return error; } /** @@ -918,14 +920,20 @@ struct file *dentry_open(const struct path *path, int flags, BUG_ON(!path->mnt); f = get_empty_filp(); - if (IS_ERR(f)) - return f; - - f->f_flags = flags; - error = vfs_open(path, f, cred); - if (error) { - put_filp(f); - return ERR_PTR(error); + if (!IS_ERR(f)) { + f->f_flags = flags; + error = vfs_open(path, f, cred); + if (!error) { + /* from now on we need fput() to dispose of f */ + error = open_check_o_direct(f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } else { + put_filp(f); + f = ERR_PTR(error); + } } return f; } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 2200662a9bf1..607092f367ad 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -256,8 +256,7 @@ found: break; } - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static int openpromfs_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index 66369ec90020..33ee8cb32f83 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -281,14 +281,17 @@ restart: ret = copy_to_user(buf, &proto_ver, sizeof(__s32)); if (ret != 0) goto error; - ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32)); + ret = copy_to_user(buf + sizeof(__s32), &magic, sizeof(__s32)); if (ret != 0) goto error; - ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64)); + ret = copy_to_user(buf + 2 * sizeof(__s32), + &cur_op->tag, + sizeof(__u64)); if (ret != 0) goto error; - ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall, - sizeof(struct orangefs_upcall_s)); + ret = copy_to_user(buf + 2 * sizeof(__s32) + sizeof(__u64), + &cur_op->upcall, + sizeof(struct orangefs_upcall_s)); if (ret != 0) goto error; @@ -381,7 +384,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, (unsigned int) MAX_DEV_REQ_DOWNSIZE); return -EFAULT; } - + if (!copy_from_iter_full(&head, head_size, iter)) { gossip_err("%s: failed to copy head.\n", __func__); return -EFAULT; @@ -426,7 +429,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, goto wakeup; /* - * We've successfully peeled off the head and the downcall. + * We've successfully peeled off the head and the downcall. * Something has gone awry if total doesn't equal the * sum of head_size, downcall_size and trailer_size. */ @@ -477,7 +480,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, wakeup: /* * Return to vfs waitqueue, and back to service_operation - * through wait_for_matching_downcall. + * through wait_for_matching_downcall. */ spin_lock(&op->lock); if (unlikely(op_is_cancel(op))) { @@ -716,37 +719,6 @@ struct ORANGEFS_dev_map_desc32 { __s32 count; }; -static unsigned long translate_dev_map26(unsigned long args, long *error) -{ - struct ORANGEFS_dev_map_desc32 __user *p32 = (void __user *)args; - /* - * Depending on the architecture, allocate some space on the - * user-call-stack based on our expected layout. - */ - struct ORANGEFS_dev_map_desc __user *p = - compat_alloc_user_space(sizeof(*p)); - compat_uptr_t addr; - - *error = 0; - /* get the ptr from the 32 bit user-space */ - if (get_user(addr, &p32->ptr)) - goto err; - /* try to put that into a 64-bit layout */ - if (put_user(compat_ptr(addr), &p->ptr)) - goto err; - /* copy the remaining fields */ - if (copy_in_user(&p->total_size, &p32->total_size, sizeof(__s32))) - goto err; - if (copy_in_user(&p->size, &p32->size, sizeof(__s32))) - goto err; - if (copy_in_user(&p->count, &p32->count, sizeof(__s32))) - goto err; - return (unsigned long)p; -err: - *error = -EFAULT; - return 0; -} - /* * 32 bit user-space apps' ioctl handlers when kernel modules * is compiled as a 64 bit one @@ -755,25 +727,26 @@ static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long args) { long ret; - unsigned long arg = args; /* Check for properly constructed commands */ ret = check_ioctl_command(cmd); if (ret < 0) return ret; if (cmd == ORANGEFS_DEV_MAP) { - /* - * convert the arguments to what we expect internally - * in kernel space - */ - arg = translate_dev_map26(args, &ret); - if (ret < 0) { - gossip_err("Could not translate dev map\n"); - return ret; - } + struct ORANGEFS_dev_map_desc desc; + struct ORANGEFS_dev_map_desc32 d32; + + if (copy_from_user(&d32, (void __user *)args, sizeof(d32))) + return -EFAULT; + + desc.ptr = compat_ptr(d32.ptr); + desc.total_size = d32.total_size; + desc.size = d32.size; + desc.count = d32.count; + return orangefs_bufmap_initialize(&desc); } /* no other ioctl requires translation */ - return dispatch_ioctl_command(cmd, arg); + return dispatch_ioctl_command(cmd, args); } #endif /* CONFIG_COMPAT is in .config */ diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 26358efbf794..db0b52187cbc 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -162,7 +162,7 @@ populate_shared_memory: else ret = 0; break; - /* + /* * If the op was in progress when the interrupt * occurred, then the client-core was able to * trigger the write. @@ -544,7 +544,7 @@ static int orangefs_fault(struct vm_fault *vmf) return filemap_fault(vmf); } -const struct vm_operations_struct orangefs_file_vm_ops = { +static const struct vm_operations_struct orangefs_file_vm_ops = { .fault = orangefs_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 79c61da8b1bc..6e4d2af8f5bc 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -20,8 +20,8 @@ static int read_one_page(struct page *page) int max_block; ssize_t bytes_read = 0; struct inode *inode = page->mapping->host; - const __u32 blocksize = PAGE_SIZE; /* inode->i_blksize */ - const __u32 blockbits = PAGE_SHIFT; /* inode->i_blkbits */ + const __u32 blocksize = PAGE_SIZE; + const __u32 blockbits = PAGE_SHIFT; struct iov_iter to; struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE}; @@ -181,16 +181,15 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) new_op->upcall.req.truncate.refn = orangefs_inode->refn; new_op->upcall.req.truncate.size = (__s64) iattr->ia_size; - ret = service_operation(new_op, __func__, - get_interruptible_flag(inode)); + ret = service_operation(new_op, + __func__, + get_interruptible_flag(inode)); /* * the truncate has no downcall members to retrieve, but * the status value tells us if it went through ok or not */ - gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs: orangefs_truncate got return value of %d\n", - ret); + gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret); op_release(new_op); @@ -212,8 +211,9 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = dentry->d_inode; gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_setattr: called on %pd\n", - dentry); + "%s: called on %pd\n", + __func__, + dentry); ret = setattr_prepare(dentry, iattr); if (ret) @@ -230,15 +230,16 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) ret = orangefs_inode_setattr(inode, iattr); gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_setattr: inode_setattr returned %d\n", - ret); + "%s: orangefs_inode_setattr returned %d\n", + __func__, + ret); if (!ret && (iattr->ia_valid & ATTR_MODE)) /* change mod on a file that has ACLs */ ret = posix_acl_chmod(inode, inode->i_mode); out: - gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", ret); + gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret); return ret; } @@ -262,13 +263,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, /* override block size reported to stat */ orangefs_inode = ORANGEFS_I(inode); - stat->blksize = orangefs_inode->blksize; if (request_mask & STATX_SIZE) stat->result_mask = STATX_BASIC_STATS; else stat->result_mask = STATX_BASIC_STATS & ~STATX_SIZE; + + stat->attributes_mask = STATX_ATTR_IMMUTABLE | + STATX_ATTR_APPEND; + if (inode->i_flags & S_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (inode->i_flags & S_APPEND) + stat->attributes |= STATX_ATTR_APPEND; } return ret; } @@ -290,7 +297,7 @@ int orangefs_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } -int orangefs_update_time(struct inode *inode, struct timespec *time, int flags) +int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) { struct iattr iattr; gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", @@ -306,7 +313,7 @@ int orangefs_update_time(struct inode *inode, struct timespec *time, int flags) return orangefs_inode_setattr(inode, &iattr); } -/* ORANGEDS2 implementation of VFS inode operations for files */ +/* ORANGEFS2 implementation of VFS inode operations for files */ static const struct inode_operations orangefs_file_inode_operations = { .get_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, @@ -325,7 +332,6 @@ static int orangefs_init_iops(struct inode *inode) case S_IFREG: inode->i_op = &orangefs_file_inode_operations; inode->i_fop = &orangefs_file_operations; - inode->i_blkbits = PAGE_SHIFT; break; case S_IFLNK: inode->i_op = &orangefs_symlink_inode_operations; @@ -345,8 +351,8 @@ static int orangefs_init_iops(struct inode *inode) } /* - * Given a ORANGEFS object identifier (fsid, handle), convert it into a ino_t type - * that will be used as a hash-index from where the handle will + * Given an ORANGEFS object identifier (fsid, handle), convert it into + * a ino_t type that will be used as a hash-index from where the handle will * be searched for in the VFS hash table of inodes. */ static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref) @@ -376,8 +382,10 @@ static int orangefs_test_inode(struct inode *inode, void *data) struct orangefs_inode_s *orangefs_inode = NULL; orangefs_inode = ORANGEFS_I(inode); - return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), &(ref->khandle)) - && orangefs_inode->refn.fs_id == ref->fs_id); + /* test handles and fs_ids... */ + return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), + &(ref->khandle)) && + orangefs_inode->refn.fs_id == ref->fs_id); } /* @@ -385,16 +393,21 @@ static int orangefs_test_inode(struct inode *inode, void *data) * file handle. * * @sb: the file system super block instance. - * @ref: The ORANGEFS object for which we are trying to locate an inode structure. + * @ref: The ORANGEFS object for which we are trying to locate an inode. */ -struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref *ref) +struct inode *orangefs_iget(struct super_block *sb, + struct orangefs_object_kref *ref) { struct inode *inode = NULL; unsigned long hash; int error; hash = orangefs_handle_hash(ref); - inode = iget5_locked(sb, hash, orangefs_test_inode, orangefs_set_inode, ref); + inode = iget5_locked(sb, + hash, + orangefs_test_inode, + orangefs_set_inode, + ref); if (!inode || !(inode->i_state & I_NEW)) return inode; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 6e3134e6d98a..625b0580f9be 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -75,8 +75,7 @@ static int orangefs_create(struct inode *dir, get_khandle_from_ino(inode), dentry); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; @@ -111,7 +110,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; struct inode *inode; - struct dentry *res; int ret = -EINVAL; /* @@ -159,65 +157,18 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, new_op->downcall.resp.lookup.refn.fs_id, ret); - if (ret < 0) { - if (ret == -ENOENT) { - /* - * if no inode was found, add a negative dentry to - * dcache anyway; if we don't, we don't hold expected - * lookup semantics and we most noticeably break - * during directory renames. - * - * however, if the operation failed or exited, do not - * add the dentry (e.g. in the case that a touch is - * issued on a file that already exists that was - * interrupted during this lookup -- no need to add - * another negative dentry for an existing file) - */ - - gossip_debug(GOSSIP_NAME_DEBUG, - "orangefs_lookup: Adding *negative* dentry " - "%p for %pd\n", - dentry, - dentry); - - d_add(dentry, NULL); - res = NULL; - goto out; - } - + if (ret >= 0) { + orangefs_set_timeout(dentry); + inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); + } else if (ret == -ENOENT) { + inode = NULL; + } else { /* must be a non-recoverable error */ - res = ERR_PTR(ret); - goto out; - } - - orangefs_set_timeout(dentry); - - inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); - if (IS_ERR(inode)) { - gossip_debug(GOSSIP_NAME_DEBUG, - "error %ld from iget\n", PTR_ERR(inode)); - res = ERR_CAST(inode); - goto out; + inode = ERR_PTR(ret); } - gossip_debug(GOSSIP_NAME_DEBUG, - "%s:%s:%d " - "Found good inode [%lu] with count [%d]\n", - __FILE__, - __func__, - __LINE__, - inode->i_ino, - (int)atomic_read(&inode->i_count)); - - /* update dentry/inode pair into dcache */ - res = d_splice_alias(inode, dentry); - - gossip_debug(GOSSIP_NAME_DEBUG, - "Lookup success (inode ct = %d)\n", - (int)atomic_read(&inode->i_count)); -out: op_release(new_op); - return res; + return d_splice_alias(inode, dentry); } /* return 0 on success; non-zero otherwise */ @@ -327,13 +278,19 @@ static int orangefs_symlink(struct inode *dir, ret = PTR_ERR(inode); goto out; } + /* + * This is necessary because orangefs_inode_getattr will not + * re-read symlink size as it is impossible for it to change. + * Invalidating the cache does not help. orangefs_new_inode + * does not set the correct size (it does not know symname). + */ + inode->i_size = strlen(symname); gossip_debug(GOSSIP_NAME_DEBUG, "Assigned symlink inode new number of %pU\n", get_khandle_from_ino(inode)); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; @@ -402,8 +359,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode "Assigned dir inode new number of %pU\n", get_khandle_from_ino(inode)); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 4f927023d095..c4e98c9c1621 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -138,7 +138,7 @@ static int get(struct slot_map *m) /* used to describe mapped buffers */ struct orangefs_bufmap_desc { - void *uaddr; /* user space address pointer */ + void __user *uaddr; /* user space address pointer */ struct page **page_array; /* array of mapped pages */ int array_count; /* size of above arrays */ struct list_head list_link; @@ -184,7 +184,7 @@ orangefs_bufmap_free(struct orangefs_bufmap *bufmap) } /* - * XXX: Can the size and shift change while the caller gives up the + * XXX: Can the size and shift change while the caller gives up the * XXX: lock between calling this and doing something useful? */ @@ -215,20 +215,6 @@ int orangefs_bufmap_shift_query(void) static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq); static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq); -/* - * orangefs_get_bufmap_init - * - * If bufmap_init is 1, then the shared memory system, including the - * buffer_index_array, is available. Otherwise, it is not. - * - * returns the value of bufmap_init - */ -int orangefs_get_bufmap_init(void) -{ - return __orangefs_bufmap ? 1 : 0; -} - - static struct orangefs_bufmap * orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) { @@ -496,7 +482,7 @@ void orangefs_readdir_index_put(int buffer_index) } /* - * we've been handed an iovec, we need to copy it to + * we've been handed an iovec, we need to copy it to * the shared memory descriptor at "buffer_index". */ int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter, diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 6e35f2f3c897..0732cb08173e 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -114,7 +114,7 @@ static const struct seq_operations help_debug_ops = { .show = help_show, }; -const struct file_operations debug_help_fops = { +static const struct file_operations debug_help_fops = { .owner = THIS_MODULE, .open = orangefs_debug_help_open, .read = seq_read, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index c29bb0ebc6bb..17b24ad6b264 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -182,7 +182,6 @@ static inline void set_op_state_purged(struct orangefs_kernel_op_s *op) struct orangefs_inode_s { struct orangefs_object_kref refn; char link_target[ORANGEFS_NAME_MAX]; - __s64 blksize; /* * Reading/Writing Extended attributes need to acquire the appropriate * reader/writer semaphore on the orangefs_inode_s structure. @@ -343,7 +342,7 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, int orangefs_permission(struct inode *inode, int mask); -int orangefs_update_time(struct inode *, struct timespec *, int); +int orangefs_update_time(struct inode *, struct timespec64 *, int); /* * defined in xattr.c diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 079a465796f3..dd28079f518c 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Documentation/ABI/stable/orangefs-sysfs: + * Documentation/ABI/stable/sysfs-fs-orangefs: * * What: /sys/fs/orangefs/perf_counter_reset * Date: June 2015 diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 00fadaf0da8f..804c8a261e4b 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -183,9 +183,9 @@ static inline int copy_attributes_from_inode(struct inode *inode, attrs->mask |= ORANGEFS_ATTR_SYS_CTIME; /* - * ORANGEFS cannot set size with a setattr operation. Probably not likely - * to be requested through the VFS, but just in case, don't worry about - * ATTR_SIZE + * ORANGEFS cannot set size with a setattr operation. Probably not + * likely to be requested through the VFS, but just in case, don't + * worry about ATTR_SIZE */ if (iattr->ia_valid & ATTR_MODE) { @@ -200,14 +200,16 @@ static inline int copy_attributes_from_inode(struct inode *inode, tmp_mode -= S_ISVTX; } else { gossip_debug(GOSSIP_UTILS_DEBUG, - "User attempted to set sticky bit on non-root directory; returning EINVAL.\n"); + "%s: setting sticky bit not supported.\n", + __func__); return -EINVAL; } } if (tmp_mode & (S_ISUID)) { gossip_debug(GOSSIP_UTILS_DEBUG, - "Attempting to set setuid bit (not supported); returning EINVAL.\n"); + "%s: setting setuid bit not supported.\n", + __func__); return -EINVAL; } @@ -275,7 +277,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; - loff_t inode_size, rounded_up_size; + loff_t inode_size; int ret, type; gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__, @@ -330,22 +332,19 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, if (request_mask & STATX_SIZE || new) { inode_size = (loff_t)new_op-> downcall.resp.getattr.attributes.size; - rounded_up_size = - (inode_size + (4096 - (inode_size % 4096))); inode->i_size = inode_size; - orangefs_inode->blksize = - new_op->downcall.resp.getattr.attributes.blksize; + inode->i_blkbits = ffs(new_op->downcall.resp.getattr. + attributes.blksize); spin_lock(&inode->i_lock); inode->i_bytes = inode_size; inode->i_blocks = - (unsigned long)(rounded_up_size / 512); + (inode_size + 512 - inode_size % 512)/512; spin_unlock(&inode->i_lock); } break; case S_IFDIR: if (request_mask & STATX_SIZE || new) { inode->i_size = PAGE_SIZE; - orangefs_inode->blksize = i_blocksize(inode); spin_lock(&inode->i_lock); inode_set_bytes(inode, inode->i_size); spin_unlock(&inode->i_lock); @@ -356,7 +355,6 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, if (new) { inode->i_size = (loff_t)strlen(new_op-> downcall.resp.getattr.link_target); - orangefs_inode->blksize = i_blocksize(inode); ret = strscpy(orangefs_inode->link_target, new_op->downcall.resp.getattr.link_target, ORANGEFS_NAME_MAX); @@ -525,7 +523,9 @@ int orangefs_normalize_to_errno(__s32 error_code) error_code = -ETIMEDOUT; } else { /* assume a default error code */ - gossip_err("orangefs: warning: got error code without errno equivalent: %d.\n", error_code); + gossip_err("%s: bad error code :%d:.\n", + __func__, + error_code); error_code = -EINVAL; } @@ -542,7 +542,7 @@ int orangefs_normalize_to_errno(__s32 error_code) * there is a bug somewhere. */ } else { - gossip_err("orangefs: orangefs_normalize_to_errno: got error code which is not from ORANGEFS.\n"); + gossip_err("%s: unknown error code.\n", __func__); error_code = -EINVAL; } return error_code; diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 61ee8d64c842..d403cf29a99b 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -342,7 +342,7 @@ enum { * that may be 32 bit! */ struct ORANGEFS_dev_map_desc { - void *ptr; + void __user *ptr; __s32 total_size; __s32 size; __s32 count; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 10796d3fe27d..dfaee90d30bd 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -156,9 +156,10 @@ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf) sb = dentry->d_sb; gossip_debug(GOSSIP_SUPER_DEBUG, - "orangefs_statfs: called on sb %p (fs_id is %d)\n", - sb, - (int)(ORANGEFS_SB(sb)->fs_id)); + "%s: called on sb %p (fs_id is %d)\n", + __func__, + sb, + (int)(ORANGEFS_SB(sb)->fs_id)); new_op = op_alloc(ORANGEFS_VFS_OP_STATFS); if (!new_op) @@ -198,7 +199,7 @@ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf) out_op_release: op_release(new_op); - gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_statfs: returning %d\n", ret); + gossip_debug(GOSSIP_SUPER_DEBUG, "%s: returning %d\n", __func__, ret); return ret; } @@ -423,8 +424,8 @@ static int orangefs_fill_sb(struct super_block *sb, sb->s_op = &orangefs_s_ops; sb->s_d_op = &orangefs_dentry_operations; - sb->s_blocksize = orangefs_bufmap_size_query(); - sb->s_blocksize_bits = orangefs_bufmap_shift_query(); + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_maxbytes = MAX_LFS_FILESIZE; root_object.khandle = ORANGEFS_SB(sb)->root_khandle; diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index 0577d6dba8c8..0729d2645d6a 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -17,8 +17,12 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -static int wait_for_matching_downcall(struct orangefs_kernel_op_s *, long, bool); -static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *); +static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, + long timeout, + bool interruptible) + __acquires(op->lock); +static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op) + __releases(op->lock); /* * What we do in this function is to walk the list of operations that are @@ -246,6 +250,7 @@ bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op) */ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op) + __releases(op->lock) { /* * handle interrupted cases depending on what state we were in when @@ -313,8 +318,9 @@ static void * Returns with op->lock taken. */ static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, - long timeout, - bool interruptible) + long timeout, + bool interruptible) + __acquires(op->lock) { long n; diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 17032631c5cf..9384164253ac 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -11,7 +11,7 @@ config OVERLAY_FS For more information see Documentation/filesystems/overlayfs.txt config OVERLAY_FS_REDIRECT_DIR - bool "Overlayfs: turn on redirect dir feature by default" + bool "Overlayfs: turn on redirect directory feature by default" depends on OVERLAY_FS help If this config option is enabled then overlay filesystems will use @@ -46,7 +46,7 @@ config OVERLAY_FS_INDEX depends on OVERLAY_FS help If this config option is enabled then overlay filesystems will use - the inodes index dir to map lower inodes to upper inodes by default. + the index directory to map lower inodes to upper inodes by default. In this case it is still possible to turn off index globally with the "index=off" module option or on a filesystem instance basis with the "index=off" mount option. @@ -66,7 +66,7 @@ config OVERLAY_FS_NFS_EXPORT depends on OVERLAY_FS_INDEX help If this config option is enabled then overlay filesystems will use - the inodes index dir to decode overlay NFS file handles by default. + the index directory to decode overlay NFS file handles by default. In this case, it is still possible to turn off NFS export support globally with the "nfs_export=off" module option or on a filesystem instance basis with the "nfs_export=off" mount option. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 8bede0742619..ddaddb4ce4c3 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -365,17 +365,14 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (err) return err; - temp = ovl_lookup_temp(indexdir); + temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0)); + err = PTR_ERR(temp); if (IS_ERR(temp)) - goto temp_err; - - err = ovl_do_mkdir(dir, temp, S_IFDIR, true); - if (err) - goto out; + goto free_name; err = ovl_set_upper_fh(upper, temp); if (err) - goto out_cleanup; + goto out; index = lookup_one_len(name.name, indexdir, name.len); if (IS_ERR(index)) { @@ -384,23 +381,13 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, err = ovl_do_rename(dir, temp, dir, index, 0); dput(index); } - - if (err) - goto out_cleanup; - out: + if (err) + ovl_cleanup(dir, temp); dput(temp); +free_name: kfree(name.name); return err; - -temp_err: - err = PTR_ERR(temp); - temp = NULL; - goto out; - -out_cleanup: - ovl_cleanup(dir, temp); - goto out; } struct ovl_copy_up_ctx { @@ -439,8 +426,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) c->dentry->d_name.len); err = PTR_ERR(upper); if (!IS_ERR(upper)) { - err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper, - true); + err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper); dput(upper); if (!err) { @@ -470,7 +456,7 @@ static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp, return PTR_ERR(upper); if (c->tmpfile) - err = ovl_do_link(temp, udir, upper, true); + err = ovl_do_link(temp, udir, upper); else err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0); @@ -481,13 +467,13 @@ static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp, return err; } -static int ovl_get_tmpfile(struct ovl_copy_up_ctx *c, struct dentry **tempp) +static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c) { int err; struct dentry *temp; const struct cred *old_creds = NULL; struct cred *new_creds = NULL; - struct cattr cattr = { + struct ovl_cattr cattr = { /* Can't properly set mode on creation because of the umask */ .mode = c->stat.mode & S_IFMT, .rdev = c->stat.rdev, @@ -495,41 +481,24 @@ static int ovl_get_tmpfile(struct ovl_copy_up_ctx *c, struct dentry **tempp) }; err = security_inode_copy_up(c->dentry, &new_creds); + temp = ERR_PTR(err); if (err < 0) goto out; if (new_creds) old_creds = override_creds(new_creds); - if (c->tmpfile) { + if (c->tmpfile) temp = ovl_do_tmpfile(c->workdir, c->stat.mode); - if (IS_ERR(temp)) - goto temp_err; - } else { - temp = ovl_lookup_temp(c->workdir); - if (IS_ERR(temp)) - goto temp_err; - - err = ovl_create_real(d_inode(c->workdir), temp, &cattr, - NULL, true); - if (err) { - dput(temp); - goto out; - } - } - err = 0; - *tempp = temp; + else + temp = ovl_create_temp(c->workdir, &cattr); out: if (new_creds) { revert_creds(old_creds); put_cred(new_creds); } - return err; - -temp_err: - err = PTR_ERR(temp); - goto out; + return temp; } static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) @@ -579,21 +548,21 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) struct inode *udir = c->destdir->d_inode; struct inode *inode; struct dentry *newdentry = NULL; - struct dentry *temp = NULL; + struct dentry *temp; int err; - err = ovl_get_tmpfile(c, &temp); - if (err) - goto out; + temp = ovl_get_tmpfile(c); + if (IS_ERR(temp)) + return PTR_ERR(temp); err = ovl_copy_up_inode(c, temp); if (err) - goto out_cleanup; + goto out; if (S_ISDIR(c->stat.mode) && c->indexed) { err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp); if (err) - goto out_cleanup; + goto out; } if (c->tmpfile) { @@ -604,7 +573,7 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) err = ovl_install_temp(c, temp, &newdentry); } if (err) - goto out_cleanup; + goto out; inode = d_inode(c->dentry); ovl_inode_update(inode, newdentry); @@ -612,13 +581,11 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) ovl_set_flag(OVL_WHITEOUTS, inode); out: + if (err && !c->tmpfile) + ovl_cleanup(d_inode(c->workdir), temp); dput(temp); return err; -out_cleanup: - if (!c->tmpfile) - ovl_cleanup(d_inode(c->workdir), temp); - goto out; } /* diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 839709c7803a..f480b1a2cd2e 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -43,7 +43,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) return err; } -struct dentry *ovl_lookup_temp(struct dentry *workdir) +static struct dentry *ovl_lookup_temp(struct dentry *workdir) { struct dentry *temp; char name[20]; @@ -114,36 +114,72 @@ kill_whiteout: goto out; } -int ovl_create_real(struct inode *dir, struct dentry *newdentry, - struct cattr *attr, struct dentry *hardlink, bool debug) +static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, + umode_t mode) { int err; + struct dentry *d, *dentry = *newdentry; + err = ovl_do_mkdir(dir, dentry, mode); + if (err) + return err; + + if (likely(!d_unhashed(dentry))) + return 0; + + /* + * vfs_mkdir() may succeed and leave the dentry passed + * to it unhashed and negative. If that happens, try to + * lookup a new hashed and positive dentry. + */ + d = lookup_one_len(dentry->d_name.name, dentry->d_parent, + dentry->d_name.len); + if (IS_ERR(d)) { + pr_warn("overlayfs: failed lookup after mkdir (%pd2, err=%i).\n", + dentry, err); + return PTR_ERR(d); + } + dput(dentry); + *newdentry = d; + + return 0; +} + +struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct ovl_cattr *attr) +{ + int err; + + if (IS_ERR(newdentry)) + return newdentry; + + err = -ESTALE; if (newdentry->d_inode) - return -ESTALE; + goto out; - if (hardlink) { - err = ovl_do_link(hardlink, dir, newdentry, debug); + if (attr->hardlink) { + err = ovl_do_link(attr->hardlink, dir, newdentry); } else { switch (attr->mode & S_IFMT) { case S_IFREG: - err = ovl_do_create(dir, newdentry, attr->mode, debug); + err = ovl_do_create(dir, newdentry, attr->mode); break; case S_IFDIR: - err = ovl_do_mkdir(dir, newdentry, attr->mode, debug); + /* mkdir is special... */ + err = ovl_mkdir_real(dir, &newdentry, attr->mode); break; case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - err = ovl_do_mknod(dir, newdentry, - attr->mode, attr->rdev, debug); + err = ovl_do_mknod(dir, newdentry, attr->mode, + attr->rdev); break; case S_IFLNK: - err = ovl_do_symlink(dir, newdentry, attr->link, debug); + err = ovl_do_symlink(dir, newdentry, attr->link); break; default: @@ -155,9 +191,20 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, * Not quite sure if non-instantiated dentry is legal or not. * VFS doesn't seem to care so check and warn here. */ - err = -ENOENT; + err = -EIO; } - return err; +out: + if (err) { + dput(newdentry); + return ERR_PTR(err); + } + return newdentry; +} + +struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr) +{ + return ovl_create_real(d_inode(workdir), ovl_lookup_temp(workdir), + attr); } static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, @@ -182,24 +229,54 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) return ovl_set_opaque_xerr(dentry, upperdentry, -EIO); } -/* Common operations required to be done after creation of file on upper */ -static void ovl_instantiate(struct dentry *dentry, struct inode *inode, - struct dentry *newdentry, bool hardlink) +/* + * Common operations required to be done after creation of file on upper. + * If @hardlink is false, then @inode is a pre-allocated inode, we may or + * may not use to instantiate the new dentry. + */ +static int ovl_instantiate(struct dentry *dentry, struct inode *inode, + struct dentry *newdentry, bool hardlink) { + struct ovl_inode_params oip = { + .upperdentry = newdentry, + .newinode = inode, + }; + ovl_dentry_version_inc(dentry->d_parent, false); ovl_dentry_set_upper_alias(dentry); if (!hardlink) { - ovl_inode_update(inode, newdentry); - ovl_copyattr(newdentry->d_inode, inode); + /* + * ovl_obtain_alias() can be called after ovl_create_real() + * and before we get here, so we may get an inode from cache + * with the same real upperdentry that is not the inode we + * pre-allocated. In this case we will use the cached inode + * to instantiate the new dentry. + * + * XXX: if we ever use ovl_obtain_alias() to decode directory + * file handles, need to use ovl_get_inode_locked() and + * d_instantiate_new() here to prevent from creating two + * hashed directory inode aliases. + */ + inode = ovl_get_inode(dentry->d_sb, &oip); + if (WARN_ON(IS_ERR(inode))) + return PTR_ERR(inode); } else { WARN_ON(ovl_inode_real(inode) != d_inode(newdentry)); dput(newdentry); inc_nlink(inode); } + d_instantiate(dentry, inode); + if (inode != oip.newinode) { + pr_warn_ratelimited("overlayfs: newly created inode found in cache (%pd2)\n", + dentry); + } + /* Force lookup of new upper hardlink to find its lower */ if (hardlink) d_drop(dentry); + + return 0; } static bool ovl_type_merge(struct dentry *dentry) @@ -213,38 +290,42 @@ static bool ovl_type_origin(struct dentry *dentry) } static int ovl_create_upper(struct dentry *dentry, struct inode *inode, - struct cattr *attr, struct dentry *hardlink) + struct ovl_cattr *attr) { struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct inode *udir = upperdir->d_inode; struct dentry *newdentry; int err; - if (!hardlink && !IS_POSIXACL(udir)) + if (!attr->hardlink && !IS_POSIXACL(udir)) attr->mode &= ~current_umask(); inode_lock_nested(udir, I_MUTEX_PARENT); - newdentry = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + newdentry = ovl_create_real(udir, + lookup_one_len(dentry->d_name.name, + upperdir, + dentry->d_name.len), + attr); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) goto out_unlock; - err = ovl_create_real(udir, newdentry, attr, hardlink, false); - if (err) - goto out_dput; if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) { /* Setting opaque here is just an optimization, allow to fail */ ovl_set_opaque(dentry, newdentry); } - ovl_instantiate(dentry, inode, newdentry, !!hardlink); - newdentry = NULL; -out_dput: - dput(newdentry); + err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink); + if (err) + goto out_cleanup; out_unlock: inode_unlock(udir); return err; + +out_cleanup: + ovl_cleanup(udir, newdentry); + dput(newdentry); + goto out_unlock; } static struct dentry *ovl_clear_empty(struct dentry *dentry, @@ -280,16 +361,11 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (upper->d_parent->d_inode != udir) goto out_unlock; - opaquedir = ovl_lookup_temp(workdir); + opaquedir = ovl_create_temp(workdir, OVL_CATTR(stat.mode)); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out_unlock; - err = ovl_create_real(wdir, opaquedir, - &(struct cattr){.mode = stat.mode}, NULL, true); - if (err) - goto out_dput; - err = ovl_copy_xattr(upper, opaquedir); if (err) goto out_cleanup; @@ -319,7 +395,6 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, out_cleanup: ovl_cleanup(wdir, opaquedir); -out_dput: dput(opaquedir); out_unlock: unlock_rename(workdir, upperdir); @@ -354,8 +429,7 @@ out_free: } static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, - struct cattr *cattr, - struct dentry *hardlink) + struct ovl_cattr *cattr) { struct dentry *workdir = ovl_workdir(dentry); struct inode *wdir = workdir->d_inode; @@ -365,6 +439,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; struct posix_acl *acl, *default_acl; + bool hardlink = !!cattr->hardlink; if (WARN_ON(!workdir)) return -EROFS; @@ -380,20 +455,16 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out; - newdentry = ovl_lookup_temp(workdir); - err = PTR_ERR(newdentry); - if (IS_ERR(newdentry)) - goto out_unlock; - upper = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) - goto out_dput; + goto out_unlock; - err = ovl_create_real(wdir, newdentry, cattr, hardlink, true); - if (err) - goto out_dput2; + newdentry = ovl_create_temp(workdir, cattr); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_dput; /* * mode could have been mutilated due to umask (e.g. sgid directory) @@ -439,12 +510,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; } - ovl_instantiate(dentry, inode, newdentry, !!hardlink); - newdentry = NULL; -out_dput2: - dput(upper); + err = ovl_instantiate(dentry, inode, newdentry, hardlink); + if (err) + goto out_cleanup; out_dput: - dput(newdentry); + dput(upper); out_unlock: unlock_rename(workdir, upperdir); out: @@ -456,12 +526,12 @@ out: out_cleanup: ovl_cleanup(wdir, newdentry); - goto out_dput2; + dput(newdentry); + goto out_dput; } static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, - struct cattr *attr, struct dentry *hardlink, - bool origin) + struct ovl_cattr *attr, bool origin) { int err; const struct cred *old_cred; @@ -489,7 +559,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, if (override_cred) { override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; - if (!hardlink) { + if (!attr->hardlink) { err = security_dentry_create_files_as(dentry, attr->mode, &dentry->d_name, old_cred, override_cred); @@ -502,21 +572,12 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, put_cred(override_cred); if (!ovl_dentry_is_whiteout(dentry)) - err = ovl_create_upper(dentry, inode, attr, - hardlink); + err = ovl_create_upper(dentry, inode, attr); else - err = ovl_create_over_whiteout(dentry, inode, attr, - hardlink); + err = ovl_create_over_whiteout(dentry, inode, attr); } out_revert_creds: revert_creds(old_cred); - if (!err) { - struct inode *realinode = d_inode(ovl_dentry_upper(dentry)); - - WARN_ON(inode->i_mode != realinode->i_mode); - WARN_ON(!uid_eq(inode->i_uid, realinode->i_uid)); - WARN_ON(!gid_eq(inode->i_gid, realinode->i_gid)); - } return err; } @@ -525,7 +586,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, { int err; struct inode *inode; - struct cattr attr = { + struct ovl_cattr attr = { .rdev = rdev, .link = link, }; @@ -534,6 +595,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, if (err) goto out; + /* Preallocate inode to be used by ovl_get_inode() */ err = -ENOMEM; inode = ovl_new_inode(dentry->d_sb, mode, rdev); if (!inode) @@ -542,8 +604,9 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, inode_init_owner(inode, dentry->d_parent->d_inode, mode); attr.mode = inode->i_mode; - err = ovl_create_or_link(dentry, inode, &attr, NULL, false); - if (err) + err = ovl_create_or_link(dentry, inode, &attr, false); + /* Did we end up using the preallocated inode? */ + if (inode != d_inode(dentry)) iput(inode); out_drop_write: @@ -601,8 +664,9 @@ static int ovl_link(struct dentry *old, struct inode *newdir, inode = d_inode(old); ihold(inode); - err = ovl_create_or_link(new, inode, NULL, ovl_dentry_upper(old), - ovl_type_origin(old)); + err = ovl_create_or_link(new, inode, + &(struct ovl_cattr) {.hardlink = ovl_dentry_upper(old)}, + ovl_type_origin(old)); if (err) iput(inode); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 425a94672300..9941ece61a14 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -300,12 +300,18 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, struct dentry *dentry; struct inode *inode; struct ovl_entry *oe; + struct ovl_inode_params oip = { + .lowerpath = lowerpath, + .index = index, + .numlower = !!lower + }; /* We get overlay directory dentries with ovl_lookup_real() */ if (d_is_dir(upper ?: lower)) return ERR_PTR(-EIO); - inode = ovl_get_inode(sb, dget(upper), lowerpath, index, !!lower); + oip.upperdentry = dget(upper); + inode = ovl_get_inode(sb, &oip); if (IS_ERR(inode)) { dput(upper); return ERR_CAST(inode); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 6e3815fb006b..ed16a898caeb 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -416,7 +416,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) return err; } -int ovl_update_time(struct inode *inode, struct timespec *ts, int flags) +int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) { if (flags & S_ATIME) { struct ovl_fs *ofs = inode->i_sb->s_fs_info; @@ -749,15 +749,26 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, return true; } -struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, - struct ovl_path *lowerpath, struct dentry *index, - unsigned int numlower) +static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, + struct inode *key) { + return newinode ? inode_insert5(newinode, (unsigned long) key, + ovl_inode_test, ovl_inode_set, key) : + iget5_locked(sb, (unsigned long) key, + ovl_inode_test, ovl_inode_set, key); +} + +struct inode *ovl_get_inode(struct super_block *sb, + struct ovl_inode_params *oip) +{ + struct dentry *upperdentry = oip->upperdentry; + struct ovl_path *lowerpath = oip->lowerpath; struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; - bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index); - int fsid = bylower ? lowerpath->layer->fsid : 0; + bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, + oip->index); + int fsid = bylower ? oip->lowerpath->layer->fsid : 0; bool is_dir; unsigned long ino = 0; @@ -774,8 +785,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, upperdentry); unsigned int nlink = is_dir ? 1 : realinode->i_nlink; - inode = iget5_locked(sb, (unsigned long) key, - ovl_inode_test, ovl_inode_set, key); + inode = ovl_iget5(sb, oip->newinode, key); if (!inode) goto out_nomem; if (!(inode->i_state & I_NEW)) { @@ -811,12 +821,12 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, if (upperdentry && ovl_is_impuredir(upperdentry)) ovl_set_flag(OVL_IMPURE, inode); - if (index) + if (oip->index) ovl_set_flag(OVL_INDEX, inode); /* Check for non-merge dir that may have whiteouts */ if (is_dir) { - if (((upperdentry && lowerdentry) || numlower > 1) || + if (((upperdentry && lowerdentry) || oip->numlower > 1) || ovl_check_origin_xattr(upperdentry ?: lowerdentry)) { ovl_set_flag(OVL_WHITEOUTS, inode); } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 2dba29eadde6..c993dd8db739 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -612,7 +612,7 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) { char *n, *s; - n = kzalloc(fh->len * 2, GFP_KERNEL); + n = kcalloc(fh->len, 2, GFP_KERNEL); if (!n) return -ENOMEM; @@ -1004,8 +1004,14 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperdentry = dget(index); if (upperdentry || ctr) { - inode = ovl_get_inode(dentry->d_sb, upperdentry, stack, index, - ctr); + struct ovl_inode_params oip = { + .upperdentry = upperdentry, + .lowerpath = stack, + .index = index, + .numlower = ctr, + }; + + inode = ovl_get_inode(dentry->d_sb, &oip); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free_oe; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e0b7de799f6b..7538b9b56237 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -86,6 +86,7 @@ struct ovl_fh { static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { int err = vfs_rmdir(dir, dentry); + pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; } @@ -93,56 +94,52 @@ static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) { int err = vfs_unlink(dir, dentry, NULL); + pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; } static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry, bool debug) + struct dentry *new_dentry) { int err = vfs_link(old_dentry, dir, new_dentry, NULL); - if (debug) { - pr_debug("link(%pd2, %pd2) = %i\n", - old_dentry, new_dentry, err); - } + + pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; } static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool debug) + umode_t mode) { int err = vfs_create(dir, dentry, mode, true); - if (debug) - pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); + + pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, - umode_t mode, bool debug) + umode_t mode) { int err = vfs_mkdir(dir, dentry, mode); - if (debug) - pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t dev, bool debug) + umode_t mode, dev_t dev) { int err = vfs_mknod(dir, dentry, mode, dev); - if (debug) { - pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", - dentry, mode, dev, err); - } + + pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; } static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, - const char *oldname, bool debug) + const char *oldname) { int err = vfs_symlink(dir, dentry, oldname); - if (debug) - pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); + + pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; } @@ -168,11 +165,8 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, { int err; - pr_debug("rename(%pd2, %pd2, 0x%x)\n", - olddentry, newdentry, flags); - + pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); - if (err) { pr_debug("...rename(%pd2, %pd2, ...) = %i\n", olddentry, newdentry, err); @@ -331,15 +325,21 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); struct posix_acl *ovl_get_acl(struct inode *inode, int type); int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); -int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); +int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); bool ovl_is_private_xattr(const char *name); +struct ovl_inode_params { + struct inode *newinode; + struct dentry *upperdentry; + struct ovl_path *lowerpath; + struct dentry *index; + unsigned int numlower; +}; struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, bool is_upper); -struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, - struct ovl_path *lowerpath, struct dentry *index, - unsigned int numlower); +struct inode *ovl_get_inode(struct super_block *sb, + struct ovl_inode_params *oip); static inline void ovl_copyattr(struct inode *from, struct inode *to) { to->i_uid = from->i_uid; @@ -352,18 +352,21 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; -struct dentry *ovl_lookup_temp(struct dentry *workdir); int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir, struct dentry *dentry); -struct cattr { +struct ovl_cattr { dev_t rdev; umode_t mode; const char *link; + struct dentry *hardlink; }; -int ovl_create_real(struct inode *dir, struct dentry *newdentry, - struct cattr *attr, - struct dentry *hardlink, bool debug); + +#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) + +struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct ovl_cattr *attr); int ovl_cleanup(struct inode *dir, struct dentry *dentry); +struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e8551c97de51..704b37311467 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -611,11 +611,10 @@ retry: goto retry; } - err = ovl_create_real(dir, work, - &(struct cattr){.mode = S_IFDIR | 0}, - NULL, true); - if (err) - goto out_dput; + work = ovl_create_real(dir, work, OVL_CATTR(attr.ia_mode)); + err = PTR_ERR(work); + if (IS_ERR(work)) + goto out_err; /* * Try to remove POSIX ACL xattrs from workdir. We are good if: diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 1ade1206bb89..0eaeb41453f5 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -43,6 +43,21 @@ config PROC_VMCORE help Exports the dump image of crashed kernel in ELF format. +config PROC_VMCORE_DEVICE_DUMP + bool "Device Hardware/Firmware Log Collection" + depends on PROC_VMCORE + default n + help + After kernel panic, device drivers can collect the device + specific snapshot of their hardware or firmware before the + underlying devices are initialized in crash recovery kernel. + Note that the device driver must be present in the crash + recovery kernel's initramfs to collect its underlying device + snapshot. + + If you say Y here, the collected device dumps will be added + as ELF notes to /proc/vmcore. + config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT depends on PROC_FS diff --git a/fs/proc/array.c b/fs/proc/array.c index ae2c807fd719..0ceb3b6b37e7 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -85,6 +85,7 @@ #include <linux/delayacct.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> +#include <linux/prctl.h> #include <linux/ptrace.h> #include <linux/tracehook.h> #include <linux/string_helpers.h> @@ -95,22 +96,29 @@ #include <asm/processor.h> #include "internal.h" -static inline void task_name(struct seq_file *m, struct task_struct *p) +void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { char *buf; size_t size; - char tcomm[sizeof(p->comm)]; + char tcomm[64]; int ret; - get_task_comm(tcomm, p); - - seq_puts(m, "Name:\t"); + if (p->flags & PF_WQ_WORKER) + wq_worker_comm(tcomm, sizeof(tcomm), p); + else + __get_task_comm(tcomm, sizeof(tcomm), p); size = seq_get_buf(m, &buf); - ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); - seq_commit(m, ret < size ? ret : -1); + if (escape) { + ret = string_escape_str(tcomm, buf, size, + ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + if (ret >= size) + ret = -1; + } else { + ret = strscpy(buf, tcomm, size); + } - seq_putc(m, '\n'); + seq_commit(m, ret); } /* @@ -260,7 +268,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; int num_threads = 0; - unsigned long qsize = 0; + unsigned int qsize = 0; unsigned long qlim = 0; sigemptyset(&pending); @@ -335,6 +343,30 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif + seq_printf(m, "\nSpeculation_Store_Bypass:\t"); + switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { + case -EINVAL: + seq_printf(m, "unknown"); + break; + case PR_SPEC_NOT_AFFECTED: + seq_printf(m, "not vulnerable"); + break; + case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: + seq_printf(m, "thread force mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_DISABLE: + seq_printf(m, "thread mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_ENABLE: + seq_printf(m, "thread vulnerable"); + break; + case PR_SPEC_DISABLE: + seq_printf(m, "globally mitigated"); + break; + default: + seq_printf(m, "vulnerable"); + break; + } seq_putc(m, '\n'); } @@ -365,7 +397,10 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, { struct mm_struct *mm = get_task_mm(task); - task_name(m, task); + seq_puts(m, "Name:\t"); + proc_task_name(m, task, true); + seq_putc(m, '\n'); + task_state(m, ns, pid, task); if (mm) { @@ -400,7 +435,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, u64 cutime, cstime, utime, stime; u64 cgtime, gtime; unsigned long rsslim = 0; - char tcomm[sizeof(task->comm)]; unsigned long flags; state = *get_task_state(task); @@ -427,8 +461,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, } } - get_task_comm(tcomm, task); - sigemptyset(&sigign); sigemptyset(&sigcatch); cutime = cstime = utime = stime = 0; @@ -495,7 +527,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); seq_puts(m, " ("); - seq_puts(m, tcomm); + proc_task_name(m, task, false); seq_puts(m, ") "); seq_putc(m, state); seq_put_decimal_ll(m, " ", ppid); @@ -677,25 +709,22 @@ out: static int children_seq_show(struct seq_file *seq, void *v) { - struct inode *inode = seq->private; - pid_t pid; - - pid = pid_nr_ns(v, inode->i_sb->s_fs_info); - seq_printf(seq, "%d ", pid); + struct inode *inode = file_inode(seq->file); + seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode))); return 0; } static void *children_seq_start(struct seq_file *seq, loff_t *pos) { - return get_children_pid(seq->private, NULL, *pos); + return get_children_pid(file_inode(seq->file), NULL, *pos); } static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pid *pid; - pid = get_children_pid(seq->private, v, *pos + 1); + pid = get_children_pid(file_inode(seq->file), v, *pos + 1); put_pid(v); ++*pos; @@ -716,17 +745,7 @@ static const struct seq_operations children_seq_ops = { static int children_seq_open(struct inode *inode, struct file *file) { - struct seq_file *m; - int ret; - - ret = seq_open(file, &children_seq_ops); - if (ret) - return ret; - - m = file->private_data; - m->private = inode; - - return ret; + return seq_open(file, &children_seq_ops); } const struct file_operations proc_tid_children_operations = { diff --git a/fs/proc/base.c b/fs/proc/base.c index 1a76d751cf3c..aaffc0c30216 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -205,171 +205,147 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, - size_t _count, loff_t *pos) +static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, + size_t count, loff_t *ppos) { - struct task_struct *tsk; - struct mm_struct *mm; - char *page; - unsigned long count = _count; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long len1, len2, len; - unsigned long p; - char c; - ssize_t rv; - - BUG_ON(*pos < 0); + unsigned long pos, len; + char *page; - tsk = get_proc_task(file_inode(file)); - if (!tsk) - return -ESRCH; - mm = get_task_mm(tsk); - put_task_struct(tsk); - if (!mm) - return 0; /* Check if process spawned far enough to have cmdline. */ - if (!mm->env_end) { - rv = 0; - goto out_mmput; - } - - page = (char *)__get_free_page(GFP_KERNEL); - if (!page) { - rv = -ENOMEM; - goto out_mmput; - } + if (!mm->env_end) + return 0; - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); - BUG_ON(arg_start > arg_end); - BUG_ON(env_start > env_end); - - len1 = arg_end - arg_start; - len2 = env_end - env_start; + if (arg_start >= arg_end) + return 0; - /* Empty ARGV. */ - if (len1 == 0) { - rv = 0; - goto out_free_page; - } /* - * Inherently racy -- command line shares address space - * with code and data. + * We have traditionally allowed the user to re-write + * the argument strings and overflow the end result + * into the environment section. But only do that if + * the environment area is contiguous to the arguments. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON); - if (rv <= 0) - goto out_free_page; - - rv = 0; - - if (c == '\0') { - /* Command line (set of strings) occupies whole ARGV. */ - if (len1 <= *pos) - goto out_free_page; - - p = arg_start + *pos; - len = len1 - *pos; - while (count > 0 && len > 0) { - unsigned int _count; - int nr_read; - - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) - goto out_free_page; - - if (copy_to_user(buf, page, nr_read)) { - rv = -EFAULT; - goto out_free_page; - } + if (env_start != arg_end || env_start >= env_end) + env_start = env_end = arg_end; + + /* .. and limit it to a maximum of one page of slop */ + if (env_end >= arg_end + PAGE_SIZE) + env_end = arg_end + PAGE_SIZE - 1; + + /* We're not going to care if "*ppos" has high bits set */ + pos = arg_start + *ppos; + + /* .. but we do check the result is in the proper range */ + if (pos < arg_start || pos >= env_end) + return 0; + + /* .. and we never go past env_end */ + if (env_end - pos < count) + count = env_end - pos; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + len = 0; + while (count) { + int got; + size_t size = min_t(size_t, PAGE_SIZE, count); + long offset; - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; - } - } else { /* - * Command line (1 string) occupies ARGV and - * extends into ENVP. + * Are we already starting past the official end? + * We always include the last byte that is *supposed* + * to be NUL */ - struct { - unsigned long p; - unsigned long len; - } cmdline[2] = { - { .p = arg_start, .len = len1 }, - { .p = env_start, .len = len2 }, - }; - loff_t pos1 = *pos; - unsigned int i; + offset = (pos >= arg_end) ? pos - arg_end + 1 : 0; + + got = access_remote_vm(mm, pos - offset, page, size + offset, FOLL_ANON); + if (got <= offset) + break; + got -= offset; + + /* Don't walk past a NUL character once you hit arg_end */ + if (pos + got >= arg_end) { + int n = 0; - i = 0; - while (i < 2 && pos1 >= cmdline[i].len) { - pos1 -= cmdline[i].len; - i++; + /* + * If we started before 'arg_end' but ended up + * at or after it, we start the NUL character + * check at arg_end-1 (where we expect the normal + * EOF to be). + * + * NOTE! This is smaller than 'got', because + * pos + got >= arg_end + */ + if (pos < arg_end) + n = arg_end - pos - 1; + + /* Cut off at first NUL after 'n' */ + got = n + strnlen(page+n, offset+got-n); + if (got < offset) + break; + got -= offset; + + /* Include the NUL if it existed */ + if (got < size) + got++; } - while (i < 2) { - p = cmdline[i].p + pos1; - len = cmdline[i].len - pos1; - while (count > 0 && len > 0) { - unsigned int _count, l; - int nr_read; - bool final; - - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) - goto out_free_page; - - /* - * Command line can be shorter than whole ARGV - * even if last "marker" byte says it is not. - */ - final = false; - l = strnlen(page, nr_read); - if (l < nr_read) { - nr_read = l; - final = true; - } - - if (copy_to_user(buf, page, nr_read)) { - rv = -EFAULT; - goto out_free_page; - } - - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; - - if (final) - goto out_free_page; - } - /* Only first chunk can be read partially. */ - pos1 = 0; - i++; + got -= copy_to_user(buf, page+offset, got); + if (unlikely(!got)) { + if (!len) + len = -EFAULT; + break; } + pos += got; + buf += got; + len += got; + count -= got; } -out_free_page: free_page((unsigned long)page); -out_mmput: + return len; +} + +static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf, + size_t count, loff_t *pos) +{ + struct mm_struct *mm; + ssize_t ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = get_mm_cmdline(mm, buf, count, pos); mmput(mm); - if (rv > 0) - *pos += rv; - return rv; + return ret; +} + +static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct task_struct *tsk; + ssize_t ret; + + BUG_ON(*pos < 0); + + tsk = get_proc_task(file_inode(file)); + if (!tsk) + return -ESRCH; + ret = get_task_cmdline(tsk, buf, count, pos); + put_task_struct(tsk); + if (ret > 0) + *pos += ret; + return ret; } static const struct file_operations proc_pid_cmdline_ops = { @@ -430,9 +406,9 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct stack_trace trace; unsigned long *entries; int err; - int i; - entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); + entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), + GFP_KERNEL); if (!entries) return -ENOMEM; @@ -443,6 +419,8 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, err = lock_trace(task); if (!err) { + unsigned int i; + save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { @@ -698,7 +676,7 @@ static bool has_pid_permissions(struct pid_namespace *pid, static int proc_pid_permission(struct inode *inode, int mask) { - struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct pid_namespace *pid = proc_pid_ns(inode); struct task_struct *task; bool has_perms; @@ -733,13 +711,11 @@ static const struct inode_operations proc_def_inode_operations = { static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns; - struct pid *pid; + struct pid_namespace *ns = proc_pid_ns(inode); + struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; - ns = inode->i_sb->s_fs_info; - pid = proc_pid(inode); task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; @@ -929,10 +905,10 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; @@ -1410,7 +1386,7 @@ static const struct file_operations proc_fail_nth_operations = { static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode); struct task_struct *p; p = get_proc_task(inode); @@ -1565,9 +1541,8 @@ static int comm_show(struct seq_file *m, void *v) if (!p) return -ESRCH; - task_lock(p); - seq_printf(m, "%s\n", p->comm); - task_unlock(p); + proc_task_name(m, p, false); + seq_putc(m, '\n'); put_task_struct(p); @@ -1782,14 +1757,14 @@ int pid_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); + struct pid_namespace *pid = proc_pid_ns(inode); struct task_struct *task; - struct pid_namespace *pid = path->dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); - rcu_read_lock(); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; + rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) { @@ -1809,15 +1784,22 @@ int pid_getattr(const struct path *path, struct kstat *stat, /* dentry stuff */ /* - * Exceptional case: normally we are not allowed to unhash a busy - * directory. In this case, however, we can do it - no aliasing problems - * due to the way we treat inodes. - * + * Set <pid>/... inode ownership (can change due to setuid(), etc.) + */ +void pid_update_inode(struct task_struct *task, struct inode *inode) +{ + task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); + + inode->i_mode &= ~(S_ISUID | S_ISGID); + security_task_to_inode(task, inode); +} + +/* * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. * */ -int pid_revalidate(struct dentry *dentry, unsigned int flags) +static int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; @@ -1829,10 +1811,7 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) task = get_proc_task(inode); if (task) { - task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); - - inode->i_mode &= ~(S_ISUID | S_ISGID); - security_task_to_inode(task, inode); + pid_update_inode(task, inode); put_task_struct(task); return 1; } @@ -1874,14 +1853,14 @@ const struct dentry_operations pid_dentry_operations = * by stat. */ bool proc_fill_cache(struct file *file, struct dir_context *ctx, - const char *name, int len, + const char *name, unsigned int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = file->f_path.dentry; struct qstr qname = QSTR_INIT(name, len); struct inode *inode; - unsigned type; - ino_t ino; + unsigned type = DT_UNKNOWN; + ino_t ino = 1; child = d_hash_and_lookup(dir, &qname); if (!child) { @@ -1890,11 +1869,14 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, if (IS_ERR(child)) goto end_instantiate; if (d_in_lookup(child)) { - int err = instantiate(d_inode(dir), child, task, ptr); + struct dentry *res; + res = instantiate(child, task, ptr); d_lookup_done(child); - if (err < 0) { + if (unlikely(res)) { dput(child); - goto end_instantiate; + child = res; + if (IS_ERR(child)) + goto end_instantiate; } } } @@ -1902,10 +1884,8 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); - return dir_emit(ctx, name, len, ino, type); - end_instantiate: - return dir_emit(ctx, name, len, 1, DT_UNKNOWN); + return dir_emit(ctx, name, len, ino, type); } /* @@ -2067,19 +2047,19 @@ static const struct inode_operations proc_map_files_link_inode_operations = { .setattr = proc_setattr, }; -static int -proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, +static struct dentry * +proc_map_files_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { fmode_t mode = (fmode_t)(unsigned long)ptr; struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | ((mode & FMODE_READ ) ? S_IRUSR : 0) | ((mode & FMODE_WRITE) ? S_IWUSR : 0)); if (!inode) - return -ENOENT; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); ei->op.proc_get_link = map_files_get_link; @@ -2088,9 +2068,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, inode->i_size = 64; d_set_d_op(dentry, &tid_map_files_dentry_operations); - d_add(dentry, inode); - - return 0; + return d_splice_alias(inode, dentry); } static struct dentry *proc_map_files_lookup(struct inode *dir, @@ -2099,19 +2077,19 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; - int result; + struct dentry *result; struct mm_struct *mm; - result = -ENOENT; + result = ERR_PTR(-ENOENT); task = get_proc_task(dir); if (!task) goto out; - result = -EACCES; + result = ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; - result = -ENOENT; + result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; @@ -2125,7 +2103,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out_no_vma; if (vma->vm_file) - result = proc_map_files_instantiate(dir, dentry, task, + result = proc_map_files_instantiate(dentry, task, (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: @@ -2134,7 +2112,7 @@ out_no_vma: out_put_task: put_task_struct(task); out: - return ERR_PTR(result); + return result; } static const struct inode_operations proc_map_files_inode_operations = { @@ -2337,7 +2315,7 @@ static int proc_timers_open(struct inode *inode, struct file *file) return -ENOMEM; tp->pid = proc_pid(inode); - tp->ns = inode->i_sb->s_fs_info; + tp->ns = proc_pid_ns(inode); return 0; } @@ -2435,16 +2413,16 @@ static const struct file_operations proc_pid_set_timerslack_ns_operations = { .release = single_release, }; -static int proc_pident_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static struct dentry *proc_pident_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dir->i_sb, task, p->mode); + inode = proc_pid_make_inode(dentry->d_sb, task, p->mode); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); if (S_ISDIR(inode->i_mode)) @@ -2454,13 +2432,9 @@ static int proc_pident_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; + pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias(inode, dentry); } static struct dentry *proc_pident_lookup(struct inode *dir, @@ -2468,11 +2442,9 @@ static struct dentry *proc_pident_lookup(struct inode *dir, const struct pid_entry *ents, unsigned int nents) { - int error; struct task_struct *task = get_proc_task(dir); const struct pid_entry *p, *last; - - error = -ENOENT; + struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; @@ -2485,17 +2457,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir, for (p = ents; p < last; p++) { if (p->len != dentry->d_name.len) continue; - if (!memcmp(dentry->d_name.name, p->name, p->len)) + if (!memcmp(dentry->d_name.name, p->name, p->len)) { + res = proc_pident_instantiate(dentry, task, p); break; + } } - if (p >= last) - goto out; - - error = proc_pident_instantiate(dir, dentry, task, p); -out: put_task_struct(task); out_no_task: - return ERR_PTR(error); + return res; } static int proc_pident_readdir(struct file *file, struct dir_context *ctx, @@ -3138,38 +3107,32 @@ void proc_flush_task(struct task_struct *task) } } -static int proc_pid_instantiate(struct inode *dir, - struct dentry * dentry, +static struct dentry *proc_pid_instantiate(struct dentry * dentry, struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) - goto out; + return ERR_PTR(-ENOENT); inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; set_nlink(inode, nlink_tgid); + pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); - - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias(inode, dentry); } struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - int result = -ENOENT; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; + struct dentry *result = ERR_PTR(-ENOENT); tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) @@ -3184,10 +3147,10 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign if (!task) goto out; - result = proc_pid_instantiate(dir, dentry, task, NULL); + result = proc_pid_instantiate(dentry, task, NULL); put_task_struct(task); out: - return ERR_PTR(result); + return result; } /* @@ -3239,7 +3202,7 @@ retry: int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; - struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(file_inode(file)); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) @@ -3263,7 +3226,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[10 + 1]; - int len; + unsigned int len; cond_resched(); if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) @@ -3435,37 +3398,32 @@ static const struct inode_operations proc_tid_base_inode_operations = { .setattr = proc_setattr, }; -static int proc_task_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static struct dentry *proc_task_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); - + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) - goto out; + return ERR_PTR(-ENOENT); + inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; - inode->i_flags|=S_IMMUTABLE; + inode->i_flags |= S_IMMUTABLE; set_nlink(inode, nlink_tid); + pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); - - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias(inode, dentry); } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - int result = -ENOENT; struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; struct pid_namespace *ns; + struct dentry *result = ERR_PTR(-ENOENT); if (!leader) goto out_no_task; @@ -3485,13 +3443,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry if (!same_thread_group(leader, task)) goto out_drop_task; - result = proc_task_instantiate(dir, dentry, task, NULL); + result = proc_task_instantiate(dentry, task, NULL); out_drop_task: put_task_struct(task); out: put_task_struct(leader); out_no_task: - return ERR_PTR(result); + return result; } /* @@ -3588,14 +3546,14 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - ns = inode->i_sb->s_fs_info; + ns = proc_pid_ns(inode); tid = (int)file->f_version; file->f_version = 0; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { char name[10 + 1]; - int len; + unsigned int len; tid = task_pid_nr_ns(task, ns); len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 8233e7af9389..fa762c5fbcb2 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -11,21 +11,9 @@ static int cmdline_proc_show(struct seq_file *m, void *v) return 0; } -static int cmdline_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, cmdline_proc_show, NULL); -} - -static const struct file_operations cmdline_proc_fops = { - .open = cmdline_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_cmdline_init(void) { - proc_create("cmdline", 0, NULL, &cmdline_proc_fops); + proc_create_single("cmdline", 0, NULL, cmdline_proc_show); return 0; } fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index a8ac48aebd59..954caf0b7fee 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -91,21 +91,9 @@ static const struct seq_operations consoles_op = { .show = show_console_dev }; -static int consoles_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &consoles_op); -} - -static const struct file_operations proc_consoles_operations = { - .open = consoles_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_consoles_init(void) { - proc_create("consoles", 0, NULL, &proc_consoles_operations); + proc_create_seq("consoles", 0, NULL, &consoles_op); return 0; } fs_initcall(proc_consoles_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 2c7f22b14489..37d38697eaf8 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -51,21 +51,9 @@ static const struct seq_operations devinfo_ops = { .show = devinfo_show }; -static int devinfo_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &devinfo_ops); -} - -static const struct file_operations proc_devinfo_operations = { - .open = devinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_devices_init(void) { - proc_create("devices", 0, NULL, &proc_devinfo_operations); + proc_create_seq("devices", 0, NULL, &devinfo_ops); return 0; } fs_initcall(proc_devices_init); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 6b80cd1e419a..81882a13212d 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -81,9 +81,41 @@ static const struct file_operations proc_fdinfo_file_operations = { .release = single_release, }; +static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) +{ + struct files_struct *files = get_files_struct(task); + struct file *file; + + if (!files) + return false; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) + *mode = file->f_mode; + rcu_read_unlock(); + put_files_struct(files); + return !!file; +} + +static void tid_fd_update_inode(struct task_struct *task, struct inode *inode, + fmode_t f_mode) +{ + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); + + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } + security_task_to_inode(task, inode); +} + static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) { - struct files_struct *files; struct task_struct *task; struct inode *inode; unsigned int fd; @@ -96,35 +128,11 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) fd = proc_fd(inode); if (task) { - files = get_files_struct(task); - if (files) { - struct file *file; - - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - unsigned f_mode = file->f_mode; - - rcu_read_unlock(); - put_files_struct(files); - - task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); - - if (S_ISLNK(inode->i_mode)) { - unsigned i_mode = S_IFLNK; - if (f_mode & FMODE_READ) - i_mode |= S_IRUSR | S_IXUSR; - if (f_mode & FMODE_WRITE) - i_mode |= S_IWUSR | S_IXUSR; - inode->i_mode = i_mode; - } - - security_task_to_inode(task, inode); - put_task_struct(task); - return 1; - } - rcu_read_unlock(); - put_files_struct(files); + fmode_t f_mode; + if (tid_fd_mode(task, fd, &f_mode)) { + tid_fd_update_inode(task, inode, f_mode); + put_task_struct(task); + return 1; } put_task_struct(task); } @@ -166,34 +174,33 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) return ret; } -static int -proc_fd_instantiate(struct inode *dir, struct dentry *dentry, - struct task_struct *task, const void *ptr) +struct fd_data { + fmode_t mode; + unsigned fd; +}; + +static struct dentry *proc_fd_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { - unsigned fd = (unsigned long)ptr; + const struct fd_data *data = ptr; struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); - ei->fd = fd; + ei->fd = data->fd; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; + tid_fd_update_inode(task, inode, data->mode); d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, 0)) - return 0; - out: - return -ENOENT; + return d_splice_alias(inode, dentry); } static struct dentry *proc_lookupfd_common(struct inode *dir, @@ -201,19 +208,21 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, instantiate_t instantiate) { struct task_struct *task = get_proc_task(dir); - int result = -ENOENT; - unsigned fd = name_to_int(&dentry->d_name); + struct fd_data data = {.fd = name_to_int(&dentry->d_name)}; + struct dentry *result = ERR_PTR(-ENOENT); if (!task) goto out_no_task; - if (fd == ~0U) + if (data.fd == ~0U) + goto out; + if (!tid_fd_mode(task, data.fd, &data.mode)) goto out; - result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); + result = instantiate(dentry, task, &data); out: put_task_struct(task); out_no_task: - return ERR_PTR(result); + return result; } static int proc_readfd_common(struct file *file, struct dir_context *ctx, @@ -236,17 +245,22 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, for (fd = ctx->pos - 2; fd < files_fdtable(files)->max_fds; fd++, ctx->pos++) { + struct file *f; + struct fd_data data; char name[10 + 1]; - int len; + unsigned int len; - if (!fcheck_files(files, fd)) + f = fcheck_files(files, fd); + if (!f) continue; + data.mode = f->f_mode; rcu_read_unlock(); + data.fd = fd; len = snprintf(name, sizeof(name), "%u", fd); if (!proc_fill_cache(file, ctx, name, len, instantiate, p, - (void *)(unsigned long)fd)) + &data)) goto out_fd_loop; cond_resched(); rcu_read_lock(); @@ -304,31 +318,25 @@ const struct inode_operations proc_fd_inode_operations = { .setattr = proc_setattr, }; -static int -proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, - struct task_struct *task, const void *ptr) +static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { - unsigned fd = (unsigned long)ptr; + const struct fd_data *data = ptr; struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); - ei->fd = fd; + ei->fd = data->fd; inode->i_fop = &proc_fdinfo_file_operations; + tid_fd_update_inode(task, inode, 0); d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, 0)) - return 0; - out: - return -ENOENT; + return d_splice_alias(inode, dentry); } static struct dentry * diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 2078e70e1595..bb1c1625b158 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -25,6 +25,7 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/uaccess.h> +#include <linux/seq_file.h> #include "internal.h" @@ -256,8 +257,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, if (!inode) return ERR_PTR(-ENOMEM); d_set_d_op(dentry, &proc_misc_dentry_ops); - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } read_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); @@ -346,13 +346,12 @@ static const struct inode_operations proc_dir_inode_operations = { .setattr = proc_notify_change, }; -static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) +/* returns the registered entry, or frees dp and returns NULL on failure */ +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp) { - int ret; - - ret = proc_alloc_inum(&dp->low_ino); - if (ret) - return ret; + if (proc_alloc_inum(&dp->low_ino)) + goto out_free_entry; write_lock(&proc_subdir_lock); dp->parent = dir; @@ -360,12 +359,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); write_unlock(&proc_subdir_lock); - proc_free_inum(dp->low_ino); - return -EEXIST; + goto out_free_inum; } write_unlock(&proc_subdir_lock); - return 0; + return dp; +out_free_inum: + proc_free_inum(dp->low_ino); +out_free_entry: + pde_free(dp); + return NULL; } static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, @@ -406,7 +409,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, if (!ent) goto out; - if (qstr.len + 1 <= sizeof(ent->inline_name)) { + if (qstr.len + 1 <= SIZEOF_PDE_INLINE_NAME) { ent->name = ent->inline_name; } else { ent->name = kmalloc(qstr.len + 1, GFP_KERNEL); @@ -443,10 +446,7 @@ struct proc_dir_entry *proc_symlink(const char *name, if (ent->data) { strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; - if (proc_register(parent, ent) < 0) { - pde_free(ent); - ent = NULL; - } + ent = proc_register(parent, ent); } else { pde_free(ent); ent = NULL; @@ -470,11 +470,9 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->proc_fops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; parent->nlink++; - if (proc_register(parent, ent) < 0) { - pde_free(ent); + ent = proc_register(parent, ent); + if (!ent) parent->nlink--; - ent = NULL; - } } return ent; } @@ -505,47 +503,47 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->proc_fops = NULL; ent->proc_iops = NULL; parent->nlink++; - if (proc_register(parent, ent) < 0) { - pde_free(ent); + ent = proc_register(parent, ent); + if (!ent) parent->nlink--; - ent = NULL; - } } return ent; } EXPORT_SYMBOL(proc_create_mount_point); -struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, - struct proc_dir_entry *parent, - const struct file_operations *proc_fops, - void *data) +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data) { - struct proc_dir_entry *pde; + struct proc_dir_entry *p; + if ((mode & S_IFMT) == 0) mode |= S_IFREG; - - if (!S_ISREG(mode)) { - WARN_ON(1); /* use proc_mkdir() */ + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + if (WARN_ON_ONCE(!S_ISREG(mode))) return NULL; + + p = __proc_create(parent, name, mode, 1); + if (p) { + p->proc_iops = &proc_file_inode_operations; + p->data = data; } + return p; +} + +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct file_operations *proc_fops, void *data) +{ + struct proc_dir_entry *p; BUG_ON(proc_fops == NULL); - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - pde = __proc_create(&parent, name, mode, 1); - if (!pde) - goto out; - pde->proc_fops = proc_fops; - pde->data = data; - pde->proc_iops = &proc_file_inode_operations; - if (proc_register(parent, pde) < 0) - goto out_free; - return pde; -out_free: - pde_free(pde); -out: - return NULL; + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = proc_fops; + return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -557,6 +555,76 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode, } EXPORT_SYMBOL(proc_create); +static int proc_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + if (de->state_size) + return seq_open_private(file, de->seq_ops, de->state_size); + return seq_open(file, de->seq_ops); +} + +static int proc_seq_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + if (de->state_size) + return seq_release_private(inode, file); + return seq_release(inode, file); +} + +static const struct file_operations proc_seq_fops = { + .open = proc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_seq_release, +}; + +struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *ops, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_seq_fops; + p->seq_ops = ops; + p->state_size = state_size; + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_seq_private); + +static int proc_single_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + return single_open(file, de->single_show, de->data); +} + +static const struct file_operations proc_single_fops = { + .open = proc_single_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_single_fops; + p->single_show = show; + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_single_data); + void proc_set_size(struct proc_dir_entry *de, loff_t size) { de->size = size; @@ -681,3 +749,27 @@ void *PDE_DATA(const struct inode *inode) return __PDE_DATA(inode); } EXPORT_SYMBOL(PDE_DATA); + +/* + * Pull a user buffer into memory and pass it to the file's write handler if + * one is supplied. The ->write() method is permitted to modify the + * kernel-side buffer. + */ +ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size, + loff_t *_pos) +{ + struct proc_dir_entry *pde = PDE(file_inode(f)); + char *buf; + int ret; + + if (!pde->write) + return -EACCES; + if (size == 0 || size > PAGE_SIZE - 1) + return -EINVAL; + buf = memdup_user_nul(ubuf, size); + if (IS_ERR(buf)) + return PTR_ERR(buf); + ret = pde->write(f, buf, size); + kfree(buf); + return ret == 0 ? size : ret; +} diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2cf3b74391ca..85ffbd27f288 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -105,9 +105,8 @@ void __init proc_init_kmemcache(void) kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, SLAB_ACCOUNT|SLAB_PANIC, NULL); proc_dir_entry_cache = kmem_cache_create_usercopy( - "proc_dir_entry", sizeof(struct proc_dir_entry), 0, SLAB_PANIC, - offsetof(struct proc_dir_entry, inline_name), - sizeof_field(struct proc_dir_entry, inline_name), NULL); + "proc_dir_entry", SIZEOF_PDE_SLOT, 0, SLAB_PANIC, + OFFSETOF_PDE_NAME, SIZEOF_PDE_INLINE_NAME, NULL); } static int proc_show_options(struct seq_file *seq, struct dentry *root) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0f1692e63cb6..da3dbfa09e79 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -44,7 +44,13 @@ struct proc_dir_entry { struct completion *pde_unload_completion; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; + union { + const struct seq_operations *seq_ops; + int (*single_show)(struct seq_file *, void *); + }; + proc_write_t write; void *data; + unsigned int state_size; unsigned int low_ino; nlink_t nlink; kuid_t uid; @@ -56,14 +62,20 @@ struct proc_dir_entry { char *name; umode_t mode; u8 namelen; -#ifdef CONFIG_64BIT -#define SIZEOF_PDE_INLINE_NAME (192-139) -#else -#define SIZEOF_PDE_INLINE_NAME (128-87) -#endif - char inline_name[SIZEOF_PDE_INLINE_NAME]; + char inline_name[]; } __randomize_layout; +#define OFFSETOF_PDE_NAME offsetof(struct proc_dir_entry, inline_name) +#define SIZEOF_PDE_SLOT \ + (OFFSETOF_PDE_NAME + 34 <= 64 ? 64 : \ + OFFSETOF_PDE_NAME + 34 <= 128 ? 128 : \ + OFFSETOF_PDE_NAME + 34 <= 192 ? 192 : \ + OFFSETOF_PDE_NAME + 34 <= 256 ? 256 : \ + OFFSETOF_PDE_NAME + 34 <= 512 ? 512 : \ + 0) + +#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE_SLOT - OFFSETOF_PDE_NAME) + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -131,6 +143,8 @@ unsigned name_to_int(const struct qstr *qstr); */ extern const struct file_operations proc_tid_children_operations; +extern void proc_task_name(struct seq_file *m, struct task_struct *p, + bool escape); extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, @@ -147,21 +161,25 @@ extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct dentry *, struct iattr *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); -extern int pid_revalidate(struct dentry *, unsigned int); +extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ -typedef int instantiate_t(struct inode *, struct dentry *, +typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); -extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, +bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* * generic.c */ +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data); +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp); extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); @@ -178,6 +196,7 @@ static inline bool is_empty_pde(const struct proc_dir_entry *pde) { return S_ISDIR(pde->mode) && !pde->proc_iops; } +extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); /* * inode.c diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index 6a6bee9c603c..cb0edc7cbf09 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -34,21 +34,9 @@ static const struct seq_operations int_seq_ops = { .show = show_interrupts }; -static int interrupts_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &int_seq_ops); -} - -static const struct file_operations proc_interrupts_operations = { - .open = interrupts_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_interrupts_init(void) { - proc_create("interrupts", 0, NULL, &proc_interrupts_operations); + proc_create_seq("interrupts", 0, NULL, &int_seq_ops); return 0; } fs_initcall(proc_interrupts_init); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index b572cc865b92..d06694757201 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -28,21 +28,9 @@ static int loadavg_proc_show(struct seq_file *m, void *v) return 0; } -static int loadavg_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, loadavg_proc_show, NULL); -} - -static const struct file_operations loadavg_proc_fops = { - .open = loadavg_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_loadavg_init(void) { - proc_create("loadavg", 0, NULL, &loadavg_proc_fops); + proc_create_single("loadavg", 0, NULL, loadavg_proc_show); return 0; } fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 65a72ab57471..2fb04846ed11 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -149,21 +149,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) return 0; } -static int meminfo_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, meminfo_proc_show, NULL); -} - -static const struct file_operations meminfo_proc_fops = { - .open = meminfo_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_meminfo_init(void) { - proc_create("meminfo", 0, NULL, &meminfo_proc_fops); + proc_create_single("meminfo", 0, NULL, meminfo_proc_show); return 0; } fs_initcall(proc_meminfo_init); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 59b17e509f46..dd2b35f78b09 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -87,28 +87,24 @@ static const struct inode_operations proc_ns_link_inode_operations = { .setattr = proc_setattr, }; -static int proc_ns_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static struct dentry *proc_ns_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) { const struct proc_ns_operations *ns_ops = ptr; struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO); if (!inode) - goto out; + return ERR_PTR(-ENOENT); ei = PROC_I(inode); inode->i_op = &proc_ns_link_inode_operations; ei->ns_ops = ns_ops; + pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, 0)) - return 0; -out: - return -ENOENT; + return d_splice_alias(inode, dentry); } static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) @@ -147,12 +143,10 @@ const struct file_operations proc_ns_dir_operations = { static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - int error; struct task_struct *task = get_proc_task(dir); const struct proc_ns_operations **entry, **last; unsigned int len = dentry->d_name.len; - - error = -ENOENT; + struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; @@ -167,11 +161,11 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (entry == last) goto out; - error = proc_ns_instantiate(dir, dentry, task, *entry); + res = proc_ns_instantiate(dentry, task, *entry); out: put_task_struct(task); out_no_task: - return ERR_PTR(error); + return res; } const struct inode_operations proc_ns_dir_inode_operations = { diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 75634379f82e..3b63be64e436 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -113,21 +113,9 @@ static const struct seq_operations proc_nommu_region_list_seqop = { .show = nommu_region_list_show }; -static int proc_nommu_region_list_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &proc_nommu_region_list_seqop); -} - -static const struct file_operations proc_nommu_region_list_operations = { - .open = proc_nommu_region_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_nommu_init(void) { - proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); + proc_create_seq("maps", S_IRUGO, NULL, &proc_nommu_region_list_seqop); return 0; } diff --git a/fs/proc/page.c b/fs/proc/page.c index 1491918a33c3..792c78a49174 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -154,6 +154,8 @@ u64 stable_page_flags(struct page *page) if (PageBalloon(page)) u |= 1 << KPF_BALLOON; + if (PageTable(page)) + u |= 1 << KPF_PGTABLE; if (page_is_idle(page)) u |= 1 << KPF_IDLE; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 1763f370489d..d5e0fcb3439e 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -38,20 +38,23 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } -int seq_open_net(struct inode *ino, struct file *f, - const struct seq_operations *ops, int size) +static int seq_open_net(struct inode *inode, struct file *file) { - struct net *net; + unsigned int state_size = PDE(inode)->state_size; struct seq_net_private *p; + struct net *net; + + WARN_ON_ONCE(state_size < sizeof(*p)); - BUG_ON(size < sizeof(*p)); + if (file->f_mode & FMODE_WRITE && !PDE(inode)->write) + return -EACCES; - net = get_proc_net(ino); - if (net == NULL) + net = get_proc_net(inode); + if (!net) return -ENXIO; - p = __seq_open_private(f, ops, size); - if (p == NULL) { + p = __seq_open_private(file, PDE(inode)->seq_ops, state_size); + if (!p) { put_net(net); return -ENOMEM; } @@ -60,51 +63,172 @@ int seq_open_net(struct inode *ino, struct file *f, #endif return 0; } -EXPORT_SYMBOL_GPL(seq_open_net); -int single_open_net(struct inode *inode, struct file *file, - int (*show)(struct seq_file *, void *)) +static int seq_release_net(struct inode *ino, struct file *f) { - int err; - struct net *net; - - err = -ENXIO; - net = get_proc_net(inode); - if (net == NULL) - goto err_net; - - err = single_open(file, show, net); - if (err < 0) - goto err_open; + struct seq_file *seq = f->private_data; + put_net(seq_file_net(seq)); + seq_release_private(ino, f); return 0; +} -err_open: - put_net(net); -err_net: - return err; +static const struct file_operations proc_net_seq_fops = { + .open = seq_open_net, + .read = seq_read, + .write = proc_simple_write, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *ops, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_net_seq_fops; + p->seq_ops = ops; + p->state_size = state_size; + return proc_register(parent, p); } -EXPORT_SYMBOL_GPL(single_open_net); +EXPORT_SYMBOL_GPL(proc_create_net_data); + +/** + * proc_create_net_data_write - Create a writable net_ns-specific proc file + * @name: The name of the file. + * @mode: The file's access mode. + * @parent: The parent directory in which to create. + * @ops: The seq_file ops with which to read the file. + * @write: The write method which which to 'modify' the file. + * @data: Data for retrieval by PDE_DATA(). + * + * Create a network namespaced proc file in the @parent directory with the + * specified @name and @mode that allows reading of a file that displays a + * series of elements and also provides for the file accepting writes that have + * some arbitrary effect. + * + * The functions in the @ops table are used to iterate over items to be + * presented and extract the readable content using the seq_file interface. + * + * The @write function is called with the data copied into a kernel space + * scratch buffer and has a NUL appended for convenience. The buffer may be + * modified by the @write function. @write should return 0 on success. + * + * The @data value is accessible from the @show and @write functions by calling + * PDE_DATA() on the file inode. The network namespace must be accessed by + * calling seq_file_net() on the seq_file struct. + */ +struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct seq_operations *ops, + proc_write_t write, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_net_seq_fops; + p->seq_ops = ops; + p->state_size = state_size; + p->write = write; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_data_write); -int seq_release_net(struct inode *ino, struct file *f) +static int single_open_net(struct inode *inode, struct file *file) { - struct seq_file *seq; + struct proc_dir_entry *de = PDE(inode); + struct net *net; + int err; - seq = f->private_data; + net = get_proc_net(inode); + if (!net) + return -ENXIO; - put_net(seq_file_net(seq)); - seq_release_private(ino, f); - return 0; + err = single_open(file, de->single_show, net); + if (err) + put_net(net); + return err; } -EXPORT_SYMBOL_GPL(seq_release_net); -int single_release_net(struct inode *ino, struct file *f) +static int single_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; put_net(seq->private); return single_release(ino, f); } -EXPORT_SYMBOL_GPL(single_release_net); + +static const struct file_operations proc_net_single_fops = { + .open = single_open_net, + .read = seq_read, + .write = proc_simple_write, + .llseek = seq_lseek, + .release = single_release_net, +}; + +struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_net_single_fops; + p->single_show = show; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_single); + +/** + * proc_create_net_single_write - Create a writable net_ns-specific proc file + * @name: The name of the file. + * @mode: The file's access mode. + * @parent: The parent directory in which to create. + * @show: The seqfile show method with which to read the file. + * @write: The write method which which to 'modify' the file. + * @data: Data for retrieval by PDE_DATA(). + * + * Create a network-namespaced proc file in the @parent directory with the + * specified @name and @mode that allows reading of a file that displays a + * single element rather than a series and also provides for the file accepting + * writes that have some arbitrary effect. + * + * The @show function is called to extract the readable content via the + * seq_file interface. + * + * The @write function is called with the data copied into a kernel space + * scratch buffer and has a NUL appended for convenience. The buffer may be + * modified by the @write function. @write should return 0 on success. + * + * The @data value is accessible from the @show and @write functions by calling + * PDE_DATA() on the file inode. The network namespace must be accessed by + * calling seq_file_single_net() on the seq_file struct. + */ +struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), + proc_write_t write, + void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_net_single_fops; + p->single_show = show; + p->write = write; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_single_write); static struct net *get_proc_task_net(struct inode *dir) { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 8989936f2995..89921a0d2ebb 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -554,9 +554,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, if (!inode) goto out; - err = NULL; d_set_d_op(dentry, &proc_sys_dentry_operations); - d_add(dentry, inode); + err = d_splice_alias(inode, dentry); out: if (h) @@ -684,6 +683,7 @@ static bool proc_sys_fill_cache(struct file *file, if (IS_ERR(child)) return false; if (d_in_lookup(child)) { + struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); if (!inode) { d_lookup_done(child); @@ -691,7 +691,16 @@ static bool proc_sys_fill_cache(struct file *file, return false; } d_set_d_op(child, &proc_sys_dentry_operations); - d_add(child, inode); + res = d_splice_alias(inode, child); + d_lookup_done(child); + if (unlikely(res)) { + if (IS_ERR(res)) { + dput(child); + return false; + } + dput(child); + child = res; + } } } inode = d_inode(child); @@ -1417,7 +1426,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos, /* If there are mixed files and directories we need a new table */ if (nr_dirs && nr_files) { struct ctl_table *new; - files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1), + files = kcalloc(nr_files + 1, sizeof(struct ctl_table), GFP_KERNEL); if (!files) goto out; diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index d0cf1c50bb6c..c69ff191e5d8 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -126,18 +126,6 @@ static const struct seq_operations tty_drivers_op = { .show = show_tty_driver }; -static int tty_drivers_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &tty_drivers_op); -} - -static const struct file_operations proc_tty_drivers_operations = { - .open = tty_drivers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - /* * This function is called by tty_register_driver() to handle * registering the driver's /proc handler into /proc/tty/driver/<foo> @@ -147,11 +135,11 @@ void proc_tty_register_driver(struct tty_driver *driver) struct proc_dir_entry *ent; if (!driver->driver_name || driver->proc_entry || - !driver->ops->proc_fops) + !driver->ops->proc_show) return; - ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, - driver->ops->proc_fops, driver); + ent = proc_create_single_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_show, driver); driver->proc_entry = ent; } @@ -186,6 +174,6 @@ void __init proc_tty_init(void) * entry. */ proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL); - proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops); - proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations); + proc_create_seq("tty/ldiscs", 0, NULL, &tty_ldiscs_seq_ops); + proc_create_seq("tty/drivers", 0, NULL, &tty_drivers_op); } diff --git a/fs/proc/root.c b/fs/proc/root.c index 61b7340b357a..f4b1a9d2eca6 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -204,8 +204,7 @@ struct proc_dir_entry proc_root = { .proc_fops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, - .name = proc_root.inline_name, - .inline_name = "/proc", + .name = "/proc", }; int pid_ns_prepare_proc(struct pid_namespace *ns) diff --git a/fs/proc/self.c b/fs/proc/self.c index 4d7d061696b3..127265e5c55f 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -12,7 +12,7 @@ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; @@ -36,7 +36,7 @@ static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = s->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *self; inode_lock(root_inode); diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 24072cc06e65..12901dcf57e2 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -25,21 +25,9 @@ static int show_softirqs(struct seq_file *p, void *v) return 0; } -static int softirqs_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_softirqs, NULL); -} - -static const struct file_operations proc_softirqs_operations = { - .open = softirqs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_softirqs_init(void) { - proc_create("softirqs", 0, NULL, &proc_softirqs_operations); + proc_create_single("softirqs", 0, NULL, show_softirqs); return 0; } fs_initcall(proc_softirqs_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c486ad4b43f0..dfd73a4616ce 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -18,6 +18,7 @@ #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/uaccess.h> +#include <linux/pkeys.h> #include <asm/elf.h> #include <asm/tlb.h> @@ -673,13 +674,16 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", [ilog2(VM_PKEY_BIT3)] = "", +#if VM_PKEY_BIT4 + [ilog2(VM_PKEY_BIT4)] = "", #endif +#endif /* CONFIG_ARCH_HAS_PKEYS */ }; size_t i; @@ -727,10 +731,6 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, } #endif /* HUGETLB_PAGE */ -void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) -{ -} - #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) >> 10, 8) static int show_smap(struct seq_file *m, void *v, int is_pid) @@ -831,11 +831,13 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); SEQ_PUT_DEC(" kB\nSwapPss: ", mss->swap_pss >> PSS_SHIFT); - SEQ_PUT_DEC(" kB\nLocked: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", + mss->pss_locked >> PSS_SHIFT); seq_puts(m, " kB\n"); } if (!rollup_mode) { - arch_show_smap(m, vma); + if (arch_pkeys_enabled()) + seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); } m_cache_vma(m, vma); @@ -937,7 +939,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the - * Documentation/vm/soft-dirty.txt for full description + * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ pte_t ptent = *pte; @@ -1258,8 +1260,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; entry = pte_to_swp_entry(pte); - frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); @@ -1310,11 +1313,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); - unsigned long offset = swp_offset(entry); + unsigned long offset; - offset += (addr & ~PMD_MASK) >> PAGE_SHIFT; - frame = swp_type(entry) | - (offset << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) { + offset = swp_offset(entry) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; @@ -1332,10 +1338,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, err = add_to_pagemap(addr, &pme, pm); if (err) break; - if (pm->show_pfn && (flags & PM_PRESENT)) - frame++; - else if (flags & PM_SWAP) - frame += (1 << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) { + if (flags & PM_PRESENT) + frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); + } } spin_unlock(ptl); return err; @@ -1421,7 +1429,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped - * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) + * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bits 57-60 zero * Bit 61 page is file-page or shared-anon @@ -1466,7 +1474,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_KERNEL); + pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 9d2efaca499f..b905010ca9eb 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -12,7 +12,7 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode); pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); char *name; @@ -36,7 +36,7 @@ static unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = s->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *thread_self; inode_lock(root_inode); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 95a708d83721..3f723cb478af 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -10,7 +10,7 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; - struct timespec idle; + struct timespec64 idle; u64 nsec; u32 rem; int i; @@ -30,21 +30,9 @@ static int uptime_proc_show(struct seq_file *m, void *v) return 0; } -static int uptime_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, uptime_proc_show, NULL); -} - -static const struct file_operations uptime_proc_fops = { - .open = uptime_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_uptime_init(void) { - proc_create("uptime", 0, NULL, &uptime_proc_fops); + proc_create_single("uptime", 0, NULL, uptime_proc_show); return 0; } fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index 94901e8e700d..b449f186577f 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -15,21 +15,9 @@ static int version_proc_show(struct seq_file *m, void *v) return 0; } -static int version_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, version_proc_show, NULL); -} - -static const struct file_operations version_proc_fops = { - .open = version_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_version_init(void) { - proc_create("version", 0, NULL, &version_proc_fops); + proc_create_single("version", 0, NULL, version_proc_show); return 0; } fs_initcall(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a45f0af22a60..cfb6674331fd 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/crash_dump.h> #include <linux/list.h> +#include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/uaccess.h> @@ -38,12 +39,23 @@ static size_t elfcorebuf_sz_orig; static char *elfnotes_buf; static size_t elfnotes_sz; +/* Size of all notes minus the device dump notes */ +static size_t elfnotes_orig_sz; /* Total size of vmcore file. */ static u64 vmcore_size; static struct proc_dir_entry *proc_vmcore; +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/* Device Dump list and mutex to synchronize access to list */ +static LIST_HEAD(vmcoredd_list); +static DEFINE_MUTEX(vmcoredd_mutex); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Device Dump Size */ +static size_t vmcoredd_orig_sz; + /* * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error * The called function has to take care of module refcounting. @@ -178,6 +190,77 @@ static int copy_to(void *target, void *src, size_t size, int userbuf) return 0; } +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + int ret = 0; + size_t tsz; + char *buf; + + mutex_lock(&vmcoredd_mutex); + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (copy_to(dst, buf, tsz, userbuf)) { + ret = -EFAULT; + goto out_unlock; + } + + size -= tsz; + start += tsz; + dst += tsz; + + /* Leave now if buffer filled already */ + if (!size) + goto out_unlock; + } + offset += dump->size; + } + +out_unlock: + mutex_unlock(&vmcoredd_mutex); + return ret; +} + +static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, + u64 start, size_t size) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + int ret = 0; + size_t tsz; + char *buf; + + mutex_lock(&vmcoredd_mutex); + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) { + ret = -EFAULT; + goto out_unlock; + } + + size -= tsz; + start += tsz; + dst += tsz; + + /* Leave now if buffer filled already */ + if (!size) + goto out_unlock; + } + offset += dump->size; + } + +out_unlock: + mutex_unlock(&vmcoredd_mutex); + return ret; +} +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -215,10 +298,41 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, if (*fpos < elfcorebuf_sz + elfnotes_sz) { void *kaddr; + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) { + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)*fpos, buflen); + start = *fpos - elfcorebuf_sz; + if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf)) + return -EFAULT; + + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (!buflen) + return acc; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); - kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz; if (copy_to(buffer, kaddr, tsz, userbuf)) return -EFAULT; + buflen -= tsz; *fpos += tsz; buffer += tsz; @@ -302,10 +416,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = { }; /** - * alloc_elfnotes_buf - allocate buffer for ELF note segment in - * vmalloc memory - * - * @notes_sz: size of buffer + * vmcore_alloc_buf - allocate buffer in vmalloc memory + * @sizez: size of buffer * * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap * the buffer to user-space by means of remap_vmalloc_range(). @@ -313,12 +425,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = { * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is * disabled and there's no need to allow users to mmap the buffer. */ -static inline char *alloc_elfnotes_buf(size_t notes_sz) +static inline char *vmcore_alloc_buf(size_t size) { #ifdef CONFIG_MMU - return vmalloc_user(notes_sz); + return vmalloc_user(size); #else - return vzalloc(notes_sz); + return vzalloc(size); #endif } @@ -446,11 +558,46 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (start < elfcorebuf_sz + elfnotes_sz) { void *kaddr; + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. This also ensures that the device dumps and + * other elf notes can be properly mmaped at page aligned + * address. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (start < elfcorebuf_sz + vmcoredd_orig_sz) { + u64 start_off; + + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)start, size); + start_off = start - elfcorebuf_sz; + if (vmcoredd_mmap_dumps(vma, vma->vm_start + len, + start_off, tsz)) + goto fail; + + size -= tsz; + start += tsz; + len += tsz; + + /* leave now if filled buffer already */ + if (!size) + return 0; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); - kaddr = elfnotes_buf + start - elfcorebuf_sz; + kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz; if (remap_vmalloc_range_partial(vma, vma->vm_start + len, kaddr, tsz)) goto fail; + size -= tsz; start += tsz; len += tsz; @@ -502,8 +649,8 @@ static struct vmcore* __init get_new_element(void) return kzalloc(sizeof(struct vmcore), GFP_KERNEL); } -static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz, - struct list_head *vc_list) +static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) { u64 size; struct vmcore *m; @@ -665,7 +812,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = alloc_elfnotes_buf(*notes_sz); + *notes_buf = vmcore_alloc_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; @@ -698,6 +845,11 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + return 0; } @@ -851,7 +1003,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = alloc_elfnotes_buf(*notes_sz); + *notes_buf = vmcore_alloc_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; @@ -884,6 +1036,11 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + return 0; } @@ -976,8 +1133,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, } /* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, - struct list_head *vc_list) +static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, + struct list_head *vc_list) { loff_t vmcore_off; struct vmcore *m; @@ -1145,6 +1302,202 @@ static int __init parse_crash_elf_headers(void) return 0; } +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/** + * vmcoredd_write_header - Write vmcore device dump header at the + * beginning of the dump's buffer. + * @buf: Output buffer where the note is written + * @data: Dump info + * @size: Size of the dump + * + * Fills beginning of the dump's buffer with vmcore device dump header. + */ +static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, + u32 size) +{ + struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf; + + vdd_hdr->n_namesz = sizeof(vdd_hdr->name); + vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name); + vdd_hdr->n_type = NT_VMCOREDD; + + strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME, + sizeof(vdd_hdr->name)); + memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name)); +} + +/** + * vmcoredd_update_program_headers - Update all Elf program headers + * @elfptr: Pointer to elf header + * @elfnotesz: Size of elf notes aligned to page size + * @vmcoreddsz: Size of device dumps to be added to elf note header + * + * Determine type of Elf header (Elf64 or Elf32) and update the elf note size. + * Also update the offsets of all the program headers after the elf note header. + */ +static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, + size_t vmcoreddsz) +{ + unsigned char *e_ident = (unsigned char *)elfptr; + u64 start, end, size; + loff_t vmcore_off; + u32 i; + + vmcore_off = elfcorebuf_sz + elfnotesz; + + if (e_ident[EI_CLASS] == ELFCLASS64) { + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr; + Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } else { + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr; + Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } +} + +/** + * vmcoredd_update_size - Update the total size of the device dumps and update + * Elf header + * @dump_size: Size of the current device dump to be added to total size + * + * Update the total size of all the device dumps and update the Elf program + * headers. Calculate the new offsets for the vmcore list and update the + * total vmcore size. + */ +static void vmcoredd_update_size(size_t dump_size) +{ + vmcoredd_orig_sz += dump_size; + elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz; + vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz, + vmcoredd_orig_sz); + + /* Update vmcore list offsets */ + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); + + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + proc_vmcore->size = vmcore_size; +} + +/** + * vmcore_add_device_dump - Add a buffer containing device dump to vmcore + * @data: dump info. + * + * Allocate a buffer and invoke the calling driver's dump collect routine. + * Write Elf note at the beginning of the buffer to indicate vmcore device + * dump and add the dump to global list. + */ +int vmcore_add_device_dump(struct vmcoredd_data *data) +{ + struct vmcoredd_node *dump; + void *buf = NULL; + size_t data_size; + int ret; + + if (!data || !strlen(data->dump_name) || + !data->vmcoredd_callback || !data->size) + return -EINVAL; + + dump = vzalloc(sizeof(*dump)); + if (!dump) { + ret = -ENOMEM; + goto out_err; + } + + /* Keep size of the buffer page aligned so that it can be mmaped */ + data_size = roundup(sizeof(struct vmcoredd_header) + data->size, + PAGE_SIZE); + + /* Allocate buffer for driver's to write their dumps */ + buf = vmcore_alloc_buf(data_size); + if (!buf) { + ret = -ENOMEM; + goto out_err; + } + + vmcoredd_write_header(buf, data, data_size - + sizeof(struct vmcoredd_header)); + + /* Invoke the driver's dump collection routing */ + ret = data->vmcoredd_callback(data, buf + + sizeof(struct vmcoredd_header)); + if (ret) + goto out_err; + + dump->buf = buf; + dump->size = data_size; + + /* Add the dump to driver sysfs list */ + mutex_lock(&vmcoredd_mutex); + list_add_tail(&dump->list, &vmcoredd_list); + mutex_unlock(&vmcoredd_mutex); + + vmcoredd_update_size(data_size); + return 0; + +out_err: + if (buf) + vfree(buf); + + if (dump) + vfree(dump); + + return ret; +} +EXPORT_SYMBOL(vmcore_add_device_dump); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Free all dumps in vmcore device dump list */ +static void vmcore_free_device_dumps(void) +{ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + mutex_lock(&vmcoredd_mutex); + while (!list_empty(&vmcoredd_list)) { + struct vmcoredd_node *dump; + + dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node, + list); + list_del(&dump->list); + vfree(dump->buf); + vfree(dump); + } + mutex_unlock(&vmcoredd_mutex); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ +} + /* Init function for vmcore module. */ static int __init vmcore_init(void) { @@ -1192,4 +1545,7 @@ void vmcore_cleanup(void) kfree(m); } free_elfcorebuf(); + + /* clear vmcore device dump list */ + vmcore_free_device_dumps(); } diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index dc720573fd53..c238ab8ba31d 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -328,7 +328,7 @@ void pstore_record_init(struct pstore_record *record, record->psi = psinfo; /* Report zeroed timestamp if called before timekeeping has resumed. */ - record->time = ns_to_timespec(ktime_get_real_fast_ns()); + record->time = ns_to_timespec64(ktime_get_real_fast_ns()); } /* diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 49b2bc114868..bbd1e357c23d 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -153,21 +153,23 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, return prz; } -static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time, +static int ramoops_read_kmsg_hdr(char *buffer, struct timespec64 *time, bool *compressed) { char data_type; int header_length = 0; - if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n%n", &time->tv_sec, - &time->tv_nsec, &data_type, &header_length) == 3) { + if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lld.%lu-%c\n%n", + (time64_t *)&time->tv_sec, &time->tv_nsec, &data_type, + &header_length) == 3) { if (data_type == 'C') *compressed = true; else *compressed = false; - } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n%n", - &time->tv_sec, &time->tv_nsec, &header_length) == 2) { - *compressed = false; + } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lld.%lu\n%n", + (time64_t *)&time->tv_sec, &time->tv_nsec, + &header_length) == 2) { + *compressed = false; } else { time->tv_sec = 0; time->tv_nsec = 0; @@ -360,8 +362,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, char *hdr; size_t len; - hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", - record->time.tv_sec, + hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n", + (time64_t)record->time.tv_sec, record->time.tv_nsec / 1000, record->compressed ? 'C' : 'D'); WARN_ON_ONCE(!hdr); diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c index eca27878079d..8d72221735d7 100644 --- a/fs/qnx4/namei.c +++ b/fs/qnx4/namei.c @@ -114,13 +114,9 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned i brelse(bh); foundinode = qnx4_iget(dir->i_sb, ino); - if (IS_ERR(foundinode)) { + if (IS_ERR(foundinode)) QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n", PTR_ERR(foundinode))); - return ERR_CAST(foundinode); - } out: - d_add(dentry, foundinode); - - return NULL; + return d_splice_alias(foundinode, dentry); } diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c index 72c2770830be..e2e98e653b8d 100644 --- a/fs/qnx6/namei.c +++ b/fs/qnx6/namei.c @@ -29,15 +29,11 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, if (ino) { foundinode = qnx6_iget(dir->i_sb, ino); qnx6_put_page(page); - if (IS_ERR(foundinode)) { + if (IS_ERR(foundinode)) pr_debug("lookup->iget -> error %ld\n", PTR_ERR(foundinode)); - return ERR_CAST(foundinode); - } } else { pr_debug("%s(): not found %s\n", __func__, name); - return NULL; } - d_add(dentry, foundinode); - return NULL; + return d_splice_alias(foundinode, dentry); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index d88231e3b2be..fc20e06c56ba 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -711,21 +711,18 @@ EXPORT_SYMBOL(dquot_quota_sync); static unsigned long dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct list_head *head; struct dquot *dquot; unsigned long freed = 0; spin_lock(&dq_list_lock); - head = free_dquots.prev; - while (head != &free_dquots && sc->nr_to_scan) { - dquot = list_entry(head, struct dquot, dq_free); + while (!list_empty(&free_dquots) && sc->nr_to_scan) { + dquot = list_first_entry(&free_dquots, struct dquot, dq_free); remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); sc->nr_to_scan--; freed++; - head = free_dquots.prev; } spin_unlock(&dq_list_lock); return freed; diff --git a/fs/read_write.c b/fs/read_write.c index c4eabbfc90df..153f8f690490 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -778,7 +778,7 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, goto out; } if (nr_segs > fast_segs) { - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); if (iov == NULL) { ret = -ENOMEM; goto out; @@ -849,7 +849,7 @@ ssize_t compat_rw_copy_check_uvector(int type, goto out; if (nr_segs > fast_segs) { ret = -ENOMEM; - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); if (iov == NULL) goto out; } @@ -2023,7 +2023,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) ret = mnt_want_write_file(dst_file); if (ret) { info->status = ret; - goto next_loop; + goto next_fdput; } dst_off = info->dest_offset; @@ -2058,9 +2058,9 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) next_file: mnt_drop_write_file(dst_file); -next_loop: +next_fdput: fdput(dst_fd); - +next_loop: if (fatal_signal_pending(current)) goto out; } diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index edc8ef78b63f..bf708ac287b4 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1456,7 +1456,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb) struct reiserfs_bitmap_info *bitmap; unsigned int bmap_nr = reiserfs_bmap_count(sb); - bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); + bitmap = vmalloc(array_size(bmap_nr, sizeof(*bitmap))); if (bitmap == NULL) return -ENOMEM; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index b13fc024d2ee..132ec4406ed0 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1044,7 +1044,8 @@ research: if (blocks_needed == 1) { un = &unf_single; } else { - un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS); + un = kcalloc(min(blocks_needed, max_to_insert), + UNFM_P_SIZE, GFP_NOFS); if (!un) { un = &unf_single; blocks_needed = 1; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 23148c3ed675..52eb5d293a34 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -350,7 +350,8 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) if (num_cnodes <= 0) { return NULL; } - head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); + head = vzalloc(array_size(num_cnodes, + sizeof(struct reiserfs_journal_cnode))); if (!head) { return NULL; } @@ -2192,10 +2193,12 @@ static int journal_read_transaction(struct super_block *sb, * now we know we've got a good transaction, and it was * inside the valid time ranges */ - log_blocks = kmalloc(get_desc_trans_len(desc) * - sizeof(struct buffer_head *), GFP_NOFS); - real_blocks = kmalloc(get_desc_trans_len(desc) * - sizeof(struct buffer_head *), GFP_NOFS); + log_blocks = kmalloc_array(get_desc_trans_len(desc), + sizeof(struct buffer_head *), + GFP_NOFS); + real_blocks = kmalloc_array(get_desc_trans_len(desc), + sizeof(struct buffer_head *), + GFP_NOFS); if (!log_blocks || !real_blocks) { brelse(c_bh); brelse(d_bh); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index bd39a998843d..97f3fc4fdd79 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -687,8 +687,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: @@ -771,8 +770,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode goto out_failed; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: @@ -871,8 +869,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode /* the above add_entry did not update dir's stat data */ reiserfs_update_sd(&th, dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: reiserfs_write_unlock(dir->i_sb); @@ -1187,8 +1184,7 @@ static int reiserfs_symlink(struct inode *parent_dir, goto out_failed; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: reiserfs_write_unlock(parent_dir->i_sb); @@ -1320,7 +1316,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, int jbegin_count; umode_t old_inode_mode; unsigned long savelink = 1; - struct timespec ctime; + struct timespec64 ctime; if (flags & ~RENAME_NOREPLACE) return -EINVAL; diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 7e288d97adcb..9fed1c05f1f4 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -76,83 +76,99 @@ static char *le_type(struct reiserfs_key *key) } /* %k */ -static void sprintf_le_key(char *buf, struct reiserfs_key *key) +static int scnprintf_le_key(char *buf, size_t size, struct reiserfs_key *key) { if (key) - sprintf(buf, "[%d %d %s %s]", le32_to_cpu(key->k_dir_id), - le32_to_cpu(key->k_objectid), le_offset(key), - le_type(key)); + return scnprintf(buf, size, "[%d %d %s %s]", + le32_to_cpu(key->k_dir_id), + le32_to_cpu(key->k_objectid), le_offset(key), + le_type(key)); else - sprintf(buf, "[NULL]"); + return scnprintf(buf, size, "[NULL]"); } /* %K */ -static void sprintf_cpu_key(char *buf, struct cpu_key *key) +static int scnprintf_cpu_key(char *buf, size_t size, struct cpu_key *key) { if (key) - sprintf(buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id, - key->on_disk_key.k_objectid, reiserfs_cpu_offset(key), - cpu_type(key)); + return scnprintf(buf, size, "[%d %d %s %s]", + key->on_disk_key.k_dir_id, + key->on_disk_key.k_objectid, + reiserfs_cpu_offset(key), cpu_type(key)); else - sprintf(buf, "[NULL]"); + return scnprintf(buf, size, "[NULL]"); } -static void sprintf_de_head(char *buf, struct reiserfs_de_head *deh) +static int scnprintf_de_head(char *buf, size_t size, + struct reiserfs_de_head *deh) { if (deh) - sprintf(buf, - "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", - deh_offset(deh), deh_dir_id(deh), deh_objectid(deh), - deh_location(deh), deh_state(deh)); + return scnprintf(buf, size, + "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", + deh_offset(deh), deh_dir_id(deh), + deh_objectid(deh), deh_location(deh), + deh_state(deh)); else - sprintf(buf, "[NULL]"); + return scnprintf(buf, size, "[NULL]"); } -static void sprintf_item_head(char *buf, struct item_head *ih) +static int scnprintf_item_head(char *buf, size_t size, struct item_head *ih) { if (ih) { - strcpy(buf, - (ih_version(ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*"); - sprintf_le_key(buf + strlen(buf), &(ih->ih_key)); - sprintf(buf + strlen(buf), ", item_len %d, item_location %d, " - "free_space(entry_count) %d", - ih_item_len(ih), ih_location(ih), ih_free_space(ih)); + char *p = buf; + char * const end = buf + size; + + p += scnprintf(p, end - p, "%s", + (ih_version(ih) == KEY_FORMAT_3_6) ? + "*3.6* " : "*3.5*"); + + p += scnprintf_le_key(p, end - p, &ih->ih_key); + + p += scnprintf(p, end - p, + ", item_len %d, item_location %d, free_space(entry_count) %d", + ih_item_len(ih), ih_location(ih), + ih_free_space(ih)); + return p - buf; } else - sprintf(buf, "[NULL]"); + return scnprintf(buf, size, "[NULL]"); } -static void sprintf_direntry(char *buf, struct reiserfs_dir_entry *de) +static int scnprintf_direntry(char *buf, size_t size, + struct reiserfs_dir_entry *de) { char name[20]; memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0; - sprintf(buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); + return scnprintf(buf, size, "\"%s\"==>[%d %d]", + name, de->de_dir_id, de->de_objectid); } -static void sprintf_block_head(char *buf, struct buffer_head *bh) +static int scnprintf_block_head(char *buf, size_t size, struct buffer_head *bh) { - sprintf(buf, "level=%d, nr_items=%d, free_space=%d rdkey ", - B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh)); + return scnprintf(buf, size, + "level=%d, nr_items=%d, free_space=%d rdkey ", + B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh)); } -static void sprintf_buffer_head(char *buf, struct buffer_head *bh) +static int scnprintf_buffer_head(char *buf, size_t size, struct buffer_head *bh) { - sprintf(buf, - "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", - bh->b_bdev, bh->b_size, - (unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)), - bh->b_state, bh->b_page, - buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", - buffer_dirty(bh) ? "DIRTY" : "CLEAN", - buffer_locked(bh) ? "LOCKED" : "UNLOCKED"); + return scnprintf(buf, size, + "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", + bh->b_bdev, bh->b_size, + (unsigned long long)bh->b_blocknr, + atomic_read(&(bh->b_count)), + bh->b_state, bh->b_page, + buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", + buffer_dirty(bh) ? "DIRTY" : "CLEAN", + buffer_locked(bh) ? "LOCKED" : "UNLOCKED"); } -static void sprintf_disk_child(char *buf, struct disk_child *dc) +static int scnprintf_disk_child(char *buf, size_t size, struct disk_child *dc) { - sprintf(buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), - dc_size(dc)); + return scnprintf(buf, size, "[dc_number=%d, dc_size=%u]", + dc_block_number(dc), dc_size(dc)); } static char *is_there_reiserfs_struct(char *fmt, int *what) @@ -189,55 +205,60 @@ static void prepare_error_buf(const char *fmt, va_list args) char *fmt1 = fmt_buf; char *k; char *p = error_buf; + char * const end = &error_buf[sizeof(error_buf)]; int what; spin_lock(&error_lock); - strcpy(fmt1, fmt); + if (WARN_ON(strscpy(fmt_buf, fmt, sizeof(fmt_buf)) < 0)) { + strscpy(error_buf, "format string too long", end - error_buf); + goto out_unlock; + } while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) { *k = 0; - p += vsprintf(p, fmt1, args); + p += vscnprintf(p, end - p, fmt1, args); switch (what) { case 'k': - sprintf_le_key(p, va_arg(args, struct reiserfs_key *)); + p += scnprintf_le_key(p, end - p, + va_arg(args, struct reiserfs_key *)); break; case 'K': - sprintf_cpu_key(p, va_arg(args, struct cpu_key *)); + p += scnprintf_cpu_key(p, end - p, + va_arg(args, struct cpu_key *)); break; case 'h': - sprintf_item_head(p, va_arg(args, struct item_head *)); + p += scnprintf_item_head(p, end - p, + va_arg(args, struct item_head *)); break; case 't': - sprintf_direntry(p, - va_arg(args, - struct reiserfs_dir_entry *)); + p += scnprintf_direntry(p, end - p, + va_arg(args, struct reiserfs_dir_entry *)); break; case 'y': - sprintf_disk_child(p, - va_arg(args, struct disk_child *)); + p += scnprintf_disk_child(p, end - p, + va_arg(args, struct disk_child *)); break; case 'z': - sprintf_block_head(p, - va_arg(args, struct buffer_head *)); + p += scnprintf_block_head(p, end - p, + va_arg(args, struct buffer_head *)); break; case 'b': - sprintf_buffer_head(p, - va_arg(args, struct buffer_head *)); + p += scnprintf_buffer_head(p, end - p, + va_arg(args, struct buffer_head *)); break; case 'a': - sprintf_de_head(p, - va_arg(args, - struct reiserfs_de_head *)); + p += scnprintf_de_head(p, end - p, + va_arg(args, struct reiserfs_de_head *)); break; } - p += strlen(p); fmt1 = k + 2; } - vsprintf(p, fmt1, args); + p += vscnprintf(p, end - p, fmt1, args); +out_unlock: spin_unlock(&error_lock); } diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index fe999157dd97..e39b3910d24d 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -389,27 +389,13 @@ static int show_journal(struct seq_file *m, void *unused) return 0; } -static int r_open(struct inode *inode, struct file *file) -{ - return single_open(file, PDE_DATA(inode), - proc_get_parent_data(inode)); -} - -static const struct file_operations r_file_operations = { - .open = r_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static struct proc_dir_entry *proc_info_root = NULL; static const char proc_info_root_name[] = "fs/reiserfs"; static void add_file(struct super_block *sb, char *name, int (*func) (struct seq_file *, void *)) { - proc_create_data(name, 0, REISERFS_SB(sb)->procdir, - &r_file_operations, func); + proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb); } int reiserfs_proc_info_init(struct super_block *sb) diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 6052d323bc9a..8096c74c38ac 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -120,7 +120,8 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) * array of bitmap block pointers */ bitmap = - vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); + vzalloc(array_size(bmap_nr_new, + sizeof(struct reiserfs_bitmap_info))); if (!bitmap) { /* * Journal bitmaps are still supersized, but the diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 5dbf5324bdda..ff94fad477e4 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -451,10 +451,10 @@ int reiserfs_commit_write(struct file *f, struct page *page, static void update_ctime(struct inode *inode) { - struct timespec now = current_time(inode); + struct timespec64 now = current_time(inode); if (inode_unhashed(inode) || !inode->i_nlink || - timespec_equal(&inode->i_ctime, &now)) + timespec64_equal(&inode->i_ctime, &now)) return; inode->i_ctime = current_time(inode); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 8f06fd1f3d69..6ccb51993a76 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -213,7 +213,7 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned long offset, maxoff; - struct inode *inode; + struct inode *inode = NULL; struct romfs_inode ri; const char *name; /* got from dentry */ int len, ret; @@ -233,7 +233,7 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, for (;;) { if (!offset || offset >= maxoff) - goto out0; + break; ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri)); if (ret < 0) @@ -244,37 +244,19 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, len); if (ret < 0) goto error; - if (ret == 1) + if (ret == 1) { + /* Hard link handling */ + if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + inode = romfs_iget(dir->i_sb, offset); break; + } /* next entry */ offset = be32_to_cpu(ri.next) & ROMFH_MASK; } - /* Hard link handling */ - if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - - inode = romfs_iget(dir->i_sb, offset); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto error; - } - goto outi; - - /* - * it's a bit funky, _lookup needs to return an error code - * (negative) or a NULL, both as a dentry. ENOENT should not - * be returned, instead we need to create a negative dentry by - * d_add(dentry, NULL); and return 0 as no error. - * (Although as I see, it only matters on writable file - * systems). - */ -out0: - inode = NULL; -outi: - d_add(dentry, inode); - ret = 0; + return d_splice_alias(inode, dentry); error: return ERR_PTR(ret); } diff --git a/fs/select.c b/fs/select.c index ba879c51288f..4a6b6e4b21cb 100644 --- a/fs/select.c +++ b/fs/select.c @@ -233,7 +233,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, add_wait_queue(wait_address, &entry->wait); } -int poll_schedule_timeout(struct poll_wqueues *pwq, int state, +static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; @@ -258,7 +258,6 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, return rc; } -EXPORT_SYMBOL(poll_schedule_timeout); /** * poll_select_set_timeout - helper function to setup the timeout value @@ -503,14 +502,10 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) continue; f = fdget(i); if (f.file) { - const struct file_operations *f_op; - f_op = f.file->f_op; - mask = DEFAULT_POLLMASK; - if (f_op->poll) { - wait_key_set(wait, in, out, - bit, busy_flag); - mask = (*f_op->poll)(f.file, wait); - } + wait_key_set(wait, in, out, bit, + busy_flag); + mask = vfs_poll(f.file, wait); + fdput(f); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; @@ -813,34 +808,29 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { - __poll_t mask; - int fd; - - mask = 0; - fd = pollfd->fd; - if (fd >= 0) { - struct fd f = fdget(fd); - mask = EPOLLNVAL; - if (f.file) { - /* userland u16 ->events contains POLL... bitmap */ - __poll_t filter = demangle_poll(pollfd->events) | - EPOLLERR | EPOLLHUP; - mask = DEFAULT_POLLMASK; - if (f.file->f_op->poll) { - pwait->_key = filter; - pwait->_key |= busy_flag; - mask = f.file->f_op->poll(f.file, pwait); - if (mask & busy_flag) - *can_busy_poll = true; - } - /* Mask out unneeded events. */ - mask &= filter; - fdput(f); - } - } + int fd = pollfd->fd; + __poll_t mask = 0, filter; + struct fd f; + + if (fd < 0) + goto out; + mask = EPOLLNVAL; + f = fdget(fd); + if (!f.file) + goto out; + + /* userland u16 ->events contains POLL... bitmap */ + filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; + pwait->_key = filter | busy_flag; + mask = vfs_poll(f.file, pwait); + if (mask & busy_flag) + *can_busy_poll = true; + mask &= filter; /* Mask out unneeded events. */ + fdput(f); + +out: /* ... and so does ->revents */ pollfd->revents = mangle_poll(mask); - return mask; } @@ -1223,7 +1213,7 @@ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { - bits = kmalloc(6 * size, GFP_KERNEL); + bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; diff --git a/fs/seq_file.c b/fs/seq_file.c index c6c27f1f9c98..4cc090b50cc5 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -709,11 +709,6 @@ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, if (m->count + width >= m->size) goto overflow; - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; diff --git a/fs/signalfd.c b/fs/signalfd.c index d2187a813376..4fcd1498acf5 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -81,83 +81,86 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait) static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, siginfo_t const *kinfo) { - long err; + struct signalfd_siginfo new; BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128); /* * Unused members should be zero ... */ - err = __clear_user(uinfo, sizeof(*uinfo)); + memset(&new, 0, sizeof(new)); /* * If you change siginfo_t structure, please be sure * this code is fixed accordingly. */ - err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo); - err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno); - err |= __put_user(kinfo->si_code, &uinfo->ssi_code); + new.ssi_signo = kinfo->si_signo; + new.ssi_errno = kinfo->si_errno; + new.ssi_code = kinfo->si_code; switch (siginfo_layout(kinfo->si_signo, kinfo->si_code)) { case SIL_KILL: - err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); - err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); + new.ssi_pid = kinfo->si_pid; + new.ssi_uid = kinfo->si_uid; break; case SIL_TIMER: - err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); - err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); - err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); - err |= __put_user(kinfo->si_int, &uinfo->ssi_int); + new.ssi_tid = kinfo->si_tid; + new.ssi_overrun = kinfo->si_overrun; + new.ssi_ptr = (long) kinfo->si_ptr; + new.ssi_int = kinfo->si_int; break; case SIL_POLL: - err |= __put_user(kinfo->si_band, &uinfo->ssi_band); - err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd); + new.ssi_band = kinfo->si_band; + new.ssi_fd = kinfo->si_fd; break; - case SIL_FAULT: - err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr); -#ifdef __ARCH_SI_TRAPNO - err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); -#endif -#ifdef BUS_MCEERR_AO + case SIL_FAULT_BNDERR: + case SIL_FAULT_PKUERR: /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. + * Fall through to the SIL_FAULT case. Both SIL_FAULT_BNDERR + * and SIL_FAULT_PKUERR are only generated by faults that + * deliver them synchronously to userspace. In case someone + * injects one of these signals and signalfd catches it treat + * it as SIL_FAULT. */ - if (kinfo->si_signo == SIGBUS && - kinfo->si_code == BUS_MCEERR_AO) - err |= __put_user((short) kinfo->si_addr_lsb, - &uinfo->ssi_addr_lsb); + case SIL_FAULT: + new.ssi_addr = (long) kinfo->si_addr; +#ifdef __ARCH_SI_TRAPNO + new.ssi_trapno = kinfo->si_trapno; #endif -#ifdef BUS_MCEERR_AR - /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. - */ - if (kinfo->si_signo == SIGBUS && - kinfo->si_code == BUS_MCEERR_AR) - err |= __put_user((short) kinfo->si_addr_lsb, - &uinfo->ssi_addr_lsb); + break; + case SIL_FAULT_MCEERR: + new.ssi_addr = (long) kinfo->si_addr; +#ifdef __ARCH_SI_TRAPNO + new.ssi_trapno = kinfo->si_trapno; #endif + new.ssi_addr_lsb = (short) kinfo->si_addr_lsb; break; case SIL_CHLD: - err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); - err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); - err |= __put_user(kinfo->si_status, &uinfo->ssi_status); - err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime); - err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime); + new.ssi_pid = kinfo->si_pid; + new.ssi_uid = kinfo->si_uid; + new.ssi_status = kinfo->si_status; + new.ssi_utime = kinfo->si_utime; + new.ssi_stime = kinfo->si_stime; break; case SIL_RT: - default: /* * This case catches also the signals queued by sigqueue(). */ - err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); - err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); - err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); - err |= __put_user(kinfo->si_int, &uinfo->ssi_int); + new.ssi_pid = kinfo->si_pid; + new.ssi_uid = kinfo->si_uid; + new.ssi_ptr = (long) kinfo->si_ptr; + new.ssi_int = kinfo->si_int; + break; + case SIL_SYS: + new.ssi_call_addr = (long) kinfo->si_call_addr; + new.ssi_syscall = kinfo->si_syscall; + new.ssi_arch = kinfo->si_arch; break; } - return err ? -EFAULT: sizeof(*uinfo); + if (copy_to_user(uinfo, &new, sizeof(struct signalfd_siginfo))) + return -EFAULT; + + return sizeof(*uinfo); } static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info, @@ -256,10 +259,8 @@ static const struct file_operations signalfd_fops = { .llseek = noop_llseek, }; -static int do_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, - int flags) +static int do_signalfd4(int ufd, sigset_t *mask, int flags) { - sigset_t sigmask; struct signalfd_ctx *ctx; /* Check the SFD_* constants for consistency. */ @@ -269,18 +270,15 @@ static int do_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK)) return -EINVAL; - if (sizemask != sizeof(sigset_t) || - copy_from_user(&sigmask, user_mask, sizeof(sigmask))) - return -EINVAL; - sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - signotset(&sigmask); + sigdelsetmask(mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + signotset(mask); if (ufd == -1) { ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; - ctx->sigmask = sigmask; + ctx->sigmask = *mask; /* * When we call this, the initialization must be complete, since @@ -300,7 +298,7 @@ static int do_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, return -EINVAL; } spin_lock_irq(¤t->sighand->siglock); - ctx->sigmask = sigmask; + ctx->sigmask = *mask; spin_unlock_irq(¤t->sighand->siglock); wake_up(¤t->sighand->signalfd_wqh); @@ -313,46 +311,51 @@ static int do_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, size_t, sizemask, int, flags) { - return do_signalfd4(ufd, user_mask, sizemask, flags); + sigset_t mask; + + if (sizemask != sizeof(sigset_t) || + copy_from_user(&mask, user_mask, sizeof(mask))) + return -EINVAL; + return do_signalfd4(ufd, &mask, flags); } SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, size_t, sizemask) { - return do_signalfd4(ufd, user_mask, sizemask, 0); + sigset_t mask; + + if (sizemask != sizeof(sigset_t) || + copy_from_user(&mask, user_mask, sizeof(mask))) + return -EINVAL; + return do_signalfd4(ufd, &mask, 0); } #ifdef CONFIG_COMPAT static long do_compat_signalfd4(int ufd, - const compat_sigset_t __user *sigmask, + const compat_sigset_t __user *user_mask, compat_size_t sigsetsize, int flags) { - sigset_t tmp; - sigset_t __user *ksigmask; + sigset_t mask; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (get_compat_sigset(&tmp, sigmask)) - return -EFAULT; - ksigmask = compat_alloc_user_space(sizeof(sigset_t)); - if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) + if (get_compat_sigset(&mask, user_mask)) return -EFAULT; - - return do_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); + return do_signalfd4(ufd, &mask, flags); } COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, - const compat_sigset_t __user *, sigmask, + const compat_sigset_t __user *, user_mask, compat_size_t, sigsetsize, int, flags) { - return do_compat_signalfd4(ufd, sigmask, sigsetsize, flags); + return do_compat_signalfd4(ufd, user_mask, sigsetsize, flags); } COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd, - const compat_sigset_t __user *,sigmask, + const compat_sigset_t __user *, user_mask, compat_size_t, sigsetsize) { - return do_compat_signalfd4(ufd, sigmask, sigsetsize, 0); + return do_compat_signalfd4(ufd, user_mask, sigsetsize, 0); } #endif diff --git a/fs/splice.c b/fs/splice.c index 005d09cf3fa8..b3daa971f597 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -259,8 +259,9 @@ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc if (buffers <= PIPE_DEF_BUFFERS) return 0; - spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); - spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); + spd->pages = kmalloc_array(buffers, sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc_array(buffers, sizeof(struct partial_page), + GFP_KERNEL); if (spd->pages && spd->partial) return 0; @@ -395,7 +396,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, vec = __vec; if (nr_pages > PIPE_DEF_BUFFERS) { - vec = kmalloc(nr_pages * sizeof(struct kvec), GFP_KERNEL); + vec = kmalloc_array(nr_pages, sizeof(struct kvec), GFP_KERNEL); if (unlikely(!vec)) { res = -ENOMEM; goto out; @@ -1242,38 +1243,26 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ -static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, - unsigned long nr_segs, unsigned int flags) +static long vmsplice_to_user(struct file *file, struct iov_iter *iter, + unsigned int flags) { - struct pipe_inode_info *pipe; - struct splice_desc sd; - long ret; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; + struct pipe_inode_info *pipe = get_pipe_info(file); + struct splice_desc sd = { + .total_len = iov_iter_count(iter), + .flags = flags, + .u.data = iter + }; + long ret = 0; - pipe = get_pipe_info(file); if (!pipe) return -EBADF; - ret = import_iovec(READ, uiov, nr_segs, - ARRAY_SIZE(iovstack), &iov, &iter); - if (ret < 0) - return ret; - - sd.total_len = iov_iter_count(&iter); - sd.len = 0; - sd.flags = flags; - sd.u.data = &iter; - sd.pos = 0; - if (sd.total_len) { pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); } - kfree(iov); return ret; } @@ -1282,14 +1271,11 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ -static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov, - unsigned long nr_segs, unsigned int flags) +static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, + unsigned int flags) { struct pipe_inode_info *pipe; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter from; - long ret; + long ret = 0; unsigned buf_flag = 0; if (flags & SPLICE_F_GIFT) @@ -1299,22 +1285,31 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov, if (!pipe) return -EBADF; - ret = import_iovec(WRITE, uiov, nr_segs, - ARRAY_SIZE(iovstack), &iov, &from); - if (ret < 0) - return ret; - pipe_lock(pipe); ret = wait_for_space(pipe, flags); if (!ret) - ret = iter_to_pipe(&from, pipe, buf_flag); + ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); if (ret > 0) wakeup_pipe_readers(pipe); - kfree(iov); return ret; } +static int vmsplice_type(struct fd f, int *type) +{ + if (!f.file) + return -EBADF; + if (f.file->f_mode & FMODE_WRITE) { + *type = WRITE; + } else if (f.file->f_mode & FMODE_READ) { + *type = READ; + } else { + fdput(f); + return -EBADF; + } + return 0; +} + /* * Note that vmsplice only really supports true splicing _from_ user memory * to a pipe, not the other way around. Splicing from user memory is a simple @@ -1331,57 +1326,69 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov, * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ -static long do_vmsplice(int fd, const struct iovec __user *iov, - unsigned long nr_segs, unsigned int flags) +static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags) { - struct fd f; - long error; - if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; - if (unlikely(nr_segs > UIO_MAXIOV)) - return -EINVAL; - else if (unlikely(!nr_segs)) - return 0; - error = -EBADF; - f = fdget(fd); - if (f.file) { - if (f.file->f_mode & FMODE_WRITE) - error = vmsplice_to_pipe(f.file, iov, nr_segs, flags); - else if (f.file->f_mode & FMODE_READ) - error = vmsplice_to_user(f.file, iov, nr_segs, flags); - - fdput(f); - } + if (!iov_iter_count(iter)) + return 0; - return error; + if (iov_iter_rw(iter) == WRITE) + return vmsplice_to_pipe(f, iter, flags); + else + return vmsplice_to_user(f, iter, flags); } -SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, +SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, unsigned long, nr_segs, unsigned int, flags) { - return do_vmsplice(fd, iov, nr_segs, flags); + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + long error; + struct fd f; + int type; + + f = fdget(fd); + error = vmsplice_type(f, &type); + if (error) + return error; + + error = import_iovec(type, uiov, nr_segs, + ARRAY_SIZE(iovstack), &iov, &iter); + if (!error) { + error = do_vmsplice(f.file, &iter, flags); + kfree(iov); + } + fdput(f); + return error; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, unsigned int, nr_segs, unsigned int, flags) { - unsigned i; - struct iovec __user *iov; - if (nr_segs > UIO_MAXIOV) - return -EINVAL; - iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); - for (i = 0; i < nr_segs; i++) { - struct compat_iovec v; - if (get_user(v.iov_base, &iov32[i].iov_base) || - get_user(v.iov_len, &iov32[i].iov_len) || - put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || - put_user(v.iov_len, &iov[i].iov_len)) - return -EFAULT; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + long error; + struct fd f; + int type; + + f = fdget(fd); + error = vmsplice_type(f, &type); + if (error) + return error; + + error = compat_import_iovec(type, iov32, nr_segs, + ARRAY_SIZE(iovstack), &iov, &iter); + if (!error) { + error = do_vmsplice(f.file, &iter, flags); + kfree(iov); } - return do_vmsplice(fd, iov, nr_segs, flags); + fdput(f); + return error; } #endif diff --git a/fs/super.c b/fs/super.c index 122c402049a2..50728d9c1a05 100644 --- a/fs/super.c +++ b/fs/super.c @@ -121,13 +121,23 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); /* - * Don't call trylock_super as it is a potential - * scalability bottleneck. The counts could get updated - * between super_cache_count and super_cache_scan anyway. - * Call to super_cache_count with shrinker_rwsem held - * ensures the safety of call to list_lru_shrink_count() and - * s_op->nr_cached_objects(). + * We don't call trylock_super() here as it is a scalability bottleneck, + * so we're exposed to partial setup state. The shrinker rwsem does not + * protect filesystem operations backing list_lru_shrink_count() or + * s_op->nr_cached_objects(). Counts can change between + * super_cache_count and super_cache_scan, so we really don't need locks + * here. + * + * However, if we are currently mounting the superblock, the underlying + * filesystem might be in a state of partial construction and hence it + * is dangerous to access it. trylock_super() uses a SB_BORN check to + * avoid this situation, so do the same here. The memory barrier is + * matched with the one in mount_fs() as we don't hold locks here. */ + if (!(sb->s_flags & SB_BORN)) + return 0; + smp_rmb(); + if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); @@ -937,7 +947,7 @@ void emergency_remount(void) static void do_thaw_all_callback(struct super_block *sb) { down_write(&sb->s_umount); - if (sb->s_root && sb->s_flags & MS_BORN) { + if (sb->s_root && sb->s_flags & SB_BORN) { emergency_thaw_bdev(sb); thaw_super_locked(sb); } else { @@ -1272,6 +1282,14 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) sb = root->d_sb; BUG_ON(!sb); WARN_ON(!sb->s_bdi); + + /* + * Write barrier is for super_cache_count(). We place it before setting + * SB_BORN as the data dependency between the two functions is the + * superblock structure contents that we just set up, not the SB_BORN + * flag. + */ + smp_wmb(); sb->s_flags |= SB_BORN; error = security_sb_kern_mount(sb, flags, secdata); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index b428d317ae92..92682fcc41f6 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -25,7 +25,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, { struct dentry *root; void *ns; - bool new_sb; + bool new_sb = false; if (!(flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) @@ -35,9 +35,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); root = kernfs_mount_ns(fs_type, flags, sysfs_root, SYSFS_MAGIC, &new_sb, ns); - if (IS_ERR(root) || !new_sb) + if (!new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); - else if (new_sb) + else if (!IS_ERR(root)) root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; return root; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 250b0755b908..4d5d20491ffd 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -51,14 +51,9 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un if (dentry->d_name.len > SYSV_NAMELEN) return ERR_PTR(-ENAMETOOLONG); ino = sysv_inode_by_name(dentry); - - if (ino) { + if (ino) inode = sysv_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - } - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev) diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 616a688f5d8f..55c508fe8131 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -24,14 +24,6 @@ static bool ubifs_crypt_empty_dir(struct inode *inode) return ubifs_check_dir_empty(inode) == 0; } -static unsigned int ubifs_crypt_max_namelen(struct inode *inode) -{ - if (S_ISLNK(inode->i_mode)) - return UBIFS_MAX_INO_DATA; - else - return UBIFS_MAX_NLEN; -} - int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { @@ -89,5 +81,5 @@ const struct fscrypt_operations ubifs_crypt_operations = { .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, .empty_dir = ubifs_crypt_empty_dir, - .max_namelen = ubifs_crypt_max_namelen, + .max_namelen = UBIFS_MAX_NLEN, }; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 9d7fb88e172e..9da224d4f2da 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -214,7 +214,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, int err; union ubifs_key key; struct inode *inode = NULL; - struct ubifs_dent_node *dent; + struct ubifs_dent_node *dent = NULL; struct ubifs_info *c = dir->i_sb->s_fs_info; struct fscrypt_name nm; @@ -229,14 +229,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(err); if (fname_len(&nm) > UBIFS_MAX_NLEN) { - err = -ENAMETOOLONG; - goto out_fname; + inode = ERR_PTR(-ENAMETOOLONG); + goto done; } dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); if (!dent) { - err = -ENOMEM; - goto out_fname; + inode = ERR_PTR(-ENOMEM); + goto done; } if (nm.hash) { @@ -250,16 +250,16 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, } if (err) { - if (err == -ENOENT) { + if (err == -ENOENT) dbg_gen("not found"); - goto done; - } - goto out_dent; + else + inode = ERR_PTR(err); + goto done; } if (dbg_check_name(c, dent, &nm)) { - err = -EINVAL; - goto out_dent; + inode = ERR_PTR(-EINVAL); + goto done; } inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum)); @@ -272,7 +272,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, ubifs_err(c, "dead directory entry '%pd', error %d", dentry, err); ubifs_ro_mode(c, err); - goto out_dent; + goto done; } if (ubifs_crypt_is_encrypted(dir) && @@ -280,27 +280,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, !fscrypt_has_permitted_context(dir, inode)) { ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); - err = -EPERM; - goto out_inode; + iput(inode); + inode = ERR_PTR(-EPERM); } done: kfree(dent); fscrypt_free_filename(&nm); - /* - * Note, d_splice_alias() would be required instead if we supported - * NFS. - */ - d_add(dentry, inode); - return NULL; - -out_inode: - iput(inode); -out_dent: - kfree(dent); -out_fname: - fscrypt_free_filename(&nm); - return ERR_PTR(err); + return d_splice_alias(inode, dentry); } static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -1289,7 +1276,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; - struct timespec time; + struct timespec64 time; unsigned int uninitialized_var(saved_nlink); struct fscrypt_name old_nm, new_nm; @@ -1517,7 +1504,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); struct inode *fst_inode = d_inode(old_dentry); struct inode *snd_inode = d_inode(new_dentry); - struct timespec time; + struct timespec64 time; int err; struct fscrypt_name fst_nm, snd_nm; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 1acb2ff505e6..fd7eb6fe9090 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1089,14 +1089,14 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; if (attr->ia_valid & ATTR_ATIME) - inode->i_atime = timespec_trunc(attr->ia_atime, - inode->i_sb->s_time_gran); + inode->i_atime = timespec64_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); if (attr->ia_valid & ATTR_MTIME) - inode->i_mtime = timespec_trunc(attr->ia_mtime, - inode->i_sb->s_time_gran); + inode->i_mtime = timespec64_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); if (attr->ia_valid & ATTR_CTIME) - inode->i_ctime = timespec_trunc(attr->ia_ctime, - inode->i_sb->s_time_gran); + inode->i_ctime = timespec64_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); if (attr->ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -1367,8 +1367,9 @@ out: static inline int mctime_update_needed(const struct inode *inode, const struct timespec *now) { - if (!timespec_equal(&inode->i_mtime, now) || - !timespec_equal(&inode->i_ctime, now)) + struct timespec64 now64 = timespec_to_timespec64(*now); + if (!timespec64_equal(&inode->i_mtime, &now64) || + !timespec64_equal(&inode->i_ctime, &now64)) return 1; return 0; } @@ -1380,7 +1381,7 @@ static inline int mctime_update_needed(const struct inode *inode, * * This function updates time of the inode. */ -int ubifs_update_time(struct inode *inode, struct timespec *time, +int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags) { struct ubifs_inode *ui = ubifs_inode(inode); @@ -1424,7 +1425,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, */ static int update_mctime(struct inode *inode) { - struct timespec now = current_time(inode); + struct timespec now = timespec64_to_timespec(current_time(inode)); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1513,12 +1514,12 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) * mmap()d file has taken write protection fault and is being made writable. * UBIFS must ensure page is budgeted for. */ -static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; - struct timespec now = current_time(inode); + struct timespec now = timespec64_to_timespec(current_time(inode)); struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; @@ -1567,8 +1568,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode))) { /* Page got truncated out from underneath us */ - err = -EINVAL; - goto out_unlock; + goto sigbus; } if (PagePrivate(page)) @@ -1597,12 +1597,10 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) wait_for_stable_page(page); return VM_FAULT_LOCKED; -out_unlock: +sigbus: unlock_page(page); ubifs_release_budget(c, &req); - if (err) - err = VM_FAULT_SIGBUS; - return err; + return VM_FAULT_SIGBUS; } static const struct vm_operations_struct ubifs_file_vm_ops = { diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 7b35e3d6cde7..a03a47cf880d 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -49,7 +49,7 @@ * maximum size. So dark watermark is the amount of free + dirty space in LEB * which are guaranteed to be reclaimable. If LEB has less space, the GC might * be unable to reclaim it. So, LEBs with free + dirty greater than dark - * watermark are "good" LEBs from GC's point of few. The other LEBs are not so + * watermark are "good" LEBs from GC's point of view. The other LEBs are not so * good, and GC takes extra care when moving them. */ diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 04c4ec6483e5..07b4956e0425 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -98,9 +98,8 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun) * * This function reserves space in journal head @head. If the reservation * succeeded, the journal head stays locked and later has to be unlocked using - * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock - * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and - * other negative error codes in case of other failures. + * 'release_head()'. Returns zero in case of success, %-EAGAIN if commit has to + * be done, and other negative error codes in case of other failures. */ static int reserve_space(struct ubifs_info *c, int jhead, int len) { @@ -1283,10 +1282,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in int *new_len) { void *buf; - int err, dlen, compr_type, out_len, old_dlen; + int err, compr_type; + u32 dlen, out_len, old_dlen; out_len = le32_to_cpu(dn->size); - buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS); + buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 8c795e6392b1..7cffa120a750 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -167,10 +167,10 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud) * @lnum: LEB number of the bud * @offs: starting offset of the bud * - * This function writes reference node for the new bud LEB @lnum it to the log, - * and adds it to the buds tress. It also makes sure that log size does not + * This function writes a reference node for the new bud LEB @lnum to the log, + * and adds it to the buds trees. It also makes sure that log size does not * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success, - * %-EAGAIN if commit is required, and a negative error codes in case of + * %-EAGAIN if commit is required, and a negative error code in case of * failure. */ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 9a517109da0f..8e99dad18880 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -628,11 +628,12 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, /* Needed by 'ubifs_pack_lsave()' */ c->main_first = c->leb_cnt - *main_lebs; - lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL); + lsave = kmalloc_array(c->lsave_cnt, sizeof(int), GFP_KERNEL); pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL); nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL); buf = vmalloc(c->leb_size); - ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + ltab = vmalloc(array_size(sizeof(struct ubifs_lpt_lprops), + c->lpt_lebs)); if (!pnode || !nnode || !buf || !ltab || !lsave) { err = -ENOMEM; goto out; @@ -1626,7 +1627,8 @@ static int lpt_init_rd(struct ubifs_info *c) { int err, i; - c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + c->ltab = vmalloc(array_size(sizeof(struct ubifs_lpt_lprops), + c->lpt_lebs)); if (!c->ltab) return -ENOMEM; @@ -1636,15 +1638,17 @@ static int lpt_init_rd(struct ubifs_info *c) return -ENOMEM; for (i = 0; i < LPROPS_HEAP_CNT; i++) { - c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, - GFP_KERNEL); + c->lpt_heap[i].arr = kmalloc_array(LPT_HEAP_SZ, + sizeof(void *), + GFP_KERNEL); if (!c->lpt_heap[i].arr) return -ENOMEM; c->lpt_heap[i].cnt = 0; c->lpt_heap[i].max_cnt = LPT_HEAP_SZ; } - c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL); + c->dirty_idx.arr = kmalloc_array(LPT_HEAP_SZ, sizeof(void *), + GFP_KERNEL); if (!c->dirty_idx.arr) return -ENOMEM; c->dirty_idx.cnt = 0; @@ -1688,7 +1692,8 @@ static int lpt_init_wr(struct ubifs_info *c) { int err, i; - c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + c->ltab_cmt = vmalloc(array_size(sizeof(struct ubifs_lpt_lprops), + c->lpt_lebs)); if (!c->ltab_cmt) return -ENOMEM; @@ -1697,7 +1702,7 @@ static int lpt_init_wr(struct ubifs_info *c) return -ENOMEM; if (c->big_lpt) { - c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS); + c->lsave = kmalloc_array(c->lsave_cnt, sizeof(int), GFP_NOFS); if (!c->lsave) return -ENOMEM; err = read_lsave(c); @@ -1939,8 +1944,8 @@ int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum, return err; } - path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1), - GFP_NOFS); + path = kmalloc_array(c->lpt_hght + 1, sizeof(struct lpt_scan_node), + GFP_NOFS); if (!path) return -ENOMEM; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 235654c2fe89..78da65b2fb85 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -619,7 +619,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, /** * pnode_lookup - lookup a pnode in the LPT. * @c: UBIFS file-system description object - * @i: pnode number (0 to main_lebs - 1) + * @i: pnode number (0 to (main_lebs - 1) / UBIFS_LPT_FANOUT)) * * This function returns a pointer to the pnode on success or a negative * error code on failure. diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index ae5c02f22f3e..85c2a43082b7 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -223,9 +223,6 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ", r->lnum, r->offs, r->len, r->deletion, r->sqnum); - /* Set c->replay_sqnum to help deal with dangling branches. */ - c->replay_sqnum = r->sqnum; - if (is_hash_key(c, &r->key)) { if (r->deletion) err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); @@ -1037,7 +1034,7 @@ int ubifs_replay_journal(struct ubifs_info *c) * The head of the log must always start with the * "commit start" node on a properly formatted UBIFS. * But we found no nodes at all, which means that - * someting went wrong and we cannot proceed mounting + * something went wrong and we cannot proceed mounting * the file-system. */ ubifs_err(c, "no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted", diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 6c397a389105..c5466c70d620 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1196,7 +1196,8 @@ static int mount_ubifs(struct ubifs_info *c) * never exceed 64. */ err = -ENOMEM; - c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL); + c->bottom_up_buf = kmalloc_array(BOTTOM_UP_HEIGHT, sizeof(int), + GFP_KERNEL); if (!c->bottom_up_buf) goto out_free; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index ba3d0e0f8615..4a21e7f75e7a 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -1104,8 +1104,9 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, ubifs_assert(znode); if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) { kfree(c->bottom_up_buf); - c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int), - GFP_NOFS); + c->bottom_up_buf = kmalloc_array(c->zroot.znode->level, + sizeof(int), + GFP_NOFS); if (!c->bottom_up_buf) return ERR_PTR(-ENOMEM); path = c->bottom_up_buf; diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index aa31f60220ef..a9df94ad46a3 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -366,7 +366,8 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) dbg_gc("%d znodes to write", cnt); - c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS); + c->gap_lebs = kmalloc_array(c->lst.idx_lebs + 1, sizeof(int), + GFP_NOFS); if (!c->gap_lebs) return -ENOMEM; @@ -674,7 +675,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt); if (!leb_cnt) return 0; - c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS); + c->ilebs = kmalloc_array(leb_cnt, sizeof(int), GFP_NOFS); if (!c->ilebs) return -ENOMEM; for (i = 0; i < leb_cnt; i++) { diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 5ee7af879cc4..04bf84d71e7b 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1206,7 +1206,6 @@ struct ubifs_debug_info; * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay * @cs_sqnum: sequence number of first node in the log (commit start node) - * @replay_sqnum: sequence number of node currently being replayed * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W * mode * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted @@ -1438,7 +1437,6 @@ struct ubifs_info { struct list_head replay_list; struct list_head replay_buds; unsigned long long cs_sqnum; - unsigned long long replay_sqnum; struct list_head unclean_leb_list; struct ubifs_mst_node *rcvrd_mst_node; struct rb_root size_tree; @@ -1740,7 +1738,7 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); int ubifs_setattr(struct dentry *dentry, struct iattr *attr); #ifdef CONFIG_UBIFS_ATIME_SUPPORT -int ubifs_update_time(struct inode *inode, struct timespec *time, int flags); +int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); #endif /* dir.c */ diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 759f1a209dbb..6f720fdf5020 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -139,7 +139,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, inode->i_op = &empty_iops; inode->i_fop = &empty_fops; - inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA; + inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME; ui = ubifs_inode(inode); ui->xattr = 1; ui->flags |= UBIFS_XATTR_FL; diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index c6e17a744c3b..aa415054ad0a 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -1,6 +1,7 @@ config UDF_FS tristate "UDF file system support" select CRC_ITU_T + select NLS help This is a file system used on some CD-ROMs and DVDs. Since the file system is supported by multiple operating systems and is more @@ -13,8 +14,3 @@ config UDF_FS module will be called udf. If unsure, say N. - -config UDF_NLS - bool - default y - depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y) diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 1b961b1d9699..fcda0fc97b90 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -533,8 +533,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, udf_write_aext(table, &epos, &eloc, (etype << 30) | elen, 1); } else - udf_delete_aext(table, epos, eloc, - (etype << 30) | elen); + udf_delete_aext(table, epos); } else { alloc_count = 0; } @@ -630,7 +629,7 @@ static udf_pblk_t udf_table_new_block(struct super_block *sb, if (goal_elen) udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); else - udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); + udf_delete_aext(table, goal_epos); brelse(goal_epos.bh); udf_add_free_space(sb, partition, -1); diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 0a98a2369738..d9523013096f 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -141,10 +141,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, fibh->ebh->b_data, sizeof(struct fileIdentDesc) + fibh->soffset); - fi_len = (sizeof(struct fileIdentDesc) + - cfi->lengthFileIdent + - le16_to_cpu(cfi->lengthOfImpUse) + 3) & ~3; - + fi_len = udf_dir_entry_len(cfi); *nf_pos += fi_len - (fibh->eoffset - fibh->soffset); fibh->eoffset = fibh->soffset + fi_len; } else { @@ -152,6 +149,9 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, sizeof(struct fileIdentDesc)); } } + /* Got last entry outside of dir size - fs is corrupted! */ + if (*nf_pos > dir->i_size) + return NULL; return fi; } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index b7a0d4b4bda1..56569023783b 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -124,8 +124,8 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - inode->i_mtime = inode->i_atime = inode->i_ctime = - iinfo->i_crtime = current_time(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + iinfo->i_crtime = timespec64_to_timespec(inode->i_mtime); if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); iput(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index c80765d62f7e..9915a58fbabd 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1147,8 +1147,7 @@ static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr if (startnum > endnum) { for (i = 0; i < (startnum - endnum); i++) - udf_delete_aext(inode, *epos, laarr[i].extLocation, - laarr[i].extLength); + udf_delete_aext(inode, *epos); } else if (startnum < endnum) { for (i = 0; i < (endnum - startnum); i++) { udf_insert_aext(inode, *epos, laarr[i].extLocation, @@ -1271,6 +1270,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) struct udf_inode_info *iinfo = UDF_I(inode); struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct kernel_lb_addr *iloc = &iinfo->i_location; + struct timespec ts; unsigned int link_count; unsigned int indirections = 0; int bs = inode->i_sb->s_blocksize; @@ -1443,15 +1443,12 @@ reread: inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); - if (!udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime)) - inode->i_atime = sbi->s_record_time; - - if (!udf_disk_stamp_to_time(&inode->i_mtime, - fe->modificationTime)) - inode->i_mtime = sbi->s_record_time; - - if (!udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime)) - inode->i_ctime = sbi->s_record_time; + udf_disk_stamp_to_time(&ts, fe->accessTime); + inode->i_atime = timespec_to_timespec64(ts); + udf_disk_stamp_to_time(&ts, fe->modificationTime); + inode->i_mtime = timespec_to_timespec64(ts); + udf_disk_stamp_to_time(&ts, fe->attrTime); + inode->i_ctime = timespec_to_timespec64(ts); iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); @@ -1461,18 +1458,13 @@ reread: inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); - if (!udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime)) - inode->i_atime = sbi->s_record_time; - - if (!udf_disk_stamp_to_time(&inode->i_mtime, - efe->modificationTime)) - inode->i_mtime = sbi->s_record_time; - - if (!udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime)) - iinfo->i_crtime = sbi->s_record_time; - - if (!udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime)) - inode->i_ctime = sbi->s_record_time; + udf_disk_stamp_to_time(&ts, efe->accessTime); + inode->i_atime = timespec_to_timespec64(ts); + udf_disk_stamp_to_time(&ts, efe->modificationTime); + inode->i_mtime = timespec_to_timespec64(ts); + udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime); + udf_disk_stamp_to_time(&ts, efe->attrTime); + inode->i_ctime = timespec_to_timespec64(ts); iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); @@ -1722,9 +1714,12 @@ static int udf_update_inode(struct inode *inode, int do_sync) inode->i_sb->s_blocksize - sizeof(struct fileEntry)); fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); - udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); - udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); - udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); + udf_time_to_disk_stamp(&fe->accessTime, + timespec64_to_timespec(inode->i_atime)); + udf_time_to_disk_stamp(&fe->modificationTime, + timespec64_to_timespec(inode->i_mtime)); + udf_time_to_disk_stamp(&fe->attrTime, + timespec64_to_timespec(inode->i_ctime)); memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; @@ -1743,14 +1738,17 @@ static int udf_update_inode(struct inode *inode, int do_sync) efe->objectSize = cpu_to_le64(inode->i_size); efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); - udf_adjust_time(iinfo, inode->i_atime); - udf_adjust_time(iinfo, inode->i_mtime); - udf_adjust_time(iinfo, inode->i_ctime); + udf_adjust_time(iinfo, timespec64_to_timespec(inode->i_atime)); + udf_adjust_time(iinfo, timespec64_to_timespec(inode->i_mtime)); + udf_adjust_time(iinfo, timespec64_to_timespec(inode->i_ctime)); - udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); - udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&efe->accessTime, + timespec64_to_timespec(inode->i_atime)); + udf_time_to_disk_stamp(&efe->modificationTime, + timespec64_to_timespec(inode->i_mtime)); udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); - udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); + udf_time_to_disk_stamp(&efe->attrTime, + timespec64_to_timespec(inode->i_ctime)); memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); @@ -2177,14 +2175,15 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, return (nelen >> 30); } -int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, - struct kernel_lb_addr eloc, uint32_t elen) +int8_t udf_delete_aext(struct inode *inode, struct extent_position epos) { struct extent_position oepos; int adsize; int8_t etype; struct allocExtDesc *aed; struct udf_inode_info *iinfo; + struct kernel_lb_addr eloc; + uint32_t elen; if (epos.bh) { get_bh(epos.bh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 0458dd47e105..06f37ddd2997 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -351,8 +351,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, loff_t f_pos; loff_t size = udf_ext0_offset(dir) + dir->i_size; int nfidlen; - uint8_t lfi; - uint16_t liu; udf_pblk_t block; struct kernel_lb_addr eloc; uint32_t elen = 0; @@ -383,7 +381,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, namelen = 0; } - nfidlen = (sizeof(struct fileIdentDesc) + namelen + 3) & ~3; + nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD); f_pos = udf_ext0_offset(dir); @@ -424,12 +422,8 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, goto out_err; } - liu = le16_to_cpu(cfi->lengthOfImpUse); - lfi = cfi->lengthFileIdent; - if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { - if (((sizeof(struct fileIdentDesc) + - liu + lfi + 3) & ~3) == nfidlen) { + if (udf_dir_entry_len(cfi) == nfidlen) { cfi->descTag.tagSerialNum = cpu_to_le16(1); cfi->fileVersionNum = cpu_to_le16(1); cfi->fileCharacteristics = 0; @@ -622,8 +616,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } @@ -733,8 +726,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inc_nlink(dir); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); @@ -1203,9 +1195,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, if (dir_fi) { dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location); - udf_update_tag((char *)dir_fi, - (sizeof(struct fileIdentDesc) + - le16_to_cpu(dir_fi->lengthOfImpUse) + 3) & ~3); + udf_update_tag((char *)dir_fi, udf_dir_entry_len(dir_fi)); if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) mark_inode_dirty(old_inode); else diff --git a/fs/udf/super.c b/fs/udf/super.c index 7949c338efa5..0c504c8031d3 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -572,7 +572,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt, case Opt_utf8: uopt->flags |= (1 << UDF_FLAG_UTF8); break; -#ifdef CONFIG_UDF_NLS case Opt_iocharset: if (!remount) { if (uopt->nls_map) @@ -581,7 +580,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt, uopt->flags |= (1 << UDF_FLAG_NLS_MAP); } break; -#endif case Opt_uforget: uopt->flags |= (1 << UDF_FLAG_UID_FORGET); break; @@ -864,6 +862,9 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) struct buffer_head *bh; uint16_t ident; int ret = -ENOMEM; +#ifdef UDFFS_DEBUG + struct timestamp *ts; +#endif outstr = kmalloc(128, GFP_NOFS); if (!outstr) @@ -882,24 +883,24 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) pvoldesc = (struct primaryVolDesc *)bh->b_data; - if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, - pvoldesc->recordingDateAndTime)) { + udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time, + pvoldesc->recordingDateAndTime); #ifdef UDFFS_DEBUG - struct timestamp *ts = &pvoldesc->recordingDateAndTime; - udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n", - le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, - ts->minute, le16_to_cpu(ts->typeAndTimezone)); + ts = &pvoldesc->recordingDateAndTime; + udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n", + le16_to_cpu(ts->year), ts->month, ts->day, ts->hour, + ts->minute, le16_to_cpu(ts->typeAndTimezone)); #endif - } - ret = udf_dstrCS0toUTF8(outstr, 31, pvoldesc->volIdent, 32); + + ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32); if (ret < 0) goto out_bh; strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret); udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); - ret = udf_dstrCS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128); + ret = udf_dstrCS0toChar(sb, outstr, 127, pvoldesc->volSetIdent, 128); if (ret < 0) goto out_bh; @@ -1587,7 +1588,7 @@ static struct udf_vds_record *handle_partition_descriptor( struct udf_vds_record *new_loc; unsigned int new_size = ALIGN(partnum, PART_DESC_ALLOC_STEP); - new_loc = kzalloc(sizeof(*new_loc) * new_size, GFP_KERNEL); + new_loc = kcalloc(new_size, sizeof(*new_loc), GFP_KERNEL); if (!new_loc) return ERR_PTR(-ENOMEM); memcpy(new_loc, data->part_descs_loc, @@ -1646,8 +1647,9 @@ static noinline int udf_process_sequence( memset(data.vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); data.size_part_descs = PART_DESC_ALLOC_STEP; - data.part_descs_loc = kzalloc(sizeof(*data.part_descs_loc) * - data.size_part_descs, GFP_KERNEL); + data.part_descs_loc = kcalloc(data.size_part_descs, + sizeof(*data.part_descs_loc), + GFP_KERNEL); if (!data.part_descs_loc) return -ENOMEM; @@ -2117,7 +2119,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) udf_err(sb, "utf8 cannot be combined with iocharset\n"); goto parse_options_failure; } -#ifdef CONFIG_UDF_NLS if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) { uopt.nls_map = load_nls_default(); if (!uopt.nls_map) @@ -2125,7 +2126,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) else udf_debug("Using default NLS map\n"); } -#endif if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP))) uopt.flags |= (1 << UDF_FLAG_UTF8); @@ -2279,10 +2279,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) error_out: iput(sbi->s_vat_inode); parse_options_failure: -#ifdef CONFIG_UDF_NLS if (uopt.nls_map) unload_nls(uopt.nls_map); -#endif if (lvid_open) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); @@ -2332,10 +2330,8 @@ static void udf_put_super(struct super_block *sb) sbi = UDF_SB(sb); iput(sbi->s_vat_inode); -#ifdef CONFIG_UDF_NLS if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); -#endif if (!sb_rdonly(sb)) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 68e8a64d22e0..84c47dde4d26 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -132,6 +132,12 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, struct fileIdentDesc *, struct udf_fileident_bh *, uint8_t *, uint8_t *); +static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi) +{ + return ALIGN(sizeof(struct fileIdentDesc) + + le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent, + UDF_NAME_PAD); +} /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); @@ -167,8 +173,7 @@ extern int udf_add_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); extern void udf_write_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); -extern int8_t udf_delete_aext(struct inode *, struct extent_position, - struct kernel_lb_addr, uint32_t); +extern int8_t udf_delete_aext(struct inode *, struct extent_position); extern int8_t udf_next_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t *, int); extern int8_t udf_current_aext(struct inode *, struct extent_position *, @@ -220,7 +225,8 @@ extern int udf_get_filename(struct super_block *, const uint8_t *, int, uint8_t *, int); extern int udf_put_filename(struct super_block *, const uint8_t *, int, uint8_t *, int); -extern int udf_dstrCS0toUTF8(uint8_t *, int, const uint8_t *, int); +extern int udf_dstrCS0toChar(struct super_block *, uint8_t *, int, + const uint8_t *, int); /* ialloc.c */ extern void udf_free_inode(struct inode *); @@ -252,8 +258,8 @@ extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); /* udftime.c */ -extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest, +extern void udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src); -extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src); +extern void udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src); #endif /* __UDF_DECL_H */ diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index 0927a4b2ecaf..67b33ac5d41b 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -40,7 +40,7 @@ #include <linux/kernel.h> #include <linux/time.h> -struct timespec * +void udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src) { u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone); @@ -67,10 +67,9 @@ udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src) * recorded with bogus sub-second values. */ dest->tv_nsec %= NSEC_PER_SEC; - return dest; } -struct timestamp * +void udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts) { long seconds; @@ -79,9 +78,6 @@ udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts) offset = -sys_tz.tz_minuteswest; - if (!dest) - return NULL; - dest->typeAndTimezone = cpu_to_le16(0x1000 | (offset & 0x0FFF)); seconds = ts.tv_sec + offset * 60; @@ -97,7 +93,6 @@ udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts) dest->centiseconds * 10000) / 100; dest->microseconds = (ts.tv_nsec / 1000 - dest->centiseconds * 10000 - dest->hundredsOfMicroseconds * 100); - return dest; } /* EOF */ diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 16a8ad21b77e..45234791fec2 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -28,101 +28,64 @@ #include "udf_sb.h" +#define PLANE_SIZE 0x10000 +#define UNICODE_MAX 0x10ffff #define SURROGATE_MASK 0xfffff800 #define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_CHAR_BITS 10 +#define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1) -static int udf_uni2char_utf8(wchar_t uni, - unsigned char *out, - int boundlen) -{ - int u_len = 0; - - if (boundlen <= 0) - return -ENAMETOOLONG; +#define ILLEGAL_CHAR_MARK '_' +#define EXT_MARK '.' +#define CRC_MARK '#' +#define EXT_SIZE 5 +/* Number of chars we need to store generated CRC to make filename unique */ +#define CRC_LEN 5 - if ((uni & SURROGATE_MASK) == SURROGATE_PAIR) - return -EINVAL; +static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len, + int str_i_idx, int u_ch, unicode_t *ret) +{ + unicode_t c; + int start_idx = str_i_idx; + + /* Expand OSTA compressed Unicode to Unicode */ + c = str_i[str_i_idx++]; + if (u_ch > 1) + c = (c << 8) | str_i[str_i_idx++]; + if ((c & SURROGATE_MASK) == SURROGATE_PAIR) { + unicode_t next; + + /* Trailing surrogate char */ + if (str_i_idx >= str_i_max_len) { + c = UNICODE_MAX + 1; + goto out; + } - if (uni < 0x80) { - out[u_len++] = (unsigned char)uni; - } else if (uni < 0x800) { - if (boundlen < 2) - return -ENAMETOOLONG; - out[u_len++] = (unsigned char)(0xc0 | (uni >> 6)); - out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); - } else { - if (boundlen < 3) - return -ENAMETOOLONG; - out[u_len++] = (unsigned char)(0xe0 | (uni >> 12)); - out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f)); - out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); - } - return u_len; -} + /* Low surrogate must follow the high one... */ + if (c & SURROGATE_LOW) { + c = UNICODE_MAX + 1; + goto out; + } -static int udf_char2uni_utf8(const unsigned char *in, - int boundlen, - wchar_t *uni) -{ - unsigned int utf_char; - unsigned char c; - int utf_cnt, u_len; - - utf_char = 0; - utf_cnt = 0; - for (u_len = 0; u_len < boundlen;) { - c = in[u_len++]; - - /* Complete a multi-byte UTF-8 character */ - if (utf_cnt) { - utf_char = (utf_char << 6) | (c & 0x3f); - if (--utf_cnt) - continue; - } else { - /* Check for a multi-byte UTF-8 character */ - if (c & 0x80) { - /* Start a multi-byte UTF-8 character */ - if ((c & 0xe0) == 0xc0) { - utf_char = c & 0x1f; - utf_cnt = 1; - } else if ((c & 0xf0) == 0xe0) { - utf_char = c & 0x0f; - utf_cnt = 2; - } else if ((c & 0xf8) == 0xf0) { - utf_char = c & 0x07; - utf_cnt = 3; - } else if ((c & 0xfc) == 0xf8) { - utf_char = c & 0x03; - utf_cnt = 4; - } else if ((c & 0xfe) == 0xfc) { - utf_char = c & 0x01; - utf_cnt = 5; - } else { - utf_cnt = -1; - break; - } - continue; - } else { - /* Single byte UTF-8 character (most common) */ - utf_char = c; - } + WARN_ON_ONCE(u_ch != 2); + next = str_i[str_i_idx++] << 8; + next |= str_i[str_i_idx++]; + if ((next & SURROGATE_MASK) != SURROGATE_PAIR || + !(next & SURROGATE_LOW)) { + c = UNICODE_MAX + 1; + goto out; } - *uni = utf_char; - break; - } - if (utf_cnt) { - *uni = '?'; - return -EINVAL; + + c = PLANE_SIZE + + ((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) + + (next & SURROGATE_CHAR_MASK); } - return u_len; +out: + *ret = c; + return str_i_idx - start_idx; } -#define ILLEGAL_CHAR_MARK '_' -#define EXT_MARK '.' -#define CRC_MARK '#' -#define EXT_SIZE 5 -/* Number of chars we need to store generated CRC to make filename unique */ -#define CRC_LEN 5 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, int *str_o_idx, @@ -132,27 +95,29 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, int (*conv_f)(wchar_t, unsigned char *, int), int translate) { - uint32_t c; + unicode_t c; int illChar = 0; int len, gotch = 0; - for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) { + while (!gotch && *str_i_idx < str_i_max_len) { if (*str_o_idx >= str_o_max_len) { *needsCRC = 1; return gotch; } - /* Expand OSTA compressed Unicode to Unicode */ - c = str_i[*str_i_idx]; - if (u_ch > 1) - c = (c << 8) | str_i[*str_i_idx + 1]; - - if (translate && (c == '/' || c == 0)) + len = get_utf16_char(str_i, str_i_max_len, *str_i_idx, u_ch, + &c); + /* These chars cannot be converted. Replace them. */ + if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) || + (translate && c == '/')) { illChar = 1; - else if (illChar) + if (!translate) + gotch = 1; + } else if (illChar) break; else gotch = 1; + *str_i_idx += len; } if (illChar) { *needsCRC = 1; @@ -160,7 +125,15 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, gotch = 1; } if (gotch) { - len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx); + if (conv_f) { + len = conv_f(c, &str_o[*str_o_idx], + str_o_max_len - *str_o_idx); + } else { + len = utf32_to_utf8(c, &str_o[*str_o_idx], + str_o_max_len - *str_o_idx); + if (len < 0) + len = -ENAMETOOLONG; + } /* Valid character? */ if (len >= 0) *str_o_idx += len; @@ -168,16 +141,16 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, *needsCRC = 1; gotch = 0; } else { - str_o[(*str_o_idx)++] = '?'; + str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK; *needsCRC = 1; } } return gotch; } -static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, +static int udf_name_from_CS0(struct super_block *sb, + uint8_t *str_o, int str_max_len, const uint8_t *ocu, int ocu_len, - int (*conv_f)(wchar_t, unsigned char *, int), int translate) { uint32_t c; @@ -194,6 +167,7 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, unsigned short valueCRC; uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1]; uint8_t crc[CRC_LEN]; + int (*conv_f)(wchar_t, unsigned char *, int); if (str_max_len <= 0) return 0; @@ -203,6 +177,11 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, return 0; } + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + conv_f = UDF_SB(sb)->s_nls_map->uni2char; + else + conv_f = NULL; + cmp_id = ocu[0]; if (cmp_id != 8 && cmp_id != 16) { memset(str_o, 0, str_max_len); @@ -293,18 +272,24 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, return str_o_len; } -static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len, - const uint8_t *str_i, int str_len, - int (*conv_f)(const unsigned char *, int, wchar_t *)) +static int udf_name_to_CS0(struct super_block *sb, + uint8_t *ocu, int ocu_max_len, + const uint8_t *str_i, int str_len) { int i, len; unsigned int max_val; - wchar_t uni_char; int u_len, u_ch; + unicode_t uni_char; + int (*conv_f)(const unsigned char *, int, wchar_t *); if (ocu_max_len <= 0) return 0; + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + conv_f = UDF_SB(sb)->s_nls_map->char2uni; + else + conv_f = NULL; + memset(ocu, 0, ocu_max_len); ocu[0] = 8; max_val = 0xff; @@ -312,36 +297,61 @@ static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len, try_again: u_len = 1; - for (i = 0; i < str_len; i++) { + for (i = 0; i < str_len; i += len) { /* Name didn't fit? */ if (u_len + u_ch > ocu_max_len) return 0; - len = conv_f(&str_i[i], str_len - i, &uni_char); - if (!len) - continue; + if (conv_f) { + wchar_t wchar; + + len = conv_f(&str_i[i], str_len - i, &wchar); + if (len > 0) + uni_char = wchar; + } else { + len = utf8_to_utf32(&str_i[i], str_len - i, + &uni_char); + } /* Invalid character, deal with it */ - if (len < 0) { + if (len <= 0 || uni_char > UNICODE_MAX) { len = 1; uni_char = '?'; } if (uni_char > max_val) { - max_val = 0xffff; - ocu[0] = 0x10; - u_ch = 2; - goto try_again; + unicode_t c; + + if (max_val == 0xff) { + max_val = 0xffff; + ocu[0] = 0x10; + u_ch = 2; + goto try_again; + } + /* + * Use UTF-16 encoding for chars outside we + * cannot encode directly. + */ + if (u_len + 2 * u_ch > ocu_max_len) + return 0; + + uni_char -= PLANE_SIZE; + c = SURROGATE_PAIR | + ((uni_char >> SURROGATE_CHAR_BITS) & + SURROGATE_CHAR_MASK); + ocu[u_len++] = (uint8_t)(c >> 8); + ocu[u_len++] = (uint8_t)(c & 0xff); + uni_char = SURROGATE_PAIR | SURROGATE_LOW | + (uni_char & SURROGATE_CHAR_MASK); } if (max_val == 0xffff) ocu[u_len++] = (uint8_t)(uni_char >> 8); ocu[u_len++] = (uint8_t)(uni_char & 0xff); - i += len - 1; } return u_len; } -int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len, +int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len) { int s_len = 0; @@ -355,14 +365,12 @@ int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len, } } - return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len, - udf_uni2char_utf8, 0); + return udf_name_from_CS0(sb, utf_o, o_len, ocu_i, s_len, 0); } int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, uint8_t *dname, int dlen) { - int (*conv_f)(wchar_t, unsigned char *, int); int ret; if (!slen) @@ -371,14 +379,7 @@ int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, if (dlen <= 0) return 0; - if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { - conv_f = udf_uni2char_utf8; - } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { - conv_f = UDF_SB(sb)->s_nls_map->uni2char; - } else - BUG(); - - ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1); + ret = udf_name_from_CS0(sb, dname, dlen, sname, slen, 1); /* Zero length filename isn't valid... */ if (ret == 0) ret = -EINVAL; @@ -388,15 +389,6 @@ int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen, uint8_t *dname, int dlen) { - int (*conv_f)(const unsigned char *, int, wchar_t *); - - if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { - conv_f = udf_char2uni_utf8; - } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { - conv_f = UDF_SB(sb)->s_nls_map->char2uni; - } else - BUG(); - - return udf_name_to_CS0(dname, dlen, sname, slen, conv_f); + return udf_name_to_CS0(sb, dname, dlen, sname, slen); } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 32545cd00ceb..d5f43ba76c59 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -39,8 +39,7 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ufs_add_link(dentry, inode); if (!err) { - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -193,8 +192,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) if (err) goto out_fail; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; out_fail: diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 8254b8b3690f..488088141451 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -541,7 +541,9 @@ static int ufs_read_cylinder_structures(struct super_block *sb) * Read cylinder group (we read only first fragment from block * at this time) and prepare internal data structures for cg caching. */ - if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS))) + sbi->s_ucg = kmalloc_array(uspi->s_ncg, sizeof(struct buffer_head *), + GFP_NOFS); + if (!sbi->s_ucg) goto failed; for (i = 0; i < uspi->s_ncg; i++) sbi->s_ucg[i] = NULL; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cec550c8468f..594d192b2331 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -62,6 +62,8 @@ struct userfaultfd_ctx { enum userfaultfd_state state; /* released */ bool released; + /* memory mappings are changing because of non-cooperative event */ + bool mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; @@ -220,24 +222,26 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, unsigned long reason) { struct mm_struct *mm = ctx->mm; - pte_t *pte; + pte_t *ptep, pte; bool ret = true; VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); - if (!pte) + ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); + + if (!ptep) goto out; ret = false; + pte = huge_ptep_get(ptep); /* * Lockless access: we're in a wait_event so it's ok if it * changes under us. */ - if (huge_pte_none(*pte)) + if (huge_pte_none(pte)) ret = true; - if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP)) + if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) ret = true; out: return ret; @@ -641,6 +645,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, * already released. */ out: + WRITE_ONCE(ctx->mmap_changing, false); userfaultfd_ctx_put(ctx); } @@ -686,10 +691,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; + ctx->mmap_changing = false; ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); + WRITE_ONCE(octx->mmap_changing, true); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -732,6 +739,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); } } @@ -772,6 +780,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); up_read(&mm->mmap_sem); msg_init(&ewq.msg); @@ -815,6 +824,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, return -ENOMEM; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1653,6 +1663,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + ret = -EFAULT; if (copy_from_user(&uffdio_copy, user_uffdio_copy, /* don't copy "copy" last field */ @@ -1674,7 +1688,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len); + uffdio_copy.len, &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1705,6 +1719,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + ret = -EFAULT; if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, /* don't copy "zeropage" last field */ @@ -1721,7 +1739,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (mmget_not_zero(ctx->mm)) { ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); + uffdio_zeropage.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1900,6 +1919,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; + ctx->mmap_changing = false; ctx->mm = current->mm; /* prevent the mm struct to be freed */ mmgrab(ctx->mm); diff --git a/fs/xattr.c b/fs/xattr.c index 61cd28ba25f3..f9cb1db187b7 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -229,7 +229,7 @@ out: } EXPORT_SYMBOL_GPL(vfs_setxattr); -ssize_t +static ssize_t xattr_getsecurity(struct inode *inode, const char *name, void *value, size_t size) { @@ -254,7 +254,6 @@ out: out_noalloc: return len; } -EXPORT_SYMBOL_GPL(xattr_getsecurity); /* * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr @@ -354,7 +353,6 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size) if (error) return error; if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) { - error = -EOPNOTSUPP; error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 46bcf0e649f5..457ac9f97377 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -85,6 +85,24 @@ config XFS_ONLINE_SCRUB If unsure, say N. +config XFS_ONLINE_REPAIR + bool "XFS online metadata repair support" + default n + depends on XFS_FS && XFS_ONLINE_SCRUB + help + If you say Y here you will be able to repair metadata on a + mounted XFS filesystem. This feature is intended to reduce + filesystem downtime by fixing minor problems before they cause the + filesystem to go down. However, it requires that the filesystem be + formatted with secondary metadata, such as reverse mappings and inode + parent pointers. + + This feature is considered EXPERIMENTAL. Use with caution! + + See the xfs_scrub man page in section 8 for additional information. + + If unsure, say N. + config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7ceb41a9786a..2f3f75a7f180 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -1,20 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (c) 2000-2005 Silicon Graphics, Inc. # All Rights Reserved. # -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it would be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write the Free Software Foundation, -# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# ccflags-y += -I$(src) # needed for trace events ccflags-y += -I$(src)/libxfs @@ -28,6 +16,7 @@ xfs-y += xfs_trace.o # build the libxfs code first xfs-y += $(addprefix libxfs/, \ + xfs_ag.o \ xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ @@ -61,6 +50,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_sb.o \ xfs_symlink_remote.o \ xfs_trans_resv.o \ + xfs_types.o \ ) # xfs_rtbitmap is shared with libxfs xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ @@ -163,4 +153,12 @@ xfs-y += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o + +# online repair +ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) +xfs-y += $(addprefix scrub/, \ + agheader_repair.o \ + repair.o \ + ) +endif endif diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 7bace03dc9dc..fdd9d6ede25c 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/mm.h> #include <linux/sched/mm.h> diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 6023b594ead7..8e6b3ba81c03 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_SUPPORT_KMEM_H__ #define __XFS_SUPPORT_KMEM_H__ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c new file mode 100644 index 000000000000..9345802c99f7 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag.c @@ -0,0 +1,464 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2018 Red Hat, Inc. + * All rights reserved. + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" + +static struct xfs_buf * +xfs_get_aghdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; + + return bp; +} + +/* + * Generic btree root block init function + */ +static void +xfs_btroot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0); +} + +/* + * Alloc btree root block init functions + */ +static void +xfs_bnoroot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_alloc_rec *arec; + + xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); + arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); + arec->ar_blockcount = cpu_to_be32(id->agsize - + be32_to_cpu(arec->ar_startblock)); +} + +static void +xfs_cntroot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_alloc_rec *arec; + + xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); + arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); + arec->ar_blockcount = cpu_to_be32(id->agsize - + be32_to_cpu(arec->ar_startblock)); +} + +/* + * Reverse map root block init + */ +static void +xfs_rmaproot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_rmap_rec *rrec; + + xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0); + + /* + * mark the AG header regions as static metadata The BNO + * btree block is the first block after the headers, so + * it's location defines the size of region the static + * metadata consumes. + * + * Note: unlike mkfs, we never have to account for log + * space when growing the data regions + */ + rrec = XFS_RMAP_REC_ADDR(block, 1); + rrec->rm_startblock = 0; + rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); + rrec->rm_offset = 0; + + /* account freespace btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 2); + rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(2); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + rrec->rm_offset = 0; + + /* account inode btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 3); + rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - + XFS_IBT_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); + rrec->rm_offset = 0; + + /* account for rmap btree root */ + rrec = XFS_RMAP_REC_ADDR(block, 4); + rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + rrec->rm_offset = 0; + + /* account for refc btree root */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + rrec = XFS_RMAP_REC_ADDR(block, 5); + rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + } +} + +/* + * Initialise new secondary superblocks with the pre-grow geometry, but mark + * them as "in progress" so we know they haven't yet been activated. This will + * get cleared when the update with the new geometry information is done after + * changes to the primary are committed. This isn't strictly necessary, but we + * get it for free with the delayed buffer write lists and it means we can tell + * if a grow operation didn't complete properly after the fact. + */ +static void +xfs_sbblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + + xfs_sb_to_disk(dsb, &mp->m_sb); + dsb->sb_inprogress = 1; +} + +static void +xfs_agfblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + xfs_extlen_t tmpsize; + + agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); + agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); + agf->agf_seqno = cpu_to_be32(id->agno); + agf->agf_length = cpu_to_be32(id->agsize); + agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); + agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); + agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + agf->agf_roots[XFS_BTNUM_RMAPi] = + cpu_to_be32(XFS_RMAP_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); + agf->agf_rmap_blocks = cpu_to_be32(1); + } + + agf->agf_flfirst = cpu_to_be32(1); + agf->agf_fllast = 0; + agf->agf_flcount = 0; + tmpsize = id->agsize - mp->m_ag_prealloc_blocks; + agf->agf_freeblks = cpu_to_be32(tmpsize); + agf->agf_longest = cpu_to_be32(tmpsize); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + agf->agf_refcount_root = cpu_to_be32( + xfs_refc_block(mp)); + agf->agf_refcount_level = cpu_to_be32(1); + agf->agf_refcount_blocks = cpu_to_be32(1); + } +} + +static void +xfs_agflblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + __be32 *agfl_bno; + int bucket; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(id->agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); + } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); +} + +static void +xfs_agiblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + int bucket; + + agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); + agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); + agi->agi_seqno = cpu_to_be32(id->agno); + agi->agi_length = cpu_to_be32(id->agsize); + agi->agi_count = 0; + agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); + agi->agi_level = cpu_to_be32(1); + agi->agi_freecount = 0; + agi->agi_newino = cpu_to_be32(NULLAGINO); + agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); + agi->agi_free_level = cpu_to_be32(1); + } + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) + agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); +} + +typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp, + struct aghdr_init_data *id); +static int +xfs_ag_init_hdr( + struct xfs_mount *mp, + struct aghdr_init_data *id, + aghdr_init_work_f work, + const struct xfs_buf_ops *ops) + +{ + struct xfs_buf *bp; + + bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops); + if (!bp) + return -ENOMEM; + + (*work)(mp, bp, id); + + xfs_buf_delwri_queue(bp, &id->buffer_list); + xfs_buf_relse(bp); + return 0; +} + +struct xfs_aghdr_grow_data { + xfs_daddr_t daddr; + size_t numblks; + const struct xfs_buf_ops *ops; + aghdr_init_work_f work; + xfs_btnum_t type; + bool need_init; +}; + +/* + * Prepare new AG headers to be written to disk. We use uncached buffers here, + * as it is assumed these new AG headers are currently beyond the currently + * valid filesystem address space. Using cached buffers would trip over EOFS + * corruption detection alogrithms in the buffer cache lookup routines. + * + * This is a non-transactional function, but the prepared buffers are added to a + * delayed write buffer list supplied by the caller so they can submit them to + * disk and wait on them as required. + */ +int +xfs_ag_init_headers( + struct xfs_mount *mp, + struct aghdr_init_data *id) + +{ + struct xfs_aghdr_grow_data aghdr_data[] = { + { /* SB */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_sb_buf_ops, + .work = &xfs_sbblock_init, + .need_init = true + }, + { /* AGF */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_agf_buf_ops, + .work = &xfs_agfblock_init, + .need_init = true + }, + { /* AGFL */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_agfl_buf_ops, + .work = &xfs_agflblock_init, + .need_init = true + }, + { /* AGI */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_agi_buf_ops, + .work = &xfs_agiblock_init, + .need_init = true + }, + { /* BNO root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_allocbt_buf_ops, + .work = &xfs_bnoroot_init, + .need_init = true + }, + { /* CNT root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_allocbt_buf_ops, + .work = &xfs_cntroot_init, + .need_init = true + }, + { /* INO root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_inobt_buf_ops, + .work = &xfs_btroot_init, + .type = XFS_BTNUM_INO, + .need_init = true + }, + { /* FINO root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_inobt_buf_ops, + .work = &xfs_btroot_init, + .type = XFS_BTNUM_FINO, + .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) + }, + { /* RMAP root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_rmapbt_buf_ops, + .work = &xfs_rmaproot_init, + .need_init = xfs_sb_version_hasrmapbt(&mp->m_sb) + }, + { /* REFC root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_refcountbt_buf_ops, + .work = &xfs_btroot_init, + .type = XFS_BTNUM_REFC, + .need_init = xfs_sb_version_hasreflink(&mp->m_sb) + }, + { /* NULL terminating block */ + .daddr = XFS_BUF_DADDR_NULL, + } + }; + struct xfs_aghdr_grow_data *dp; + int error = 0; + + /* Account for AG free space in new AG */ + id->nfree += id->agsize - mp->m_ag_prealloc_blocks; + for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) { + if (!dp->need_init) + continue; + + id->daddr = dp->daddr; + id->numblks = dp->numblks; + id->type = dp->type; + error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops); + if (error) + break; + } + return error; +} + +/* + * Extent the AG indicated by the @id by the length passed in + */ +int +xfs_ag_extend_space( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct aghdr_init_data *id, + xfs_extlen_t len) +{ + struct xfs_owner_info oinfo; + struct xfs_buf *bp; + struct xfs_agi *agi; + struct xfs_agf *agf; + int error; + + /* + * Change the agi length. + */ + error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(bp); + be32_add_cpu(&agi->agi_length, len); + ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || + be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); + xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); + + /* + * Change agf length. + */ + error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(bp); + be32_add_cpu(&agf->agf_length, len); + ASSERT(agf->agf_length == agi->agi_length); + xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + + /* + * Free the new space. + * + * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that + * this doesn't actually exist in the rmap btree. + */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); + error = xfs_rmap_free(tp, bp, id->agno, + be32_to_cpu(agf->agf_length) - len, + len, &oinfo); + if (error) + return error; + + return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno, + be32_to_cpu(agf->agf_length) - len), + len, &oinfo, XFS_AG_RESV_NONE); +} diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h new file mode 100644 index 000000000000..412702e23f61 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018 Red Hat, Inc. + * All rights reserved. + */ + +#ifndef __LIBXFS_AG_H +#define __LIBXFS_AG_H 1 + +struct xfs_mount; +struct xfs_trans; + +struct aghdr_init_data { + /* per ag data */ + xfs_agblock_t agno; /* ag to init */ + xfs_extlen_t agsize; /* new AG size */ + struct list_head buffer_list; /* buffer writeback list */ + xfs_rfsblock_t nfree; /* cumulative new free space */ + + /* per header data */ + xfs_daddr_t daddr; /* header location */ + size_t numblks; /* size of header */ + xfs_btnum_t type; /* type of btree root block */ +}; + +int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); +int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp, + struct aghdr_init_data *id, xfs_extlen_t len); + +#endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 03885a968de8..fecd187fcf2c 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -171,6 +157,7 @@ __xfs_ag_resv_free( error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); resv->ar_reserved = 0; resv->ar_asked = 0; + resv->ar_orig_reserved = 0; if (error) trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, @@ -203,13 +190,34 @@ __xfs_ag_resv_init( struct xfs_mount *mp = pag->pag_mount; struct xfs_ag_resv *resv; int error; - xfs_extlen_t reserved; + xfs_extlen_t hidden_space; if (used > ask) ask = used; - reserved = ask - used; - error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true); + switch (type) { + case XFS_AG_RESV_RMAPBT: + /* + * Space taken by the rmapbt is not subtracted from fdblocks + * because the rmapbt lives in the free space. Here we must + * subtract the entire reservation from fdblocks so that we + * always have blocks available for rmapbt expansion. + */ + hidden_space = ask; + break; + case XFS_AG_RESV_METADATA: + /* + * Space taken by all other metadata btrees are accounted + * on-disk as used space. We therefore only hide the space + * that is reserved but not used by the trees. + */ + hidden_space = ask - used; + break; + default: + ASSERT(0); + return -EINVAL; + } + error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); @@ -230,7 +238,8 @@ __xfs_ag_resv_init( resv = xfs_perag_resv(pag, type); resv->ar_asked = ask; - resv->ar_reserved = resv->ar_orig_reserved = reserved; + resv->ar_orig_reserved = hidden_space; + resv->ar_reserved = ask - used; trace_xfs_ag_resv_init(pag, type, ask); return 0; diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index 938f2f96c5e8..4619b554ee90 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_AG_RESV_H__ #define __XFS_AG_RESV_H__ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 4bcc095fe44a..75dbdc14c45f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -39,6 +27,9 @@ #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_ag_resv.h" +#include "xfs_bmap.h" + +extern kmem_zone_t *xfs_bmap_free_item_zone; struct workqueue_struct *xfs_alloc_wq; @@ -224,15 +215,38 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat) /* output: success/failure */ { + struct xfs_mount *mp = cur->bc_mp; + xfs_agnumber_t agno = cur->bc_private.a.agno; union xfs_btree_rec *rec; int error; error = xfs_btree_get_rec(cur, &rec, stat); - if (!error && *stat == 1) { - *bno = be32_to_cpu(rec->alloc.ar_startblock); - *len = be32_to_cpu(rec->alloc.ar_blockcount); - } - return error; + if (error || !(*stat)) + return error; + + *bno = be32_to_cpu(rec->alloc.ar_startblock); + *len = be32_to_cpu(rec->alloc.ar_blockcount); + + if (*len == 0) + goto out_bad_rec; + + /* check for valid extent range, including overflow */ + if (!xfs_verify_agbno(mp, agno, *bno)) + goto out_bad_rec; + if (*bno > *bno + *len) + goto out_bad_rec; + if (!xfs_verify_agbno(mp, agno, *bno + *len - 1)) + goto out_bad_rec; + + return 0; + +out_bad_rec: + xfs_warn(mp, + "%s Freespace BTree record corruption in AG %d detected!", + cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno); + xfs_warn(mp, + "start block 0x%x block count 0x%x", *bno, *len); + return -EFSCORRUPTED; } /* @@ -2060,6 +2074,30 @@ xfs_alloc_space_available( return true; } +int +xfs_free_agfl_block( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + struct xfs_buf *agbp, + struct xfs_owner_info *oinfo) +{ + int error; + struct xfs_buf *bp; + + error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, + XFS_AG_RESV_AGFL); + if (error) + return error; + + bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0); + if (!bp) + return -EFSCORRUPTED; + xfs_trans_binval(tp, bp); + + return 0; +} + /* * Check the agfl fields of the agf for inconsistency or corruption. The purpose * is to detect an agfl header padding mismatch between current and early v5 @@ -2148,6 +2186,40 @@ xfs_agfl_reset( } /* + * Defer an AGFL block free. This is effectively equivalent to + * xfs_bmap_add_free() with some special handling particular to AGFL blocks. + * + * Deferring AGFL frees helps prevent log reservation overruns due to too many + * allocation operations in a transaction. AGFL frees are prone to this problem + * because for one they are always freed one at a time. Further, an immediate + * AGFL block free can cause a btree join and require another block free before + * the real allocation can proceed. Deferring the free disconnects freeing up + * the AGFL slot from freeing the block. + */ +STATIC void +xfs_defer_agfl_block( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, + xfs_fsblock_t agbno, + struct xfs_owner_info *oinfo) +{ + struct xfs_extent_free_item *new; /* new element */ + + ASSERT(xfs_bmap_free_item_zone != NULL); + ASSERT(oinfo != NULL); + + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); + new->xefi_blockcount = 1; + new->xefi_oinfo = *oinfo; + + trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); +} + +/* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ @@ -2247,21 +2319,20 @@ xfs_alloc_fix_freelist( else xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG); while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { - struct xfs_buf *bp; - error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) goto out_agbp_relse; - error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, - &targs.oinfo, XFS_AG_RESV_AGFL); - if (error) - goto out_agbp_relse; - bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); - if (!bp) { - error = -EFSCORRUPTED; - goto out_agbp_relse; + + /* defer agfl frees if dfops is provided */ + if (tp->t_agfl_dfops) { + xfs_defer_agfl_block(mp, tp->t_agfl_dfops, args->agno, + bno, &targs.oinfo); + } else { + error = xfs_free_agfl_block(tp, args->agno, bno, agbp, + &targs.oinfo); + if (error) + goto out_agbp_relse; } - xfs_trans_binval(tp, bp); } targs.tp = tp; @@ -2949,18 +3020,20 @@ out: * after fixing up the freelist. */ int /* error */ -xfs_free_extent( +__xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ struct xfs_owner_info *oinfo, /* extent owner */ - enum xfs_ag_resv_type type) /* block reservation type */ + enum xfs_ag_resv_type type, /* block reservation type */ + bool skip_discard) { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); int error; + unsigned int busy_flags = 0; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); @@ -2984,7 +3057,9 @@ xfs_free_extent( if (error) goto err; - xfs_extent_busy_insert(tp, agno, agbno, len, 0); + if (skip_discard) + busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; + xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags); return 0; err: @@ -3049,55 +3124,6 @@ xfs_alloc_query_all( return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); } -/* Find the size of the AG, in blocks. */ -xfs_agblock_t -xfs_ag_block_count( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - ASSERT(agno < mp->m_sb.sb_agcount); - - if (agno < mp->m_sb.sb_agcount - 1) - return mp->m_sb.sb_agblocks; - return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks); -} - -/* - * Verify that an AG block number pointer neither points outside the AG - * nor points at static metadata. - */ -bool -xfs_verify_agbno( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agblock_t agbno) -{ - xfs_agblock_t eoag; - - eoag = xfs_ag_block_count(mp, agno); - if (agbno >= eoag) - return false; - if (agbno <= XFS_AGFL_BLOCK(mp)) - return false; - return true; -} - -/* - * Verify that an FS block number pointer neither points outside the - * filesystem nor points at static AG metadata. - */ -bool -xfs_verify_fsbno( - struct xfs_mount *mp, - xfs_fsblock_t fsbno) -{ - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); - - if (agno >= mp->m_sb.sb_agcount) - return false; - return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); -} - /* Is there a record covering a given extent? */ int xfs_alloc_has_record( @@ -3116,3 +3142,40 @@ xfs_alloc_has_record( return xfs_btree_has_record(cur, &low, &high, exists); } + +/* + * Walk all the blocks in the AGFL. The @walk_fn can return any negative + * error code or XFS_BTREE_QUERY_RANGE_ABORT. + */ +int +xfs_agfl_walk( + struct xfs_mount *mp, + struct xfs_agf *agf, + struct xfs_buf *agflbp, + xfs_agfl_walk_fn walk_fn, + void *priv) +{ + __be32 *agfl_bno; + unsigned int i; + int error; + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + i = be32_to_cpu(agf->agf_flfirst); + + /* Nothing to walk in an empty AGFL. */ + if (agf->agf_flcount == cpu_to_be32(0)) + return 0; + + /* Otherwise, walk from first to last, wrapping as needed. */ + for (;;) { + error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); + if (error) + return error; + if (i == be32_to_cpu(agf->agf_fllast)) + break; + if (++i == xfs_agfl_size(mp)) + i = 0; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index cbf789ea5a4e..e716c993ac4c 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_ALLOC_H__ #define __XFS_ALLOC_H__ @@ -191,12 +179,24 @@ xfs_alloc_vextent( * Free an extent. */ int /* error */ -xfs_free_extent( +__xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ struct xfs_owner_info *oinfo, /* extent owner */ - enum xfs_ag_resv_type type); /* block reservation type */ + enum xfs_ag_resv_type type, /* block reservation type */ + bool skip_discard); + +static inline int +xfs_free_extent( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) +{ + return __xfs_free_extent(tp, bno, len, oinfo, type, false); +} int /* error */ xfs_alloc_lookup_le( @@ -223,6 +223,8 @@ int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, + struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **agbp); @@ -240,12 +242,13 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); -xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); -bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno); -bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exist); +typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, + void *priv); +int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf, + struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index b451649ba176..4e59cc8a2802 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -242,7 +230,6 @@ xfs_allocbt_init_ptr_from_cur( struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); - ASSERT(agf->agf_roots[cur->bc_btnum] != 0); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -547,3 +534,12 @@ xfs_allocbt_maxrecs( return blocklen / sizeof(xfs_alloc_rec_t); return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); } + +/* Calculate the freespace btree size for some records. */ +xfs_extlen_t +xfs_allocbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_alloc_mnr, len); +} diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 45e189e7e81c..c9305ebb69f6 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_ALLOC_BTREE_H__ #define __XFS_ALLOC_BTREE_H__ @@ -61,5 +49,7 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, + unsigned long long len); #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 35a124400d60..99590f61d624 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -236,7 +224,7 @@ xfs_attr_set( args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; args.total = xfs_attr_calc_size(&args, &local); - error = xfs_qm_dqattach(dp, 0); + error = xfs_qm_dqattach(dp); if (error) return error; @@ -427,7 +415,7 @@ xfs_attr_remove( */ args.op_flags = XFS_DA_OP_OKNOENT; - error = xfs_qm_dqattach(dp, 0); + error = xfs_qm_dqattach(dp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2135b8e67dcc..76e90046731c 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -477,7 +465,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ - minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + minforkoff = max(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ @@ -803,9 +791,8 @@ xfs_attr_shortform_to_leaf( ASSERT(blkno == 0); error = xfs_attr3_leaf_create(args, blkno, &bp); if (error) { - error = xfs_da_shrink_inode(args, 0, bp); - bp = NULL; - if (error) + /* xfs_attr3_leaf_create may not have instantiated a block */ + if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0)) goto out; xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 4da08af5b134..7b74e18becff 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_ATTR_LEAF_H__ #define __XFS_ATTR_LEAF_H__ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 21be186067a2..bf2e0371149b 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -620,7 +608,7 @@ xfs_attr_rmtval_remove( /* * If the "remote" value is in the cache, remove it. */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); + bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 5a9acfa156d7..9d20b66ad379 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_ATTR_REMOTE_H__ #define __XFS_ATTR_REMOTE_H__ diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index afd684ae3136..aafa4fe70624 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_ATTR_SF_H__ #define __XFS_ATTR_SF_H__ diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c index 0a94cce5ea35..40ce5f3094d1 100644 --- a/fs/xfs/libxfs/xfs_bit.c +++ b/fs/xfs/libxfs/xfs_bit.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_log_format.h" diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h index 61c6b2025d0c..99017b8df292 100644 --- a/fs/xfs/libxfs/xfs_bit.h +++ b/fs/xfs/libxfs/xfs_bit.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_BIT_H__ #define __XFS_BIT_H__ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 040eeda8426f..7205268b30bc 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -246,7 +234,7 @@ xfs_bmap_get_bp( struct xfs_btree_cur *cur, xfs_fsblock_t bno) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; int i; if (!cur) @@ -260,9 +248,9 @@ xfs_bmap_get_bp( } /* Chase down all the log items to see if the bp is there */ - list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { - struct xfs_buf_log_item *bip; - bip = (struct xfs_buf_log_item *)lidp->lid_item; + list_for_each_entry(lip, &cur->bc_tp->t_items, li_trans) { + struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip; + if (bip->bli_item.li_type == XFS_LI_BUF && XFS_BUF_ADDR(bip->bli_buf) == bno) return bip->bli_buf; @@ -312,8 +300,9 @@ xfs_check_block( xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); - panic("%s: ptrs are equal in node\n", + xfs_err(mp, "%s: ptrs are equal in node\n", __func__); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } } } @@ -483,7 +472,8 @@ error0: error_norelse: xfs_warn(mp, "%s: BAD after btree leaves for %d extents", __func__, i); - panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return; } @@ -542,12 +532,13 @@ xfs_bmap_validate_ret( * The list is maintained sorted (by block number). */ void -xfs_bmap_add_free( +__xfs_bmap_add_free( struct xfs_mount *mp, struct xfs_defer_ops *dfops, xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_owner_info *oinfo) + struct xfs_owner_info *oinfo, + bool skip_discard) { struct xfs_extent_free_item *new; /* new element */ #ifdef DEBUG @@ -574,6 +565,7 @@ xfs_bmap_add_free( new->xefi_oinfo = *oinfo; else xfs_rmap_skip_owner_update(&new->xefi_oinfo); + new->xefi_skip_discard = skip_discard; trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0, XFS_FSB_TO_AGBNO(mp, bno), len); xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); @@ -1244,7 +1236,6 @@ xfs_iread_extents( num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > nextents)) { - ASSERT(i + num_recs <= nextents); xfs_warn(ip->i_mount, "corrupt dinode %Lu, (btree extents).", (unsigned long long) ip->i_ino); @@ -2001,10 +1992,13 @@ xfs_bmap_add_extent_delay_real( ASSERT(0); } - /* add reverse mapping */ - error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new); - if (error) - goto done; + /* add reverse mapping unless caller opted out */ + if (!(bma->flags & XFS_BMAPI_NORMAP)) { + error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, + whichfork, new); + if (error) + goto done; + } /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -2668,7 +2662,8 @@ xfs_bmap_add_extent_hole_real( struct xfs_bmbt_irec *new, xfs_fsblock_t *first, struct xfs_defer_ops *dfops, - int *logflagsp) + int *logflagsp, + int flags) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; @@ -2845,10 +2840,12 @@ xfs_bmap_add_extent_hole_real( break; } - /* add reverse mapping */ - error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new); - if (error) - goto done; + /* add reverse mapping unless caller opted out */ + if (!(flags & XFS_BMAPI_NORMAP)) { + error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new); + if (error) + goto done; + } /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { @@ -2926,7 +2923,7 @@ xfs_bmap_extsize_align( * perform this alignment, or if a truncate shot us in the * foot. */ - temp = do_mod(orig_off, extsz); + div_u64_rem(orig_off, extsz, &temp); if (temp) { align_alen += temp; align_off -= temp; @@ -3470,7 +3467,7 @@ xfs_bmap_btalloc( xfs_rmap_skip_owner_update(&args.oinfo); /* Trim the allocation back to the maximum an AG can fit. */ - args.maxlen = MIN(ap->length, mp->m_ag_max_usable); + args.maxlen = min(ap->length, mp->m_ag_max_usable); args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { @@ -3500,15 +3497,17 @@ xfs_bmap_btalloc( /* apply extent size hints if obtained earlier */ if (align) { args.prod = align; - if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) - args.mod = (xfs_extlen_t)(args.prod - args.mod); + div_u64_rem(ap->offset, args.prod, &args.mod); + if (args.mod) + args.mod = args.prod - args.mod; } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) { args.prod = 1; args.mod = 0; } else { args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog; - if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod)))) - args.mod = (xfs_extlen_t)(args.prod - args.mod); + div_u64_rem(ap->offset, args.prod, &args.mod); + if (args.mod) + args.mod = args.prod - args.mod; } /* * If we are not low on available data blocks, and the @@ -4123,7 +4122,8 @@ xfs_bmapi_allocate( else error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, whichfork, &bma->icur, &bma->cur, &bma->got, - bma->firstblock, bma->dfops, &bma->logflags); + bma->firstblock, bma->dfops, &bma->logflags, + bma->flags); bma->logflags |= tmp_logflags; if (error) @@ -4509,30 +4509,37 @@ error0: return error; } -static int +int xfs_bmapi_remap( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - struct xfs_defer_ops *dfops) + struct xfs_defer_ops *dfops, + int flags) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp; struct xfs_btree_cur *cur = NULL; xfs_fsblock_t firstblock = NULLFSBLOCK; struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; + int whichfork = xfs_bmapi_whichfork(flags); int logflags = 0, error; + ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(len > 0); ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | + XFS_BMAPI_NORMAP))); + ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != + (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; @@ -4542,7 +4549,7 @@ xfs_bmapi_remap( return -EIO; if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; } @@ -4557,7 +4564,7 @@ xfs_bmapi_remap( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (ifp->if_flags & XFS_IFBROOT) { - cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = firstblock; cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; @@ -4566,18 +4573,21 @@ xfs_bmapi_remap( got.br_startoff = bno; got.br_startblock = startblock; got.br_blockcount = len; - got.br_state = XFS_EXT_NORM; + if (flags & XFS_BMAPI_PREALLOC) + got.br_state = XFS_EXT_UNWRITTEN; + else + got.br_state = XFS_EXT_NORM; - error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur, - &cur, &got, &firstblock, dfops, &logflags); + error = xfs_bmap_add_extent_hole_real(tp, ip, whichfork, &icur, + &cur, &got, &firstblock, dfops, &logflags, flags); if (error) goto error0; - if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) { + if (xfs_bmap_wants_extents(ip, whichfork)) { int tmp_logflags = 0; error = xfs_bmap_btree_to_extents(tp, ip, cur, - &tmp_logflags, XFS_DATA_FORK); + &tmp_logflags, whichfork); logflags |= tmp_logflags; } @@ -4945,13 +4955,15 @@ xfs_bmap_del_extent_real( if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { xfs_fsblock_t bno; xfs_filblks_t len; + xfs_extlen_t mod; + + bno = div_u64_rem(del->br_startblock, mp->m_sb.sb_rextsize, + &mod); + ASSERT(mod == 0); + len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize, + &mod); + ASSERT(mod == 0); - ASSERT(do_mod(del->br_blockcount, mp->m_sb.sb_rextsize) == 0); - ASSERT(do_mod(del->br_startblock, mp->m_sb.sb_rextsize) == 0); - bno = del->br_startblock; - len = del->br_blockcount; - do_div(bno, mp->m_sb.sb_rextsize); - do_div(len, mp->m_sb.sb_rextsize); error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); if (error) goto done; @@ -5104,9 +5116,12 @@ xfs_bmap_del_extent_real( error = xfs_refcount_decrease_extent(mp, dfops, del); if (error) goto done; - } else - xfs_bmap_add_free(mp, dfops, del->br_startblock, - del->br_blockcount, NULL); + } else { + __xfs_bmap_add_free(mp, dfops, del->br_startblock, + del->br_blockcount, NULL, + (bflags & XFS_BMAPI_NODISCARD) || + del->br_state == XFS_EXT_UNWRITTEN); + } } /* @@ -5285,9 +5300,12 @@ __xfs_bunmapi( del.br_blockcount = max_len; } + if (!isrt) + goto delete; + sum = del.br_startblock + del.br_blockcount; - if (isrt && - (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { + div_u64_rem(sum, mp->m_sb.sb_rextsize, &mod); + if (mod) { /* * Realtime extent not lined up at the end. * The extent could have been split into written @@ -5334,7 +5352,8 @@ __xfs_bunmapi( goto error0; goto nodelete; } - if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) { + div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod); + if (mod) { /* * Realtime extent is lined up at the end but not * at the front. We'll get rid of full extents if @@ -5403,6 +5422,7 @@ __xfs_bunmapi( } } +delete: if (wasdel) { error = xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); @@ -5760,6 +5780,32 @@ del_cursor: return error; } +/* Make sure we won't be right-shifting an extent past the maximum bound. */ +int +xfs_bmap_can_insert_extents( + struct xfs_inode *ip, + xfs_fileoff_t off, + xfs_fileoff_t shift) +{ + struct xfs_bmbt_irec got; + int is_empty; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_last_extent(NULL, ip, XFS_DATA_FORK, &got, &is_empty); + if (!error && !is_empty && got.br_startoff >= off && + ((got.br_startoff + shift) & BMBT_STARTOFF_MASK) < got.br_startoff) + error = -EINVAL; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + int xfs_bmap_insert_extents( struct xfs_trans *tp, @@ -6148,7 +6194,7 @@ xfs_bmap_finish_one( switch (type) { case XFS_BMAP_MAP: error = xfs_bmapi_remap(tp, ip, startoff, *blockcount, - startblock, dfops); + startblock, dfops, 0); *blockcount = 0; break; case XFS_BMAP_UNMAP: diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 2b766b37096d..9b49ddf99c41 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_BMAP_H__ #define __XFS_BMAP_H__ @@ -68,6 +56,7 @@ struct xfs_extent_free_item xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ struct list_head xefi_list; struct xfs_owner_info xefi_oinfo; /* extent owner */ + bool xefi_skip_discard; }; #define XFS_BMAP_MAX_NMAP 4 @@ -116,6 +105,12 @@ struct xfs_extent_free_item /* Only convert unwritten extents, don't allocate new blocks */ #define XFS_BMAPI_CONVERT_ONLY 0x800 +/* Skip online discard of freed extents */ +#define XFS_BMAPI_NODISCARD 0x1000 + +/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ +#define XFS_BMAPI_NORMAP 0x2000 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -128,7 +123,9 @@ struct xfs_extent_free_item { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ - { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" } + { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ + { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ + { XFS_BMAPI_NORMAP, "NORMAP" } static inline int xfs_bmapi_aflag(int w) @@ -192,9 +189,9 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); -void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, +void __xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_owner_info *oinfo); + struct xfs_owner_info *oinfo, bool skip_discard); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -230,6 +227,8 @@ int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops); +int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, + xfs_fileoff_t shift); int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, @@ -240,6 +239,17 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); +static inline void +xfs_bmap_add_free( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t bno, + xfs_filblks_t len, + struct xfs_owner_info *oinfo) +{ + __xfs_bmap_add_free(mp, dfops, bno, len, oinfo, false); +} + enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, XFS_BMAP_UNMAP, @@ -277,4 +287,8 @@ static inline int xfs_bmap_fork_to_state(int whichfork) xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); +int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, + struct xfs_defer_ops *dfops, int flags); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index d89d06bea6e3..e1a2d9ceb615 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -660,3 +648,12 @@ xfs_bmbt_change_owner( xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); return error; } + +/* Calculate the bmap btree size for some records. */ +unsigned long long +xfs_bmbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_bmap_dmnr, len); +} diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index e4505746ccaa..29b407d053b4 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_BMAP_BTREE_H__ #define __XFS_BMAP_BTREE_H__ @@ -118,4 +106,7 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, + unsigned long long len); + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index ac7d66427e42..34c6d7bd4d18 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -234,7 +222,6 @@ xfs_btree_check_sptr( return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno); } -#ifdef DEBUG /* * Check that a given (indexed) btree pointer at a certain level of a * btree is valid and doesn't point past where it should. @@ -247,17 +234,31 @@ xfs_btree_check_ptr( int level) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, - xfs_btree_check_lptr(cur, - be64_to_cpu((&ptr->l)[index]), level)); + if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), + level)) + return 0; + xfs_err(cur->bc_mp, +"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", + cur->bc_private.b.ip->i_ino, + cur->bc_private.b.whichfork, cur->bc_btnum, + level, index); } else { - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, - xfs_btree_check_sptr(cur, - be32_to_cpu((&ptr->s)[index]), level)); + if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), + level)) + return 0; + xfs_err(cur->bc_mp, +"AG %u: Corrupt btree %d pointer at level %d index %d.", + cur->bc_private.a.agno, cur->bc_btnum, + level, index); } - return 0; + return -EFSCORRUPTED; } + +#ifdef DEBUG +# define xfs_btree_debug_check_ptr xfs_btree_check_ptr +#else +# define xfs_btree_debug_check_ptr(...) (0) #endif /* @@ -988,22 +989,30 @@ xfs_btree_readahead( return xfs_btree_readahead_sblock(cur, lr, block); } -STATIC xfs_daddr_t +STATIC int xfs_btree_ptr_to_daddr( struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) + union xfs_btree_ptr *ptr, + xfs_daddr_t *daddr) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK)); + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + int error; - return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); - } else { - ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); - ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); + error = xfs_btree_check_ptr(cur, ptr, 0, 1); + if (error) + return error; - return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, - be32_to_cpu(ptr->s)); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + fsbno = be64_to_cpu(ptr->l); + *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); + } else { + agbno = be32_to_cpu(ptr->s); + *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + agbno); } + + return 0; } /* @@ -1018,8 +1027,11 @@ xfs_btree_readahead_ptr( union xfs_btree_ptr *ptr, xfs_extlen_t count) { - xfs_buf_readahead(cur->bc_mp->m_ddev_targp, - xfs_btree_ptr_to_daddr(cur, ptr), + xfs_daddr_t daddr; + + if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) + return; + xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr, cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); } @@ -1282,11 +1294,14 @@ xfs_btree_get_buf_block( { struct xfs_mount *mp = cur->bc_mp; xfs_daddr_t d; + int error; /* need to sort out how callers deal with failures first */ ASSERT(!(flags & XBF_TRYLOCK)); - d = xfs_btree_ptr_to_daddr(cur, ptr); + error = xfs_btree_ptr_to_daddr(cur, ptr, &d); + if (error) + return error; *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags); @@ -1317,7 +1332,9 @@ xfs_btree_read_buf_block( /* need to sort out how callers deal with failures first */ ASSERT(!(flags & XBF_TRYLOCK)); - d = xfs_btree_ptr_to_daddr(cur, ptr); + error = xfs_btree_ptr_to_daddr(cur, ptr, &d); + if (error) + return error; error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags, bpp, cur->bc_ops->buf_ops); @@ -1764,6 +1781,7 @@ xfs_btree_lookup_get_block( struct xfs_btree_block **blkp) /* return btree block */ { struct xfs_buf *bp; /* buffer pointer for btree block */ + xfs_daddr_t daddr; int error = 0; /* special case the root block if in an inode */ @@ -1780,7 +1798,10 @@ xfs_btree_lookup_get_block( * Otherwise throw it away and get a new one. */ bp = cur->bc_bufs[level]; - if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) { + error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); + if (error) + return error; + if (bp && XFS_BUF_ADDR(bp) == daddr) { *blkp = XFS_BUF_TO_BLOCK(bp); return 0; } @@ -1896,7 +1917,13 @@ xfs_btree_lookup( high = xfs_btree_get_numrecs(block); if (!high) { /* Block is empty, must be an empty leaf. */ - ASSERT(level == 0 && cur->bc_nlevels == 1); + if (level != 0 || cur->bc_nlevels != 1) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, + cur->bc_mp, block, + sizeof(*block)); + return -EFSCORRUPTED; + } cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; *stat = 0; @@ -1946,11 +1973,10 @@ xfs_btree_lookup( keyno = 1; pp = xfs_btree_ptr_addr(cur, keyno, block); -#ifdef DEBUG - error = xfs_btree_check_ptr(cur, pp, 0, level); + error = xfs_btree_debug_check_ptr(cur, pp, 0, level); if (error) goto error0; -#endif + cur->bc_ptrs[level] = keyno; } } @@ -2354,11 +2380,11 @@ xfs_btree_lshift( lpp = xfs_btree_ptr_addr(cur, lrecs, left); rpp = xfs_btree_ptr_addr(cur, 1, right); -#ifdef DEBUG - error = xfs_btree_check_ptr(cur, rpp, 0, level); + + error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); if (error) goto error0; -#endif + xfs_btree_copy_keys(cur, lkp, rkp, 1); xfs_btree_copy_ptrs(cur, lpp, rpp, 1); @@ -2393,15 +2419,14 @@ xfs_btree_lshift( XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); if (level > 0) { /* It's a nonleaf. operate on keys and ptrs */ -#ifdef DEBUG int i; /* loop index */ for (i = 0; i < rrecs; i++) { - error = xfs_btree_check_ptr(cur, rpp, i + 1, level); + error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); if (error) goto error0; } -#endif + xfs_btree_shift_keys(cur, xfs_btree_key_addr(cur, 2, right), -1, rrecs); @@ -2541,22 +2566,18 @@ xfs_btree_rshift( rkp = xfs_btree_key_addr(cur, 1, right); rpp = xfs_btree_ptr_addr(cur, 1, right); -#ifdef DEBUG for (i = rrecs - 1; i >= 0; i--) { - error = xfs_btree_check_ptr(cur, rpp, i, level); + error = xfs_btree_debug_check_ptr(cur, rpp, i, level); if (error) goto error0; } -#endif xfs_btree_shift_keys(cur, rkp, 1, rrecs); xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); -#ifdef DEBUG - error = xfs_btree_check_ptr(cur, lpp, 0, level); + error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); if (error) goto error0; -#endif /* Now put the new data in, and log it. */ xfs_btree_copy_keys(cur, rkp, lkp, 1); @@ -2661,9 +2682,7 @@ __xfs_btree_split( int rrecs; int src_index; int error; /* error return value */ -#ifdef DEBUG int i; -#endif XFS_BTREE_STATS_INC(cur, split); @@ -2729,13 +2748,11 @@ __xfs_btree_split( rkp = xfs_btree_key_addr(cur, 1, right); rpp = xfs_btree_ptr_addr(cur, 1, right); -#ifdef DEBUG for (i = src_index; i < rrecs; i++) { - error = xfs_btree_check_ptr(cur, lpp, i, level); + error = xfs_btree_debug_check_ptr(cur, lpp, i, level); if (error) goto error0; } -#endif /* Copy the keys & pointers to the new block. */ xfs_btree_copy_keys(cur, rkp, lkp, rrecs); @@ -2923,9 +2940,7 @@ xfs_btree_new_iroot( union xfs_btree_ptr nptr; /* new block addr */ int level; /* btree level */ int error; /* error return code */ -#ifdef DEBUG int i; /* loop counter */ -#endif XFS_BTREE_STATS_INC(cur, newroot); @@ -2972,20 +2987,18 @@ xfs_btree_new_iroot( xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); cpp = xfs_btree_ptr_addr(cur, 1, cblock); -#ifdef DEBUG for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - error = xfs_btree_check_ptr(cur, pp, i, level); + error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) goto error0; } -#endif + xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); -#ifdef DEBUG - error = xfs_btree_check_ptr(cur, &nptr, 0, level); + error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); if (error) goto error0; -#endif + xfs_btree_copy_ptrs(cur, pp, &nptr, 1); xfs_iroot_realloc(cur->bc_private.b.ip, @@ -3229,9 +3242,7 @@ xfs_btree_insrec( int ptr; /* key/record index */ int numrecs;/* number of records */ int error; /* error return value */ -#ifdef DEBUG int i; -#endif xfs_daddr_t old_bn; ncur = NULL; @@ -3321,22 +3332,18 @@ xfs_btree_insrec( kp = xfs_btree_key_addr(cur, ptr, block); pp = xfs_btree_ptr_addr(cur, ptr, block); -#ifdef DEBUG for (i = numrecs - ptr; i >= 0; i--) { - error = xfs_btree_check_ptr(cur, pp, i, level); + error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) return error; } -#endif xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); -#ifdef DEBUG - error = xfs_btree_check_ptr(cur, ptrp, 0, level); + error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); if (error) goto error0; -#endif /* Now put the new data in, bump numrecs and log it. */ xfs_btree_copy_keys(cur, kp, key, 1); @@ -3524,8 +3531,8 @@ xfs_btree_kill_iroot( int error; #ifdef DEBUG union xfs_btree_ptr ptr; - int i; #endif + int i; ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); ASSERT(cur->bc_nlevels > 1); @@ -3581,13 +3588,13 @@ xfs_btree_kill_iroot( pp = xfs_btree_ptr_addr(cur, 1, block); cpp = xfs_btree_ptr_addr(cur, 1, cblock); -#ifdef DEBUG + for (i = 0; i < numrecs; i++) { - error = xfs_btree_check_ptr(cur, cpp, i, level - 1); + error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); if (error) return error; } -#endif + xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); error = xfs_btree_free_block(cur, cbp); @@ -3721,13 +3728,11 @@ xfs_btree_delrec( lkp = xfs_btree_key_addr(cur, ptr + 1, block); lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); -#ifdef DEBUG for (i = 0; i < numrecs - ptr; i++) { - error = xfs_btree_check_ptr(cur, lpp, i, level); + error = xfs_btree_debug_check_ptr(cur, lpp, i, level); if (error) goto error0; } -#endif if (ptr < numrecs) { xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); @@ -4060,13 +4065,13 @@ xfs_btree_delrec( lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); rkp = xfs_btree_key_addr(cur, 1, right); rpp = xfs_btree_ptr_addr(cur, 1, right); -#ifdef DEBUG + for (i = 1; i < rrecs; i++) { - error = xfs_btree_check_ptr(cur, rpp, i, level); + error = xfs_btree_debug_check_ptr(cur, rpp, i, level); if (error) goto error0; } -#endif + xfs_btree_copy_keys(cur, lkp, rkp, rrecs); xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); @@ -4836,14 +4841,14 @@ xfs_btree_query_all( * Calculate the number of blocks needed to store a given number of records * in a short-format (per-AG metadata) btree. */ -xfs_extlen_t +unsigned long long xfs_btree_calc_size( uint *limits, unsigned long long len) { int level; int maxrecs; - xfs_extlen_t rval; + unsigned long long rval; maxrecs = limits[0]; for (level = 0, rval = 0; len > 1; level++) { @@ -4919,3 +4924,24 @@ xfs_btree_has_record( *exists = false; return error; } + +/* Are there more records in this btree? */ +bool +xfs_btree_has_more_records( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, 0, &bp); + + /* There are still records in this block. */ + if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block)) + return true; + + /* There are more record blocks. */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); + else + return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 9227159a751e..0a4fdf7f11a7 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_BTREE_H__ #define __XFS_BTREE_H__ @@ -482,7 +470,7 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, unsigned int max_recs); uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); -xfs_extlen_t xfs_btree_calc_size(uint *limits, unsigned long long len); +unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ @@ -528,5 +516,6 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, union xfs_btree_irec *high, bool *exists); +bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index ea187b4a7991..8a301402bbc4 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -305,9 +293,11 @@ xfs_da3_node_read( type = XFS_BLFT_DIR_LEAFN_BUF; break; default: - type = 0; - ASSERT(0); - break; + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + tp->t_mountp, info, sizeof(*info)); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; } xfs_trans_buf_set_type(tp, *bpp, type); } @@ -2091,7 +2081,7 @@ xfs_da_grow_inode_int( */ mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); for (b = *bno, mapi = 0; b < *bno + count; ) { - nmap = MIN(XFS_BMAP_MAX_NMAP, count); + nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); error = xfs_bmapi_write(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ae6de17467f2..28260073ae71 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_DA_BTREE_H__ #define __XFS_DA_BTREE_H__ diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index 6d77d1a8498a..b39053dcb643 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 7e77299b7789..5d5bf3bffc78 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_DA_FORMAT_H__ #define __XFS_DA_FORMAT_H__ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 087fea02c389..c3e5bffda4f5 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -220,7 +206,7 @@ xfs_defer_trans_abort( { struct xfs_defer_pending *dfp; - trace_xfs_defer_trans_abort(tp->t_mountp, dop); + trace_xfs_defer_trans_abort(tp->t_mountp, dop, _RET_IP_); /* Abort intent items that don't have a done item. */ list_for_each_entry(dfp, &dop->dop_pending, dfp_list) { @@ -253,7 +239,7 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]); - trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); + trace_xfs_defer_trans_roll((*tp)->t_mountp, dop, _RET_IP_); /* Roll the transaction. */ error = xfs_trans_roll(tp); @@ -352,10 +338,21 @@ xfs_defer_finish( void *state; int error = 0; void (*cleanup_fn)(struct xfs_trans *, void *, int); + struct xfs_defer_ops *orig_dop; ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - trace_xfs_defer_finish((*tp)->t_mountp, dop); + trace_xfs_defer_finish((*tp)->t_mountp, dop, _RET_IP_); + + /* + * Attach dfops to the transaction during deferred ops processing. This + * explicitly causes calls into the allocator to defer AGFL block frees. + * Note that this code can go away once all dfops users attach to the + * associated tp. + */ + ASSERT(!(*tp)->t_agfl_dfops || ((*tp)->t_agfl_dfops == dop)); + orig_dop = (*tp)->t_agfl_dfops; + (*tp)->t_agfl_dfops = dop; /* Until we run out of pending work to finish... */ while (xfs_defer_has_unfinished_work(dop)) { @@ -428,10 +425,11 @@ xfs_defer_finish( } out: + (*tp)->t_agfl_dfops = orig_dop; if (error) trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error); else - trace_xfs_defer_finish_done((*tp)->t_mountp, dop); + trace_xfs_defer_finish_done((*tp)->t_mountp, dop, _RET_IP_); return error; } @@ -447,7 +445,7 @@ xfs_defer_cancel( struct list_head *pwi; struct list_head *n; - trace_xfs_defer_cancel(NULL, dop); + trace_xfs_defer_cancel(NULL, dop, _RET_IP_); /* * Free the pending items. Caller should already have arranged @@ -532,5 +530,5 @@ xfs_defer_init( *fbp = NULLFSBLOCK; INIT_LIST_HEAD(&dop->dop_intake); INIT_LIST_HEAD(&dop->dop_pending); - trace_xfs_defer_init(NULL, dop); + trace_xfs_defer_init(NULL, dop, _RET_IP_); } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 045beacdd37d..a02b2b748b6d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_DEFER_H__ #define __XFS_DEFER_H__ @@ -55,6 +41,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_REFCOUNT, XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, + XFS_DEFER_OPS_TYPE_AGFL_FREE, XFS_DEFER_OPS_TYPE_MAX, }; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 92f94e190f04..59169aff30fe 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 989e95a53db2..ed385316c7dc 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_DIR2_H__ #define __XFS_DIR2_H__ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 875893ded514..30ed5919da72 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -514,8 +502,8 @@ xfs_dir2_block_addname( if (mid - lowstale) memmove(&blp[lowstale], &blp[lowstale + 1], (mid - lowstale) * sizeof(*blp)); - lfloglow = MIN(lowstale, lfloglow); - lfloghigh = MAX(mid, lfloghigh); + lfloglow = min(lowstale, lfloglow); + lfloghigh = max(mid, lfloghigh); } /* * Move entries toward the high-numbered stale entry. @@ -526,8 +514,8 @@ xfs_dir2_block_addname( if (highstale - mid) memmove(&blp[mid + 1], &blp[mid], (highstale - mid) * sizeof(*blp)); - lfloglow = MIN(mid, lfloglow); - lfloghigh = MAX(highstale, lfloghigh); + lfloglow = min(mid, lfloglow); + lfloghigh = max(highstale, lfloghigh); } be32_add_cpu(&btp->stale, -1); } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index cb67ec730b9b..01162c62ec8f 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -33,6 +21,11 @@ #include "xfs_cksum.h" #include "xfs_log.h" +static xfs_failaddr_t xfs_dir2_data_freefind_verify( + struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, + struct xfs_dir2_data_unused *dup, + struct xfs_dir2_data_free **bf_ent); + /* * Check the consistency of the data block. * The input can also be a block-format directory. @@ -147,6 +140,8 @@ __xfs_dir3_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + xfs_failaddr_t fa; + if (lastfree != 0) return __this_address; if (endp < p + be16_to_cpu(dup->length)) @@ -154,7 +149,9 @@ __xfs_dir3_data_check( if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) != (char *)dup - (char *)hdr) return __this_address; - dfp = xfs_dir2_data_freefind(hdr, bf, dup); + fa = xfs_dir2_data_freefind_verify(hdr, bf, dup, &dfp); + if (fa) + return fa; if (dfp) { i = (int)(dfp - bf); if ((freeseen & (1 << i)) != 0) @@ -242,7 +239,8 @@ xfs_dir3_data_check( if (!fa) return; xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount, - bp->b_addr, __FILE__, __LINE__, fa); + bp->b_addr, BBTOB(bp->b_length), __FILE__, __LINE__, + fa); ASSERT(0); } #endif @@ -381,55 +379,79 @@ xfs_dir3_data_readahead( } /* - * Given a data block and an unused entry from that block, - * return the bestfree entry if any that corresponds to it. + * Find the bestfree entry that exactly coincides with unused directory space + * or a verifier error because the bestfree data are bad. */ -xfs_dir2_data_free_t * -xfs_dir2_data_freefind( - struct xfs_dir2_data_hdr *hdr, /* data block header */ - struct xfs_dir2_data_free *bf, /* bestfree table pointer */ - struct xfs_dir2_data_unused *dup) /* unused space */ +static xfs_failaddr_t +xfs_dir2_data_freefind_verify( + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_data_free *bf, + struct xfs_dir2_data_unused *dup, + struct xfs_dir2_data_free **bf_ent) { - xfs_dir2_data_free_t *dfp; /* bestfree entry */ - xfs_dir2_data_aoff_t off; /* offset value needed */ -#ifdef DEBUG - int matched; /* matched the value */ - int seenzero; /* saw a 0 bestfree entry */ -#endif + struct xfs_dir2_data_free *dfp; + xfs_dir2_data_aoff_t off; + bool matched = false; + bool seenzero = false; + *bf_ent = NULL; off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); -#ifdef DEBUG /* * Validate some consistency in the bestfree table. * Check order, non-overlapping entries, and if we find the * one we're looking for it has to be exact. */ - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - for (dfp = &bf[0], seenzero = matched = 0; - dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; - dfp++) { + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) { - ASSERT(!dfp->length); - seenzero = 1; + if (dfp->length) + return __this_address; + seenzero = true; continue; } - ASSERT(seenzero == 0); + if (seenzero) + return __this_address; if (be16_to_cpu(dfp->offset) == off) { - matched = 1; - ASSERT(dfp->length == dup->length); - } else if (off < be16_to_cpu(dfp->offset)) - ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset)); - else - ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off); - ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length)); - if (dfp > &bf[0]) - ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length)); + matched = true; + if (dfp->length != dup->length) + return __this_address; + } else if (be16_to_cpu(dfp->offset) > off) { + if (off + be16_to_cpu(dup->length) > + be16_to_cpu(dfp->offset)) + return __this_address; + } else { + if (be16_to_cpu(dfp->offset) + + be16_to_cpu(dfp->length) > off) + return __this_address; + } + if (!matched && + be16_to_cpu(dfp->length) < be16_to_cpu(dup->length)) + return __this_address; + if (dfp > &bf[0] && + be16_to_cpu(dfp[-1].length) < be16_to_cpu(dfp[0].length)) + return __this_address; } -#endif + + /* Looks ok so far; now try to match up with a bestfree entry. */ + *bf_ent = xfs_dir2_data_freefind(hdr, bf, dup); + return NULL; +} + +/* + * Given a data block and an unused entry from that block, + * return the bestfree entry if any that corresponds to it. + */ +xfs_dir2_data_free_t * +xfs_dir2_data_freefind( + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup) /* unused space */ +{ + xfs_dir2_data_free_t *dfp; /* bestfree entry */ + xfs_dir2_data_aoff_t off; /* offset value needed */ + + off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); + /* * If this is smaller than the smallest bestfree entry, * it can't be there since they're sorted. @@ -1124,7 +1146,7 @@ xfs_dir2_data_use_free( return 0; corrupt: xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount, - hdr, __FILE__, __LINE__, fa); + hdr, sizeof(*hdr), __FILE__, __LINE__, fa); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 50fc9c0c5e2b..1728a3e6f5cf 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -81,7 +69,8 @@ xfs_dir3_leaf_check( if (!fa) return; xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount, - bp->b_addr, __FILE__, __LINE__, fa); + bp->b_addr, BBTOB(bp->b_length), __FILE__, __LINE__, + fa); ASSERT(0); } #else @@ -601,8 +590,8 @@ xfs_dir3_leaf_find_entry( (index - lowstale - 1) * sizeof(xfs_dir2_leaf_entry_t)); } - *lfloglow = MIN(lowstale, *lfloglow); - *lfloghigh = MAX(index - 1, *lfloghigh); + *lfloglow = min(lowstale, *lfloglow); + *lfloghigh = max(index - 1, *lfloghigh); leafhdr->stale--; return &ents[index - 1]; } @@ -621,8 +610,8 @@ xfs_dir3_leaf_find_entry( memmove(&ents[index + 1], &ents[index], (highstale - index) * sizeof(xfs_dir2_leaf_entry_t)); } - *lfloglow = MIN(index, *lfloglow); - *lfloghigh = MAX(highstale, *lfloghigh); + *lfloglow = min(index, *lfloglow); + *lfloghigh = max(highstale, *lfloghigh); leafhdr->stale--; return &ents[index]; } @@ -872,7 +861,6 @@ xfs_dir2_leaf_addname( */ dup = (xfs_dir2_data_unused_t *) ((char *)hdr + be16_to_cpu(bf[0].offset)); - ASSERT(be16_to_cpu(dup->length) >= length); needscan = needlog = 0; /* * Mark the initial part of our freespace in use for the new entry. diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 9df096cc3c37..2daf874969ab 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -84,7 +72,8 @@ xfs_dir3_leaf_check( if (!fa) return; xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, dp->i_mount, - bp->b_addr, __FILE__, __LINE__, fa); + bp->b_addr, BBTOB(bp->b_length), __FILE__, __LINE__, + fa); ASSERT(0); } #else diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 753aeeeffc18..59f9fb2241a5 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_DIR2_PRIV_H__ #define __XFS_DIR2_PRIV_H__ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 0c75a7f00883..585dfdb7b6b6 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 8b7a6c3cb599..d293f371dd54 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -41,14 +29,18 @@ xfs_calc_dquots_per_chunk( /* * Do some primitive error checking on ondisk dquot data structures. + * + * The xfs_dqblk structure /contains/ the xfs_disk_dquot structure; + * we verify them separately because at some points we have only the + * smaller xfs_disk_dquot structure available. */ + xfs_failaddr_t xfs_dquot_verify( struct xfs_mount *mp, xfs_disk_dquot_t *ddq, xfs_dqid_t id, - uint type, /* used only when IO_dorepair is true */ - uint flags) + uint type) /* used only during quotacheck */ { /* * We can encounter an uninitialized dquot buffer for 2 reasons: @@ -70,6 +62,8 @@ xfs_dquot_verify( if (ddq->d_version != XFS_DQUOT_VERSION) return __this_address; + if (type && ddq->d_flags != type) + return __this_address; if (ddq->d_flags != XFS_DQ_USER && ddq->d_flags != XFS_DQ_PROJ && ddq->d_flags != XFS_DQ_GROUP) @@ -99,33 +93,44 @@ xfs_dquot_verify( return NULL; } +xfs_failaddr_t +xfs_dqblk_verify( + struct xfs_mount *mp, + struct xfs_dqblk *dqb, + xfs_dqid_t id, + uint type) /* used only during quotacheck */ +{ + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + + return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type); +} + /* * Do some primitive error checking on ondisk dquot data structures. */ int -xfs_dquot_repair( +xfs_dqblk_repair( struct xfs_mount *mp, - struct xfs_disk_dquot *ddq, + struct xfs_dqblk *dqb, xfs_dqid_t id, uint type) { - struct xfs_dqblk *d = (struct xfs_dqblk *)ddq; - - /* * Typically, a repair is only requested by quotacheck. */ ASSERT(id != -1); - memset(d, 0, sizeof(xfs_dqblk_t)); + memset(dqb, 0, sizeof(xfs_dqblk_t)); - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_flags = type; - d->dd_diskdq.d_id = cpu_to_be32(id); + dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION; + dqb->dd_diskdq.d_flags = type; + dqb->dd_diskdq.d_id = cpu_to_be32(id); if (xfs_sb_version_hascrc(&mp->m_sb)) { - uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); - xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -135,7 +140,8 @@ xfs_dquot_repair( STATIC bool xfs_dquot_buf_verify_crc( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + bool readahead) { struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; int ndquots; @@ -156,10 +162,12 @@ xfs_dquot_buf_verify_crc( for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), - XFS_DQUOT_CRC_OFF)) - return false; - if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid)) + XFS_DQUOT_CRC_OFF)) { + if (!readahead) + xfs_buf_verifier_error(bp, -EFSBADCRC, __func__, + d, sizeof(*d), __this_address); return false; + } } return true; } @@ -167,9 +175,10 @@ xfs_dquot_buf_verify_crc( STATIC xfs_failaddr_t xfs_dquot_buf_verify( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + bool readahead) { - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_dqblk *dqb = bp->b_addr; xfs_failaddr_t fa; xfs_dqid_t id = 0; int ndquots; @@ -195,14 +204,19 @@ xfs_dquot_buf_verify( for (i = 0; i < ndquots; i++) { struct xfs_disk_dquot *ddq; - ddq = &d[i].dd_diskdq; + ddq = &dqb[i].dd_diskdq; if (i == 0) id = be32_to_cpu(ddq->d_id); - fa = xfs_dquot_verify(mp, ddq, id + i, 0, 0); - if (fa) + fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0); + if (fa) { + if (!readahead) + xfs_buf_verifier_error(bp, -EFSCORRUPTED, + __func__, &dqb[i], + sizeof(struct xfs_dqblk), fa); return fa; + } } return NULL; @@ -214,7 +228,7 @@ xfs_dquot_buf_verify_struct( { struct xfs_mount *mp = bp->b_target->bt_mount; - return xfs_dquot_buf_verify(mp, bp); + return xfs_dquot_buf_verify(mp, bp, false); } static void @@ -222,15 +236,10 @@ xfs_dquot_buf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_failaddr_t fa; - if (!xfs_dquot_buf_verify_crc(mp, bp)) - xfs_verifier_error(bp, -EFSBADCRC, __this_address); - else { - fa = xfs_dquot_buf_verify(mp, bp); - if (fa) - xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); - } + if (!xfs_dquot_buf_verify_crc(mp, bp, false)) + return; + xfs_dquot_buf_verify(mp, bp, false); } /* @@ -245,8 +254,8 @@ xfs_dquot_buf_readahead_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if (!xfs_dquot_buf_verify_crc(mp, bp) || - xfs_dquot_buf_verify(mp, bp) != NULL) { + if (!xfs_dquot_buf_verify_crc(mp, bp, true) || + xfs_dquot_buf_verify(mp, bp, true) != NULL) { xfs_buf_ioerror(bp, -EIO); bp->b_flags &= ~XBF_DONE; } @@ -262,11 +271,8 @@ xfs_dquot_buf_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_failaddr_t fa; - fa = xfs_dquot_buf_verify(mp, bp); - if (fa) - xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); + xfs_dquot_buf_verify(mp, bp, false); } const struct xfs_buf_ops xfs_dquot_buf_ops = { diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index bc1789d95152..b9974e7a8e6e 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -1,21 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (C) 2017 Oracle. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_ERRORTAG_H_ #define __XFS_ERRORTAG_H_ @@ -65,7 +52,8 @@ #define XFS_ERRTAG_LOG_BAD_CRC 29 #define XFS_ERRTAG_LOG_ITEM_PIN 30 #define XFS_ERRTAG_BUF_LRU_REF 31 -#define XFS_ERRTAG_MAX 32 +#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 +#define XFS_ERRTAG_MAX 33 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -102,5 +90,6 @@ #define XFS_RANDOM_LOG_BAD_CRC 1 #define XFS_RANDOM_LOG_ITEM_PIN 1 #define XFS_RANDOM_BUF_LRU_REF 2 +#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 42956d8d95ed..059bc44c27e8 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_FORMAT_H__ #define __XFS_FORMAT_H__ @@ -98,6 +86,9 @@ struct xfs_ifork; XFS_SB_VERSION2_PROJID32BIT | \ XFS_SB_VERSION2_FTYPE) +/* Maximum size of the xfs filesystem label, no terminating NULL */ +#define XFSLABEL_MAX 12 + /* * Superblock - in core version. Must match the ondisk version below. * Must be padded to 64 bit alignment. @@ -122,7 +113,7 @@ typedef struct xfs_sb { uint16_t sb_sectsize; /* volume sector size, bytes */ uint16_t sb_inodesize; /* inode size, bytes */ uint16_t sb_inopblock; /* inodes per block */ - char sb_fname[12]; /* file system name */ + char sb_fname[XFSLABEL_MAX]; /* file system name */ uint8_t sb_blocklog; /* log2 of sb_blocksize */ uint8_t sb_sectlog; /* log2 of sb_sectsize */ uint8_t sb_inodelog; /* log2 of sb_inodesize */ @@ -213,7 +204,7 @@ typedef struct xfs_dsb { __be16 sb_sectsize; /* volume sector size, bytes */ __be16 sb_inodesize; /* inode size, bytes */ __be16 sb_inopblock; /* inodes per block */ - char sb_fname[12]; /* file system name */ + char sb_fname[XFSLABEL_MAX]; /* file system name */ __u8 sb_blocklog; /* log2 of sb_blocksize */ __u8 sb_sectlog; /* log2 of sb_sectsize */ __u8 sb_inodelog; /* log2 of sb_inodesize */ @@ -971,6 +962,9 @@ typedef enum xfs_dinode_fmt { XFS_DFORK_DSIZE(dip, mp) : \ XFS_DFORK_ASIZE(dip, mp)) +#define XFS_DFORK_MAXEXT(dip, mp, w) \ + (XFS_DFORK_SIZE(dip, mp, w) / sizeof(struct xfs_bmbt_rec)) + /* * Return pointers to the data or attribute forks. */ @@ -1535,6 +1529,8 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTBLOCK_BITLEN 52 #define BMBT_BLOCKCOUNT_BITLEN 21 +#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) + typedef struct xfs_bmbt_rec { __be64 l0, l1; } xfs_bmbt_rec_t; diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index faf1a4edd618..f3aa59302fef 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 1995-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_FS_H__ #define __XFS_FS_H__ @@ -542,13 +530,20 @@ struct xfs_scrub_metadata { /* o: Metadata object looked funny but isn't corrupt. */ #define XFS_SCRUB_OFLAG_WARNING (1 << 6) +/* + * o: IFLAG_REPAIR was set but metadata object did not need fixing or + * optimization and has therefore not been altered. + */ +#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7) + #define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ XFS_SCRUB_OFLAG_PREEN | \ XFS_SCRUB_OFLAG_XFAIL | \ XFS_SCRUB_OFLAG_XCORRUPT | \ XFS_SCRUB_OFLAG_INCOMPLETE | \ - XFS_SCRUB_OFLAG_WARNING) + XFS_SCRUB_OFLAG_WARNING | \ + XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED) #define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT) /* diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index de627fa19168..0d968e8143aa 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -133,22 +121,51 @@ xfs_inobt_get_rec( struct xfs_inobt_rec_incore *irec, int *stat) { + struct xfs_mount *mp = cur->bc_mp; + xfs_agnumber_t agno = cur->bc_private.a.agno; union xfs_btree_rec *rec; int error; + uint64_t realfree; error = xfs_btree_get_rec(cur, &rec, stat); if (error || *stat == 0) return error; - xfs_inobt_btrec_to_irec(cur->bc_mp, rec, irec); + xfs_inobt_btrec_to_irec(mp, rec, irec); + + if (!xfs_verify_agino(mp, agno, irec->ir_startino)) + goto out_bad_rec; + if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || + irec->ir_count > XFS_INODES_PER_CHUNK) + goto out_bad_rec; + if (irec->ir_freecount > XFS_INODES_PER_CHUNK) + goto out_bad_rec; + + /* if there are no holes, return the first available offset */ + if (!xfs_inobt_issparse(irec->ir_holemask)) + realfree = irec->ir_free; + else + realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec); + if (hweight64(realfree) != irec->ir_freecount) + goto out_bad_rec; return 0; + +out_bad_rec: + xfs_warn(mp, + "%s Inode BTree record corruption in AG %d detected!", + cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", agno); + xfs_warn(mp, +"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", + irec->ir_startino, irec->ir_count, irec->ir_freecount, + irec->ir_free, irec->ir_holemask); + return -EFSCORRUPTED; } /* * Insert a single inobt record. Cursor must already point to desired location. */ -STATIC int +int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, uint16_t holemask, @@ -880,6 +897,7 @@ sparse_alloc: be32_add_cpu(&agi->agi_freecount, newlen); pag = xfs_perag_get(args.mp, agno); pag->pagi_freecount += newlen; + pag->pagi_count += newlen; xfs_perag_put(pag); agi->agi_newino = cpu_to_be32(newino); @@ -1974,6 +1992,7 @@ xfs_difree_inobt( xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); pag = xfs_perag_get(mp, agno); pag->pagi_freecount -= ilen - 1; + pag->pagi_count -= ilen; xfs_perag_put(pag); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); @@ -2477,26 +2496,13 @@ xfs_ialloc_log_agi( } } -#ifdef DEBUG -STATIC void -xfs_check_agi_unlinked( - struct xfs_agi *agi) -{ - int i; - - for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) - ASSERT(agi->agi_unlinked[i]); -} -#else -#define xfs_check_agi_unlinked(agi) -#endif - static xfs_failaddr_t xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + int i; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) @@ -2532,7 +2538,13 @@ xfs_agi_verify( if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) return __this_address; - xfs_check_agi_unlinked(agi); + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { + if (agi->agi_unlinked[i] == NULLAGINO) + continue; + if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i]))) + return __this_address; + } + return NULL; } @@ -2664,96 +2676,6 @@ xfs_ialloc_pagi_init( return 0; } -/* Calculate the first and last possible inode number in an AG. */ -void -xfs_ialloc_agino_range( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t *first, - xfs_agino_t *last) -{ - xfs_agblock_t bno; - xfs_agblock_t eoag; - - eoag = xfs_ag_block_count(mp, agno); - - /* - * Calculate the first inode, which will be in the first - * cluster-aligned block after the AGFL. - */ - bno = round_up(XFS_AGFL_BLOCK(mp) + 1, - xfs_ialloc_cluster_alignment(mp)); - *first = XFS_OFFBNO_TO_AGINO(mp, bno, 0); - - /* - * Calculate the last inode, which will be at the end of the - * last (aligned) cluster that can be allocated in the AG. - */ - bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp)); - *last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1; -} - -/* - * Verify that an AG inode number pointer neither points outside the AG - * nor points at static metadata. - */ -bool -xfs_verify_agino( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t agino) -{ - xfs_agino_t first; - xfs_agino_t last; - - xfs_ialloc_agino_range(mp, agno, &first, &last); - return agino >= first && agino <= last; -} - -/* - * Verify that an FS inode number pointer neither points outside the - * filesystem nor points at static AG metadata. - */ -bool -xfs_verify_ino( - struct xfs_mount *mp, - xfs_ino_t ino) -{ - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ino); - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); - - if (agno >= mp->m_sb.sb_agcount) - return false; - if (XFS_AGINO_TO_INO(mp, agno, agino) != ino) - return false; - return xfs_verify_agino(mp, agno, agino); -} - -/* Is this an internal inode number? */ -bool -xfs_internal_inum( - struct xfs_mount *mp, - xfs_ino_t ino) -{ - return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || - (xfs_sb_version_hasquota(&mp->m_sb) && - xfs_is_quota_inode(&mp->m_sb, ino)); -} - -/* - * Verify that a directory entry's inode number doesn't point at an internal - * inode, empty space, or static AG metadata. - */ -bool -xfs_verify_dir_ino( - struct xfs_mount *mp, - xfs_ino_t ino) -{ - if (xfs_internal_inum(mp, ino)) - return false; - return xfs_verify_ino(mp, ino); -} - /* Is there an inode record covering a given range of inode numbers? */ int xfs_ialloc_has_inode_record( diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index c5402bb4ce0c..90b09c5f163b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_IALLOC_H__ #define __XFS_IALLOC_H__ @@ -176,14 +164,10 @@ int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low, xfs_agino_t high, bool *exists); int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count, xfs_agino_t *freecount); +int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, + uint8_t count, int32_t freecount, xfs_inofree_t free, + int *stat); int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); -void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t *first, xfs_agino_t *last); -bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t agino); -bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); -bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); -bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 367e9a0726e6..a5237afec5ab 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -296,7 +284,7 @@ xfs_inobt_verify( case cpu_to_be32(XFS_FIBT_MAGIC): break; default: - return NULL; + return __this_address; } /* level verification */ @@ -608,3 +596,12 @@ xfs_finobt_calc_reserves( *used += tree_len; return 0; } + +/* Calculate the inobt btree size for some records. */ +xfs_extlen_t +xfs_iallocbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_inobt_mnr, len); +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index aa81e2e63f3f..bf8f0c405e7d 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_IALLOC_BTREE_H__ #define __XFS_IALLOC_BTREE_H__ @@ -74,5 +62,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); +extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, + unsigned long long len); #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index b0f31791c7e6..b80c63faace2 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2017 Christoph Hellwig. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/cache.h> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 1201107eabc6..30d1d60f1d46 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -201,11 +189,6 @@ xfs_imap_to_bp( ASSERT(buf_flags & XBF_TRYLOCK); return error; } - - if (error == -EFSCORRUPTED && - (iget_flags & XFS_IGET_UNTRUSTED)) - return -EINVAL; - xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", __func__, error); return error; @@ -391,12 +374,54 @@ xfs_log_dinode_to_disk( } } +static xfs_failaddr_t +xfs_dinode_verify_fork( + struct xfs_dinode *dip, + struct xfs_mount *mp, + int whichfork) +{ + uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + + switch (XFS_DFORK_FORMAT(dip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (whichfork == XFS_DATA_FORK) { + if (S_ISREG(be16_to_cpu(dip->di_mode))) + return __this_address; + if (be64_to_cpu(dip->di_size) > + XFS_DFORK_SIZE(dip, mp, whichfork)) + return __this_address; + } + if (di_nextents) + return __this_address; + break; + case XFS_DINODE_FMT_EXTENTS: + if (di_nextents > XFS_DFORK_MAXEXT(dip, mp, whichfork)) + return __this_address; + break; + case XFS_DINODE_FMT_BTREE: + if (whichfork == XFS_ATTR_FORK) { + if (di_nextents > MAXAEXTNUM) + return __this_address; + } else if (di_nextents > MAXEXTNUM) { + return __this_address; + } + break; + default: + return __this_address; + } + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip) { + xfs_failaddr_t fa; uint16_t mode; uint16_t flags; uint64_t flags2; @@ -457,24 +482,9 @@ xfs_dinode_verify( case S_IFREG: case S_IFLNK: case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: - /* - * no local regular files yet - */ - if (S_ISREG(mode)) - return __this_address; - if (di_size > XFS_DFORK_DSIZE(dip, mp)) - return __this_address; - if (dip->di_nextents) - return __this_address; - /* fall through */ - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - break; - default: - return __this_address; - } + fa = xfs_dinode_verify_fork(dip, mp, XFS_DATA_FORK); + if (fa) + return fa; break; case 0: /* Uninitialized inode ok. */ @@ -484,17 +494,9 @@ xfs_dinode_verify( } if (XFS_DFORK_Q(dip)) { - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - if (dip->di_anextents) - return __this_address; - /* fall through */ - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - break; - default: - return __this_address; - } + fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK); + if (fa) + return fa; } else { /* * If there is no fork offset, this may be a freshly-made inode @@ -513,6 +515,12 @@ xfs_dinode_verify( return __this_address; } + /* extent size hint validation */ + fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), + mode, flags); + if (fa) + return fa; + /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) return NULL; @@ -521,7 +529,7 @@ xfs_dinode_verify( /* don't allow reflink/cowextsize if we don't have reflink */ if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && - !xfs_sb_version_hasreflink(&mp->m_sb)) + !xfs_sb_version_hasreflink(&mp->m_sb)) return __this_address; /* only regular files get reflink */ @@ -536,6 +544,12 @@ xfs_dinode_verify( if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX)) return __this_address; + /* COW extent size hint validation */ + fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) + return fa; + return NULL; } @@ -717,7 +731,8 @@ xfs_inode_validate_extsize( if ((hint_flag || inherit_flag) && extsize == 0) return __this_address; - if (!(hint_flag || inherit_flag) && extsize != 0) + /* free inodes get flags set to zero but extsize remains */ + if (mode && !(hint_flag || inherit_flag) && extsize != 0) return __this_address; if (extsize_bytes % blocksize_bytes) @@ -763,7 +778,8 @@ xfs_inode_validate_cowextsize( if (hint_flag && cowextsize == 0) return __this_address; - if (!hint_flag && cowextsize != 0) + /* free inodes get flags set to zero but cowextsize remains */ + if (mode && !hint_flag && cowextsize != 0) return __this_address; if (hint_flag && rt_flag) diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index d9a376a78ee2..ab0f84165317 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_INODE_BUF_H__ #define __XFS_INODE_BUF_H__ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 701c42a28d05..183ec0cb8921 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/log2.h> diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index dd8aba0dd119..781b1603df5e 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_INODE_FORK_H__ #define __XFS_INODE_FORK_H__ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 349d9f8edb89..79bb79853c9f 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_LOG_FORMAT_H__ #define __XFS_LOG_FORMAT_H__ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 66948a9fd486..f3d18eaecebb 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_LOG_RECOVER_H__ #define __XFS_LOG_RECOVER_H__ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index cc4cbe290939..1b542ec11d5d 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2013 Jie Liu. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index bb1b13a9b5f4..4bfdd5f4c6af 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_QUOTA_DEFS_H__ #define __XFS_QUOTA_DEFS_H__ @@ -107,14 +95,12 @@ typedef uint16_t xfs_qwarncnt_t; * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ #define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ #define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ -#define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be @@ -152,10 +138,11 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, - struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type, - uint flags); + struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type); +extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, + struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); -extern int xfs_dquot_repair(struct xfs_mount *mp, struct xfs_disk_dquot *ddq, +extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 560e28473024..9dda6fd0bb13 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -88,8 +74,25 @@ xfs_refcount_lookup_ge( return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } +/* + * Look up the first record equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_eq( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_LE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + /* Convert on-disk record to in-core format. */ -static inline void +void xfs_refcount_btrec_to_irec( union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) @@ -108,16 +111,53 @@ xfs_refcount_get_rec( struct xfs_refcount_irec *irec, int *stat) { + struct xfs_mount *mp = cur->bc_mp; + xfs_agnumber_t agno = cur->bc_private.a.agno; union xfs_btree_rec *rec; int error; + xfs_agblock_t realstart; error = xfs_btree_get_rec(cur, &rec, stat); - if (!error && *stat == 1) { - xfs_refcount_btrec_to_irec(rec, irec); - trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, - irec); + if (error || !*stat) + return error; + + xfs_refcount_btrec_to_irec(rec, irec); + + agno = cur->bc_private.a.agno; + if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) + goto out_bad_rec; + + /* handle special COW-staging state */ + realstart = irec->rc_startblock; + if (realstart & XFS_REFC_COW_START) { + if (irec->rc_refcount != 1) + goto out_bad_rec; + realstart &= ~XFS_REFC_COW_START; + } else if (irec->rc_refcount < 2) { + goto out_bad_rec; } - return error; + + /* check for valid extent range, including overflow */ + if (!xfs_verify_agbno(mp, agno, realstart)) + goto out_bad_rec; + if (realstart > realstart + irec->rc_blockcount) + goto out_bad_rec; + if (!xfs_verify_agbno(mp, agno, realstart + irec->rc_blockcount - 1)) + goto out_bad_rec; + + if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) + goto out_bad_rec; + + trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, irec); + return 0; + +out_bad_rec: + xfs_warn(mp, + "Refcount BTree record corruption in AG %d detected!", agno); + xfs_warn(mp, + "Start block 0x%x, block count 0x%x, references 0x%x", + irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); + return -EFSCORRUPTED; } /* @@ -149,7 +189,7 @@ xfs_refcount_update( * by [bno, len, refcount]. * This either works (return 0) or gets an EFSCORRUPTED error. */ -STATIC int +int xfs_refcount_insert( struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, @@ -162,7 +202,10 @@ xfs_refcount_insert( cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; error = xfs_btree_insert(cur, i); + if (error) + goto out_error; XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + out_error: if (error) trace_xfs_refcount_insert_error(cur->bc_mp, diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 2a731ac68fe4..5fef74412727 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_REFCOUNT_H__ #define __XFS_REFCOUNT_H__ @@ -24,6 +10,8 @@ extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno, int *stat); +extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); @@ -85,5 +73,10 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); +union xfs_btree_rec; +extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec, + struct xfs_refcount_irec *irec); +extern int xfs_refcount_insert(struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, int *stat); #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 375abfeb6267..b71937982c5b 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -192,7 +178,6 @@ xfs_refcountbt_init_ptr_from_cur( struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); - ASSERT(agf->agf_refcount_root != 0); ptr->s = agf->agf_refcount_root; } diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 2bc4694ef146..d2852b6e1fa8 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_REFCOUNT_BTREE_H__ #define __XFS_REFCOUNT_BTREE_H__ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index fba8d2718017..d4460b0d2d81 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -39,6 +27,7 @@ #include "xfs_extent_busy.h" #include "xfs_bmap.h" #include "xfs_inode.h" +#include "xfs_ialloc.h" /* * Lookup the first record less than or equal to [bno, len, owner, offset] @@ -203,6 +192,8 @@ xfs_rmap_get_rec( struct xfs_rmap_irec *irec, int *stat) { + struct xfs_mount *mp = cur->bc_mp; + xfs_agnumber_t agno = cur->bc_private.a.agno; union xfs_btree_rec *rec; int error; @@ -210,7 +201,43 @@ xfs_rmap_get_rec( if (error || !*stat) return error; - return xfs_rmap_btrec_to_irec(rec, irec); + if (xfs_rmap_btrec_to_irec(rec, irec)) + goto out_bad_rec; + + if (irec->rm_blockcount == 0) + goto out_bad_rec; + if (irec->rm_startblock <= XFS_AGFL_BLOCK(mp)) { + if (irec->rm_owner != XFS_RMAP_OWN_FS) + goto out_bad_rec; + if (irec->rm_blockcount != XFS_AGFL_BLOCK(mp) + 1) + goto out_bad_rec; + } else { + /* check for valid extent range, including overflow */ + if (!xfs_verify_agbno(mp, agno, irec->rm_startblock)) + goto out_bad_rec; + if (irec->rm_startblock > + irec->rm_startblock + irec->rm_blockcount) + goto out_bad_rec; + if (!xfs_verify_agbno(mp, agno, + irec->rm_startblock + irec->rm_blockcount - 1)) + goto out_bad_rec; + } + + if (!(xfs_verify_ino(mp, irec->rm_owner) || + (irec->rm_owner <= XFS_RMAP_OWN_FS && + irec->rm_owner >= XFS_RMAP_OWN_MIN))) + goto out_bad_rec; + + return 0; +out_bad_rec: + xfs_warn(mp, + "Reverse Mapping BTree record corruption in AG %d detected!", + agno); + xfs_warn(mp, + "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", + irec->rm_owner, irec->rm_flags, irec->rm_startblock, + irec->rm_blockcount); + return -EFSCORRUPTED; } struct xfs_find_left_neighbor_info { @@ -1374,6 +1401,8 @@ xfs_rmap_convert_shared( */ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, &PREV, &i); + if (error) + goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); ASSERT(PREV.rm_offset <= offset); @@ -2030,6 +2059,34 @@ out_error: return error; } +/* Insert a raw rmap into the rmapbt. */ +int +xfs_rmap_map_raw( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rmap) +{ + struct xfs_owner_info oinfo; + + oinfo.oi_owner = rmap->rm_owner; + oinfo.oi_offset = rmap->rm_offset; + oinfo.oi_flags = 0; + if (rmap->rm_flags & XFS_RMAP_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + + if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return xfs_rmap_map(cur, rmap->rm_startblock, + rmap->rm_blockcount, + rmap->rm_flags & XFS_RMAP_UNWRITTEN, + &oinfo); + + return xfs_rmap_map_shared(cur, rmap->rm_startblock, + rmap->rm_blockcount, + rmap->rm_flags & XFS_RMAP_UNWRITTEN, + &oinfo); +} + struct xfs_rmap_query_range_info { xfs_rmap_query_range_fn fn; void *priv; @@ -2453,3 +2510,56 @@ xfs_rmap_record_exists( irec.rm_startblock + irec.rm_blockcount >= bno + len); return 0; } + +struct xfs_rmap_key_state { + uint64_t owner; + uint64_t offset; + unsigned int flags; + bool has_rmap; +}; + +/* For each rmap given, figure out if it doesn't match the key we want. */ +STATIC int +xfs_rmap_has_other_keys_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_rmap_key_state *rks = priv; + + if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && + ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) + return 0; + rks->has_rmap = true; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Given an extent and some owner info, can we find records overlapping + * the extent whose owner info does not match the given owner? + */ +int +xfs_rmap_has_other_keys( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + bool *has_rmap) +{ + struct xfs_rmap_irec low = {0}; + struct xfs_rmap_irec high; + struct xfs_rmap_key_state rks; + int error; + + xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); + rks.has_rmap = false; + + low.rm_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno + len - 1; + + error = xfs_rmap_query_range(cur, &low, &high, + xfs_rmap_has_other_keys_helper, &rks); + *has_rmap = rks.has_rmap; + return error; +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 380e53be98d5..9f19454768b2 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_RMAP_H__ #define __XFS_RMAP_H__ @@ -238,5 +224,9 @@ int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, struct xfs_owner_info *oinfo, bool *has_rmap); +int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, struct xfs_owner_info *oinfo, + bool *has_rmap); +int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap); #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index d756e0b84abf..221a88ea60bb 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -234,7 +222,6 @@ xfs_rmapbt_init_ptr_from_cur( struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); - ASSERT(agf->agf_roots[cur->bc_btnum] != 0); ptr->s = agf->agf_roots[cur->bc_btnum]; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index d68d96eed7ea..50198b6c3bb2 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_RMAP_BTREE_H__ #define __XFS_RMAP_BTREE_H__ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 106be2d0bb88..b228c821bae6 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -90,6 +78,9 @@ xfs_rtbuf_get( if (error) return error; + if (nmap == 0 || !xfs_bmap_is_real_extent(&map)) + return -EFSCORRUPTED; + ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), @@ -1033,14 +1024,17 @@ xfs_rtalloc_query_range( int is_free; int error = 0; - if (low_rec->ar_startblock > high_rec->ar_startblock) + if (low_rec->ar_startext > high_rec->ar_startext) return -EINVAL; - else if (low_rec->ar_startblock == high_rec->ar_startblock) + if (low_rec->ar_startext >= mp->m_sb.sb_rextents || + low_rec->ar_startext == high_rec->ar_startext) return 0; + if (high_rec->ar_startext > mp->m_sb.sb_rextents) + high_rec->ar_startext = mp->m_sb.sb_rextents; /* Iterate the bitmap, looking for discrepancies. */ - rtstart = low_rec->ar_startblock; - rem = high_rec->ar_startblock - rtstart; + rtstart = low_rec->ar_startext; + rem = high_rec->ar_startext - rtstart; while (rem) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, @@ -1050,13 +1044,13 @@ xfs_rtalloc_query_range( /* How long does the extent go for? */ error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startblock - 1, &rtend); + high_rec->ar_startext - 1, &rtend); if (error) break; if (is_free) { - rec.ar_startblock = rtstart; - rec.ar_blockcount = rtend - rtstart + 1; + rec.ar_startext = rtstart; + rec.ar_extcount = rtend - rtstart + 1; error = fn(tp, &rec, priv); if (error) @@ -1079,25 +1073,13 @@ xfs_rtalloc_query_all( { struct xfs_rtalloc_rec keys[2]; - keys[0].ar_startblock = 0; - keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks; - keys[0].ar_blockcount = keys[1].ar_blockcount = 0; + keys[0].ar_startext = 0; + keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1; + keys[0].ar_extcount = keys[1].ar_extcount = 0; return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); } -/* - * Verify that an realtime block number pointer doesn't point off the - * end of the realtime device. - */ -bool -xfs_verify_rtbno( - struct xfs_mount *mp, - xfs_rtblock_t rtbno) -{ - return rtbno < mp->m_sb.sb_rblocks; -} - /* Is the given extent all free? */ int xfs_rtalloc_extent_is_free( diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index d9b94bd5f689..350119eeaecb 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -278,6 +266,22 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + if (sbp->sb_unit) { + if (!xfs_sb_version_hasdalign(sbp) || + sbp->sb_unit > sbp->sb_width || + (sbp->sb_width % sbp->sb_unit) != 0) { + xfs_notice(mp, "SB stripe unit sanity check failed"); + return -EFSCORRUPTED; + } + } else if (xfs_sb_version_hasdalign(sbp)) { + xfs_notice(mp, "SB stripe alignment sanity check failed"); + return -EFSCORRUPTED; + } else if (sbp->sb_width) { + xfs_notice(mp, "SB stripe width sanity check failed"); + return -EFSCORRUPTED; + } + + if (xfs_sb_version_hascrc(&mp->m_sb) && sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { xfs_notice(mp, "v5 SB sanity check failed"); @@ -767,7 +771,7 @@ xfs_sb_mount_common( mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; mp->m_bsize = XFS_FSB_TO_BB(mp, 1); - mp->m_ialloc_inos = (int)MAX((uint16_t)XFS_INODES_PER_CHUNK, + mp->m_ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK, sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; @@ -888,6 +892,111 @@ xfs_sync_sb( return xfs_trans_commit(tp); } +/* + * Update all the secondary superblocks to match the new state of the primary. + * Because we are completely overwriting all the existing fields in the + * secondary superblock buffers, there is no need to read them in from disk. + * Just get a new buffer, stamp it and write it. + * + * The sb buffers need to be cached here so that we serialise against other + * operations that access the secondary superblocks, but we don't want to keep + * them in memory once it is written so we mark it as a one-shot buffer. + */ +int +xfs_update_secondary_sbs( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + int saved_error = 0; + int error = 0; + LIST_HEAD (buffer_list); + + /* update secondary superblocks. */ + for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) { + struct xfs_buf *bp; + + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), + XFS_FSS_TO_BB(mp, 1), 0); + /* + * If we get an error reading or writing alternate superblocks, + * continue. xfs_repair chooses the "best" superblock based + * on most matches; if we break early, we'll leave more + * superblocks un-updated than updated, and xfs_repair may + * pick them over the properly-updated primary. + */ + if (!bp) { + xfs_warn(mp, + "error allocating secondary superblock for ag %d", + agno); + if (!saved_error) + saved_error = -ENOMEM; + continue; + } + + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_oneshot(bp); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_buf_delwri_queue(bp, &buffer_list); + xfs_buf_relse(bp); + + /* don't hold too many buffers at once */ + if (agno % 16) + continue; + + error = xfs_buf_delwri_submit(&buffer_list); + if (error) { + xfs_warn(mp, + "write error %d updating a secondary superblock near ag %d", + error, agno); + if (!saved_error) + saved_error = error; + continue; + } + } + error = xfs_buf_delwri_submit(&buffer_list); + if (error) { + xfs_warn(mp, + "write error %d updating a secondary superblock near ag %d", + error, agno); + } + + return saved_error ? saved_error : error; +} + +/* + * Same behavior as xfs_sync_sb, except that it is always synchronous and it + * also writes the superblock buffer to disk sector 0 immediately. + */ +int +xfs_sync_sb_buf( + struct xfs_mount *mp) +{ + struct xfs_trans *tp; + struct xfs_buf *bp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); + if (error) + return error; + + bp = xfs_trans_getsb(tp, mp, 0); + xfs_log_sb(tp); + xfs_trans_bhold(tp, bp); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp); + if (error) + goto out; + /* + * write out the sb buffer to get the changes to disk + */ + error = xfs_bwrite(bp); +out: + xfs_buf_relse(bp); + return error; +} + int xfs_fs_geometry( struct xfs_sb *sbp, @@ -972,3 +1081,47 @@ xfs_fs_geometry( return 0; } + +/* Read a secondary superblock. */ +int +xfs_sb_read_secondary( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **bpp) +{ + struct xfs_buf *bp; + int error; + + ASSERT(agno != 0 && agno != NULLAGNUMBER); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + if (error) + return error; + xfs_buf_set_ref(bp, XFS_SSB_REF); + *bpp = bp; + return 0; +} + +/* Get an uninitialised secondary superblock buffer. */ +int +xfs_sb_get_secondary( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **bpp) +{ + struct xfs_buf *bp; + + ASSERT(agno != 0 && agno != NULLAGNUMBER); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (!bp) + return -ENOMEM; + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_oneshot(bp); + *bpp = bp; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 63dcd2a1a657..13564d69800a 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -1,23 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_SB_H__ #define __XFS_SB_H__ +struct xfs_mount; +struct xfs_sb; +struct xfs_dsb; +struct xfs_trans; +struct xfs_fsop_geom; +struct xfs_perag; + /* * perag get/put wrappers for ref counting */ @@ -29,13 +24,22 @@ extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); +extern int xfs_sync_sb_buf(struct xfs_mount *mp); extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); +extern int xfs_update_secondary_sbs(struct xfs_mount *mp); + #define XFS_FS_GEOM_MAX_STRUCT_VER (4) extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, int struct_version); +extern int xfs_sb_read_secondary(struct xfs_mount *mp, + struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **bpp); +extern int xfs_sb_get_secondary(struct xfs_mount *mp, + struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **bpp); #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index d0b84da0cb1e..22089f1c880a 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_SHARED_H__ #define __XFS_SHARED_H__ @@ -57,21 +45,6 @@ extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; -/* - * This structure is used to track log items associated with - * a transaction. It points to the log item and keeps some - * flags to track the state of the log item. It also tracks - * the amount of space needed to log the item it describes - * once we get to commit processing (see xfs_trans_commit()). - */ -struct xfs_log_item_desc { - struct xfs_log_item *lid_item; - struct list_head lid_trans; - unsigned char lid_flags; -}; - -#define XFS_LID_DIRTY 0x1 - /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); @@ -127,6 +100,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_ATTR_BTREE_REF 1 #define XFS_DQUOT_REF 1 #define XFS_REFC_BTREE_REF 1 +#define XFS_SSB_REF 0 /* * Flags for xfs_trans_ichgtime(). diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 5ef5f354587e..95374ab2dee7 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * Copyright (c) 2012-2013 Red Hat, Inc. * All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 3bccdf73e141..f99a7aefe418 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * Copyright (C) 2010 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -248,7 +236,7 @@ xfs_calc_write_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_inode_res(mp, 1) + + max((xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + @@ -275,7 +263,7 @@ xfs_calc_itruncate_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_inode_res(mp, 1) + + max((xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + @@ -300,7 +288,7 @@ xfs_calc_rename_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_inode_res(mp, 4) + + max((xfs_calc_inode_res(mp, 4) + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + @@ -340,7 +328,7 @@ xfs_calc_link_reservation( { return XFS_DQUOT_LOGRES(mp) + xfs_calc_iunlink_remove_reservation(mp) + - MAX((xfs_calc_inode_res(mp, 2) + + max((xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + @@ -378,7 +366,7 @@ xfs_calc_remove_reservation( { return XFS_DQUOT_LOGRES(mp) + xfs_calc_iunlink_add_reservation(mp) + - MAX((xfs_calc_inode_res(mp, 1) + + max((xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + @@ -436,7 +424,7 @@ STATIC uint xfs_calc_icreate_reservation(xfs_mount_t *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX(xfs_calc_icreate_resv_alloc(mp), + max(xfs_calc_icreate_resv_alloc(mp), xfs_calc_create_resv_modify(mp)); } @@ -644,7 +632,7 @@ STATIC uint xfs_calc_attrinval_reservation( struct xfs_mount *mp) { - return MAX((xfs_calc_inode_res(mp, 1) + + return max((xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + @@ -708,7 +696,7 @@ xfs_calc_attrrm_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_inode_res(mp, 1) + + max((xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)) + (uint)XFS_FSB_TO_B(mp, diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index b7e5357d060a..7241ab28cf84 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_TRANS_RESV_H__ #define __XFS_TRANS_RESV_H__ diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index d787c677d2a3..a62fb950bef1 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_TRANS_SPACE_H__ #define __XFS_TRANS_SPACE_H__ diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c new file mode 100644 index 000000000000..2e2a243cef2e --- /dev/null +++ b/fs/xfs/libxfs/xfs_types.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (C) 2017 Oracle. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_rmap.h" +#include "xfs_alloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" + +/* Find the size of the AG, in blocks. */ +xfs_agblock_t +xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + ASSERT(agno < mp->m_sb.sb_agcount); + + if (agno < mp->m_sb.sb_agcount - 1) + return mp->m_sb.sb_agblocks; + return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks); +} + +/* + * Verify that an AG block number pointer neither points outside the AG + * nor points at static metadata. + */ +bool +xfs_verify_agbno( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t agbno) +{ + xfs_agblock_t eoag; + + eoag = xfs_ag_block_count(mp, agno); + if (agbno >= eoag) + return false; + if (agbno <= XFS_AGFL_BLOCK(mp)) + return false; + return true; +} + +/* + * Verify that an FS block number pointer neither points outside the + * filesystem nor points at static AG metadata. + */ +bool +xfs_verify_fsbno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno) +{ + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); + + if (agno >= mp->m_sb.sb_agcount) + return false; + return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); +} + +/* Calculate the first and last possible inode number in an AG. */ +void +xfs_agino_range( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t *first, + xfs_agino_t *last) +{ + xfs_agblock_t bno; + xfs_agblock_t eoag; + + eoag = xfs_ag_block_count(mp, agno); + + /* + * Calculate the first inode, which will be in the first + * cluster-aligned block after the AGFL. + */ + bno = round_up(XFS_AGFL_BLOCK(mp) + 1, + xfs_ialloc_cluster_alignment(mp)); + *first = XFS_OFFBNO_TO_AGINO(mp, bno, 0); + + /* + * Calculate the last inode, which will be at the end of the + * last (aligned) cluster that can be allocated in the AG. + */ + bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp)); + *last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1; +} + +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata. + */ +bool +xfs_verify_agino( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + xfs_agino_t first; + xfs_agino_t last; + + xfs_agino_range(mp, agno, &first, &last); + return agino >= first && agino <= last; +} + +/* + * Verify that an FS inode number pointer neither points outside the + * filesystem nor points at static AG metadata. + */ +bool +xfs_verify_ino( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + + if (agno >= mp->m_sb.sb_agcount) + return false; + if (XFS_AGINO_TO_INO(mp, agno, agino) != ino) + return false; + return xfs_verify_agino(mp, agno, agino); +} + +/* Is this an internal inode number? */ +bool +xfs_internal_inum( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || + (xfs_sb_version_hasquota(&mp->m_sb) && + xfs_is_quota_inode(&mp->m_sb, ino)); +} + +/* + * Verify that a directory entry's inode number doesn't point at an internal + * inode, empty space, or static AG metadata. + */ +bool +xfs_verify_dir_ino( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + if (xfs_internal_inum(mp, ino)) + return false; + return xfs_verify_ino(mp, ino); +} + +/* + * Verify that an realtime block number pointer doesn't point off the + * end of the realtime device. + */ +bool +xfs_verify_rtbno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return rtbno < mp->m_sb.sb_rblocks; +} diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 3c560695c546..4055d62f690c 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_TYPES_H__ #define __XFS_TYPES_H__ @@ -30,7 +18,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ -typedef int32_t xfs_rtword_t; /* word type for bitmap manipulations */ +typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ typedef int32_t xfs_tid_t; /* transaction identifier */ @@ -159,4 +147,23 @@ typedef struct xfs_bmbt_irec xfs_exntst_t br_state; /* extent state */ } xfs_bmbt_irec_t; +/* + * Type verifier functions + */ +struct xfs_mount; + +xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); +bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno); +bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); + +void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t *first, xfs_agino_t *last); +bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t agino); +bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h index e3c92d19e540..79155eec341b 100644 --- a/fs/xfs/mrlock.h +++ b/fs/xfs/mrlock.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_SUPPORT_MRLOCK_H__ #define __XFS_SUPPORT_MRLOCK_H__ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 018aabbd9394..9bb0745f1ad2 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -38,68 +24,6 @@ #include "scrub/common.h" #include "scrub/trace.h" -/* - * Walk all the blocks in the AGFL. The fn function can return any negative - * error code or XFS_BTREE_QUERY_RANGE_ABORT. - */ -int -xfs_scrub_walk_agfl( - struct xfs_scrub_context *sc, - int (*fn)(struct xfs_scrub_context *, - xfs_agblock_t bno, void *), - void *priv) -{ - struct xfs_agf *agf; - __be32 *agfl_bno; - struct xfs_mount *mp = sc->mp; - unsigned int flfirst; - unsigned int fllast; - int i; - int error; - - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp); - flfirst = be32_to_cpu(agf->agf_flfirst); - fllast = be32_to_cpu(agf->agf_fllast); - - /* Nothing to walk in an empty AGFL. */ - if (agf->agf_flcount == cpu_to_be32(0)) - return 0; - - /* first to last is a consecutive list. */ - if (fllast >= flfirst) { - for (i = flfirst; i <= fllast; i++) { - error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); - if (error) - return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - } - - return 0; - } - - /* first to the end */ - for (i = flfirst; i < xfs_agfl_size(mp); i++) { - error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); - if (error) - return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - } - - /* the start to last. */ - for (i = 0; i <= fllast; i++) { - error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); - if (error) - return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - } - - return 0; -} - /* Superblock */ /* Cross-reference with the other btrees. */ @@ -157,9 +81,7 @@ xfs_scrub_superblock( if (agno == 0) return 0; - error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp); /* * The superblock verifier can return several different error codes * if it thinks the superblock doesn't look right. For a mount these @@ -680,6 +602,7 @@ struct xfs_scrub_agfl_info { unsigned int sz_entries; unsigned int nr_entries; xfs_agblock_t *entries; + struct xfs_scrub_context *sc; }; /* Cross-reference with the other btrees. */ @@ -701,12 +624,12 @@ xfs_scrub_agfl_block_xref( /* Scrub an AGFL block. */ STATIC int xfs_scrub_agfl_block( - struct xfs_scrub_context *sc, + struct xfs_mount *mp, xfs_agblock_t agbno, void *priv) { - struct xfs_mount *mp = sc->mp; struct xfs_scrub_agfl_info *sai = priv; + struct xfs_scrub_context *sc = sai->sc; xfs_agnumber_t agno = sc->sa.agno; if (xfs_verify_agbno(mp, agno, agbno) && @@ -717,6 +640,9 @@ xfs_scrub_agfl_block( xfs_scrub_agfl_block_xref(sc, agbno, priv); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return XFS_BTREE_QUERY_RANGE_ABORT; + return 0; } @@ -796,8 +722,10 @@ xfs_scrub_agfl( goto out; } memset(&sai, 0, sizeof(sai)); + sai.sc = sc; sai.sz_entries = agflcount; - sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS); + sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, + KM_MAYFAIL); if (!sai.entries) { error = -ENOMEM; goto out; @@ -805,7 +733,12 @@ xfs_scrub_agfl( /* Check the blocks in the AGFL. */ xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG); - error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai); + error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), + sc->sa.agfl_bp, xfs_scrub_agfl_block, &sai); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + error = 0; + goto out_free; + } if (error) goto out_free; @@ -934,7 +867,7 @@ xfs_scrub_agi( } /* Check inode counters */ - xfs_ialloc_agino_range(mp, agno, &first_agino, &last_agino); + xfs_agino_range(mp, agno, &first_agino, &last_agino); icount = be32_to_cpu(agi->agi_count); if (icount > last_agino - first_agino + 1 || icount < be32_to_cpu(agi->agi_freecount)) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c new file mode 100644 index 000000000000..117eedac53df --- /dev/null +++ b/fs/xfs/scrub/agheader_repair.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Superblock */ + +/* Repair the superblock. */ +int +xfs_repair_superblock( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + xfs_agnumber_t agno; + int error; + + /* Don't try to repair AG 0's sb; let xfs_repair deal with it. */ + agno = sc->sm->sm_agno; + if (agno == 0) + return -EOPNOTSUPP; + + error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp); + if (error) + return error; + + /* Copy AG 0's superblock to this one. */ + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + + /* Write this to disk. */ + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); + return error; +} diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 517c079d3f68..50e4f7fa06f0 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -70,7 +56,7 @@ xfs_scrub_allocbt_xref_other( pcur = &sc->sa.cnt_cur; else pcur = &sc->sa.bno_cur; - if (!*pcur) + if (!*pcur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec); @@ -172,7 +158,7 @@ xfs_scrub_xref_is_used_space( bool is_freesp; int error; - if (!sc->sa.bno_cur) + if (!sc->sa.bno_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp); diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 127575f0abfb..de51cf8a8516 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -126,8 +112,9 @@ xfs_scrub_xattr_listent( if (args.valuelen != valuelen) xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - fail_xref: + if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + context->seen_enough = 1; return; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 639d14b51e90..3d08589f5c60 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -51,7 +37,6 @@ xfs_scrub_setup_inode_bmap( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - struct xfs_mount *mp = sc->mp; int error; error = xfs_scrub_get_inode(sc, ip); @@ -75,7 +60,7 @@ xfs_scrub_setup_inode_bmap( } /* Got the inode, lock it and we're ready to go. */ - error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + error = xfs_scrub_trans_alloc(sc, 0); if (error) goto out; sc->ilock_flags |= XFS_ILOCK_EXCL; @@ -175,7 +160,7 @@ xfs_scrub_bmap_xref_rmap( unsigned long long rmap_end; uint64_t owner; - if (!info->sc->sa.rmap_cur) + if (!info->sc->sa.rmap_cur || xfs_scrub_skip_xref(info->sc->sm)) return; if (info->whichfork == XFS_COW_FORK) @@ -684,7 +669,8 @@ xfs_scrub_bmap( info.lastoff = 0; ifp = XFS_IFORK_PTR(ip, whichfork); for_each_xfs_iext(ifp, &icur, &irec) { - if (xfs_scrub_should_terminate(sc, &error)) + if (xfs_scrub_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) break; if (isnullstartblock(irec.br_startblock)) continue; diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 54218168c8f9..5b472045f036 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -442,7 +428,7 @@ xfs_scrub_btree_check_owner( */ if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { co = kmem_alloc(sizeof(struct check_owner), - KM_MAYFAIL | KM_NOFS); + KM_MAYFAIL); if (!co) return -ENOMEM; co->level = level; @@ -455,6 +441,44 @@ xfs_scrub_btree_check_owner( } /* + * Check that this btree block has at least minrecs records or is one of the + * special blocks that don't require that. + */ +STATIC void +xfs_scrub_btree_check_minrecs( + struct xfs_scrub_btree *bs, + int level, + struct xfs_btree_block *block) +{ + unsigned int numrecs; + int ok_level; + + numrecs = be16_to_cpu(block->bb_numrecs); + + /* More records than minrecs means the block is ok. */ + if (numrecs >= bs->cur->bc_ops->get_minrecs(bs->cur, level)) + return; + + /* + * Certain btree blocks /can/ have fewer than minrecs records. Any + * level greater than or equal to the level of the highest dedicated + * btree block are allowed to violate this constraint. + * + * For a btree rooted in a block, the btree root can have fewer than + * minrecs records. If the btree is rooted in an inode and does not + * store records in the root, the direct children of the root and the + * root itself can have fewer than minrecs records. + */ + ok_level = bs->cur->bc_nlevels - 1; + if (bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + ok_level--; + if (level >= ok_level) + return; + + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level); +} + +/* * Grab and scrub a btree block given a btree pointer. Returns block * and buffer pointers (if applicable) if they're ok to use. */ @@ -491,6 +515,8 @@ xfs_scrub_btree_get_block( if (*pbp) xfs_scrub_buffer_recheck(bs->sc, *pbp); + xfs_scrub_btree_check_minrecs(bs, level, *pblock); + /* * Check the block's owner; this function absorbs error codes * for us. diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index e2b868ede70b..956627500f2c 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_SCRUB_BTREE_H__ #define __XFS_SCRUB_BTREE_H__ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 8ed91d5c868d..70e70c69f83f 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -44,11 +30,14 @@ #include "xfs_rmap_btree.h" #include "xfs_log.h" #include "xfs_trans_priv.h" +#include "xfs_attr.h" +#include "xfs_reflink.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/btree.h" +#include "scrub/repair.h" /* Common code for the metadata scrubbers. */ @@ -539,6 +528,10 @@ xfs_scrub_ag_free( xfs_trans_brelse(sc->tp, sa->agi_bp); sa->agi_bp = NULL; } + if (sa->pag) { + xfs_perag_put(sa->pag); + sa->pag = NULL; + } sa->agno = NULLAGNUMBER; } @@ -566,15 +559,53 @@ xfs_scrub_ag_init( return xfs_scrub_ag_btcur_init(sc, sa); } +/* + * Grab the per-ag structure if we haven't already gotten it. Teardown of the + * xfs_scrub_ag will release it for us. + */ +void +xfs_scrub_perag_get( + struct xfs_mount *mp, + struct xfs_scrub_ag *sa) +{ + if (!sa->pag) + sa->pag = xfs_perag_get(mp, sa->agno); +} + /* Per-scrubber setup functions */ +/* + * Grab an empty transaction so that we can re-grab locked buffers if + * one of our btrees turns out to be cyclic. + * + * If we're going to repair something, we need to ask for the largest possible + * log reservation so that we can handle the worst case scenario for metadata + * updates while rebuilding a metadata item. We also need to reserve as many + * blocks in the head transaction as we think we're going to need to rebuild + * the metadata object. + */ +int +xfs_scrub_trans_alloc( + struct xfs_scrub_context *sc, + uint resblks) +{ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, + resblks, 0, 0, &sc->tp); + + return xfs_trans_alloc_empty(sc->mp, &sc->tp); +} + /* Set us up with a transaction and an empty context. */ int xfs_scrub_setup_fs( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp); + uint resblks; + + resblks = xfs_repair_calc_ag_resblks(sc); + return xfs_scrub_trans_alloc(sc, resblks); } /* Set us up with AG headers and btree cursors. */ @@ -695,7 +726,6 @@ xfs_scrub_setup_inode_contents( struct xfs_inode *ip, unsigned int resblks) { - struct xfs_mount *mp = sc->mp; int error; error = xfs_scrub_get_inode(sc, ip); @@ -705,7 +735,7 @@ xfs_scrub_setup_inode_contents( /* Got the inode, lock it and we're ready to go. */ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; xfs_ilock(sc->ip, sc->ilock_flags); - error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + error = xfs_scrub_trans_alloc(sc, resblks); if (error) goto out; sc->ilock_flags |= XFS_ILOCK_EXCL; @@ -727,6 +757,10 @@ xfs_scrub_should_check_xref( int *error, struct xfs_btree_cur **curpp) { + /* No point in xref if we already know we're corrupt. */ + if (xfs_scrub_skip_xref(sc->sm)) + return false; + if (*error == 0) return true; @@ -773,3 +807,80 @@ xfs_scrub_buffer_recheck( sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; trace_xfs_scrub_block_error(sc, bp->b_bn, fa); } + +/* + * Scrub the attr/data forks of a metadata inode. The metadata inode must be + * pointed to by sc->ip and the ILOCK must be held. + */ +int +xfs_scrub_metadata_inode_forks( + struct xfs_scrub_context *sc) +{ + __u32 smtype; + bool shared; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Metadata inodes don't live on the rt device. */ + if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* They should never participate in reflink. */ + if (xfs_is_reflink_inode(sc->ip)) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* They also should never have extended attributes. */ + if (xfs_inode_hasattr(sc->ip)) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* Invoke the data fork scrubber. */ + smtype = sc->sm->sm_type; + sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD; + error = xfs_scrub_bmap_data(sc); + sc->sm->sm_type = smtype; + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + /* Look for incorrect shared blocks. */ + if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) { + error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, + &shared); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + if (shared) + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + } + + return error; +} + +/* + * Try to lock an inode in violation of the usual locking order rules. For + * example, trying to get the IOLOCK while in transaction context, or just + * plain breaking AG-order or inode-order inode locking rules. Either way, + * the only way to avoid an ABBA deadlock is to use trylock and back off if + * we can't. + */ +int +xfs_scrub_ilock_inverted( + struct xfs_inode *ip, + uint lock_mode) +{ + int i; + + for (i = 0; i < 20; i++) { + if (xfs_ilock_nowait(ip, lock_mode)) + return 0; + delay(1); + } + return -EDEADLOCK; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index deaf60400981..2172bd5361e2 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_SCRUB_COMMON_H__ #define __XFS_SCRUB_COMMON_H__ @@ -38,19 +24,7 @@ xfs_scrub_should_terminate( return false; } -/* - * Grab an empty transaction so that we can re-grab locked buffers if - * one of our btrees turns out to be cyclic. - */ -static inline int -xfs_scrub_trans_alloc( - struct xfs_scrub_metadata *sm, - struct xfs_mount *mp, - struct xfs_trans **tpp) -{ - return xfs_trans_alloc_empty(mp, tpp); -} - +int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc, uint resblks); bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int *error); bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork, @@ -135,16 +109,13 @@ xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip) void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa); int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno, struct xfs_scrub_ag *sa); +void xfs_scrub_perag_get(struct xfs_mount *mp, struct xfs_scrub_ag *sa); int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno, struct xfs_buf **agi, struct xfs_buf **agf, struct xfs_buf **agfl); void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa); int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa); -int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc, - int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno, - void *), - void *priv); int xfs_scrub_count_rmap_ownedby_ag(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, struct xfs_owner_info *oinfo, @@ -157,4 +128,17 @@ int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc, struct xfs_inode *ip, unsigned int resblks); void xfs_scrub_buffer_recheck(struct xfs_scrub_context *sc, struct xfs_buf *bp); +/* + * Don't bother cross-referencing if we already found corruption or cross + * referencing discrepancies. + */ +static inline bool xfs_scrub_skip_xref(struct xfs_scrub_metadata *sm) +{ + return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT); +} + +int xfs_scrub_metadata_inode_forks(struct xfs_scrub_context *sc); +int xfs_scrub_ilock_inverted(struct xfs_inode *ip, uint lock_mode); + #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index bffdb7dc09bf..d700c4d4d4ef 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h index d31468d68cef..365f9f0019e6 100644 --- a/fs/xfs/scrub/dabtree.h +++ b/fs/xfs/scrub/dabtree.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_SCRUB_DABTREE_H__ #define __XFS_SCRUB_DABTREE_H__ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 38f29806eb54..86324775fc9b 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -172,7 +158,7 @@ xfs_scrub_dir_actor( error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, &error)) - goto fail_xref; + goto out; if (lookup_ino != ino) { xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); goto out; @@ -183,8 +169,13 @@ xfs_scrub_dir_actor( if (error) goto out; out: - return error; -fail_xref: + /* + * A negative error code returned here is supposed to cause the + * dir_emit caller (xfs_readdir) to abort the directory iteration + * and return zero to xfs_scrub_directory. + */ + if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -EFSCORRUPTED; return error; } @@ -240,6 +231,9 @@ xfs_scrub_dir_rec( } xfs_scrub_buffer_recheck(ds->sc, bp); + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_relse; + dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off); /* Make sure we got a real directory entry. */ @@ -357,6 +351,9 @@ xfs_scrub_directory_data_bestfree( /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_buf; + /* Do the bestfrees correspond to actual free space? */ bf = d_ops->data_bestfree_p(bp->b_addr); smallest_bestfree = UINT_MAX; @@ -413,14 +410,18 @@ xfs_scrub_directory_data_bestfree( /* Spot check this free entry */ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); - if (tag != ((char *)dup - (char *)bp->b_addr)) + if (tag != ((char *)dup - (char *)bp->b_addr)) { xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } /* * Either this entry is a bestfree or it's smaller than * any of the bestfrees. */ xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_buf; /* Move on. */ newlen = be16_to_cpu(dup->length); @@ -546,6 +547,8 @@ xfs_scrub_directory_leaf1_bestfree( } if (leafhdr.stale != stale) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; /* Check all the bestfree entries. */ for (i = 0; i < bestcount; i++, bestp++) { @@ -556,9 +559,11 @@ xfs_scrub_directory_leaf1_bestfree( i * args->geo->fsbcount, -1, &dbp); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - continue; + break; xfs_scrub_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; } out: return error; @@ -607,7 +612,7 @@ xfs_scrub_directory_free_bestfree( -1, &dbp); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - continue; + break; xfs_scrub_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); } @@ -656,7 +661,7 @@ xfs_scrub_directory_blocks( /* Iterate all the data extents in the directory... */ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); - while (found) { + while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { /* Block directories only have a single block at offset 0. */ if (is_block && (got.br_startoff > 0 || @@ -719,7 +724,7 @@ xfs_scrub_directory_blocks( /* Scan for free blocks */ lblk = free_lblk; found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); - while (found) { + while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { /* * Dirs can't have blocks mapped above 2^32. * Single-block dirs shouldn't even be here. diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 106ca4bd753f..13d43d108574 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -387,7 +373,8 @@ xfs_scrub_iallocbt_xref_rmap_btreeblks( int error; if (!sc->sa.ino_cur || !sc->sa.rmap_cur || - (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur)) + (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur) || + xfs_scrub_skip_xref(sc->sm)) return; /* Check that we saw as many inobt blocks as the rmap says. */ @@ -424,7 +411,7 @@ xfs_scrub_iallocbt_xref_rmap_inodes( xfs_filblks_t blocks; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Check that we saw as many inode blocks as the rmap knows about. */ @@ -496,7 +483,7 @@ xfs_scrub_xref_inode_check( bool has_inodes; int error; - if (!(*icur)) + if (!(*icur) || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index df14930e4fc5..7a6208505980 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -55,7 +41,6 @@ xfs_scrub_setup_inode( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - struct xfs_mount *mp = sc->mp; int error; /* @@ -68,7 +53,7 @@ xfs_scrub_setup_inode( break; case -EFSCORRUPTED: case -EFSBADCRC: - return xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + return xfs_scrub_trans_alloc(sc, 0); default: return error; } @@ -76,7 +61,7 @@ xfs_scrub_setup_inode( /* Got the inode, lock it and we're ready to go. */ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; xfs_ilock(sc->ip, sc->ilock_flags); - error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + error = xfs_scrub_trans_alloc(sc, 0); if (error) goto out; sc->ilock_flags |= XFS_ILOCK_EXCL; @@ -449,7 +434,7 @@ xfs_scrub_inode_xref_finobt( int has_record; int error; - if (!sc->sa.fino_cur) + if (!sc->sa.fino_cur || xfs_scrub_skip_xref(sc->sm)) return; agino = XFS_INO_TO_AGINO(sc->mp, ino); @@ -492,6 +477,9 @@ xfs_scrub_inode_xref_bmap( xfs_filblks_t acount; int error; + if (xfs_scrub_skip_xref(sc->sm)) + return; + /* Walk all the extents to check nextents/naextents/nblocks. */ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, &nextents, &count); diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 1fb88c18d455..e2bda58c32f0 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -147,6 +133,9 @@ xfs_scrub_parent_validate( *try_again = false; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + /* '..' must not point to ourselves. */ if (sc->ip->i_ino == dnum) { xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); @@ -211,7 +200,9 @@ xfs_scrub_parent_validate( */ xfs_iunlock(sc->ip, sc->ilock_flags); sc->ilock_flags = 0; - xfs_ilock(dp, XFS_IOLOCK_SHARED); + error = xfs_scrub_ilock_inverted(dp, XFS_IOLOCK_SHARED); + if (error) + goto out_rele; /* Go looking for our dentry. */ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); @@ -220,8 +211,10 @@ xfs_scrub_parent_validate( /* Drop the parent lock, relock this inode. */ xfs_iunlock(dp, XFS_IOLOCK_SHARED); + error = xfs_scrub_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL); + if (error) + goto out_rele; sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); /* * If we're an unlinked directory, the parent /won't/ have a link @@ -323,5 +316,13 @@ xfs_scrub_parent( if (try_again && tries == 20) xfs_scrub_set_incomplete(sc); out: + /* + * If we failed to lock the parent inode even after a retry, just mark + * this scrub incomplete and return. + */ + if (sc->try_harder && error == -EDEADLOCK) { + error = 0; + xfs_scrub_set_incomplete(sc); + } return error; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 6ba465e6c885..6ff906aa0a3b 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -66,25 +52,43 @@ xfs_scrub_setup_quota( struct xfs_inode *ip) { uint dqtype; + int error; + + if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp)) + return -ENOENT; dqtype = xfs_scrub_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; + sc->has_quotaofflock = true; + mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + error = xfs_scrub_setup_fs(sc, ip); + if (error) + return error; + sc->ip = xfs_quota_inode(sc->mp, dqtype); + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + sc->ilock_flags = XFS_ILOCK_EXCL; return 0; } /* Quotas. */ +struct xfs_scrub_quota_info { + struct xfs_scrub_context *sc; + xfs_dqid_t last_id; +}; + /* Scrub the fields in an individual quota item. */ -STATIC void +STATIC int xfs_scrub_quota_item( - struct xfs_scrub_context *sc, - uint dqtype, struct xfs_dquot *dq, - xfs_dqid_t id) + uint dqtype, + void *priv) { + struct xfs_scrub_quota_info *sqi = priv; + struct xfs_scrub_context *sc = sqi->sc; struct xfs_mount *mp = sc->mp; struct xfs_disk_dquot *d = &dq->q_core; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -99,17 +103,18 @@ xfs_scrub_quota_item( unsigned long long icount; unsigned long long rcount; xfs_ino_t fs_icount; - - offset = id / qi->qi_dqperchunk; + xfs_dqid_t id = be32_to_cpu(d->d_id); /* - * We fed $id and DQNEXT into the xfs_qm_dqget call, which means - * that the actual dquot we got must either have the same id or - * the next higher id. + * Except for the root dquot, the actual dquot we got must either have + * the same or higher id as we saw before. */ - if (id > be32_to_cpu(d->d_id)) + offset = id / qi->qi_dqperchunk; + if (id && id <= sqi->last_id) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + sqi->last_id = id; + /* Did we get the dquot type we wanted? */ if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); @@ -183,115 +188,85 @@ xfs_scrub_quota_item( xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); if (id != 0 && rhard != 0 && rcount > rhard) xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + + return 0; } -/* Scrub all of a quota type's items. */ -int -xfs_scrub_quota( +/* Check the quota's data fork. */ +STATIC int +xfs_scrub_quota_data_fork( struct xfs_scrub_context *sc) { struct xfs_bmbt_irec irec = { 0 }; - struct xfs_mount *mp = sc->mp; - struct xfs_inode *ip; - struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_dquot *dq; + struct xfs_iext_cursor icur; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_ifork *ifp; xfs_fileoff_t max_dqid_off; - xfs_fileoff_t off = 0; - xfs_dqid_t id = 0; - uint dqtype; - int nimaps; int error = 0; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) - return -ENOENT; - - mutex_lock(&qi->qi_quotaofflock); - dqtype = xfs_scrub_quota_to_dqtype(sc); - if (!xfs_this_quota_on(sc->mp, dqtype)) { - error = -ENOENT; - goto out_unlock_quota; - } - - /* Attach to the quota inode and set sc->ip so that reporting works. */ - ip = xfs_quota_inode(sc->mp, dqtype); - sc->ip = ip; + /* Invoke the fork scrubber. */ + error = xfs_scrub_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; - /* Look for problem extents. */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); - goto out_unlock_inode; - } + /* Check for data fork problems that apply only to quota files. */ max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; - while (1) { + ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &irec) { if (xfs_scrub_should_terminate(sc, &error)) break; - - off = irec.br_startoff + irec.br_blockcount; - nimaps = 1; - error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps, - XFS_BMAPI_ENTIRE); - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off, - &error)) - goto out_unlock_inode; - if (!nimaps) - break; - if (irec.br_startblock == HOLESTARTBLOCK) - continue; - - /* Check the extent record doesn't point to crap. */ - if (irec.br_startblock + irec.br_blockcount <= - irec.br_startblock) - xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, - irec.br_startoff); - if (!xfs_verify_fsbno(mp, irec.br_startblock) || - !xfs_verify_fsbno(mp, irec.br_startblock + - irec.br_blockcount - 1)) - xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, - irec.br_startoff); - /* - * Unwritten extents or blocks mapped above the highest + * delalloc extents or blocks mapped above the highest * quota id shouldn't happen. */ if (isnullstartblock(irec.br_startblock) || irec.br_startoff > max_dqid_off || - irec.br_startoff + irec.br_blockcount > max_dqid_off + 1) - xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + irec.br_startoff); + break; + } } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; - /* Check all the quota items. */ - while (id < ((xfs_dqid_t)-1ULL)) { - if (xfs_scrub_should_terminate(sc, &error)) - break; + return error; +} - error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT, - &dq); - if (error == -ENOENT) - break; - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, - id * qi->qi_dqperchunk, &error)) - break; +/* Scrub all of a quota type's items. */ +int +xfs_scrub_quota( + struct xfs_scrub_context *sc) +{ + struct xfs_scrub_quota_info sqi; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + uint dqtype; + int error = 0; - xfs_scrub_quota_item(sc, dqtype, dq, id); + dqtype = xfs_scrub_quota_to_dqtype(sc); - id = be32_to_cpu(dq->q_core.d_id) + 1; - xfs_qm_dqput(dq); - if (!id) - break; - } + /* Look for problem extents. */ + error = xfs_scrub_quota_data_fork(sc); + if (error) + goto out; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* + * Check all the quota items. Now that we've checked the quota inode + * data fork we have to drop ILOCK_EXCL to use the regular dquot + * functions. + */ + xfs_iunlock(sc->ip, sc->ilock_flags); + sc->ilock_flags = 0; + sqi.sc = sc; + sqi.last_id = 0; + error = xfs_qm_dqiterate(mp, dqtype, xfs_scrub_quota_item, &sqi); + sc->ilock_flags = XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, + sqi.last_id * qi->qi_dqperchunk, &error)) + goto out; out: - /* We set sc->ip earlier, so make sure we clear it now. */ - sc->ip = NULL; -out_unlock_quota: - mutex_unlock(&qi->qi_quotaofflock); return error; - -out_unlock_inode: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out; } diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 400f1561cd3d..607a9faa8ecc 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -150,7 +136,7 @@ xfs_scrub_refcountbt_rmap_check( * so we don't need insertion sort here. */ frag = kmem_alloc(sizeof(struct xfs_scrub_refcnt_frag), - KM_MAYFAIL | KM_NOFS); + KM_MAYFAIL); if (!frag) return -ENOMEM; memcpy(&frag->rm, rec, sizeof(frag->rm)); @@ -310,7 +296,7 @@ xfs_scrub_refcountbt_xref_rmap( struct xfs_scrub_refcnt_frag *n; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Cross-reference with the rmapbt to confirm the refcount. */ @@ -404,7 +390,7 @@ xfs_scrub_refcount_xref_rmap( xfs_filblks_t blocks; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Check that we saw as many refcbt blocks as the rmap knows about. */ @@ -460,7 +446,7 @@ xfs_scrub_xref_is_cow_staging( int has_refcount; int error; - if (!sc->sa.refc_cur) + if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Find the CoW staging extent. */ @@ -504,7 +490,7 @@ xfs_scrub_xref_is_not_shared( bool shared; int error; - if (!sc->sa.refc_cur) + if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c new file mode 100644 index 000000000000..326be4e8b71e --- /dev/null +++ b/fs/xfs/scrub/repair.c @@ -0,0 +1,1075 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_extent_busy.h" +#include "xfs_ag_resv.h" +#include "xfs_trans_space.h" +#include "xfs_quota.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Attempt to repair some metadata, if the metadata is corrupt and userspace + * told us to fix it. This function returns -EAGAIN to mean "re-run scrub", + * and will set *fixed to true if it thinks it repaired anything. + */ +int +xfs_repair_attempt( + struct xfs_inode *ip, + struct xfs_scrub_context *sc, + bool *fixed) +{ + int error = 0; + + trace_xfs_repair_attempt(ip, sc->sm, error); + + xfs_scrub_ag_btcur_free(&sc->sa); + + /* Repair whatever's broken. */ + ASSERT(sc->ops->repair); + error = sc->ops->repair(sc); + trace_xfs_repair_done(ip, sc->sm, error); + switch (error) { + case 0: + /* + * Repair succeeded. Commit the fixes and perform a second + * scrub so that we can tell userspace if we fixed the problem. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + *fixed = true; + return -EAGAIN; + case -EDEADLOCK: + case -EAGAIN: + /* Tell the caller to try again having grabbed all the locks. */ + if (!sc->try_harder) { + sc->try_harder = true; + return -EAGAIN; + } + /* + * We tried harder but still couldn't grab all the resources + * we needed to fix it. The corruption has not been fixed, + * so report back to userspace. + */ + return -EFSCORRUPTED; + default: + return error; + } +} + +/* + * Complain about unfixable problems in the filesystem. We don't log + * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver + * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the + * administrator isn't running xfs_scrub in no-repairs mode. + * + * Use this helper function because _ratelimited silently declares a static + * structure to track rate limiting information. + */ +void +xfs_repair_failure( + struct xfs_mount *mp) +{ + xfs_alert_ratelimited(mp, +"Corruption not fixed during online repair. Unmount and run xfs_repair."); +} + +/* + * Repair probe -- userspace uses this to probe if we're willing to repair a + * given mountpoint. + */ +int +xfs_repair_probe( + struct xfs_scrub_context *sc) +{ + int error = 0; + + if (xfs_scrub_should_terminate(sc, &error)) + return error; + + return 0; +} + +/* + * Roll a transaction, keeping the AG headers locked and reinitializing + * the btree cursors. + */ +int +xfs_repair_roll_ag_trans( + struct xfs_scrub_context *sc) +{ + int error; + + /* Keep the AG header buffers locked so we can keep going. */ + xfs_trans_bhold(sc->tp, sc->sa.agi_bp); + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); + + /* Roll the transaction. */ + error = xfs_trans_roll(&sc->tp); + if (error) + goto out_release; + + /* Join AG headers to the new transaction. */ + xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); + xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); + + return 0; + +out_release: + /* + * Rolling failed, so release the hold on the buffers. The + * buffers will be released during teardown on our way out + * of the kernel. + */ + xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); + xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); + xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp); + + return error; +} + +/* + * Does the given AG have enough space to rebuild a btree? Neither AG + * reservation can be critical, and we must have enough space (factoring + * in AG reservations) to construct a whole btree. + */ +bool +xfs_repair_ag_has_space( + struct xfs_perag *pag, + xfs_extlen_t nr_blocks, + enum xfs_ag_resv_type type) +{ + return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) && + !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) && + pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks; +} + +/* + * Figure out how many blocks to reserve for an AG repair. We calculate the + * worst case estimate for the number of blocks we'd need to rebuild one of + * any type of per-AG btree. + */ +xfs_extlen_t +xfs_repair_calc_ag_resblks( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_scrub_metadata *sm = sc->sm; + struct xfs_perag *pag; + struct xfs_buf *bp; + xfs_agino_t icount = 0; + xfs_extlen_t aglen = 0; + xfs_extlen_t usedlen; + xfs_extlen_t freelen; + xfs_extlen_t bnobt_sz; + xfs_extlen_t inobt_sz; + xfs_extlen_t rmapbt_sz; + xfs_extlen_t refcbt_sz; + int error; + + if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + return 0; + + /* Use in-core counters if possible. */ + pag = xfs_perag_get(mp, sm->sm_agno); + if (pag->pagi_init) + icount = pag->pagi_count; + + /* + * Otherwise try to get the actual counters from disk; if not, make + * some worst case assumptions. + */ + if (icount == 0) { + error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp); + if (error) { + icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock; + } else { + icount = pag->pagi_count; + xfs_buf_relse(bp); + } + } + + /* Now grab the block counters from the AGF. */ + error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); + if (error) { + aglen = mp->m_sb.sb_agblocks; + freelen = aglen; + usedlen = aglen; + } else { + aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length); + freelen = pag->pagf_freeblks; + usedlen = aglen - freelen; + xfs_buf_relse(bp); + } + xfs_perag_put(pag); + + trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, + freelen, usedlen); + + /* + * Figure out how many blocks we'd need worst case to rebuild + * each type of btree. Note that we can only rebuild the + * bnobt/cntbt or inobt/finobt as pairs. + */ + bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen); + if (xfs_sb_version_hassparseinodes(&mp->m_sb)) + inobt_sz = xfs_iallocbt_calc_size(mp, icount / + XFS_INODES_PER_HOLEMASK_BIT); + else + inobt_sz = xfs_iallocbt_calc_size(mp, icount / + XFS_INODES_PER_CHUNK); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + inobt_sz *= 2; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen); + else + refcbt_sz = 0; + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + /* + * Guess how many blocks we need to rebuild the rmapbt. + * For non-reflink filesystems we can't have more records than + * used blocks. However, with reflink it's possible to have + * more than one rmap record per AG block. We don't know how + * many rmaps there could be in the AG, so we start off with + * what we hope is an generous over-estimation. + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + rmapbt_sz = xfs_rmapbt_calc_size(mp, + (unsigned long long)aglen * 2); + else + rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen); + } else { + rmapbt_sz = 0; + } + + trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz, + inobt_sz, rmapbt_sz, refcbt_sz); + + return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); +} + +/* Allocate a block in an AG. */ +int +xfs_repair_alloc_ag_block( + struct xfs_scrub_context *sc, + struct xfs_owner_info *oinfo, + xfs_fsblock_t *fsbno, + enum xfs_ag_resv_type resv) +{ + struct xfs_alloc_arg args = {0}; + xfs_agblock_t bno; + int error; + + switch (resv) { + case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_RMAPBT: + error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1); + if (error) + return error; + if (bno == NULLAGBLOCK) + return -ENOSPC; + xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno, + 1, false); + *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno); + if (resv == XFS_AG_RESV_RMAPBT) + xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno); + return 0; + default: + break; + } + + args.tp = sc->tp; + args.mp = sc->mp; + args.oinfo = *oinfo; + args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0); + args.minlen = 1; + args.maxlen = 1; + args.prod = 1; + args.type = XFS_ALLOCTYPE_THIS_AG; + args.resv = resv; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + ASSERT(args.len == 1); + *fsbno = args.fsbno; + + return 0; +} + +/* Initialize a new AG btree root block with zero entries. */ +int +xfs_repair_init_btblock( + struct xfs_scrub_context *sc, + xfs_fsblock_t fsb, + struct xfs_buf **bpp, + xfs_btnum_t btnum, + const struct xfs_buf_ops *ops) +{ + struct xfs_trans *tp = sc->tp; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + + trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), btnum); + + ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb), + XFS_FSB_TO_BB(mp, 1), 0); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(tp, bp, 0, bp->b_length); + bp->b_ops = ops; + *bpp = bp; + + return 0; +} + +/* + * Reconstructing per-AG Btrees + * + * When a space btree is corrupt, we don't bother trying to fix it. Instead, + * we scan secondary space metadata to derive the records that should be in + * the damaged btree, initialize a fresh btree root, and insert the records. + * Note that for rebuilding the rmapbt we scan all the primary data to + * generate the new records. + * + * However, that leaves the matter of removing all the metadata describing the + * old broken structure. For primary metadata we use the rmap data to collect + * every extent with a matching rmap owner (exlist); we then iterate all other + * metadata structures with the same rmap owner to collect the extents that + * cannot be removed (sublist). We then subtract sublist from exlist to + * derive the blocks that were used by the old btree. These blocks can be + * reaped. + * + * For rmapbt reconstructions we must use different tactics for extent + * collection. First we iterate all primary metadata (this excludes the old + * rmapbt, obviously) to generate new rmap records. The gaps in the rmap + * records are collected as exlist. The bnobt records are collected as + * sublist. As with the other btrees we subtract sublist from exlist, and the + * result (since the rmapbt lives in the free space) are the blocks from the + * old rmapbt. + */ + +/* Collect a dead btree extent for later disposal. */ +int +xfs_repair_collect_btree_extent( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + xfs_fsblock_t fsbno, + xfs_extlen_t len) +{ + struct xfs_repair_extent *rex; + + trace_xfs_repair_collect_btree_extent(sc->mp, + XFS_FSB_TO_AGNO(sc->mp, fsbno), + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len); + + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL); + if (!rex) + return -ENOMEM; + + INIT_LIST_HEAD(&rex->list); + rex->fsbno = fsbno; + rex->len = len; + list_add_tail(&rex->list, &exlist->list); + + return 0; +} + +/* + * An error happened during the rebuild so the transaction will be cancelled. + * The fs will shut down, and the administrator has to unmount and run repair. + * Therefore, free all the memory associated with the list so we can die. + */ +void +xfs_repair_cancel_btree_extents( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist) +{ + struct xfs_repair_extent *rex; + struct xfs_repair_extent *n; + + for_each_xfs_repair_extent_safe(rex, n, exlist) { + list_del(&rex->list); + kmem_free(rex); + } +} + +/* Compare two btree extents. */ +static int +xfs_repair_btree_extent_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_repair_extent *ap; + struct xfs_repair_extent *bp; + + ap = container_of(a, struct xfs_repair_extent, list); + bp = container_of(b, struct xfs_repair_extent, list); + + if (ap->fsbno > bp->fsbno) + return 1; + if (ap->fsbno < bp->fsbno) + return -1; + return 0; +} + +/* + * Remove all the blocks mentioned in @sublist from the extents in @exlist. + * + * The intent is that callers will iterate the rmapbt for all of its records + * for a given owner to generate @exlist; and iterate all the blocks of the + * metadata structures that are not being rebuilt and have the same rmapbt + * owner to generate @sublist. This routine subtracts all the extents + * mentioned in sublist from all the extents linked in @exlist, which leaves + * @exlist as the list of blocks that are not accounted for, which we assume + * are the dead blocks of the old metadata structure. The blocks mentioned in + * @exlist can be reaped. + */ +#define LEFT_ALIGNED (1 << 0) +#define RIGHT_ALIGNED (1 << 1) +int +xfs_repair_subtract_extents( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_repair_extent_list *sublist) +{ + struct list_head *lp; + struct xfs_repair_extent *ex; + struct xfs_repair_extent *newex; + struct xfs_repair_extent *subex; + xfs_fsblock_t sub_fsb; + xfs_extlen_t sub_len; + int state; + int error = 0; + + if (list_empty(&exlist->list) || list_empty(&sublist->list)) + return 0; + ASSERT(!list_empty(&sublist->list)); + + list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp); + list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp); + + /* + * Now that we've sorted both lists, we iterate exlist once, rolling + * forward through sublist and/or exlist as necessary until we find an + * overlap or reach the end of either list. We do not reset lp to the + * head of exlist nor do we reset subex to the head of sublist. The + * list traversal is similar to merge sort, but we're deleting + * instead. In this manner we avoid O(n^2) operations. + */ + subex = list_first_entry(&sublist->list, struct xfs_repair_extent, + list); + lp = exlist->list.next; + while (lp != &exlist->list) { + ex = list_entry(lp, struct xfs_repair_extent, list); + + /* + * Advance subex and/or ex until we find a pair that + * intersect or we run out of extents. + */ + while (subex->fsbno + subex->len <= ex->fsbno) { + if (list_is_last(&subex->list, &sublist->list)) + goto out; + subex = list_next_entry(subex, list); + } + if (subex->fsbno >= ex->fsbno + ex->len) { + lp = lp->next; + continue; + } + + /* trim subex to fit the extent we have */ + sub_fsb = subex->fsbno; + sub_len = subex->len; + if (subex->fsbno < ex->fsbno) { + sub_len -= ex->fsbno - subex->fsbno; + sub_fsb = ex->fsbno; + } + if (sub_len > ex->len) + sub_len = ex->len; + + state = 0; + if (sub_fsb == ex->fsbno) + state |= LEFT_ALIGNED; + if (sub_fsb + sub_len == ex->fsbno + ex->len) + state |= RIGHT_ALIGNED; + switch (state) { + case LEFT_ALIGNED: + /* Coincides with only the left. */ + ex->fsbno += sub_len; + ex->len -= sub_len; + break; + case RIGHT_ALIGNED: + /* Coincides with only the right. */ + ex->len -= sub_len; + lp = lp->next; + break; + case LEFT_ALIGNED | RIGHT_ALIGNED: + /* Total overlap, just delete ex. */ + lp = lp->next; + list_del(&ex->list); + kmem_free(ex); + break; + case 0: + /* + * Deleting from the middle: add the new right extent + * and then shrink the left extent. + */ + newex = kmem_alloc(sizeof(struct xfs_repair_extent), + KM_MAYFAIL); + if (!newex) { + error = -ENOMEM; + goto out; + } + INIT_LIST_HEAD(&newex->list); + newex->fsbno = sub_fsb + sub_len; + newex->len = ex->fsbno + ex->len - newex->fsbno; + list_add(&newex->list, &ex->list); + ex->len = sub_fsb - ex->fsbno; + lp = lp->next; + break; + default: + ASSERT(0); + break; + } + } + +out: + return error; +} +#undef LEFT_ALIGNED +#undef RIGHT_ALIGNED + +/* + * Disposal of Blocks from Old per-AG Btrees + * + * Now that we've constructed a new btree to replace the damaged one, we want + * to dispose of the blocks that (we think) the old btree was using. + * Previously, we used the rmapbt to collect the extents (exlist) with the + * rmap owner corresponding to the tree we rebuilt, collected extents for any + * blocks with the same rmap owner that are owned by another data structure + * (sublist), and subtracted sublist from exlist. In theory the extents + * remaining in exlist are the old btree's blocks. + * + * Unfortunately, it's possible that the btree was crosslinked with other + * blocks on disk. The rmap data can tell us if there are multiple owners, so + * if the rmapbt says there is an owner of this block other than @oinfo, then + * the block is crosslinked. Remove the reverse mapping and continue. + * + * If there is one rmap record, we can free the block, which removes the + * reverse mapping but doesn't add the block to the free space. Our repair + * strategy is to hope the other metadata objects crosslinked on this block + * will be rebuilt (atop different blocks), thereby removing all the cross + * links. + * + * If there are no rmap records at all, we also free the block. If the btree + * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't + * supposed to be a rmap record and everything is ok. For other btrees there + * had to have been an rmap entry for the block to have ended up on @exlist, + * so if it's gone now there's something wrong and the fs will shut down. + * + * Note: If there are multiple rmap records with only the same rmap owner as + * the btree we're trying to rebuild and the block is indeed owned by another + * data structure with the same rmap owner, then the block will be in sublist + * and therefore doesn't need disposal. If there are multiple rmap records + * with only the same rmap owner but the block is not owned by something with + * the same rmap owner, the block will be freed. + * + * The caller is responsible for locking the AG headers for the entire rebuild + * operation so that nothing else can sneak in and change the AG state while + * we're not looking. We also assume that the caller already invalidated any + * buffers associated with @exlist. + */ + +/* + * Invalidate buffers for per-AG btree blocks we're dumping. This function + * is not intended for use with file data repairs; we have bunmapi for that. + */ +int +xfs_repair_invalidate_blocks( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist) +{ + struct xfs_repair_extent *rex; + struct xfs_repair_extent *n; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t i; + + /* + * For each block in each extent, see if there's an incore buffer for + * exactly that block; if so, invalidate it. The buffer cache only + * lets us look for one buffer at a time, so we have to look one block + * at a time. Avoid invalidating AG headers and post-EOFS blocks + * because we never own those; and if we can't TRYLOCK the buffer we + * assume it's owned by someone else. + */ + for_each_xfs_repair_extent_safe(rex, n, exlist) { + for (fsbno = rex->fsbno, i = rex->len; i > 0; fsbno++, i--) { + /* Skip AG headers and post-EOFS blocks */ + if (!xfs_verify_fsbno(sc->mp, fsbno)) + continue; + bp = xfs_buf_incore(sc->mp->m_ddev_targp, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK); + if (bp) { + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + } + } + } + + return 0; +} + +/* Ensure the freelist is the correct size. */ +int +xfs_repair_fix_freelist( + struct xfs_scrub_context *sc, + bool can_shrink) +{ + struct xfs_alloc_arg args = {0}; + + args.mp = sc->mp; + args.tp = sc->tp; + args.agno = sc->sa.agno; + args.alignment = 1; + args.pag = sc->sa.pag; + + return xfs_alloc_fix_freelist(&args, + can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); +} + +/* + * Put a block back on the AGFL. + */ +STATIC int +xfs_repair_put_freelist( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno) +{ + struct xfs_owner_info oinfo; + int error; + + /* Make sure there's space on the freelist. */ + error = xfs_repair_fix_freelist(sc, true); + if (error) + return error; + + /* + * Since we're "freeing" a lost block onto the AGFL, we have to + * create an rmap for the block prior to merging it or else other + * parts will break. + */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1, + &oinfo); + if (error) + return error; + + /* Put the block on the AGFL. */ + error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp, + agbno, 0); + if (error) + return error; + xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + + return 0; +} + +/* Dispose of a single metadata block. */ +STATIC int +xfs_repair_dispose_btree_block( + struct xfs_scrub_context *sc, + xfs_fsblock_t fsbno, + struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type resv) +{ + struct xfs_btree_cur *cur; + struct xfs_buf *agf_bp = NULL; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + bool has_other_rmap; + int error; + + agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + + /* + * If we are repairing per-inode metadata, we need to read in the AGF + * buffer. Otherwise, we're repairing a per-AG structure, so reuse + * the AGF buffer that the setup functions already grabbed. + */ + if (sc->ip) { + error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp); + if (error) + return error; + if (!agf_bp) + return -ENOMEM; + } else { + agf_bp = sc->sa.agf_bp; + } + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno); + + /* Can we find any other rmappings? */ + error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap); + if (error) + goto out_cur; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + + /* + * If there are other rmappings, this block is cross linked and must + * not be freed. Remove the reverse mapping and move on. Otherwise, + * we were the only owner of the block, so free the extent, which will + * also remove the rmap. + * + * XXX: XFS doesn't support detecting the case where a single block + * metadata structure is crosslinked with a multi-block structure + * because the buffer cache doesn't detect aliasing problems, so we + * can't fix 100% of crosslinking problems (yet). The verifiers will + * blow on writeout, the filesystem will shut down, and the admin gets + * to run xfs_repair. + */ + if (has_other_rmap) + error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo); + else if (resv == XFS_AG_RESV_AGFL) + error = xfs_repair_put_freelist(sc, agbno); + else + error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv); + if (agf_bp != sc->sa.agf_bp) + xfs_trans_brelse(sc->tp, agf_bp); + if (error) + return error; + + if (sc->ip) + return xfs_trans_roll_inode(&sc->tp, sc->ip); + return xfs_repair_roll_ag_trans(sc); + +out_cur: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + if (agf_bp != sc->sa.agf_bp) + xfs_trans_brelse(sc->tp, agf_bp); + return error; +} + +/* Dispose of btree blocks from an old per-AG btree. */ +int +xfs_repair_reap_btree_extents( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) +{ + struct xfs_repair_extent *rex; + struct xfs_repair_extent *n; + int error = 0; + + ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); + + /* Dispose of every block from the old btree. */ + for_each_xfs_repair_extent_safe(rex, n, exlist) { + ASSERT(sc->ip != NULL || + XFS_FSB_TO_AGNO(sc->mp, rex->fsbno) == sc->sa.agno); + + trace_xfs_repair_dispose_btree_extent(sc->mp, + XFS_FSB_TO_AGNO(sc->mp, rex->fsbno), + XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno), rex->len); + + for (; rex->len > 0; rex->len--, rex->fsbno++) { + error = xfs_repair_dispose_btree_block(sc, rex->fsbno, + oinfo, type); + if (error) + goto out; + } + list_del(&rex->list); + kmem_free(rex); + } + +out: + xfs_repair_cancel_btree_extents(sc, exlist); + return error; +} + +/* + * Finding per-AG Btree Roots for AGF/AGI Reconstruction + * + * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild + * the AG headers by using the rmap data to rummage through the AG looking for + * btree roots. This is not guaranteed to work if the AG is heavily damaged + * or the rmap data are corrupt. + * + * Callers of xfs_repair_find_ag_btree_roots must lock the AGF and AGFL + * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the + * AGI is being rebuilt. It must maintain these locks until it's safe for + * other threads to change the btrees' shapes. The caller provides + * information about the btrees to look for by passing in an array of + * xfs_repair_find_ag_btree with the (rmap owner, buf_ops, magic) fields set. + * The (root, height) fields will be set on return if anything is found. The + * last element of the array should have a NULL buf_ops to mark the end of the + * array. + * + * For every rmapbt record matching any of the rmap owners in btree_info, + * read each block referenced by the rmap record. If the block is a btree + * block from this filesystem matching any of the magic numbers and has a + * level higher than what we've already seen, remember the block and the + * height of the tree required to have such a block. When the call completes, + * we return the highest block we've found for each btree description; those + * should be the roots. + */ + +struct xfs_repair_findroot { + struct xfs_scrub_context *sc; + struct xfs_buf *agfl_bp; + struct xfs_agf *agf; + struct xfs_repair_find_ag_btree *btree_info; +}; + +/* See if our block is in the AGFL. */ +STATIC int +xfs_repair_findroot_agfl_walk( + struct xfs_mount *mp, + xfs_agblock_t bno, + void *priv) +{ + xfs_agblock_t *agbno = priv; + + return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0; +} + +/* Does this block match the btree information passed in? */ +STATIC int +xfs_repair_findroot_block( + struct xfs_repair_findroot *ri, + struct xfs_repair_find_ag_btree *fab, + uint64_t owner, + xfs_agblock_t agbno, + bool *found_it) +{ + struct xfs_mount *mp = ri->sc->mp; + struct xfs_buf *bp; + struct xfs_btree_block *btblock; + xfs_daddr_t daddr; + int error; + + daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno); + + /* + * Blocks in the AGFL have stale contents that might just happen to + * have a matching magic and uuid. We don't want to pull these blocks + * in as part of a tree root, so we have to filter out the AGFL stuff + * here. If the AGFL looks insane we'll just refuse to repair. + */ + if (owner == XFS_RMAP_OWN_AG) { + error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, + xfs_repair_findroot_agfl_walk, &agbno); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + return 0; + if (error) + return error; + } + + error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr, + mp->m_bsize, 0, &bp, NULL); + if (error) + return error; + + /* + * Does this look like a block matching our fs and higher than any + * other block we've found so far? If so, reattach buffer verifiers + * so the AIL won't complain if the buffer is also dirty. + */ + btblock = XFS_BUF_TO_BLOCK(bp); + if (be32_to_cpu(btblock->bb_magic) != fab->magic) + goto out; + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) + goto out; + bp->b_ops = fab->buf_ops; + + /* Ignore this block if it's lower in the tree than we've seen. */ + if (fab->root != NULLAGBLOCK && + xfs_btree_get_level(btblock) < fab->height) + goto out; + + /* Make sure we pass the verifiers. */ + bp->b_ops->verify_read(bp); + if (bp->b_error) + goto out; + fab->root = agbno; + fab->height = xfs_btree_get_level(btblock) + 1; + *found_it = true; + + trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno, + be32_to_cpu(btblock->bb_magic), fab->height - 1); +out: + xfs_trans_brelse(ri->sc->tp, bp); + return error; +} + +/* + * Do any of the blocks in this rmap record match one of the btrees we're + * looking for? + */ +STATIC int +xfs_repair_findroot_rmap( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_repair_findroot *ri = priv; + struct xfs_repair_find_ag_btree *fab; + xfs_agblock_t b; + bool found_it; + int error = 0; + + /* Ignore anything that isn't AG metadata. */ + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) + return 0; + + /* Otherwise scan each block + btree type. */ + for (b = 0; b < rec->rm_blockcount; b++) { + found_it = false; + for (fab = ri->btree_info; fab->buf_ops; fab++) { + if (rec->rm_owner != fab->rmap_owner) + continue; + error = xfs_repair_findroot_block(ri, fab, + rec->rm_owner, rec->rm_startblock + b, + &found_it); + if (error) + return error; + if (found_it) + break; + } + } + + return 0; +} + +/* Find the roots of the per-AG btrees described in btree_info. */ +int +xfs_repair_find_ag_btree_roots( + struct xfs_scrub_context *sc, + struct xfs_buf *agf_bp, + struct xfs_repair_find_ag_btree *btree_info, + struct xfs_buf *agfl_bp) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_repair_findroot ri; + struct xfs_repair_find_ag_btree *fab; + struct xfs_btree_cur *cur; + int error; + + ASSERT(xfs_buf_islocked(agf_bp)); + ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp)); + + ri.sc = sc; + ri.btree_info = btree_info; + ri.agf = XFS_BUF_TO_AGF(agf_bp); + ri.agfl_bp = agfl_bp; + for (fab = btree_info; fab->buf_ops; fab++) { + ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG); + ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner)); + fab->root = NULLAGBLOCK; + fab->height = 0; + } + + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); + error = xfs_rmap_query_all(cur, xfs_repair_findroot_rmap, &ri); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + return error; +} + +/* Force a quotacheck the next time we mount. */ +void +xfs_repair_force_quotacheck( + struct xfs_scrub_context *sc, + uint dqtype) +{ + uint flag; + + flag = xfs_quota_chkd_flag(dqtype); + if (!(flag & sc->mp->m_qflags)) + return; + + sc->mp->m_qflags &= ~flag; + spin_lock(&sc->mp->m_sb_lock); + sc->mp->m_sb.sb_qflags &= ~flag; + spin_unlock(&sc->mp->m_sb_lock); + xfs_log_sb(sc->tp); +} + +/* + * Attach dquots to this inode, or schedule quotacheck to fix them. + * + * This function ensures that the appropriate dquots are attached to an inode. + * We cannot allow the dquot code to allocate an on-disk dquot block here + * because we're already in transaction context with the inode locked. The + * on-disk dquot should already exist anyway. If the quota code signals + * corruption or missing quota information, schedule quotacheck, which will + * repair corruptions in the quota metadata. + */ +int +xfs_repair_ino_dqattach( + struct xfs_scrub_context *sc) +{ + int error; + + error = xfs_qm_dqattach_locked(sc->ip, false); + switch (error) { + case -EFSBADCRC: + case -EFSCORRUPTED: + case -ENOENT: + xfs_err_ratelimited(sc->mp, +"inode %llu repair encountered quota error %d, quotacheck forced.", + (unsigned long long)sc->ip->i_ino, error); + if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot) + xfs_repair_force_quotacheck(sc, XFS_DQ_USER); + if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot) + xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP); + if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) + xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ); + /* fall through */ + case -ESRCH: + error = 0; + break; + default: + break; + } + + return error; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h new file mode 100644 index 000000000000..ef47826b6725 --- /dev/null +++ b/fs/xfs/scrub/repair.h @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_SCRUB_REPAIR_H__ +#define __XFS_SCRUB_REPAIR_H__ + +static inline int xfs_repair_notsupported(struct xfs_scrub_context *sc) +{ + return -EOPNOTSUPP; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR + +/* Repair helpers */ + +int xfs_repair_attempt(struct xfs_inode *ip, struct xfs_scrub_context *sc, + bool *fixed); +void xfs_repair_failure(struct xfs_mount *mp); +int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc); +bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, + enum xfs_ag_resv_type type); +xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc); +int xfs_repair_alloc_ag_block(struct xfs_scrub_context *sc, + struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno, + enum xfs_ag_resv_type resv); +int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb, + struct xfs_buf **bpp, xfs_btnum_t btnum, + const struct xfs_buf_ops *ops); + +struct xfs_repair_extent { + struct list_head list; + xfs_fsblock_t fsbno; + xfs_extlen_t len; +}; + +struct xfs_repair_extent_list { + struct list_head list; +}; + +static inline void +xfs_repair_init_extent_list( + struct xfs_repair_extent_list *exlist) +{ + INIT_LIST_HEAD(&exlist->list); +} + +#define for_each_xfs_repair_extent_safe(rbe, n, exlist) \ + list_for_each_entry_safe((rbe), (n), &(exlist)->list, list) +int xfs_repair_collect_btree_extent(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *btlist, xfs_fsblock_t fsbno, + xfs_extlen_t len); +void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *btlist); +int xfs_repair_subtract_extents(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_repair_extent_list *sublist); +int xfs_repair_fix_freelist(struct xfs_scrub_context *sc, bool can_shrink); +int xfs_repair_invalidate_blocks(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *btlist); +int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); + +struct xfs_repair_find_ag_btree { + /* in: rmap owner of the btree we're looking for */ + uint64_t rmap_owner; + + /* in: buffer ops */ + const struct xfs_buf_ops *buf_ops; + + /* in: magic number of the btree */ + uint32_t magic; + + /* out: the highest btree block found and the tree height */ + xfs_agblock_t root; + unsigned int height; +}; + +int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc, + struct xfs_buf *agf_bp, + struct xfs_repair_find_ag_btree *btree_info, + struct xfs_buf *agfl_bp); +void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype); +int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc); + +/* Metadata repairers */ + +int xfs_repair_probe(struct xfs_scrub_context *sc); +int xfs_repair_superblock(struct xfs_scrub_context *sc); + +#else + +static inline int xfs_repair_attempt( + struct xfs_inode *ip, + struct xfs_scrub_context *sc, + bool *fixed) +{ + return -EOPNOTSUPP; +} + +static inline void xfs_repair_failure(struct xfs_mount *mp) {} + +static inline xfs_extlen_t +xfs_repair_calc_ag_resblks( + struct xfs_scrub_context *sc) +{ + ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)); + return 0; +} + +#define xfs_repair_probe xfs_repair_notsupported +#define xfs_repair_superblock xfs_repair_notsupported + +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_REPAIR_H__ */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 8f2a7c3ff455..c6d763236ba7 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -66,7 +52,7 @@ xfs_scrub_rmapbt_xref_refc( bool is_unwritten; int error; - if (!sc->sa.refc_cur) + if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm)) return; non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); @@ -207,7 +193,7 @@ xfs_scrub_xref_check_owner( bool has_rmap; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo, @@ -250,7 +236,7 @@ xfs_scrub_xref_has_no_owner( bool has_rmap; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap); diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 39c41dfe08ee..1f86e02a07ca 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -66,11 +52,15 @@ xfs_scrub_rtbitmap_rec( void *priv) { struct xfs_scrub_context *sc = priv; + xfs_rtblock_t startblock; + xfs_rtblock_t blockcount; - if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock || - !xfs_verify_rtbno(sc->mp, rec->ar_startblock) || - !xfs_verify_rtbno(sc->mp, rec->ar_startblock + - rec->ar_blockcount - 1)) + startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize; + blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize; + + if (startblock + blockcount <= startblock || + !xfs_verify_rtbno(sc->mp, startblock) || + !xfs_verify_rtbno(sc->mp, startblock + blockcount - 1)) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -82,6 +72,11 @@ xfs_scrub_rtbitmap( { int error; + /* Invoke the fork scrubber. */ + error = xfs_scrub_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; @@ -95,8 +90,35 @@ int xfs_scrub_rtsummary( struct xfs_scrub_context *sc) { + struct xfs_inode *rsumip = sc->mp->m_rsumip; + struct xfs_inode *old_ip = sc->ip; + uint old_ilock_flags = sc->ilock_flags; + int error = 0; + + /* + * We ILOCK'd the rt bitmap ip in the setup routine, now lock the + * rt summary ip in compliance with the rt inode locking rules. + * + * Since we switch sc->ip to rsumip we have to save the old ilock + * flags so that we don't mix up the inode state that @sc tracks. + */ + sc->ip = rsumip; + sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM; + xfs_ilock(sc->ip, sc->ilock_flags); + + /* Invoke the fork scrubber. */ + error = xfs_scrub_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + goto out; + /* XXX: implement this some day */ - return -ENOENT; + xfs_scrub_set_incomplete(sc); +out: + /* Switch back to the rtbitmap inode and lock flags. */ + xfs_iunlock(sc->ip, sc->ilock_flags); + sc->ilock_flags = old_ilock_flags; + sc->ip = old_ip; + return error; } @@ -107,11 +129,23 @@ xfs_scrub_xref_is_used_rt_space( xfs_rtblock_t fsbno, xfs_extlen_t len) { + xfs_rtblock_t startext; + xfs_rtblock_t endext; + xfs_rtblock_t extcount; bool is_free; int error; + if (xfs_scrub_skip_xref(sc->sm)) + return; + + startext = fsbno; + endext = fsbno + len - 1; + do_div(startext, sc->mp->m_sb.sb_rextsize); + if (do_div(endext, sc->mp->m_sb.sb_rextsize)) + endext++; + extcount = endext - startext; xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, fsbno, len, + error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, &is_free); if (!xfs_scrub_should_check_xref(sc, &error, NULL)) goto out_unlock; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 26c75967a072..58ae76b3a421 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -42,11 +28,18 @@ #include "xfs_refcount_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/btree.h" +#include "scrub/repair.h" /* * Online Scrub and Repair @@ -120,6 +113,24 @@ * XCORRUPT flag; btree query function errors are noted by setting the * XFAIL flag and deleting the cursor to prevent further attempts to * cross-reference with a defective btree. + * + * If a piece of metadata proves corrupt or suboptimal, the userspace + * program can ask the kernel to apply some tender loving care (TLC) to + * the metadata object by setting the REPAIR flag and re-calling the + * scrub ioctl. "Corruption" is defined by metadata violating the + * on-disk specification; operations cannot continue if the violation is + * left untreated. It is possible for XFS to continue if an object is + * "suboptimal", however performance may be degraded. Repairs are + * usually performed by rebuilding the metadata entirely out of + * redundant metadata. Optimizing, on the other hand, can sometimes be + * done without rebuilding entire structures. + * + * Generally speaking, the repair code has the following code structure: + * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock. + * The first check helps us figure out if we need to rebuild or simply + * optimize the structure so that the rebuild knows what to do. The + * second check evaluates the completeness of the repair; that is what + * is reported to userspace. */ /* @@ -155,7 +166,10 @@ xfs_scrub_teardown( { xfs_scrub_ag_free(sc, &sc->sa); if (sc->tp) { - xfs_trans_cancel(sc->tp); + if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + error = xfs_trans_commit(sc->tp); + else + xfs_trans_cancel(sc->tp); sc->tp = NULL; } if (sc->ip) { @@ -166,6 +180,8 @@ xfs_scrub_teardown( iput(VFS_I(sc->ip)); sc->ip = NULL; } + if (sc->has_quotaofflock) + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); if (sc->buf) { kmem_free(sc->buf); sc->buf = NULL; @@ -180,126 +196,150 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { .type = ST_NONE, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_probe, + .repair = xfs_repair_probe, }, [XFS_SCRUB_TYPE_SB] = { /* superblock */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_superblock, + .repair = xfs_repair_superblock, }, [XFS_SCRUB_TYPE_AGF] = { /* agf */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agf, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agfl, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_AGI] = { /* agi */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agi, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_bnobt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_cntbt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_iallocbt, .scrub = xfs_scrub_inobt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_iallocbt, .scrub = xfs_scrub_finobt, .has = xfs_sb_version_hasfinobt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_rmapbt, .scrub = xfs_scrub_rmapbt, .has = xfs_sb_version_hasrmapbt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_refcountbt, .scrub = xfs_scrub_refcountbt, .has = xfs_sb_version_hasreflink, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ .type = ST_INODE, .setup = xfs_scrub_setup_inode, .scrub = xfs_scrub_inode, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_data, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_attr, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_cow, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_DIR] = { /* directory */ .type = ST_INODE, .setup = xfs_scrub_setup_directory, .scrub = xfs_scrub_directory, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ .type = ST_INODE, .setup = xfs_scrub_setup_xattr, .scrub = xfs_scrub_xattr, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ .type = ST_INODE, .setup = xfs_scrub_setup_symlink, .scrub = xfs_scrub_symlink, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ .type = ST_INODE, .setup = xfs_scrub_setup_parent, .scrub = xfs_scrub_parent, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_FS, .setup = xfs_scrub_setup_rt, .scrub = xfs_scrub_rtbitmap, .has = xfs_sb_version_hasrealtime, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xfs_scrub_setup_rt, .scrub = xfs_scrub_rtsummary, .has = xfs_sb_version_hasrealtime, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS, .setup = xfs_scrub_setup_quota, .scrub = xfs_scrub_quota, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ .type = ST_FS, .setup = xfs_scrub_setup_quota, .scrub = xfs_scrub_quota, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ .type = ST_FS, .setup = xfs_scrub_setup_quota, .scrub = xfs_scrub_quota, + .repair = xfs_repair_notsupported, }, }; @@ -379,15 +419,54 @@ xfs_scrub_validate_inputs( if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) goto out; - /* We don't know how to repair anything yet. */ - if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) - goto out; + /* + * We only want to repair read-write v5+ filesystems. Defer the check + * for ops->repair until after our scrub confirms that we need to + * perform repairs so that we avoid failing due to not supporting + * repairing an object that doesn't need repairs. + */ + if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { + error = -EOPNOTSUPP; + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto out; + + error = -EROFS; + if (mp->m_flags & XFS_MOUNT_RDONLY) + goto out; + } error = 0; out: return error; } +#ifdef CONFIG_XFS_ONLINE_REPAIR +static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc) +{ + /* + * Userspace asked us to repair something, we repaired it, rescanned + * it, and the rescan says it's still broken. Scream about this in + * the system logs. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT))) + xfs_repair_failure(sc->mp); +} +#else +static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc) +{ + /* + * Userspace asked us to scrub something, it's broken, and we have no + * way of fixing it. Scream in the logs. + */ + if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT)) + xfs_alert_ratelimited(sc->mp, + "Corruption detected during scrub."); +} +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + /* Dispatch metadata scrubbing. */ int xfs_scrub_metadata( @@ -397,6 +476,7 @@ xfs_scrub_metadata( struct xfs_scrub_context sc; struct xfs_mount *mp = ip->i_mount; bool try_harder = false; + bool already_fixed = false; int error = 0; BUILD_BUG_ON(sizeof(meta_scrub_ops) != @@ -446,10 +526,44 @@ retry_op: } else if (error) goto out_teardown; - if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | - XFS_SCRUB_OFLAG_XCORRUPT)) - xfs_alert_ratelimited(mp, "Corruption detected during scrub."); + if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) { + bool needs_fix; + + /* Let debug users force us into the repair routines. */ + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + + needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT | + XFS_SCRUB_OFLAG_PREEN)); + /* + * If userspace asked for a repair but it wasn't necessary, + * report that back to userspace. + */ + if (!needs_fix) { + sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; + goto out_nofix; + } + + /* + * If it's broken, userspace wants us to fix it, and we haven't + * already tried to fix it, then attempt a repair. + */ + error = xfs_repair_attempt(ip, &sc, &already_fixed); + if (error == -EAGAIN) { + if (sc.try_harder) + try_harder = true; + error = xfs_scrub_teardown(&sc, ip, 0); + if (error) { + xfs_repair_failure(mp); + goto out; + } + goto retry_op; + } + } +out_nofix: + xfs_scrub_postmortem(&sc); out_teardown: error = xfs_scrub_teardown(&sc, ip, error); out: diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 0d92af86f67a..b295edd5fc0e 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_SCRUB_SCRUB_H__ #define __XFS_SCRUB_SCRUB_H__ @@ -38,6 +24,9 @@ struct xfs_scrub_meta_ops { /* Examine metadata for errors. */ int (*scrub)(struct xfs_scrub_context *); + /* Repair or optimize the metadata. */ + int (*repair)(struct xfs_scrub_context *); + /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_sb *); @@ -48,6 +37,7 @@ struct xfs_scrub_meta_ops { /* Buffer pointers and btree cursors for an entire AG. */ struct xfs_scrub_ag { xfs_agnumber_t agno; + struct xfs_perag *pag; /* AG btree roots */ struct xfs_buf *agf_bp; @@ -73,6 +63,7 @@ struct xfs_scrub_context { void *buf; uint ilock_flags; bool try_harder; + bool has_quotaofflock; /* State tracking for single-AG operations. */ struct xfs_scrub_ag sa; diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 3aa3d60f7c16..570a89812116 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 86daed0e3a45..7c76d8b5cb05 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 5d2b1c241be5..cec3e5ece5a1 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xfs_scrub @@ -69,6 +55,8 @@ DEFINE_EVENT(xfs_scrub_class, name, \ DEFINE_SCRUB_EVENT(xfs_scrub_start); DEFINE_SCRUB_EVENT(xfs_scrub_done); DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry); +DEFINE_SCRUB_EVENT(xfs_repair_attempt); +DEFINE_SCRUB_EVENT(xfs_repair_done); TRACE_EVENT(xfs_scrub_op_error, TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno, @@ -492,6 +480,262 @@ TRACE_EVENT(xfs_scrub_xref_error, __entry->ret_ip) ); +/* repair tracepoints */ +#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) + +DECLARE_EVENT_CLASS(xfs_repair_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_REPAIR_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_repair_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_dispose_btree_extent); +DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_collect_btree_extent); +DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_agfl_insert); + +DECLARE_EVENT_CLASS(xfs_repair_rmap_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + uint64_t owner, uint64_t offset, unsigned int flags), + TP_ARGS(mp, agno, agbno, len, owner, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; + __entry->offset = offset; + __entry->flags = flags; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); +#define DEFINE_REPAIR_RMAP_EVENT(name) \ +DEFINE_EVENT(xfs_repair_rmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + uint64_t owner, uint64_t offset, unsigned int flags), \ + TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_alloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_ialloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_rmap_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_bmap_extent_fn); + +TRACE_EVENT(xfs_repair_refcount_extent_fn, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec), + TP_ARGS(mp, agno, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount) +) + +TRACE_EVENT(xfs_repair_init_btblock, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_btnum_t btnum), + TP_ARGS(mp, agno, agbno, btnum), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(uint32_t, btnum) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->btnum = btnum; + ), + TP_printk("dev %d:%d agno %u agbno %u btnum %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->btnum) +) +TRACE_EVENT(xfs_repair_findroot_block, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + uint32_t magic, uint16_t level), + TP_ARGS(mp, agno, agbno, magic, level), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(uint32_t, magic) + __field(uint16_t, level) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->magic = magic; + __entry->level = level; + ), + TP_printk("dev %d:%d agno %u agbno %u magic 0x%x level %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->magic, + __entry->level) +) +TRACE_EVENT(xfs_repair_calc_ag_resblks, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen, + xfs_agblock_t usedlen), + TP_ARGS(mp, agno, icount, aglen, freelen, usedlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, icount) + __field(xfs_agblock_t, aglen) + __field(xfs_agblock_t, freelen) + __field(xfs_agblock_t, usedlen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->icount = icount; + __entry->aglen = aglen; + __entry->freelen = freelen; + __entry->usedlen = usedlen; + ), + TP_printk("dev %d:%d agno %d icount %u aglen %u freelen %u usedlen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->icount, + __entry->aglen, + __entry->freelen, + __entry->usedlen) +) +TRACE_EVENT(xfs_repair_calc_ag_resblks_btsize, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz, + xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz), + TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bnobt_sz) + __field(xfs_agblock_t, inobt_sz) + __field(xfs_agblock_t, rmapbt_sz) + __field(xfs_agblock_t, refcbt_sz) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bnobt_sz = bnobt_sz; + __entry->inobt_sz = inobt_sz; + __entry->rmapbt_sz = rmapbt_sz; + __entry->refcbt_sz = refcbt_sz; + ), + TP_printk("dev %d:%d agno %d bno %u ino %u rmap %u refcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bnobt_sz, + __entry->inobt_sz, + __entry->rmapbt_sz, + __entry->refcbt_sz) +) +TRACE_EVENT(xfs_repair_reset_counters, + TP_PROTO(struct xfs_mount *mp), + TP_ARGS(mp), + TP_STRUCT__entry( + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + ), + TP_printk("dev %d:%d", + MAJOR(__entry->dev), MINOR(__entry->dev)) +) + +TRACE_EVENT(xfs_repair_ialloc_insert, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, uint16_t holemask, uint8_t count, + uint8_t freecount, uint64_t freemask), + TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint16_t, holemask) + __field(uint8_t, count) + __field(uint8_t, freecount) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->holemask = holemask; + __entry->count = count; + __entry->freecount = freecount; + __entry->freemask = freemask; + ), + TP_printk("dev %d:%d agno %d startino %u holemask 0x%x count %u freecount %u freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->holemask, + __entry->count, + __entry->freecount, + __entry->freemask) +) + +#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ + #endif /* _TRACE_XFS_SCRUB_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index e00e0eadac6a..2897ba3a17e6 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_SCRUB_H__ #define __XFS_SCRUB_H__ diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 5ff7f228d616..583a9f539bf1 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_H__ #define __XFS_H__ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 3354140de07e..8039e35147dd 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2008, Christoph Hellwig * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_format.h" diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 04327318ef67..94615e34bc86 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2001-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_ACL_H__ #define __XFS_ACL_H__ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0ab824f574ed..8eb3ba3d4d00 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_shared.h" @@ -543,8 +531,19 @@ xfs_submit_ioend( { /* Convert CoW extents to regular */ if (!status && ioend->io_type == XFS_IO_COW) { + /* + * Yuk. This can do memory allocation, but is not a + * transactional operation so everything is done in GFP_KERNEL + * context. That can deadlock, because we hold pages in + * writeback state and GFP_KERNEL allocations can block on them. + * Hence we must operate in nofs conditions here. + */ + unsigned nofs_flag; + + nofs_flag = memalloc_nofs_save(); status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); + memalloc_nofs_restore(nofs_flag); } /* Reserve log space if we might write beyond the on-disk inode size. */ @@ -594,7 +593,7 @@ xfs_alloc_ioend( struct xfs_ioend *ioend; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); xfs_init_bio_from_bh(bio, bh); ioend = container_of(bio, struct xfs_ioend, io_inline_bio); @@ -1378,10 +1377,9 @@ xfs_vm_bmap( struct address_space *mapping, sector_t block) { - struct inode *inode = (struct inode *)mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(mapping->host); - trace_xfs_vm_bmap(XFS_I(inode)); + trace_xfs_vm_bmap(ip); /* * The swap code (ab-)uses ->bmap to get a block mapping and then @@ -1394,9 +1392,7 @@ xfs_vm_bmap( */ if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; - - filemap_write_and_wait(mapping); - return generic_block_bmap(mapping, block, xfs_get_blocks); + return iomap_bmap(mapping, block, &xfs_iomap_ops); } STATIC int @@ -1475,6 +1471,16 @@ xfs_vm_set_page_dirty( return newly_dirty; } +static int +xfs_iomap_swapfile_activate( + struct swap_info_struct *sis, + struct file *swap_file, + sector_t *span) +{ + sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file)); + return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops); +} + const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, @@ -1488,6 +1494,7 @@ const struct address_space_operations xfs_address_space_operations = { .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, + .swap_activate = xfs_iomap_swapfile_activate, }; const struct address_space_operations xfs_dax_aops = { @@ -1495,4 +1502,5 @@ const struct address_space_operations xfs_dax_aops = { .direct_IO = noop_direct_IO, .set_page_dirty = noop_set_page_dirty, .invalidatepage = noop_invalidatepage, + .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 69346d460dfa..25bc6d4a1231 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -1,24 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2005-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct bio_set *xfs_ioend_bioset; +extern struct bio_set xfs_ioend_bioset; /* * Types of I/O for bmap clustering and I/O completion tracking. diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index d07bf27451c9..033ff8c478e2 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_ATTR_H__ #define __XFS_ATTR_H__ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 52818ea2eb50..7ce10055f275 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 3e59a348ea71..f9ca80154c9c 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -139,7 +127,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", XFS_ERRLEVEL_LOW, - context->dp->i_mount, sfe); + context->dp->i_mount, sfe, + sizeof(*sfe)); kmem_free(sbuf); return -EFSCORRUPTED; } @@ -241,7 +230,7 @@ xfs_attr_node_list_lookup( if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - node); + node, sizeof(*node)); goto out_corruptbuf; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 2203465e63ea..956ebd583e27 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -160,7 +146,7 @@ STATIC void xfs_bui_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_bui_release(BUI_ITEM(lip)); } @@ -305,7 +291,7 @@ xfs_bud_item_unlock( { struct xfs_bud_log_item *budp = BUD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_bui_release(budp->bud_buip); kmem_zone_free(xfs_bud_zone, budp); } diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index 24b354a2c836..fd1a1b13df51 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_BMAP_ITEM_H__ #define __XFS_BMAP_ITEM_H__ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 8cd8c412f52d..83b1e8c6c18f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * Copyright (c) 2012 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -92,6 +80,7 @@ xfs_bmap_rtalloc( int error; /* error return value */ xfs_mount_t *mp; /* mount point structure */ xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t mod = 0; /* product factor for allocators */ xfs_extlen_t ralen = 0; /* realtime allocation length */ xfs_extlen_t align; /* minimum allocation alignment */ xfs_rtblock_t rtb; @@ -111,7 +100,8 @@ xfs_bmap_rtalloc( * If the offset & length are not perfectly aligned * then kill prod, it will just get us in trouble. */ - if (do_mod(ap->offset, align) || ap->length % align) + div_u64_rem(ap->offset, align, &mod); + if (mod || ap->length % align) prod = 1; /* * Set ralen to be the actual requested length in rtextents. @@ -695,12 +685,10 @@ out_unlock_iolock: } /* - * dead simple method of punching delalyed allocation blocks from a range in - * the inode. Walks a block at a time so will be slow, but is only executed in - * rare error cases so the overhead is not critical. This will always punch out - * both the start and end blocks, even if the ranges only partially overlap - * them, so it is up to the caller to ensure that partial blocks are not - * passed in. + * Dead simple method of punching delalyed allocation blocks from a range in + * the inode. This will always punch out both the start and end blocks, even + * if the ranges only partially overlap them, so it is up to the caller to + * ensure that partial blocks are not passed in. */ int xfs_bmap_punch_delalloc_range( @@ -708,63 +696,44 @@ xfs_bmap_punch_delalloc_range( xfs_fileoff_t start_fsb, xfs_fileoff_t length) { - xfs_fileoff_t remaining = length; + struct xfs_ifork *ifp = &ip->i_df; + xfs_fileoff_t end_fsb = start_fsb + length; + struct xfs_bmbt_irec got, del; + struct xfs_iext_cursor icur; int error = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - do { - int done; - xfs_bmbt_irec_t imap; - int nimaps = 1; - xfs_fsblock_t firstblock; - struct xfs_defer_ops dfops; + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } - /* - * Map the range first and check that it is a delalloc extent - * before trying to unmap the range. Otherwise we will be - * trying to remove a real extent (which requires a - * transaction) or a hole, which is probably a bad idea... - */ - error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, - XFS_BMAPI_ENTIRE); + if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) + return 0; - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "Failed delalloc mapping lookup ino %lld fsb %lld.", - ip->i_ino, start_fsb); - } - break; - } - if (!nimaps) { - /* nothing there */ - goto next_block; - } - if (imap.br_startblock != DELAYSTARTBLOCK) { - /* been converted, ignore */ - goto next_block; - } - WARN_ON(imap.br_blockcount == 0); + while (got.br_startoff + got.br_blockcount > start_fsb) { + del = got; + xfs_trim_extent(&del, start_fsb, length); /* - * Note: while we initialise the firstblock/dfops pair, they - * should never be used because blocks should never be - * allocated or freed for a delalloc extent and hence we need - * don't cancel or finish them after the xfs_bunmapi() call. + * A delete can push the cursor forward. Step back to the + * previous extent on non-delalloc or extents outside the + * target range. */ - xfs_defer_init(&dfops, &firstblock); - error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, - &dfops, &done); - if (error) - break; + if (!del.br_blockcount || + !isnullstartblock(del.br_startblock)) { + if (!xfs_iext_prev_extent(ifp, &icur, &got)) + break; + continue; + } - ASSERT(!xfs_defer_has_unfinished_work(&dfops)); -next_block: - start_fsb++; - remaining--; - } while(remaining > 0); + error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, + &got, &del); + if (error || !xfs_iext_get_extent(ifp, &icur, &got)) + break; + } return error; } @@ -848,7 +817,7 @@ xfs_free_eofblocks( /* * Attach the dquots to the inode up front. */ - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -871,8 +840,8 @@ xfs_free_eofblocks( * contents of the file are flushed to disk then the files * may be full of holes (ie NULL files bug). */ - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, - XFS_ISIZE(ip)); + error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip), XFS_BMAPI_NODISCARD); if (error) { /* * If we get an error at this point we simply don't @@ -918,7 +887,7 @@ xfs_alloc_file_space( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -948,9 +917,11 @@ xfs_alloc_file_space( do_div(s, extsz); s *= extsz; e = startoffset_fsb + allocatesize_fsb; - if ((temp = do_mod(startoffset_fsb, extsz))) + div_u64_rem(startoffset_fsb, extsz, &temp); + if (temp) e += temp; - if ((temp = do_mod(e, extsz))) + div_u64_rem(e, extsz, &temp); + if (temp) e += extsz - temp; } else { s = 0; @@ -1111,7 +1082,7 @@ xfs_adjust_extent_unmap_boundaries( if (nimap && imap.br_startblock != HOLESTARTBLOCK) { ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod = do_mod(imap.br_startblock, mp->m_sb.sb_rextsize); + div_u64_rem(imap.br_startblock, mp->m_sb.sb_rextsize, &mod); if (mod) *startoffset_fsb += mp->m_sb.sb_rextsize - mod; } @@ -1169,7 +1140,7 @@ xfs_free_file_space( trace_xfs_free_file_space(ip); - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -1216,7 +1187,22 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - return iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); + error = iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); + if (error) + return error; + + /* + * If we zeroed right up to EOF and EOF straddles a page boundary we + * must make sure that the post-EOF area is also zeroed because the + * page could be mmap'd and iomap_zero_range doesn't do that for us. + * Writeback of the eof page will do this, albeit clumsily. + */ + if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) { + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + (offset + len) & ~PAGE_MASK, LLONG_MAX); + } + + return error; } /* @@ -1412,6 +1398,10 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); + error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); + if (error) + return error; + error = xfs_prepare_shift(ip, offset); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 4d4ae48bd4f6..87363d136bb6 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_BMAP_UTIL_H__ #define __XFS_BMAP_UTIL_H__ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 55661cbdb51b..e9c058e3761c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include <linux/stddef.h> @@ -33,7 +21,6 @@ #include <linux/migrate.h> #include <linux/backing-dev.h> #include <linux/freezer.h> -#include <linux/sched/mm.h> #include "xfs_format.h" #include "xfs_log_format.h" @@ -549,17 +536,31 @@ xfs_buf_hash_destroy( } /* - * Look up, and creates if absent, a lockable buffer for - * a given range of an inode. The buffer is returned - * locked. No I/O is implied by this call. + * Look up a buffer in the buffer cache and return it referenced and locked + * in @found_bp. + * + * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the + * cache. + * + * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return + * -EAGAIN if we fail to lock it. + * + * Return values are: + * -EFSCORRUPTED if have been supplied with an invalid address + * -EAGAIN on trylock failure + * -ENOENT if we fail to find a match and @new_bp was NULL + * 0, with @found_bp: + * - @new_bp if we inserted it into the cache + * - the buffer we found and locked. */ -xfs_buf_t * -_xfs_buf_find( +static int +xfs_buf_find( struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - xfs_buf_t *new_bp) + struct xfs_buf *new_bp, + struct xfs_buf **found_bp) { struct xfs_perag *pag; xfs_buf_t *bp; @@ -567,6 +568,8 @@ _xfs_buf_find( xfs_daddr_t eofs; int i; + *found_bp = NULL; + for (i = 0; i < nmaps; i++) cmap.bm_len += map[i].bm_len; @@ -580,16 +583,11 @@ _xfs_buf_find( */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { - /* - * XXX (dgc): we should really be returning -EFSCORRUPTED here, - * but none of the higher level infrastructure supports - * returning a specific error on buffer lookup failures. - */ xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", __func__, cmap.bm_bn, eofs); WARN_ON(1); - return NULL; + return -EFSCORRUPTED; } pag = xfs_perag_get(btp->bt_mount, @@ -604,19 +602,20 @@ _xfs_buf_find( } /* No match found */ - if (new_bp) { - /* the buffer keeps the perag reference until it is freed */ - new_bp->b_pag = pag; - rhashtable_insert_fast(&pag->pag_buf_hash, - &new_bp->b_rhash_head, - xfs_buf_hash_params); - spin_unlock(&pag->pag_buf_lock); - } else { + if (!new_bp) { XFS_STATS_INC(btp->bt_mount, xb_miss_locked); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); + return -ENOENT; } - return new_bp; + + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, + xfs_buf_hash_params); + spin_unlock(&pag->pag_buf_lock); + *found_bp = new_bp; + return 0; found: spin_unlock(&pag->pag_buf_lock); @@ -626,7 +625,7 @@ found: if (flags & XBF_TRYLOCK) { xfs_buf_rele(bp); XFS_STATS_INC(btp->bt_mount, xb_busy_locked); - return NULL; + return -EAGAIN; } xfs_buf_lock(bp); XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); @@ -646,6 +645,24 @@ found: trace_xfs_buf_find(bp, flags, _RET_IP_); XFS_STATS_INC(btp->bt_mount, xb_get_locked); + *found_bp = bp; + return 0; +} + +struct xfs_buf * +xfs_buf_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + struct xfs_buf *bp; + int error; + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + + error = xfs_buf_find(target, &map, 1, flags, NULL, &bp); + if (error) + return NULL; return bp; } @@ -665,9 +682,27 @@ xfs_buf_get_map( struct xfs_buf *new_bp; int error = 0; - bp = _xfs_buf_find(target, map, nmaps, flags, NULL); - if (likely(bp)) + error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); + + switch (error) { + case 0: + /* cache hit */ goto found; + case -EAGAIN: + /* cache hit, trylock failure, caller handles failure */ + ASSERT(flags & XBF_TRYLOCK); + return NULL; + case -ENOENT: + /* cache miss, go for insert */ + break; + case -EFSCORRUPTED: + default: + /* + * None of the higher layers understand failure types + * yet, so return NULL to signal a fatal lookup error. + */ + return NULL; + } new_bp = _xfs_buf_alloc(target, map, nmaps, flags); if (unlikely(!new_bp)) @@ -679,8 +714,8 @@ xfs_buf_get_map( return NULL; } - bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); - if (!bp) { + error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); + if (error) { xfs_buf_free(new_bp); return NULL; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index edced162a674..d24dbd4dac39 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_BUF_H__ #define __XFS_BUF_H__ @@ -218,20 +206,9 @@ typedef struct xfs_buf { } xfs_buf_t; /* Finding and Reading Buffers */ -struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags, struct xfs_buf *new_bp); - -static inline struct xfs_buf * -xfs_incore( - struct xfs_buftarg *target, - xfs_daddr_t blkno, - size_t numblks, - xfs_buf_flags_t flags) -{ - DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return _xfs_buf_find(target, &map, 1, flags, NULL); -} +struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target, + xfs_daddr_t blkno, size_t numblks, + xfs_buf_flags_t flags); struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -358,6 +335,18 @@ extern void xfs_buf_terminate(void); void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); +/* + * If the buffer is already on the LRU, do nothing. Otherwise set the buffer + * up with a reference count of 0 so it will be tossed from the cache when + * released. + */ +static inline void xfs_buf_oneshot(struct xfs_buf *bp) +{ + if (!list_empty(&bp->b_lru) || atomic_read(&bp->b_lru_ref) > 1) + return; + atomic_set(&bp->b_lru_ref, 0); +} + static inline int xfs_buf_ispinned(struct xfs_buf *bp) { return atomic_read(&bp->b_pin_count); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 82ad270e390e..1c9d1398980b 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -438,7 +426,7 @@ xfs_buf_item_unpin( * xfs_trans_uncommit() will try to reference the * buffer which we no longer have a hold on. */ - if (lip->li_desc) + if (!list_empty(&lip->li_trans)) xfs_trans_del_item(lip); /* @@ -568,13 +556,15 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); + bool aborted; bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); #if defined(DEBUG) || defined(XFS_WARN) bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); #endif + aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); + /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; @@ -743,8 +733,10 @@ xfs_buf_item_init( * nothing to do here so return. */ ASSERT(bp->b_target->bt_mount == mp); - if (bip != NULL) { + if (bip) { ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + ASSERT(!bp->b_transp); + ASSERT(bip->bli_buf == bp); return 0; } @@ -838,7 +830,7 @@ xfs_buf_item_log_segment( * of the last bit to be set in this word plus one. */ if (bit) { - end_bit = MIN(bit + bits_to_set, (uint)NBWORD); + end_bit = min(bit + bits_to_set, (uint)NBWORD); mask = ((1U << (end_bit - bit)) - 1) << bit; *wordp |= mask; wordp++; diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 643f53dcfe51..3f7d7b72e7e6 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_BUF_ITEM_H__ #define __XFS_BUF_ITEM_H__ diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index b6ae3597bfb0..5142e64e2345 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 7b68e6c9a474..678a5fcd7576 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_format.h" diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a7daef9e16bf..0973a0423bed 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -288,49 +276,43 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) } /* - * Allocate a block and fill it with dquots. - * This is called when the bmapi finds a hole. + * Ensure that the given in-core dquot has a buffer on disk backing it, and + * return the buffer. This is called when the bmapi finds a hole. */ STATIC int -xfs_qm_dqalloc( - xfs_trans_t **tpp, - xfs_mount_t *mp, - xfs_dquot_t *dqp, - xfs_inode_t *quotip, - xfs_fileoff_t offset_fsb, - xfs_buf_t **O_bpp) +xfs_dquot_disk_alloc( + struct xfs_trans **tpp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) { - xfs_fsblock_t firstblock; - struct xfs_defer_ops dfops; - xfs_bmbt_irec_t map; - int nmaps, error; - xfs_buf_t *bp; - xfs_trans_t *tp = *tpp; - - ASSERT(tp != NULL); + struct xfs_bmbt_irec map; + struct xfs_defer_ops dfops; + struct xfs_mount *mp = (*tpp)->t_mountp; + struct xfs_buf *bp; + struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); + xfs_fsblock_t firstblock; + int nmaps = 1; + int error; trace_xfs_dqalloc(dqp); - /* - * Initialize the bmap freelist prior to calling bmapi code. - */ xfs_defer_init(&dfops, &firstblock); xfs_ilock(quotip, XFS_ILOCK_EXCL); - /* - * Return if this type of quotas is turned off while we didn't - * have an inode lock - */ if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + /* + * Return if this type of quotas is turned off while we didn't + * have an inode lock + */ xfs_iunlock(quotip, XFS_ILOCK_EXCL); return -ESRCH; } - xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); - nmaps = 1; - error = xfs_bmapi_write(tp, quotip, offset_fsb, - XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &dfops); + /* Create the block mapping. */ + xfs_trans_ijoin(*tpp, quotip, XFS_ILOCK_EXCL); + error = xfs_bmapi_write(*tpp, quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &dfops); if (error) goto error0; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); @@ -344,10 +326,8 @@ xfs_qm_dqalloc( dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); /* now we can just get the buffer (there's nothing to read yet) */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, - 0); + bp = xfs_trans_get_buf(*tpp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0); if (!bp) { error = -ENOMEM; goto error1; @@ -358,37 +338,45 @@ xfs_qm_dqalloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), + xfs_qm_init_dquot_blk(*tpp, mp, be32_to_cpu(dqp->q_core.d_id), dqp->dq_flags & XFS_DQ_ALLTYPES, bp); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* - * xfs_defer_finish() may commit the current transaction and - * start a second transaction if the freelist is not empty. + * Hold the buffer and join it to the dfops so that we'll still own + * the buffer when we return to the caller. The buffer disposal on + * error must be paid attention to very carefully, as it has been + * broken since commit efa092f3d4c6 "[XFS] Fixes a bug in the quota + * code when allocating a new dquot record" in 2005, and the later + * conversion to xfs_defer_ops in commit 310a75a3c6c747 failed to keep + * the buffer locked across the _defer_finish call. We can now do + * this correctly with xfs_defer_bjoin. * - * Since we still want to modify this buffer, we need to - * ensure that the buffer is not released on commit of - * the first transaction and ensure the buffer is added to the - * second transaction. + * Above, we allocated a disk block for the dquot information and + * used get_buf to initialize the dquot. If the _defer_bjoin fails, + * the buffer is still locked to *tpp, so we must _bhold_release and + * then _trans_brelse the buffer. If the _defer_finish fails, the old + * transaction is gone but the new buffer is not joined or held to any + * transaction, so we must _buf_relse it. * - * If there is only one transaction then don't stop the buffer - * from being released when it commits later on. + * If everything succeeds, the caller of this function is returned a + * buffer that is locked and held to the transaction. The caller + * is responsible for unlocking any buffer passed back, either + * manually or by committing the transaction. */ - - xfs_trans_bhold(tp, bp); - + xfs_trans_bhold(*tpp, bp); + error = xfs_defer_bjoin(&dfops, bp); + if (error) { + xfs_trans_bhold_release(*tpp, bp); + xfs_trans_brelse(*tpp, bp); + goto error1; + } error = xfs_defer_finish(tpp, &dfops); - if (error) + if (error) { + xfs_buf_relse(bp); goto error1; - - /* Transaction was committed? */ - if (*tpp != tp) { - tp = *tpp; - xfs_trans_bjoin(tp, bp); - } else { - xfs_trans_bhold_release(tp, bp); } - - *O_bpp = bp; + *bpp = bp; return 0; error1: @@ -398,32 +386,24 @@ error0: } /* - * Maps a dquot to the buffer containing its on-disk version. - * This returns a ptr to the buffer containing the on-disk dquot - * in the bpp param, and a ptr to the on-disk dquot within that buffer + * Read in the in-core dquot's on-disk metadata and return the buffer. + * Returns ENOENT to signal a hole. */ STATIC int -xfs_qm_dqtobp( - xfs_trans_t **tpp, - xfs_dquot_t *dqp, - xfs_disk_dquot_t **O_ddpp, - xfs_buf_t **O_bpp, - uint flags) +xfs_dquot_disk_read( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) { struct xfs_bmbt_irec map; - int nmaps = 1, error; struct xfs_buf *bp; - struct xfs_inode *quotip; - struct xfs_mount *mp = dqp->q_mount; - xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); - struct xfs_trans *tp = (tpp ? *tpp : NULL); + struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); uint lock_mode; - - quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags); - dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; + int nmaps = 1; + int error; lock_mode = xfs_ilock_data_map_shared(quotip); - if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + if (!xfs_this_quota_on(mp, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. @@ -436,81 +416,48 @@ xfs_qm_dqtobp( * Find the block map; no allocations yet */ error = xfs_bmapi_read(quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); - + XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); xfs_iunlock(quotip, lock_mode); if (error) return error; ASSERT(nmaps == 1); - ASSERT(map.br_blockcount == 1); + ASSERT(map.br_blockcount >= 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (map.br_startblock == HOLESTARTBLOCK) + return -ENOENT; + + trace_xfs_dqtobp_read(dqp); /* - * Offset of dquot in the (fixed sized) dquot chunk. + * store the blkno etc so that we don't have to do the + * mapping all the time */ - dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * - sizeof(xfs_dqblk_t); - - ASSERT(map.br_startblock != DELAYSTARTBLOCK); - if (map.br_startblock == HOLESTARTBLOCK) { - /* - * We don't allocate unless we're asked to - */ - if (!(flags & XFS_QMOPT_DQALLOC)) - return -ENOENT; - - ASSERT(tp); - error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, - dqp->q_fileoffset, &bp); - if (error) - return error; - tp = *tpp; - } else { - trace_xfs_dqtobp_read(dqp); + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); - /* - * store the blkno etc so that we don't have to do the - * mapping all the time - */ - dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); - - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, - 0, &bp, &xfs_dquot_buf_ops); - if (error) { - ASSERT(bp == NULL); - return error; - } + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); + if (error) { + ASSERT(bp == NULL); + return error; } ASSERT(xfs_buf_islocked(bp)); - *O_bpp = bp; - *O_ddpp = bp->b_addr + dqp->q_bufoffset; + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + *bpp = bp; return 0; } - -/* - * Read in the ondisk dquot using dqtobp() then copy it to an incore version, - * and release the buffer immediately. - * - * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed. - */ -int -xfs_qm_dqread( +/* Allocate and initialize everything we need for an incore dquot. */ +STATIC struct xfs_dquot * +xfs_dquot_alloc( struct xfs_mount *mp, xfs_dqid_t id, - uint type, - uint flags, - struct xfs_dquot **O_dqpp) + uint type) { struct xfs_dquot *dqp; - struct xfs_disk_dquot *ddqp; - struct xfs_buf *bp; - struct xfs_trans *tp = NULL; - int error; dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); @@ -520,6 +467,12 @@ xfs_qm_dqread( INIT_LIST_HEAD(&dqp->q_lru); mutex_init(&dqp->q_qlock); init_waitqueue_head(&dqp->q_pinwait); + dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; + /* + * Offset of dquot in the (fixed sized) dquot chunk. + */ + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * + sizeof(xfs_dqblk_t); /* * Because we want to use a counting completion, complete @@ -548,35 +501,22 @@ xfs_qm_dqread( break; } - XFS_STATS_INC(mp, xs_qm_dquot); - - trace_xfs_dqread(dqp); + xfs_qm_dquot_logitem_init(dqp); - if (flags & XFS_QMOPT_DQALLOC) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); - if (error) - goto error0; - } + XFS_STATS_INC(mp, xs_qm_dquot); + return dqp; +} - /* - * get a pointer to the on-disk dquot and the buffer containing it - * dqp already knows its own type (GROUP/USER). - */ - error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags); - if (error) { - /* - * This can happen if quotas got turned off (ESRCH), - * or if the dquot didn't exist on disk and we ask to - * allocate (ENOENT). - */ - trace_xfs_dqread_fail(dqp); - goto error1; - } +/* Copy the in-core quota fields in from the on-disk buffer. */ +STATIC void +xfs_dquot_from_disk( + struct xfs_dquot *dqp, + struct xfs_buf *bp) +{ + struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; /* copy everything from disk dquot to the incore dquot */ memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); - xfs_qm_dquot_logitem_init(dqp); /* * Reservation counters are defined as reservation plus current usage @@ -588,40 +528,90 @@ xfs_qm_dqread( /* initialize the dquot speculative prealloc thresholds */ xfs_dquot_set_prealloc_limits(dqp); +} - /* Mark the buf so that this will stay incore a little longer */ - xfs_buf_set_ref(bp, XFS_DQUOT_REF); +/* Allocate and initialize the dquot buffer for this in-core dquot. */ +static int +xfs_qm_dqread_alloc( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_trans *tp; + struct xfs_buf *bp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); + if (error) + goto err; + + error = xfs_dquot_disk_alloc(&tp, dqp, &bp); + if (error) + goto err_cancel; + + error = xfs_trans_commit(tp); + if (error) { + /* + * Buffer was held to the transaction, so we have to unlock it + * manually here because we're not passing it back. + */ + xfs_buf_relse(bp); + goto err; + } + *bpp = bp; + return 0; + +err_cancel: + xfs_trans_cancel(tp); +err: + return error; +} + +/* + * Read in the ondisk dquot using dqtobp() then copy it to an incore version, + * and release the buffer immediately. If @can_alloc is true, fill any + * holes in the on-disk metadata. + */ +static int +xfs_qm_dqread( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + bool can_alloc, + struct xfs_dquot **dqpp) +{ + struct xfs_dquot *dqp; + struct xfs_buf *bp; + int error; + + dqp = xfs_dquot_alloc(mp, id, type); + trace_xfs_dqread(dqp); + + /* Try to read the buffer, allocating if necessary. */ + error = xfs_dquot_disk_read(mp, dqp, &bp); + if (error == -ENOENT && can_alloc) + error = xfs_qm_dqread_alloc(mp, dqp, &bp); + if (error) + goto err; /* - * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) - * So we need to release with xfs_trans_brelse(). - * The strategy here is identical to that of inodes; we lock - * the dquot in xfs_qm_dqget() before making it accessible to - * others. This is because dquots, like inodes, need a good level of - * concurrency, and we don't want to take locks on the entire buffers - * for dquot accesses. - * Note also that the dquot buffer may even be dirty at this point, if - * this particular dquot was repaired. We still aren't afraid to - * brelse it because we have the changes incore. + * At this point we should have a clean locked buffer. Copy the data + * to the incore dquot and release the buffer since the incore dquot + * has its own locking protocol so we needn't tie up the buffer any + * further. */ ASSERT(xfs_buf_islocked(bp)); - xfs_trans_brelse(tp, bp); + xfs_dquot_from_disk(dqp, bp); - if (tp) { - error = xfs_trans_commit(tp); - if (error) - goto error0; - } - - *O_dqpp = dqp; + xfs_buf_relse(bp); + *dqpp = dqp; return error; -error1: - if (tp) - xfs_trans_cancel(tp); -error0: +err: + trace_xfs_dqread_fail(dqp); xfs_qm_dqdestroy(dqp); - *O_dqpp = NULL; + *dqpp = NULL; return error; } @@ -679,77 +669,230 @@ xfs_dq_get_next_id( } /* - * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a - * a locked dquot, doing an allocation (if requested) as needed. - * When both an inode and an id are given, the inode's id takes precedence. - * That is, if the id changes while we don't hold the ilock inside this - * function, the new dquot is returned, not necessarily the one requested - * in the id argument. + * Look up the dquot in the in-core cache. If found, the dquot is returned + * locked and ready to go. + */ +static struct xfs_dquot * +xfs_qm_dqget_cache_lookup( + struct xfs_mount *mp, + struct xfs_quotainfo *qi, + struct radix_tree_root *tree, + xfs_dqid_t id) +{ + struct xfs_dquot *dqp; + +restart: + mutex_lock(&qi->qi_tree_lock); + dqp = radix_tree_lookup(tree, id); + if (!dqp) { + mutex_unlock(&qi->qi_tree_lock); + XFS_STATS_INC(mp, xs_qm_dqcachemisses); + return NULL; + } + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_freeing(dqp); + delay(1); + goto restart; + } + + dqp->q_nrefs++; + mutex_unlock(&qi->qi_tree_lock); + + trace_xfs_dqget_hit(dqp); + XFS_STATS_INC(mp, xs_qm_dqcachehits); + return dqp; +} + +/* + * Try to insert a new dquot into the in-core cache. If an error occurs the + * caller should throw away the dquot and start over. Otherwise, the dquot + * is returned locked (and held by the cache) as if there had been a cache + * hit. + */ +static int +xfs_qm_dqget_cache_insert( + struct xfs_mount *mp, + struct xfs_quotainfo *qi, + struct radix_tree_root *tree, + xfs_dqid_t id, + struct xfs_dquot *dqp) +{ + int error; + + mutex_lock(&qi->qi_tree_lock); + error = radix_tree_insert(tree, id, dqp); + if (unlikely(error)) { + /* Duplicate found! Caller must try again. */ + WARN_ON(error != -EEXIST); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_dup(dqp); + return error; + } + + /* Return a locked dquot to the caller, with a reference taken. */ + xfs_dqlock(dqp); + dqp->q_nrefs = 1; + + qi->qi_dquots++; + mutex_unlock(&qi->qi_tree_lock); + + return 0; +} + +/* Check our input parameters. */ +static int +xfs_qm_dqget_checks( + struct xfs_mount *mp, + uint type) +{ + if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp))) + return -ESRCH; + + switch (type) { + case XFS_DQ_USER: + if (!XFS_IS_UQUOTA_ON(mp)) + return -ESRCH; + return 0; + case XFS_DQ_GROUP: + if (!XFS_IS_GQUOTA_ON(mp)) + return -ESRCH; + return 0; + case XFS_DQ_PROJ: + if (!XFS_IS_PQUOTA_ON(mp)) + return -ESRCH; + return 0; + default: + WARN_ON_ONCE(0); + return -EINVAL; + } +} + +/* + * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked + * dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( - xfs_mount_t *mp, - xfs_inode_t *ip, /* locked inode (optional) */ - xfs_dqid_t id, /* uid/projid/gid depending on type */ - uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */ - uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ - xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + bool can_alloc, + struct xfs_dquot **O_dqpp) { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || - (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || - (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { - return -ESRCH; + error = xfs_qm_dqget_checks(mp, type); + if (error) + return error; + +restart: + dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); + if (dqp) { + *O_dqpp = dqp; + return 0; } - ASSERT(type == XFS_DQ_USER || - type == XFS_DQ_PROJ || - type == XFS_DQ_GROUP); - if (ip) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_inode_dquot(ip, type) == NULL); + error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); + if (error) + return error; + + error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + if (error) { + /* + * Duplicate found. Just throw away the new dquot and start + * over. + */ + xfs_qm_dqdestroy(dqp); + XFS_STATS_INC(mp, xs_qm_dquot_dups); + goto restart; } -restart: - mutex_lock(&qi->qi_tree_lock); - dqp = radix_tree_lookup(tree, id); - if (dqp) { - xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) { - xfs_dqunlock(dqp); - mutex_unlock(&qi->qi_tree_lock); - trace_xfs_dqget_freeing(dqp); - delay(1); - goto restart; - } + trace_xfs_dqget_miss(dqp); + *O_dqpp = dqp; + return 0; +} - /* uninit / unused quota found in radix tree, keep looking */ - if (flags & XFS_QMOPT_DQNEXT) { - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - xfs_dqunlock(dqp); - mutex_unlock(&qi->qi_tree_lock); - error = xfs_dq_get_next_id(mp, type, &id); - if (error) - return error; - goto restart; - } - } +/* + * Given a dquot id and type, read and initialize a dquot from the on-disk + * metadata. This function is only for use during quota initialization so + * it ignores the dquot cache assuming that the dquot shrinker isn't set up. + * The caller is responsible for _qm_dqdestroy'ing the returned dquot. + */ +int +xfs_qm_dqget_uncached( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct xfs_dquot **dqpp) +{ + int error; - dqp->q_nrefs++; - mutex_unlock(&qi->qi_tree_lock); + error = xfs_qm_dqget_checks(mp, type); + if (error) + return error; - trace_xfs_dqget_hit(dqp); - XFS_STATS_INC(mp, xs_qm_dqcachehits); + return xfs_qm_dqread(mp, id, type, 0, dqpp); +} + +/* Return the quota id for a given inode and type. */ +xfs_dqid_t +xfs_qm_id_for_quotatype( + struct xfs_inode *ip, + uint type) +{ + switch (type) { + case XFS_DQ_USER: + return ip->i_d.di_uid; + case XFS_DQ_GROUP: + return ip->i_d.di_gid; + case XFS_DQ_PROJ: + return xfs_get_projid(ip); + } + ASSERT(0); + return 0; +} + +/* + * Return the dquot for a given inode and type. If @can_alloc is true, then + * allocate blocks if needed. The inode's ILOCK must be held and it must not + * have already had an inode attached. + */ +int +xfs_qm_dqget_inode( + struct xfs_inode *ip, + uint type, + bool can_alloc, + struct xfs_dquot **O_dqpp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + struct xfs_dquot *dqp; + xfs_dqid_t id; + int error; + + error = xfs_qm_dqget_checks(mp, type); + if (error) + return error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_inode_dquot(ip, type) == NULL); + + id = xfs_qm_id_for_quotatype(ip, type); + +restart: + dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); + if (dqp) { *O_dqpp = dqp; return 0; } - mutex_unlock(&qi->qi_tree_lock); - XFS_STATS_INC(mp, xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across @@ -758,87 +901,81 @@ restart: * lock here means dealing with a chown that can happen before * we re-acquire the lock. */ - if (ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - error = xfs_qm_dqread(mp, id, type, flags, &dqp); - - if (ip) - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* If we are asked to find next active id, keep looking */ - if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) { - error = xfs_dq_get_next_id(mp, type, &id); - if (!error) - goto restart; - } - + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); + xfs_ilock(ip, XFS_ILOCK_EXCL); if (error) return error; - if (ip) { - /* - * A dquot could be attached to this inode by now, since - * we had dropped the ilock. - */ - if (xfs_this_quota_on(mp, type)) { - struct xfs_dquot *dqp1; - - dqp1 = xfs_inode_dquot(ip, type); - if (dqp1) { - xfs_qm_dqdestroy(dqp); - dqp = dqp1; - xfs_dqlock(dqp); - goto dqret; - } - } else { - /* inode stays locked on return */ + /* + * A dquot could be attached to this inode by now, since we had + * dropped the ilock. + */ + if (xfs_this_quota_on(mp, type)) { + struct xfs_dquot *dqp1; + + dqp1 = xfs_inode_dquot(ip, type); + if (dqp1) { xfs_qm_dqdestroy(dqp); - return -ESRCH; + dqp = dqp1; + xfs_dqlock(dqp); + goto dqret; } + } else { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return -ESRCH; } - mutex_lock(&qi->qi_tree_lock); - error = radix_tree_insert(tree, id, dqp); - if (unlikely(error)) { - WARN_ON(error != -EEXIST); - + error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + if (error) { /* * Duplicate found. Just throw away the new dquot and start * over. */ - mutex_unlock(&qi->qi_tree_lock); - trace_xfs_dqget_dup(dqp); xfs_qm_dqdestroy(dqp); XFS_STATS_INC(mp, xs_qm_dquot_dups); goto restart; } - /* - * We return a locked dquot to the caller, with a reference taken - */ - xfs_dqlock(dqp); - dqp->q_nrefs = 1; +dqret: + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + trace_xfs_dqget_miss(dqp); + *O_dqpp = dqp; + return 0; +} - qi->qi_dquots++; - mutex_unlock(&qi->qi_tree_lock); +/* + * Starting at @id and progressing upwards, look for an initialized incore + * dquot, lock it, and return it. + */ +int +xfs_qm_dqget_next( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct xfs_dquot **dqpp) +{ + struct xfs_dquot *dqp; + int error = 0; - /* If we are asked to find next active id, keep looking */ - if (flags & XFS_QMOPT_DQNEXT) { - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - xfs_qm_dqput(dqp); - error = xfs_dq_get_next_id(mp, type, &id); - if (error) - return error; - goto restart; + *dqpp = NULL; + for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) { + error = xfs_qm_dqget(mp, id, type, false, &dqp); + if (error == -ENOENT) + continue; + else if (error != 0) + break; + + if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + *dqpp = dqp; + return 0; } + + xfs_qm_dqput(dqp); } - dqret: - ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); - trace_xfs_dqget_miss(dqp); - *O_dqpp = dqp; - return 0; + return error; } /* @@ -913,9 +1050,9 @@ xfs_qm_dqflush_done( * since it's cheaper, and then we recheck while * holding the lock before removing the dquot from the AIL. */ - if ((lip->li_flags & XFS_LI_IN_AIL) && + if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && ((lip->li_lsn == qip->qli_flush_lsn) || - (lip->li_flags & XFS_LI_FAILED))) { + test_bit(XFS_LI_FAILED, &lip->li_flags))) { /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->ail_lock); @@ -926,8 +1063,7 @@ xfs_qm_dqflush_done( * Clear the failed state since we are about to drop the * flush lock */ - if (lip->li_flags & XFS_LI_FAILED) - xfs_clear_li_failed(lip); + xfs_clear_li_failed(lip); spin_unlock(&ailp->ail_lock); } } @@ -953,6 +1089,7 @@ xfs_qm_dqflush( { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; + struct xfs_dqblk *dqb; struct xfs_disk_dquot *ddqp; xfs_failaddr_t fa; int error; @@ -996,12 +1133,13 @@ xfs_qm_dqflush( /* * Calculate the location of the dquot inside the buffer. */ - ddqp = bp->b_addr + dqp->q_bufoffset; + dqb = bp->b_addr + dqp->q_bufoffset; + ddqp = &dqb->dd_diskdq; /* - * A simple sanity check in case we got a corrupted dquot.. + * A simple sanity check in case we got a corrupted dquot. */ - fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 0); + fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", be32_to_cpu(ddqp->d_id), fa); @@ -1032,8 +1170,6 @@ xfs_qm_dqflush( * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; - dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -1119,3 +1255,35 @@ xfs_qm_exit(void) kmem_zone_destroy(xfs_qm_dqtrxzone); kmem_zone_destroy(xfs_qm_dqzone); } + +/* + * Iterate every dquot of a particular type. The caller must ensure that the + * particular quota type is active. iter_fn can return negative error codes, + * or XFS_BTREE_QUERY_RANGE_ABORT to indicate that it wants to stop iterating. + */ +int +xfs_qm_dqiterate( + struct xfs_mount *mp, + uint dqtype, + xfs_qm_dqiterate_fn iter_fn, + void *priv) +{ + struct xfs_dquot *dq; + xfs_dqid_t id = 0; + int error; + + do { + error = xfs_qm_dqget_next(mp, id, dqtype, &dq); + if (error == -ENOENT) + return 0; + if (error) + return error; + + error = iter_fn(dq, dqtype, priv); + id = be32_to_cpu(dq->q_core.d_id); + xfs_qm_dqput(dq); + id++; + } while (error == 0 && id != 0); + + return error; +} diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 2f536f33cd26..64bd8640f6e8 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_DQUOT_H__ #define __XFS_DQUOT_H__ @@ -160,8 +148,6 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) -extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, - uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); @@ -169,8 +155,19 @@ extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, struct xfs_dquot *); -extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, - xfs_dqid_t, uint, uint, xfs_dquot_t **); +extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, + uint type); +extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, + uint type, bool can_alloc, + struct xfs_dquot **dqpp); +extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, + bool can_alloc, + struct xfs_dquot **dqpp); +extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, + uint type, struct xfs_dquot **dqpp); +extern int xfs_qm_dqget_uncached(struct xfs_mount *mp, + xfs_dqid_t id, uint type, + struct xfs_dquot **dqpp); extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); @@ -185,4 +182,9 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } +typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype, + void *priv); +int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype, + xfs_qm_dqiterate_fn iter_fn, void *priv); + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 4b331e354da7..7dedd17c4813 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -173,7 +161,7 @@ xfs_qm_dquot_logitem_push( * The buffer containing this item failed to be written back * previously. Resubmit the buffer for IO */ - if (lip->li_flags & XFS_LI_FAILED) { + if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; @@ -209,10 +197,7 @@ xfs_qm_dquot_logitem_push( spin_unlock(&lip->li_ailp->ail_lock); error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(dqp->q_mount, "%s: push error %d on dqp "PTR_FMT, - __func__, error, dqp); - } else { + if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 502e9464634a..db9df710a308 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_DQUOT_ITEM_H__ #define __XFS_DQUOT_ITEM_H__ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index a63f5083f497..0470114a8d80 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_format.h" @@ -61,6 +49,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, XFS_RANDOM_BUF_LRU_REF, + XFS_RANDOM_FORCE_SCRUB_REPAIR, }; struct xfs_errortag_attr { @@ -167,6 +156,7 @@ XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); +XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -201,6 +191,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), + XFS_ERRORTAG_ATTR_LIST(force_repair), NULL, }; @@ -331,13 +322,14 @@ xfs_corruption_error( const char *tag, int level, struct xfs_mount *mp, - void *p, + void *buf, + size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr) { if (level <= xfs_error_level) - xfs_hex_dump(p, XFS_CORRUPTION_DUMP_LEN); + xfs_hex_dump(buf, bufsize); xfs_error_report(tag, level, mp, filename, linenum, failaddr); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index ce391349e78b..246d3e989c6c 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_ERROR_H__ #define __XFS_ERROR_H__ @@ -24,8 +12,9 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, const char *filename, int linenum, xfs_failaddr_t failaddr); extern void xfs_corruption_error(const char *tag, int level, - struct xfs_mount *mp, void *p, const char *filename, - int linenum, xfs_failaddr_t failaddr); + struct xfs_mount *mp, void *buf, size_t bufsize, + const char *filename, int linenum, + xfs_failaddr_t failaddr); extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name, void *buf, size_t bufsz, xfs_failaddr_t failaddr); @@ -37,8 +26,8 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) -#define XFS_CORRUPTION_ERROR(e, lvl, mp, mem) \ - xfs_corruption_error(e, lvl, mp, mem, \ +#define XFS_CORRUPTION_ERROR(e, lvl, mp, buf, bufsize) \ + xfs_corruption_error(e, lvl, mp, buf, bufsize, \ __FILE__, __LINE__, __return_address) #define XFS_ERRLEVEL_OFF 0 diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index eed698aa9f16..3cf4682e2510 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2004-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_format.h" @@ -140,15 +128,24 @@ xfs_nfs_get_inode( */ error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip); if (error) { + /* * EINVAL means the inode cluster doesn't exist anymore. - * This implies the filehandle is stale, so we should - * translate it here. + * EFSCORRUPTED means the metadata pointing to the inode cluster + * or the inode cluster itself is corrupt. This implies the + * filehandle is stale, so we should translate it here. * We don't use ESTALE directly down the chain to not * confuse applications using bulkstat that expect EINVAL. */ - if (error == -EINVAL || error == -ENOENT) + switch (error) { + case -EINVAL: + case -ENOENT: + case -EFSCORRUPTED: error = -ESTALE; + break; + default: + break; + } return ERR_PTR(error); } diff --git a/fs/xfs/xfs_export.h b/fs/xfs/xfs_export.h index 3272b6ae7a35..64471a3ddb04 100644 --- a/fs/xfs/xfs_export.h +++ b/fs/xfs/xfs_export.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_EXPORT_H__ #define __XFS_EXPORT_H__ diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 13e3d1a69e76..0ed68379e551 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (c) 2010 David Chinner. * Copyright (c) 2011 Christoph Hellwig. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 60195ea1b84a..990ab3891971 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (c) 2010 David Chinner. * Copyright (c) 2011 Christoph Hellwig. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_EXTENT_BUSY_H__ #define __XFS_EXTENT_BUSY_H__ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index b5b1e567b9f4..d9da66c718bb 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -168,7 +156,7 @@ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_efi_release(EFI_ITEM(lip)); } @@ -402,7 +390,7 @@ xfs_efd_item_unlock( { struct xfs_efd_log_item *efdp = EFD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_efi_release(efdp->efd_efip); xfs_efd_item_free(efdp); } @@ -542,7 +530,7 @@ xfs_efi_recover( for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &efip->efi_format.efi_extents[i]; error = xfs_trans_free_extent(tp, efdp, extp->ext_start, - extp->ext_len, &oinfo); + extp->ext_len, &oinfo, false); if (error) goto abort_error; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index a32c794a86b7..2a6a895ca73e 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_EXTFREE_ITEM_H__ #define __XFS_EXTFREE_ITEM_H__ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e70fb8ccecea..a3e7767a5715 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -312,7 +300,7 @@ restart: if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock); + error = xfs_break_layouts(inode, iolock, BREAK_WRITE); if (error) return error; @@ -414,6 +402,12 @@ xfs_dio_write_end_io( if (size <= 0) return size; + /* + * Capture amount written on completion as we can't reliably account + * for it on submission. + */ + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); + if (flags & IOMAP_DIO_COW) { error = xfs_reflink_end_cow(ip, offset, size); if (error) @@ -599,7 +593,16 @@ xfs_file_dax_write( } out: xfs_iunlock(ip, iolock); - return error ? error : ret; + if (error) + return error; + + if (ret > 0) { + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } + return ret; } STATIC ssize_t @@ -669,6 +672,12 @@ write_retry: out: if (iolock) xfs_iunlock(ip, iolock); + + if (ret > 0) { + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } return ret; } @@ -693,8 +702,9 @@ xfs_file_write_iter( return -EIO; if (IS_DAX(inode)) - ret = xfs_file_dax_write(iocb, from); - else if (iocb->ki_flags & IOCB_DIRECT) { + return xfs_file_dax_write(iocb, from); + + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered * write *only* in the case that we're doing a reflink @@ -702,20 +712,74 @@ xfs_file_write_iter( * allow an operation to fall back to buffered mode. */ ret = xfs_file_dio_aio_write(iocb, from); - if (ret == -EREMCHG) - goto buffered; - } else { -buffered: - ret = xfs_file_buffered_aio_write(iocb, from); + if (ret != -EREMCHG) + return ret; } - if (ret > 0) { - XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + return xfs_file_buffered_aio_write(iocb, from); +} - /* Handle various SYNC-type writes */ - ret = generic_write_sync(iocb, ret); - } - return ret; +static void +xfs_wait_dax_page( + struct inode *inode, + bool *did_unlock) +{ + struct xfs_inode *ip = XFS_I(inode); + + *did_unlock = true; + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + schedule(); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); +} + +static int +xfs_break_dax_layouts( + struct inode *inode, + uint iolock, + bool *did_unlock) +{ + struct page *page; + + ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL)); + + page = dax_layout_busy_page(inode->i_mapping); + if (!page) + return 0; + + return ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, + 0, 0, xfs_wait_dax_page(inode, did_unlock)); +} + +int +xfs_break_layouts( + struct inode *inode, + uint *iolock, + enum layout_break_reason reason) +{ + bool retry; + int error; + + ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); + + do { + retry = false; + switch (reason) { + case BREAK_UNMAP: + error = xfs_break_dax_layouts(inode, *iolock, &retry); + if (error || retry) + break; + /* fall through */ + case BREAK_WRITE: + error = xfs_break_leased_layouts(inode, iolock, &retry); + break; + default: + WARN_ON_ONCE(1); + error = -EINVAL; + } + } while (error == 0 && retry); + + return error; } #define XFS_FALLOC_FL_SUPPORTED \ @@ -734,7 +798,7 @@ xfs_file_fallocate( struct xfs_inode *ip = XFS_I(inode); long error; enum xfs_prealloc_flags flags = 0; - uint iolock = XFS_IOLOCK_EXCL; + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; loff_t new_size = 0; bool do_file_insert = false; @@ -744,13 +808,10 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) goto out_unlock; - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; - if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -1007,7 +1068,7 @@ xfs_file_llseek( * page_lock (MM) * i_lock (XFS - extent map serialisation) */ -static int +static vm_fault_t __xfs_filemap_fault( struct vm_fault *vmf, enum page_entry_size pe_size, @@ -1015,7 +1076,7 @@ __xfs_filemap_fault( { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); - int ret; + vm_fault_t ret; trace_xfs_filemap_fault(ip, pe_size, write_fault); @@ -1044,7 +1105,7 @@ __xfs_filemap_fault( return ret; } -static int +static vm_fault_t xfs_filemap_fault( struct vm_fault *vmf) { @@ -1054,7 +1115,7 @@ xfs_filemap_fault( (vmf->flags & FAULT_FLAG_WRITE)); } -static int +static vm_fault_t xfs_filemap_huge_fault( struct vm_fault *vmf, enum page_entry_size pe_size) @@ -1067,7 +1128,7 @@ xfs_filemap_huge_fault( (vmf->flags & FAULT_FLAG_WRITE)); } -static int +static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { @@ -1079,7 +1140,7 @@ xfs_filemap_page_mkwrite( * on write faults. In reality, it needs to serialise against truncate and * prepare memory for writing so handle is as standard write fault. */ -static int +static vm_fault_t xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 3f8722e51dbe..2d2c5ab9143c 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2006-2007 Silicon Graphics, Inc. * Copyright (c) 2014 Christoph Hellwig. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_format.h" diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 2ef43406e53b..5cc7665e93c9 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2006-2007 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_FILESTREAM_H__ #define __XFS_FILESTREAM_H__ diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 43cfc07996a4..c7157bc48bd1 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -465,10 +451,9 @@ xfs_getfsmap_rtdev_rtbitmap_helper( struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; - rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock); - - irec.rm_startblock = rec->ar_startblock; - irec.rm_blockcount = rec->ar_blockcount; + irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize; + rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock); + irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ irec.rm_offset = 0; irec.rm_flags = 0; @@ -528,14 +513,17 @@ xfs_getfsmap_rtdev_rtbitmap_query( struct xfs_trans *tp, struct xfs_getfsmap_info *info) { - struct xfs_rtalloc_rec alow; - struct xfs_rtalloc_rec ahigh; + struct xfs_rtalloc_rec alow = { 0 }; + struct xfs_rtalloc_rec ahigh = { 0 }; int error; xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); - alow.ar_startblock = info->low.rm_startblock; - ahigh.ar_startblock = info->high.rm_startblock; + alow.ar_startext = info->low.rm_startblock; + ahigh.ar_startext = info->high.rm_startblock; + do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize); + if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize)) + ahigh.ar_startext++; error = xfs_rtalloc_query_range(tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h index 0b9bf822595c..c6c57739b862 100644 --- a/fs/xfs/xfs_fsmap.h +++ b/fs/xfs/xfs_fsmap.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_FSMAP_H__ #define __XFS_FSMAP_H__ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 523792768080..3f2bd6032cf8 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -24,85 +12,42 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" -#include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_error.h" #include "xfs_btree.h" -#include "xfs_alloc_btree.h" #include "xfs_alloc.h" -#include "xfs_rmap_btree.h" -#include "xfs_ialloc.h" #include "xfs_fsops.h" -#include "xfs_itable.h" #include "xfs_trans_space.h" #include "xfs_rtalloc.h" #include "xfs_trace.h" #include "xfs_log.h" -#include "xfs_filestream.h" -#include "xfs_rmap.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" /* - * File system operations + * growfs operations */ - -static struct xfs_buf * -xfs_growfs_get_hdr_buf( - struct xfs_mount *mp, - xfs_daddr_t blkno, - size_t numblks, - int flags, - const struct xfs_buf_ops *ops) -{ - struct xfs_buf *bp; - - bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); - if (!bp) - return NULL; - - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - bp->b_bn = blkno; - bp->b_maps[0].bm_bn = blkno; - bp->b_ops = ops; - - return bp; -} - static int xfs_growfs_data_private( xfs_mount_t *mp, /* mount point for filesystem */ xfs_growfs_data_t *in) /* growfs data input struct */ { - xfs_agf_t *agf; - struct xfs_agfl *agfl; - xfs_agi_t *agi; - xfs_agnumber_t agno; - xfs_extlen_t agsize; - xfs_extlen_t tmpsize; - xfs_alloc_rec_t *arec; xfs_buf_t *bp; - int bucket; - int dpct; - int error, saved_error = 0; + int error; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; xfs_rfsblock_t nb, nb_mod; xfs_rfsblock_t new; - xfs_rfsblock_t nfree; xfs_agnumber_t oagcount; - int pct; xfs_trans_t *tp; + LIST_HEAD (buffer_list); + struct aghdr_init_data id = {}; nb = in->newblocks; - pct = in->imaxpct; - if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) + if (nb < mp->m_sb.sb_dblocks) return -EINVAL; if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; - dpct = pct - mp->m_sb.sb_imax_pct; error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); @@ -135,376 +80,45 @@ xfs_growfs_data_private( return error; /* - * Write new AG headers to disk. Non-transactional, but written - * synchronously so they are completed prior to the growfs transaction - * being logged. + * Write new AG headers to disk. Non-transactional, but need to be + * written and completed prior to the growfs transaction being logged. + * To do this, we use a delayed write buffer list and wait for + * submission and IO completion of the list as a whole. This allows the + * IO subsystem to merge all the AG headers in a single AG into a single + * IO and hide most of the latency of the IO from us. + * + * This also means that if we get an error whilst building the buffer + * list to write, we can cancel the entire list without having written + * anything. */ - nfree = 0; - for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { - __be32 *agfl_bno; - - /* - * AG freespace header block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, - &xfs_agf_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - agf = XFS_BUF_TO_AGF(bp); - agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); - agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); - agf->agf_seqno = cpu_to_be32(agno); - if (agno == nagcount - 1) - agsize = - nb - - (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); + INIT_LIST_HEAD(&id.buffer_list); + for (id.agno = nagcount - 1; + id.agno >= oagcount; + id.agno--, new -= id.agsize) { + + if (id.agno == nagcount - 1) + id.agsize = nb - + (id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); else - agsize = mp->m_sb.sb_agblocks; - agf->agf_length = cpu_to_be32(agsize); - agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); - agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); - agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - agf->agf_roots[XFS_BTNUM_RMAPi] = - cpu_to_be32(XFS_RMAP_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); - agf->agf_rmap_blocks = cpu_to_be32(1); - } - - agf->agf_flfirst = cpu_to_be32(1); - agf->agf_fllast = 0; - agf->agf_flcount = 0; - tmpsize = agsize - mp->m_ag_prealloc_blocks; - agf->agf_freeblks = cpu_to_be32(tmpsize); - agf->agf_longest = cpu_to_be32(tmpsize); - if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasreflink(&mp->m_sb)) { - agf->agf_refcount_root = cpu_to_be32( - xfs_refc_block(mp)); - agf->agf_refcount_level = cpu_to_be32(1); - agf->agf_refcount_blocks = cpu_to_be32(1); - } - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * AG freelist header block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, - &xfs_agfl_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - agfl = XFS_BUF_TO_AGFL(bp); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); - agfl->agfl_seqno = cpu_to_be32(agno); - uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); - } - - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); - for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) - agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * AG inode header block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, - &xfs_agi_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - agi = XFS_BUF_TO_AGI(bp); - agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); - agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); - agi->agi_seqno = cpu_to_be32(agno); - agi->agi_length = cpu_to_be32(agsize); - agi->agi_count = 0; - agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); - agi->agi_level = cpu_to_be32(1); - agi->agi_freecount = 0; - agi->agi_newino = cpu_to_be32(NULLAGINO); - agi->agi_dirino = cpu_to_be32(NULLAGINO); - if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { - agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); - agi->agi_free_level = cpu_to_be32(1); - } - for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) - agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * BNO btree root block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_allocbt_buf_ops); - - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0); - - arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - arec->ar_blockcount = cpu_to_be32( - agsize - be32_to_cpu(arec->ar_startblock)); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * CNT btree root block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_allocbt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0); - - arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - arec->ar_blockcount = cpu_to_be32( - agsize - be32_to_cpu(arec->ar_startblock)); - nfree += be32_to_cpu(arec->ar_blockcount); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* RMAP btree root block */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - struct xfs_rmap_rec *rrec; - struct xfs_btree_block *block; - - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_rmapbt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0, - agno, 0); - block = XFS_BUF_TO_BLOCK(bp); - - - /* - * mark the AG header regions as static metadata The BNO - * btree block is the first block after the headers, so - * it's location defines the size of region the static - * metadata consumes. - * - * Note: unlike mkfs, we never have to account for log - * space when growing the data regions - */ - rrec = XFS_RMAP_REC_ADDR(block, 1); - rrec->rm_startblock = 0; - rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account freespace btree root blocks */ - rrec = XFS_RMAP_REC_ADDR(block, 2); - rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); - rrec->rm_blockcount = cpu_to_be32(2); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account inode btree root blocks */ - rrec = XFS_RMAP_REC_ADDR(block, 3); - rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); - rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - - XFS_IBT_BLOCK(mp)); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account for rmap btree root */ - rrec = XFS_RMAP_REC_ADDR(block, 4); - rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); - rrec->rm_blockcount = cpu_to_be32(1); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account for refc btree root */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { - rrec = XFS_RMAP_REC_ADDR(block, 5); - rrec->rm_startblock = cpu_to_be32( - xfs_refc_block(mp)); - rrec->rm_blockcount = cpu_to_be32(1); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - } - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - } - - /* - * INO btree root block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_inobt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * FINO btree root block - */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_inobt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO, - 0, 0, agno, 0); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - } - - /* - * refcount btree root block - */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_refcountbt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC, - 0, 0, agno, 0); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - } - } - xfs_trans_agblocks_delta(tp, nfree); - /* - * There are new blocks in the old last a.g. - */ - if (new) { - struct xfs_owner_info oinfo; + id.agsize = mp->m_sb.sb_agblocks; - /* - * Change the agi length. - */ - error = xfs_ialloc_read_agi(mp, tp, agno, &bp); - if (error) { - goto error0; - } - ASSERT(bp); - agi = XFS_BUF_TO_AGI(bp); - be32_add_cpu(&agi->agi_length, new); - ASSERT(nagcount == oagcount || - be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); - xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); - /* - * Change agf length. - */ - error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp); + error = xfs_ag_init_headers(mp, &id); if (error) { - goto error0; + xfs_buf_delwri_cancel(&id.buffer_list); + goto out_trans_cancel; } - ASSERT(bp); - agf = XFS_BUF_TO_AGF(bp); - be32_add_cpu(&agf->agf_length, new); - ASSERT(be32_to_cpu(agf->agf_length) == - be32_to_cpu(agi->agi_length)); + } + error = xfs_buf_delwri_submit(&id.buffer_list); + if (error) + goto out_trans_cancel; - xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + xfs_trans_agblocks_delta(tp, id.nfree); - /* - * Free the new space. - * - * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that - * this doesn't actually exist in the rmap btree. - */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); - error = xfs_rmap_free(tp, bp, agno, - be32_to_cpu(agf->agf_length) - new, - new, &oinfo); - if (error) - goto error0; - error = xfs_free_extent(tp, - XFS_AGB_TO_FSB(mp, agno, - be32_to_cpu(agf->agf_length) - new), - new, &oinfo, XFS_AG_RESV_NONE); + /* If there are new blocks in the old last AG, extend it. */ + if (new) { + error = xfs_ag_extend_space(mp, tp, &id, new); if (error) - goto error0; + goto out_trans_cancel; } /* @@ -517,10 +131,8 @@ xfs_growfs_data_private( if (nb > mp->m_sb.sb_dblocks) xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, nb - mp->m_sb.sb_dblocks); - if (nfree) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); - if (dpct) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + if (id.nfree) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) @@ -529,12 +141,6 @@ xfs_growfs_data_private( /* New allocation groups fully initialized, so update mount struct */ if (nagimax) mp->m_maxagi = nagimax; - if (mp->m_sb.sb_imax_pct) { - uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; - do_div(icount, 100); - mp->m_maxicount = icount << mp->m_sb.sb_inopblog; - } else - mp->m_maxicount = 0; xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); @@ -545,73 +151,24 @@ xfs_growfs_data_private( if (new) { struct xfs_perag *pag; - pag = xfs_perag_get(mp, agno); + pag = xfs_perag_get(mp, id.agno); error = xfs_ag_resv_free(pag); xfs_perag_put(pag); if (error) - goto out; + return error; } - /* Reserve AG metadata blocks. */ + /* + * Reserve AG metadata blocks. ENOSPC here does not mean there was a + * growfs failure, just that there still isn't space for new user data + * after the grow has been run. + */ error = xfs_fs_reserve_ag_blocks(mp); - if (error && error != -ENOSPC) - goto out; - - /* update secondary superblocks. */ - for (agno = 1; agno < nagcount; agno++) { + if (error == -ENOSPC) error = 0; - /* - * new secondary superblocks need to be zeroed, not read from - * disk as the contents of the new area we are growing into is - * completely unknown. - */ - if (agno < oagcount) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, - &xfs_sb_buf_ops); - } else { - bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0); - if (bp) { - bp->b_ops = &xfs_sb_buf_ops; - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - } else - error = -ENOMEM; - } - - /* - * If we get an error reading or writing alternate superblocks, - * continue. xfs_repair chooses the "best" superblock based - * on most matches; if we break early, we'll leave more - * superblocks un-updated than updated, and xfs_repair may - * pick them over the properly-updated primary. - */ - if (error) { - xfs_warn(mp, - "error %d reading secondary superblock for ag %d", - error, agno); - saved_error = error; - continue; - } - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) { - xfs_warn(mp, - "write error %d updating secondary superblock for ag %d", - error, agno); - saved_error = error; - continue; - } - } - - out: - return saved_error ? saved_error : error; + return error; - error0: +out_trans_cancel: xfs_trans_cancel(tp); return error; } @@ -638,25 +195,71 @@ xfs_growfs_log_private( return -ENOSYS; } +static int +xfs_growfs_imaxpct( + struct xfs_mount *mp, + __u32 imaxpct) +{ + struct xfs_trans *tp; + int dpct; + int error; + + if (imaxpct > 100) + return -EINVAL; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + dpct = imaxpct - mp->m_sb.sb_imax_pct; + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp); +} + /* * protected versions of growfs function acquire and release locks on the mount * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, * XFS_IOC_FSGROWFSRT */ - - int xfs_growfs_data( - xfs_mount_t *mp, - xfs_growfs_data_t *in) + struct xfs_mount *mp, + struct xfs_growfs_data *in) { - int error; + int error = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; - error = xfs_growfs_data_private(mp, in); + + /* update imaxpct separately to the physical grow of the filesystem */ + if (in->imaxpct != mp->m_sb.sb_imax_pct) { + error = xfs_growfs_imaxpct(mp, in->imaxpct); + if (error) + goto out_error; + } + + if (in->newblocks != mp->m_sb.sb_dblocks) { + error = xfs_growfs_data_private(mp, in); + if (error) + goto out_error; + } + + /* Post growfs calculations needed to reflect new state in operations */ + if (mp->m_sb.sb_imax_pct) { + uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; + do_div(icount, 100); + mp->m_maxicount = icount << mp->m_sb.sb_inopblog; + } else + mp->m_maxicount = 0; + + /* Update secondary superblocks now the physical grow has completed */ + error = xfs_update_secondary_sbs(mp); + +out_error: /* * Increment the generation unconditionally, the error could be from * updating the secondary superblocks, in which case the new size @@ -784,7 +387,7 @@ xfs_reserve_blocks( do { free = percpu_counter_sum(&mp->m_fdblocks) - mp->m_alloc_set_aside; - if (!free) + if (free <= 0) break; delta = request - mp->m_resblks; diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 20484ed5e919..d023db0862c2 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_FSOPS_H__ #define __XFS_FSOPS_H__ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 3e1cc3001bcb..5169e84ae382 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_sysctl.h" @@ -47,6 +35,7 @@ xfs_param_t xfs_params = { struct xfs_globals xfs_globals = { .log_recovery_delay = 0, /* no delay by default */ + .mount_delay = 0, /* no delay by default */ #ifdef XFS_ASSERT_FATAL .bug_on_assert = true, /* assert failures BUG() */ #else diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9a18f69f6e96..47f417d20a30 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -107,7 +95,8 @@ xfs_inode_free_callback( xfs_idestroy_fork(ip, XFS_COW_FORK); if (ip->i_itemp) { - ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, + &ip->i_itemp->ili_item.li_flags)); xfs_inode_item_destroy(ip); ip->i_itemp = NULL; } @@ -309,6 +298,46 @@ xfs_reinit_inode( } /* + * If we are allocating a new inode, then check what was returned is + * actually a free, empty inode. If we are not allocating an inode, + * then check we didn't find a free inode. + * + * Returns: + * 0 if the inode free state matches the lookup context + * -ENOENT if the inode is free and we are not allocating + * -EFSCORRUPTED if there is any state mismatch at all + */ +static int +xfs_iget_check_free_state( + struct xfs_inode *ip, + int flags) +{ + if (flags & XFS_IGET_CREATE) { + /* should be a free inode */ + if (VFS_I(ip)->i_mode != 0) { + xfs_warn(ip->i_mount, +"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", + ip->i_ino, VFS_I(ip)->i_mode); + return -EFSCORRUPTED; + } + + if (ip->i_d.di_nblocks != 0) { + xfs_warn(ip->i_mount, +"Corruption detected! Free inode 0x%llx has blocks allocated!", + ip->i_ino); + return -EFSCORRUPTED; + } + return 0; + } + + /* should be an allocated inode */ + if (VFS_I(ip)->i_mode == 0) + return -ENOENT; + + return 0; +} + +/* * Check the validity of the inode we just found it the cache */ static int @@ -357,12 +386,12 @@ xfs_iget_cache_hit( } /* - * If lookup is racing with unlink return an error immediately. + * Check the inode free state is valid. This also detects lookup + * racing with unlinks. */ - if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = -ENOENT; + error = xfs_iget_check_free_state(ip, flags); + if (error) goto out_error; - } /* * If IRECLAIMABLE is set, we've torn down the VFS inode already. @@ -485,29 +514,12 @@ xfs_iget_cache_miss( /* - * If we are allocating a new inode, then check what was returned is - * actually a free, empty inode. If we are not allocating an inode, - * the check we didn't find a free inode. + * Check the inode free state is valid. This also detects lookup + * racing with unlinks. */ - if (flags & XFS_IGET_CREATE) { - if (VFS_I(ip)->i_mode != 0) { - xfs_warn(mp, -"Corruption detected! Free inode 0x%llx not marked free on disk", - ino); - error = -EFSCORRUPTED; - goto out_destroy; - } - if (ip->i_d.di_nblocks != 0) { - xfs_warn(mp, -"Corruption detected! Free inode 0x%llx has blocks allocated!", - ino); - error = -EFSCORRUPTED; - goto out_destroy; - } - } else if (VFS_I(ip)->i_mode == 0) { - error = -ENOENT; + error = xfs_iget_check_free_state(ip, flags); + if (error) goto out_destroy; - } /* * Preload the radix tree so we can insert safely under the @@ -1802,3 +1814,21 @@ xfs_inode_clear_cowblocks_tag( return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); } + +/* Disable post-EOF and CoW block auto-reclamation. */ +void +xfs_icache_disable_reclaim( + struct xfs_mount *mp) +{ + cancel_delayed_work_sync(&mp->m_eofblocks_work); + cancel_delayed_work_sync(&mp->m_cowblocks_work); +} + +/* Enable post-EOF and CoW block auto-reclamation. */ +void +xfs_icache_enable_reclaim( + struct xfs_mount *mp) +{ + xfs_queue_eofblocks(mp); + xfs_queue_cowblocks(mp); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index d4a77588eca1..26c0626f1f75 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef XFS_SYNC_H #define XFS_SYNC_H 1 @@ -131,4 +119,7 @@ xfs_fs_eofblocks_from_user( int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); +void xfs_icache_disable_reclaim(struct xfs_mount *mp); +void xfs_icache_enable_reclaim(struct xfs_mount *mp); + #endif diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 865ad1373e5e..8381d34cb102 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2008-2010, 2013 Dave Chinner * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -91,7 +79,7 @@ xfs_icreate_item_unlock( { struct xfs_icreate_item *icp = ICR_ITEM(lip); - if (icp->ic_item.li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) kmem_zone_free(xfs_icreate_zone, icp); return; } @@ -184,5 +172,5 @@ xfs_icreate_log( xfs_trans_add_item(tp, &icp->ic_item); tp->t_flags |= XFS_TRANS_DIRTY; - icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags); } diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h index 59e89f87c09b..a50d0b01e15a 100644 --- a/fs/xfs/xfs_icreate_item.h +++ b/fs/xfs/xfs_icreate_item.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2008-2010, Dave Chinner * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef XFS_ICREATE_ITEM_H #define XFS_ICREATE_ITEM_H 1 diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2b70c8b4cee2..5df4de666cc1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/log2.h> #include <linux/iversion.h> @@ -498,7 +486,7 @@ again: if (!try_lock) { for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) + if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) try_lock++; } } @@ -598,7 +586,7 @@ xfs_lock_two_inodes( * and try again. */ lp = (xfs_log_item_t *)ip0->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { xfs_iunlock(ip0, ip0_mode); if ((++attempts % 5) == 0) @@ -773,7 +761,7 @@ xfs_ialloc( xfs_inode_t *ip; uint flags; int error; - struct timespec tv; + struct timespec64 tv; struct inode *inode; /* @@ -791,6 +779,18 @@ xfs_ialloc( ASSERT(*ialloc_context == NULL); /* + * Protect against obviously corrupt allocation btree records. Later + * xfs_iget checks will catch re-allocation of other active in-memory + * and on-disk inodes. If we don't catch reallocating the parent inode + * here we will deadlock in xfs_iget() so we have to do these checks + * first. + */ + if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { + xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); + return -EFSCORRUPTED; + } + + /* * Get the in-core inode with the lock held exclusively. * This is because we're setting fields here we need * to prevent others from looking at until we're done. @@ -1196,6 +1196,7 @@ xfs_create( unlock_dp_on_error = true; xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* * Reserve disk quota and the inode. @@ -1411,11 +1412,11 @@ xfs_link( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - error = xfs_qm_dqattach(sip, 0); + error = xfs_qm_dqattach(sip); if (error) goto std_return; - error = xfs_qm_dqattach(tdp, 0); + error = xfs_qm_dqattach(tdp); if (error) goto std_return; @@ -1451,6 +1452,7 @@ xfs_link( } xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* * Handle initial link state of O_TMPFILE inode @@ -1534,11 +1536,12 @@ xfs_itruncate_clear_reflink_flags( * dirty on error so that transactions can be easily aborted if possible. */ int -xfs_itruncate_extents( +xfs_itruncate_extents_flags( struct xfs_trans **tpp, struct xfs_inode *ip, int whichfork, - xfs_fsize_t new_size) + xfs_fsize_t new_size, + int flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; @@ -1561,6 +1564,8 @@ xfs_itruncate_extents( trace_xfs_itruncate_extents_start(ip, new_size); + flags |= xfs_bmapi_aflag(whichfork); + /* * Since it is possible for space to become allocated beyond * the end of the file (in a crash where the space is allocated @@ -1579,12 +1584,9 @@ xfs_itruncate_extents( unmap_len = last_block - first_unmap_block + 1; while (!done) { xfs_defer_init(&dfops, &first_block); - error = xfs_bunmapi(tp, ip, - first_unmap_block, unmap_len, - xfs_bmapi_aflag(whichfork), - XFS_ITRUNC_MAX_EXTENTS, - &first_block, &dfops, - &done); + error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, + XFS_ITRUNC_MAX_EXTENTS, &first_block, + &dfops, &done); if (error) goto out_bmap_cancel; @@ -1811,6 +1813,7 @@ xfs_inactive_ifree( xfs_trans_ijoin(tp, ip, 0); xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; error = xfs_ifree(tp, ip, &dfops); if (error) { /* @@ -1911,7 +1914,7 @@ xfs_inactive( ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return; @@ -2075,10 +2078,15 @@ xfs_iunlink_remove( * list this inode will go on. */ agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - ASSERT(agino != 0); + if (!xfs_verify_agino(mp, agno, agino)) + return -EFSCORRUPTED; bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)); - ASSERT(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(mp, agno, + be32_to_cpu(agi->agi_unlinked[bucket_index]))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + agi, sizeof(*agi)); + return -EFSCORRUPTED; + } if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { /* @@ -2156,8 +2164,12 @@ xfs_iunlink_remove( last_offset = imap.im_boffset; next_agino = be32_to_cpu(last_dip->di_next_unlinked); - ASSERT(next_agino != NULLAGINO); - ASSERT(next_agino != 0); + if (!xfs_verify_agino(mp, agno, next_agino)) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, + last_dip, sizeof(*last_dip)); + return -EFSCORRUPTED; + } } /* @@ -2246,7 +2258,7 @@ xfs_ifree_cluster( */ ioffset = inum - xic->first_ino; if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { - ASSERT(do_mod(ioffset, inodes_per_cluster) == 0); + ASSERT(ioffset % inodes_per_cluster == 0); continue; } @@ -2574,11 +2586,11 @@ xfs_remove( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - error = xfs_qm_dqattach(dp, 0); + error = xfs_qm_dqattach(dp); if (error) goto std_return; - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) goto std_return; @@ -2647,6 +2659,7 @@ xfs_remove( goto out_trans_cancel; xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; error = xfs_dir_removename(tp, dp, name, ip->i_ino, &first_block, &dfops, resblks); if (error) { @@ -3014,6 +3027,7 @@ xfs_rename( } xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) @@ -3222,7 +3236,6 @@ xfs_iflush_cluster( struct xfs_inode *cip; int nr_found; int clcount = 0; - int bufwasdelwri; int i; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); @@ -3346,37 +3359,22 @@ cluster_corrupt_out: * inode buffer and shut down the filesystem. */ rcu_read_unlock(); - /* - * Clean up the buffer. If it was delwri, just release it -- - * brelse can handle it with no problems. If not, shut down the - * filesystem before releasing the buffer. - */ - bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); - if (bufwasdelwri) - xfs_buf_relse(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - if (!bufwasdelwri) { - /* - * Just like incore_relse: if we have b_iodone functions, - * mark the buffer as an error and call them. Otherwise - * mark it as stale and brelse. - */ - if (bp->b_iodone) { - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp); - } else { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - } - } - /* - * Unlocks the flush lock + * We'll always have an inode attached to the buffer for completion + * process by the time we are called from xfs_iflush(). Hence we have + * always need to do IO completion processing to abort the inodes + * attached to the buffer. handle them just like the shutdown case in + * xfs_buf_submit(). */ + ASSERT(bp->b_iodone); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_ioend(bp); + + /* abort the corrupt inode, as it was not attached to the buffer */ xfs_iflush_abort(cip, false); kmem_free(cilist); xfs_perag_put(pag); @@ -3472,12 +3470,17 @@ xfs_iflush( xfs_log_force(mp, 0); /* - * inode clustering: - * see if other inodes can be gathered into this write + * inode clustering: try to gather other inodes into this write + * + * Note: Any error during clustering will result in the filesystem + * being shut down and completion callbacks run on the cluster buffer. + * As we have already flushed and attached this inode to the buffer, + * it has already been aborted and released by xfs_iflush_cluster() and + * so we have no further error handling to do here. */ error = xfs_iflush_cluster(ip, bp); if (error) - goto cluster_corrupt_out; + return error; *bpp = bp; return 0; @@ -3486,12 +3489,8 @@ corrupt_out: if (bp) xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -cluster_corrupt_out: - error = -EFSCORRUPTED; abort_out: - /* - * Unlocks the flush lock - */ + /* abort the corrupt inode, as it was not attached to the buffer */ xfs_iflush_abort(ip, false); return error; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1eebc53df7d7..2ed63a49e890 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_INODE_H__ #define __XFS_INODE_H__ @@ -379,6 +367,20 @@ static inline void xfs_ifunlock(struct xfs_inode *ip) >> XFS_ILOCK_SHIFT) /* + * Layouts are broken in the BREAK_WRITE case to ensure that + * layout-holders do not collide with local writes. Additionally, + * layouts are broken in the BREAK_UNMAP case to make sure the + * layout-holder has a consistent view of the file's extent map. While + * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases, + * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to + * go idle. + */ +enum layout_break_reason { + BREAK_WRITE, + BREAK_UNMAP, +}; + +/* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and * new subdirectory gets S_ISGID bit from parent. @@ -415,8 +417,8 @@ uint xfs_ilock_attr_map_shared(struct xfs_inode *); uint xfs_ip2xflags(struct xfs_inode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_defer_ops *); -int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, - int, xfs_fsize_t); +int xfs_itruncate_extents_flags(struct xfs_trans **, + struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); @@ -433,6 +435,16 @@ int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, dev_t, prid_t, struct xfs_inode **); +static inline int +xfs_itruncate_extents( + struct xfs_trans **tpp, + struct xfs_inode *ip, + int whichfork, + xfs_fsize_t new_size) +{ + return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0); +} + /* from xfs_file.c */ enum xfs_prealloc_flags { XFS_PREALLOC_SET = (1 << 1), @@ -443,6 +455,8 @@ enum xfs_prealloc_flags { int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); +int xfs_break_layouts(struct inode *inode, uint *iolock, + enum layout_break_reason reason); /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 34b91b789702..2389c34c172d 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -518,7 +506,7 @@ xfs_inode_item_push( * The buffer containing this item failed to be written back * previously. Resubmit the buffer for IO. */ - if (lip->li_flags & XFS_LI_FAILED) { + if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; @@ -729,14 +717,14 @@ xfs_iflush_done( */ iip = INODE_ITEM(blip); if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || - (blip->li_flags & XFS_LI_FAILED)) + test_bit(XFS_LI_FAILED, &blip->li_flags)) need_ail++; } /* make sure we capture the state of the initial inode. */ iip = INODE_ITEM(lip); if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || - lip->li_flags & XFS_LI_FAILED) + test_bit(XFS_LI_FAILED, &lip->li_flags)) need_ail++; /* @@ -803,7 +791,7 @@ xfs_iflush_abort( xfs_inode_log_item_t *iip = ip->i_itemp; if (iip) { - if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) { xfs_trans_ail_remove(&iip->ili_item, stale ? SHUTDOWN_LOG_IO_ERROR : SHUTDOWN_CORRUPT_INCORE); diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index b72373a33cd9..27081eba220c 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_INODE_ITEM_H__ #define __XFS_INODE_ITEM_H__ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 89fb1eb80aae..0ef5ece5634c 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -39,7 +27,6 @@ #include "xfs_icache.h" #include "xfs_symlink.h" #include "xfs_trans.h" -#include "xfs_pnfs.h" #include "xfs_acl.h" #include "xfs_btree.h" #include <linux/fsmap.h> @@ -614,7 +601,7 @@ xfs_ioc_space( struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; enum xfs_prealloc_flags flags = 0; - uint iolock = XFS_IOLOCK_EXCL; + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; int error; /* @@ -644,13 +631,10 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) goto out_unlock; - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; - switch (bf->l_whence) { case 0: /*SEEK_SET*/ break; @@ -1098,12 +1082,15 @@ xfs_ioctl_setattr_dax_invalidate( /* * It is only valid to set the DAX flag on regular files and * directories on filesystems where the block size is equal to the page - * size. On directories it serves as an inherit hint. + * size. On directories it serves as an inherited hint so we don't + * have to check the device for dax support or flush pagecache. */ if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (bdev_dax_supported(sb, sb->s_blocksize) < 0) + if (S_ISREG(inode->i_mode) && + !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), + sb->s_blocksize)) return -EINVAL; } @@ -1113,6 +1100,9 @@ xfs_ioctl_setattr_dax_invalidate( if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode)) return 0; + if (S_ISDIR(inode->i_mode)) + return 0; + /* lock, flush and invalidate mapping in preparation for flag change */ xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL); error = filemap_write_and_wait(inode->i_mapping); @@ -1811,6 +1801,88 @@ xfs_ioc_swapext( return error; } +static int +xfs_ioc_getlabel( + struct xfs_mount *mp, + char __user *user_label) +{ + struct xfs_sb *sbp = &mp->m_sb; + char label[XFSLABEL_MAX + 1]; + + /* Paranoia */ + BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX); + + /* 1 larger than sb_fname, so this ensures a trailing NUL char */ + memset(label, 0, sizeof(label)); + spin_lock(&mp->m_sb_lock); + strncpy(label, sbp->sb_fname, XFSLABEL_MAX); + spin_unlock(&mp->m_sb_lock); + + if (copy_to_user(user_label, label, sizeof(label))) + return -EFAULT; + return 0; +} + +static int +xfs_ioc_setlabel( + struct file *filp, + struct xfs_mount *mp, + char __user *newlabel) +{ + struct xfs_sb *sbp = &mp->m_sb; + char label[XFSLABEL_MAX + 1]; + size_t len; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * The generic ioctl allows up to FSLABEL_MAX chars, but XFS is much + * smaller, at 12 bytes. We copy one more to be sure we find the + * (required) NULL character to test the incoming label length. + * NB: The on disk label doesn't need to be null terminated. + */ + if (copy_from_user(label, newlabel, XFSLABEL_MAX + 1)) + return -EFAULT; + len = strnlen(label, XFSLABEL_MAX + 1); + if (len > sizeof(sbp->sb_fname)) + return -EINVAL; + + error = mnt_want_write_file(filp); + if (error) + return error; + + spin_lock(&mp->m_sb_lock); + memset(sbp->sb_fname, 0, sizeof(sbp->sb_fname)); + memcpy(sbp->sb_fname, label, len); + spin_unlock(&mp->m_sb_lock); + + /* + * Now we do several things to satisfy userspace. + * In addition to normal logging of the primary superblock, we also + * immediately write these changes to sector zero for the primary, then + * update all backup supers (as xfs_db does for a label change), then + * invalidate the block device page cache. This is so that any prior + * buffered reads from userspace (i.e. from blkid) are invalidated, + * and userspace will see the newly-written label. + */ + error = xfs_sync_sb_buf(mp); + if (error) + goto out; + /* + * growfs also updates backup supers so lock against that. + */ + mutex_lock(&mp->m_growlock); + error = xfs_update_secondary_sbs(mp); + mutex_unlock(&mp->m_growlock); + + invalidate_bdev(mp->m_ddev_targp->bt_bdev); + +out: + mnt_drop_write_file(filp); + return error; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -1834,6 +1906,10 @@ xfs_file_ioctl( switch (cmd) { case FITRIM: return xfs_ioc_trim(mp, arg); + case FS_IOC_GETFSLABEL: + return xfs_ioc_getlabel(mp, arg); + case FS_IOC_SETFSLABEL: + return xfs_ioc_setlabel(filp, mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 8de879f0c7d5..4b17f67c888a 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2008 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_IOCTL_H__ #define __XFS_IOCTL_H__ diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 10fbde359649..fba115f4103a 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2004-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/compat.h> #include <linux/ioctl.h> diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 5492bcf6f442..d28fa824284a 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2004-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_IOCTL32_H__ #define __XFS_IOCTL32_H__ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 046469fcc1b8..55876dd02f0c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * Copyright (c) 2016 Christoph Hellwig. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/iomap.h> #include "xfs.h" @@ -200,7 +188,7 @@ xfs_iomap_write_direct( goto out_unlock; } else { if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) - last_fsb = MIN(last_fsb, (xfs_fileoff_t) + last_fsb = min(last_fsb, (xfs_fileoff_t) imap->br_blockcount + imap->br_startoff); } @@ -224,7 +212,7 @@ xfs_iomap_write_direct( * necessary and move on to transaction setup. */ xfs_iunlock(ip, lockmode); - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -488,8 +476,8 @@ xfs_iomap_prealloc_size( * The shift throttle value is set to the maximum value as determined by * the global low free space values and per-quota low free space values. */ - alloc_blocks = MIN(alloc_blocks, qblocks); - shift = MAX(shift, qshift); + alloc_blocks = min(alloc_blocks, qblocks); + shift = max(shift, qshift); if (shift) alloc_blocks >>= shift; @@ -576,7 +564,7 @@ xfs_file_iomap_begin_delay( goto done; } - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; @@ -692,7 +680,7 @@ xfs_iomap_write_allocate( /* * Make sure that the dquots are there. */ - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -946,8 +934,11 @@ error_on_bmapi_transaction: return error; } -static inline bool imap_needs_alloc(struct inode *inode, - struct xfs_bmbt_irec *imap, int nimaps) +static inline bool +imap_needs_alloc( + struct inode *inode, + struct xfs_bmbt_irec *imap, + int nimaps) { return !nimaps || imap->br_startblock == HOLESTARTBLOCK || @@ -955,31 +946,71 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); } -static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps) +static inline bool +needs_cow_for_zeroing( + struct xfs_bmbt_irec *imap, + int nimaps) { return nimaps && imap->br_startblock != HOLESTARTBLOCK && imap->br_state != XFS_EXT_UNWRITTEN; } -static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) +static int +xfs_ilock_for_iomap( + struct xfs_inode *ip, + unsigned flags, + unsigned *lockmode) { + unsigned mode = XFS_ILOCK_SHARED; + bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); + /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) - return true; + if (xfs_is_reflink_inode(ip) && is_write) { + /* + * FIXME: It could still overwrite on unshared extents and not + * need allocation. + */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + mode = XFS_ILOCK_EXCL; + } /* - * Extents not yet cached requires exclusive access, don't block. - * This is an opencoded xfs_ilock_data_map_shared() to cater for the + * Extents not yet cached requires exclusive access, don't block. This + * is an opencoded xfs_ilock_data_map_shared() call but with * non-blocking behaviour. */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && - !(ip->i_df.if_flags & XFS_IFEXTENTS)) - return true; - return false; + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + mode = XFS_ILOCK_EXCL; + } + +relock: + if (flags & IOMAP_NOWAIT) { + if (!xfs_ilock_nowait(ip, mode)) + return -EAGAIN; + } else { + xfs_ilock(ip, mode); + } + + /* + * The reflink iflag could have changed since the earlier unlocked + * check, so if we got ILOCK_SHARED for a write and but we're now a + * reflink inode we have to switch to ILOCK_EXCL and relock. + */ + if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { + xfs_iunlock(ip, mode); + mode = XFS_ILOCK_EXCL; + goto relock; + } + + *lockmode = mode; + return 0; } static int @@ -1007,19 +1038,15 @@ xfs_file_iomap_begin( return xfs_file_iomap_begin_delay(inode, offset, length, iomap); } - if (need_excl_ilock(ip, flags)) - lockmode = XFS_ILOCK_EXCL; - else - lockmode = XFS_ILOCK_SHARED; - - if (flags & IOMAP_NOWAIT) { - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) - return -EAGAIN; - if (!xfs_ilock_nowait(ip, lockmode)) - return -EAGAIN; - } else { - xfs_ilock(ip, lockmode); - } + /* + * Lock the inode in the manner required for the specified operation and + * check for as many conditions that would result in blocking as + * possible. This removes most of the non-blocking checks from the + * mapping code below. + */ + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; ASSERT(offset <= mp->m_super->s_maxbytes); if (offset > mp->m_super->s_maxbytes - length) @@ -1040,19 +1067,21 @@ xfs_file_iomap_begin( goto out_unlock; } - if (xfs_is_reflink_inode(ip) && - ((flags & IOMAP_WRITE) || - ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) { + /* Non-modifying mapping requested, so we are done */ + if (!(flags & (IOMAP_WRITE | IOMAP_ZERO))) + goto out_found; + + /* + * Break shared extents if necessary. Checks for non-blocking IO have + * been done up front, so we don't need to do them here. + */ + if (xfs_is_reflink_inode(ip)) { + /* if zeroing doesn't need COW allocation, then we are done. */ + if ((flags & IOMAP_ZERO) && + !needs_cow_for_zeroing(&imap, nimaps)) + goto out_found; + if (flags & IOMAP_DIRECT) { - /* - * A reflinked inode will result in CoW alloc. - * FIXME: It could still overwrite on unshared extents - * and not need allocation. - */ - if (flags & IOMAP_NOWAIT) { - error = -EAGAIN; - goto out_unlock; - } /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode); @@ -1068,46 +1097,45 @@ xfs_file_iomap_begin( length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { - /* - * If nowait is set bail since we are going to make - * allocations. - */ - if (flags & IOMAP_NOWAIT) { - error = -EAGAIN; - goto out_unlock; - } - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES - * pages to keep the chunks of work done where somewhat symmetric - * with the work writeback does. This is a completely arbitrary - * number pulled out of thin air as a best guess for initial - * testing. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - length = min_t(loff_t, length, 1024 * PAGE_SIZE); - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - error = xfs_iomap_write_direct(ip, offset, length, &imap, - nimaps); - if (error) - return error; + /* Don't need to allocate over holes when doing zeroing operations. */ + if (flags & IOMAP_ZERO) + goto out_found; - iomap->flags = IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); - } else { - ASSERT(nimaps); + if (!imap_needs_alloc(inode, &imap, nimaps)) + goto out_found; - xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, 0, &imap); + /* If nowait is set bail since we are going to make allocations. */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; } + /* + * We cap the maximum length we map to a sane size to keep the chunks + * of work done where somewhat symmetric with the work writeback does. + * This is a completely arbitrary number pulled out of thin air as a + * best guess for initial testing. + * + * Note that the values needs to be less than 32-bits wide until the + * lower level functions are updated. + */ + length = min_t(loff_t, length, 1024 * PAGE_SIZE); + + /* + * xfs_iomap_write_direct() expects the shared lock. It is unlocked on + * return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); + if (error) + return error; + + iomap->flags = IOMAP_F_NEW; + trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + +out_finish: if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; @@ -1117,6 +1145,13 @@ xfs_file_iomap_begin( if (shared) iomap->flags |= IOMAP_F_SHARED; return 0; + +out_found: + ASSERT(nimaps); + xfs_iunlock(ip, lockmode); + trace_xfs_iomap_found(ip, offset, length, 0, &imap); + goto out_finish; + out_unlock: xfs_iunlock(ip, lockmode); return error; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index ee535065c5d0..83474c9cede9 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ @@ -42,10 +30,10 @@ xfs_aligned_fsb_count( if (extsz) { xfs_extlen_t align; - align = do_mod(offset_fsb, extsz); + div_u64_rem(offset_fsb, extsz, &align); if (align) count_fsb += align; - align = do_mod(count_fsb, extsz); + div_u64_rem(count_fsb, extsz, &align); if (align) count_fsb += extsz - align; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a3ed3c811dfa..0fa29f39d658 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -37,7 +25,6 @@ #include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_trans_space.h" -#include "xfs_pnfs.h" #include "xfs_iomap.h" #include <linux/capability.h> @@ -260,6 +247,7 @@ xfs_vn_lookup( struct dentry *dentry, unsigned int flags) { + struct inode *inode; struct xfs_inode *cip; struct xfs_name name; int error; @@ -269,14 +257,13 @@ xfs_vn_lookup( xfs_dentry_to_name(&name, dentry); error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); - if (unlikely(error)) { - if (unlikely(error != -ENOENT)) - return ERR_PTR(error); - d_add(dentry, NULL); - return NULL; - } - - return d_splice_alias(VFS_I(cip), dentry); + if (likely(!error)) + inode = VFS_I(cip); + else if (likely(error == -ENOENT)) + inode = NULL; + else + inode = ERR_PTR(error); + return d_splice_alias(inode, dentry); } STATIC struct dentry * @@ -855,7 +842,7 @@ xfs_setattr_size( /* * Make sure that the dquots are attached to the inode. */ - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -1030,14 +1017,19 @@ xfs_vn_setattr( int error; if (iattr->ia_valid & ATTR_SIZE) { - struct xfs_inode *ip = XFS_I(d_inode(dentry)); - uint iolock = XFS_IOLOCK_EXCL; + struct inode *inode = d_inode(dentry); + struct xfs_inode *ip = XFS_I(inode); + uint iolock; - error = xfs_break_layouts(d_inode(dentry), &iolock); - if (error) + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); + if (error) { + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); return error; + } - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); error = xfs_vn_setattr_size(dentry, iattr); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { @@ -1050,7 +1042,7 @@ xfs_vn_setattr( STATIC int xfs_vn_update_time( struct inode *inode, - struct timespec *now, + struct timespec64 *now, int flags) { struct xfs_inode *ip = XFS_I(inode); @@ -1195,6 +1187,30 @@ static const struct inode_operations xfs_inline_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; +/* Figure out if this file actually supports DAX. */ +static bool +xfs_inode_supports_dax( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* Only supported on non-reflinked files. */ + if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip)) + return false; + + /* DAX mount option or DAX iflag must be set. */ + if (!(mp->m_flags & XFS_MOUNT_DAX) && + !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) + return false; + + /* Block size must match page size */ + if (mp->m_sb.sb_blocksize != PAGE_SIZE) + return false; + + /* Device has to support DAX too. */ + return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL; +} + STATIC void xfs_diflags_to_iflags( struct inode *inode, @@ -1213,11 +1229,7 @@ xfs_diflags_to_iflags( inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - if (S_ISREG(inode->i_mode) && - ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && - !xfs_is_reflink_inode(ip) && - (ip->i_mount->m_flags & XFS_MOUNT_DAX || - ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) + if (xfs_inode_supports_dax(ip)) inode->i_flags |= S_DAX; } @@ -1250,6 +1262,14 @@ xfs_setup_inode( xfs_diflags_to_iflags(inode, ip); if (S_ISDIR(inode->i_mode)) { + /* + * We set the i_rwsem class here to avoid potential races with + * lockdep_annotate_inode_mutex_key() reinitialising the lock + * after a filehandle lookup has already found the inode in + * cache before it has been unlocked via unlock_new_inode(). + */ + lockdep_set_class(&inode->i_rwsem, + &inode->i_sb->s_type->i_mutex_dir_key); lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); ip->d_ops = ip->i_mount->m_dir_inode_ops; } else { diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 0259a383721a..4d24ff309f59 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_IOPS_H__ #define __XFS_IOPS_H__ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index d58310514423..24f4f1c555b5 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -571,7 +559,7 @@ xfs_inumbers( *lastino != XFS_AGINO_TO_INO(mp, agno, agino)) return error; - bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); + bcount = min(left, (int)(PAGE_SIZE / sizeof(*buffer))); buffer = kmem_zalloc(bcount * sizeof(*buffer), KM_SLEEP); do { struct xfs_inobt_rec_incore r; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 6ea8b3912fa4..8a822285b671 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_ITABLE_H__ #define __XFS_ITABLE_H__ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index bee51a14a906..edbd5a210df2 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_LINUX__ #define __XFS_LINUX__ @@ -38,6 +26,7 @@ typedef __u32 xfs_nlink_t; #include <linux/semaphore.h> #include <linux/mm.h> +#include <linux/sched/mm.h> #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/slab.h> @@ -151,8 +140,6 @@ typedef __u32 xfs_nlink_t; #define XFS_PROJID_DEFAULT 0 -#define MIN(a,b) (min(a,b)) -#define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) static inline void delay(long ticks) @@ -220,25 +207,6 @@ static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev) #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() -/* Side effect free 64 bit mod operation */ -static inline __u32 xfs_do_mod(void *a, __u32 b, int n) -{ - switch (n) { - case 4: - return *(__u32 *)a % b; - case 8: - { - __u64 c = *(__u64 *)a; - return do_div(c, b); - } - } - - /* NOTREACHED */ - return 0; -} - -#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) - static inline uint64_t roundup_64(uint64_t x, uint32_t y) { x += y - 1; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2fcd9ed5d075..5e56f3b93d4b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -1047,6 +1035,7 @@ xfs_log_item_init( INIT_LIST_HEAD(&item->li_ail); INIT_LIST_HEAD(&item->li_cil); INIT_LIST_HEAD(&item->li_bio_list); + INIT_LIST_HEAD(&item->li_trans); } /* @@ -1640,8 +1629,8 @@ xlog_grant_push_ail( * log, and 256 blocks. */ free_threshold = BTOBB(need_bytes); - free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); - free_threshold = MAX(free_threshold, 256); + free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); + free_threshold = max(free_threshold, 256); if (free_blocks >= free_threshold) return; @@ -2110,10 +2099,10 @@ xlog_print_tic_res( */ void xlog_print_trans( - struct xfs_trans *tp) + struct xfs_trans *tp) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_log_item_desc *lidp; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_log_item *lip; /* dump core transaction and ticket info */ xfs_warn(mp, "transaction summary:"); @@ -2124,15 +2113,14 @@ xlog_print_trans( xlog_print_tic_res(mp, tp->t_ticket); /* dump each log item */ - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv = lip->li_lv; struct xfs_log_iovec *vec; int i; xfs_warn(mp, "log item: "); xfs_warn(mp, " type = 0x%x", lip->li_type); - xfs_warn(mp, " flags = 0x%x", lip->li_flags); + xfs_warn(mp, " flags = 0x%lx", lip->li_flags); if (!lv) continue; xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index fa8ad31d587f..3c1f6a8b4b70 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_LOG_H__ #define __XFS_LOG_H__ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4668403b1741..d3884e08b43c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" @@ -141,10 +129,9 @@ xlog_cil_alloc_shadow_bufs( struct xlog *log, struct xfs_trans *tp) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv; int niovecs = 0; int nbytes = 0; @@ -152,7 +139,7 @@ xlog_cil_alloc_shadow_bufs( bool ordered = false; /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* get number of vecs and size of data to be stored */ @@ -317,7 +304,7 @@ xlog_cil_insert_format_items( int *diff_len, int *diff_iovecs) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; /* Bail out if we didn't find a log item. */ @@ -326,15 +313,14 @@ xlog_cil_insert_format_items( return; } - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv; struct xfs_log_vec *old_lv = NULL; struct xfs_log_vec *shadow; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* @@ -406,7 +392,7 @@ xlog_cil_insert_items( { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; int len = 0; int diff_iovecs = 0; int iclog_space; @@ -479,11 +465,10 @@ xlog_cil_insert_items( * We do this here so we only need to take the CIL lock once during * the transaction commit. */ - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* @@ -1013,6 +998,7 @@ xfs_log_commit_cil( *commit_lsn = xc_commit_lsn; xfs_log_done(mp, tp->t_ticket, NULL, regrant); + tp->t_ticket = NULL; xfs_trans_unreserve_and_mod_sb(tp); /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 129975970d99..b5f82cb36202 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_LOG_PRIV_H__ #define __XFS_LOG_PRIV_H__ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 2b2383f1895e..b181b5f57a19 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -1248,6 +1236,25 @@ xlog_verify_head( } /* + * We need to make sure we handle log wrapping properly, so we can't use the + * calculated logbno directly. Make sure it wraps to the correct bno inside the + * log. + * + * The log is limited to 32 bit sizes, so we use the appropriate modulus + * operation here and cast it back to a 64 bit daddr on return. + */ +static inline xfs_daddr_t +xlog_wrap_logbno( + struct xlog *log, + xfs_daddr_t bno) +{ + int mod; + + div_s64_rem(bno, log->l_logBBsize, &mod); + return mod; +} + +/* * Check whether the head of the log points to an unmount record. In other * words, determine whether the log is clean. If so, update the in-core state * appropriately. @@ -1295,12 +1302,13 @@ xlog_check_unmount_rec( } else { hblks = 1; } - after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)); - after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize); + + after_umount_blk = xlog_wrap_logbno(log, + rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len))); + if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { - umount_data_blk = rhead_blk + hblks; - umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize); + umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks); error = xlog_bread(log, umount_data_blk, 1, bp, &offset); if (error) return error; @@ -1816,7 +1824,7 @@ xlog_clear_stale_blocks( * we don't waste all day writing from the head to the tail * for no reason. */ - max_distance = MIN(max_distance, tail_distance); + max_distance = min(max_distance, tail_distance); if ((head_block + max_distance) <= log->l_logBBsize) { /* @@ -2702,7 +2710,7 @@ xlog_recover_do_reg_buffer( goto next; } fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, - -1, 0, 0); + -1, 0); if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", @@ -2884,14 +2892,14 @@ xlog_recover_buffer_pass2( * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * the inode buffer size isn't max(blocksize, mp->m_inode_cluster_size) * for *our* value of mp->m_inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, + (BBTOB(bp->b_io_length) != max(log->l_mp->m_sb.sb_blocksize, (uint32_t)log->l_mp->m_inode_cluster_size))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); @@ -3115,7 +3123,8 @@ xlog_recover_inode_pass2( if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", - XFS_ERRLEVEL_LOW, mp, ldip); + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); xfs_alert(mp, "%s: Bad regular inode log record, rec ptr "PTR_FMT", " "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", @@ -3128,7 +3137,8 @@ xlog_recover_inode_pass2( (ldip->di_format != XFS_DINODE_FMT_BTREE) && (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", - XFS_ERRLEVEL_LOW, mp, ldip); + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); xfs_alert(mp, "%s: Bad dir inode log record, rec ptr "PTR_FMT", " "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", @@ -3139,7 +3149,8 @@ xlog_recover_inode_pass2( } if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", - XFS_ERRLEVEL_LOW, mp, ldip); + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); xfs_alert(mp, "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", @@ -3151,7 +3162,8 @@ xlog_recover_inode_pass2( } if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", - XFS_ERRLEVEL_LOW, mp, ldip); + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); xfs_alert(mp, "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, @@ -3162,7 +3174,8 @@ xlog_recover_inode_pass2( isize = xfs_log_dinode_size(ldip->di_version); if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", - XFS_ERRLEVEL_LOW, mp, ldip); + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); xfs_alert(mp, "%s: Bad inode log record length %d, rec ptr "PTR_FMT, __func__, item->ri_buf[1].i_len, item); @@ -3348,7 +3361,7 @@ xlog_recover_dquot_pass2( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0, 0); + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", dq_f->qlf_id, fa); @@ -5466,9 +5479,7 @@ xlog_do_recovery_pass( */ if (blk_no + bblks <= log->l_logBBsize || blk_no >= log->l_logBBsize) { - /* mod blk_no in case the header wrapped and - * pushed it beyond the end of the log */ - rblk_no = do_mod(blk_no, log->l_logBBsize); + rblk_no = xlog_wrap_logbno(log, blk_no); error = xlog_bread(log, rblk_no, bblks, dbp, &offset); if (error) diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index e68bd1050eab..576c375ce12a 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index a901b86772f8..a3378252baa1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -874,9 +862,12 @@ xfs_mountfs( * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. */ - error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); + error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_IGET_UNTRUSTED, + XFS_ILOCK_EXCL, &rip); if (error) { - xfs_warn(mp, "failed to read root inode"); + xfs_warn(mp, + "Failed to read root inode 0x%llx, error %d", + sbp->sb_rootino, -error); goto out_log_dealloc; } @@ -1072,9 +1063,7 @@ xfs_unmountfs( uint64_t resblks; int error; - cancel_delayed_work_sync(&mp->m_eofblocks_work); - cancel_delayed_work_sync(&mp->m_cowblocks_work); - + xfs_icache_disable_reclaim(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 10b90bbc5162..245349d1e23f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ @@ -283,7 +271,7 @@ xfs_preferred_iosize(xfs_mount_t *mp) return (mp->m_swidth ? (mp->m_swidth << mp->m_sb.sb_blocklog) : ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ? - (1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) : + (1 << (int)max(mp->m_readio_log, mp->m_writeio_log)) : PAGE_SIZE)); } diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 70eea7ae2876..74738813f60d 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2006-2007 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_mru_cache.h" diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index b3f3fbdfcc47..f1fde1ecf730 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2006-2007 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_MRU_CACHE_H__ #define __XFS_MRU_CACHE_H__ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 0492436a053f..d3e04d20d8d4 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016 Oracle. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_ONDISK_H #define __XFS_ONDISK_H diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index aa6c5c193f45..f44c3599527d 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -31,19 +31,20 @@ * rules in the page fault path we don't bother. */ int -xfs_break_layouts( +xfs_break_leased_layouts( struct inode *inode, - uint *iolock) + uint *iolock, + bool *did_unlock) { struct xfs_inode *ip = XFS_I(inode); int error; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); - while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); + *did_unlock = true; error = break_layout(inode, true); - *iolock = XFS_IOLOCK_EXCL; + *iolock &= ~XFS_IOLOCK_SHARED; + *iolock |= XFS_IOLOCK_EXCL; xfs_ilock(ip, *iolock); } @@ -120,8 +121,8 @@ xfs_fs_map_blocks( * Lock out any other I/O before we flush and invalidate the pagecache, * and then hand out a layout to the remote system. This is very * similar to direct I/O, except that the synchronization is much more - * complicated. See the comment near xfs_break_layouts for a detailed - * explanation. + * complicated. See the comment near xfs_break_leased_layouts + * for a detailed explanation. */ xfs_ilock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index bf45951e28fe..940c6c2ad88c 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -9,10 +9,11 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock); +int xfs_break_leased_layouts(struct inode *inode, uint *iolock, + bool *did_unlock); #else static inline int -xfs_break_layouts(struct inode *inode, uint *iolock) +xfs_break_leased_layouts(struct inode *inode, uint *iolock, bool *did_unlock) { return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index ec39ae274c78..9ceb85cce33a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -161,10 +149,7 @@ xfs_qm_dqpurge( * to purge this dquot anyway, so we go ahead regardless. */ error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(mp, "%s: dquot "PTR_FMT" flush failed", - __func__, dqp); - } else { + if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); } @@ -173,7 +158,7 @@ xfs_qm_dqpurge( ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(XFS_FORCED_SHUTDOWN(mp) || - !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); @@ -265,7 +250,7 @@ xfs_qm_dqattach_one( xfs_inode_t *ip, xfs_dqid_t id, uint type, - uint doalloc, + bool doalloc, xfs_dquot_t **IO_idqpp) { xfs_dquot_t *dqp; @@ -291,7 +276,7 @@ xfs_qm_dqattach_one( * exist on disk and we didn't ask it to allocate; ESRCH if quotas got * turned off suddenly. */ - error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc, &dqp); + error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp); if (error) return error; @@ -326,14 +311,14 @@ xfs_qm_need_dqattach( /* * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON * into account. - * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. + * If @doalloc is true, the dquot(s) will be allocated if needed. * Inode may get unlocked and relocked in here, and the caller must deal with * the consequences. */ int xfs_qm_dqattach_locked( xfs_inode_t *ip, - uint flags) + bool doalloc) { xfs_mount_t *mp = ip->i_mount; int error = 0; @@ -345,8 +330,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, - flags & XFS_QMOPT_DQALLOC, - &ip->i_udquot); + doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); @@ -354,8 +338,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, - flags & XFS_QMOPT_DQALLOC, - &ip->i_gdquot); + doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); @@ -363,8 +346,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, - flags & XFS_QMOPT_DQALLOC, - &ip->i_pdquot); + doalloc, &ip->i_pdquot); if (error) goto done; ASSERT(ip->i_pdquot); @@ -381,8 +363,7 @@ done: int xfs_qm_dqattach( - struct xfs_inode *ip, - uint flags) + struct xfs_inode *ip) { int error; @@ -390,7 +371,7 @@ xfs_qm_dqattach( return 0; xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_qm_dqattach_locked(ip, flags); + error = xfs_qm_dqattach_locked(ip, false); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -479,11 +460,8 @@ xfs_qm_dquot_isolate( spin_unlock(lru_lock); error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(dqp->q_mount, "%s: dquot "PTR_FMT" flush failed", - __func__, dqp); + if (error) goto out_unlock_dirty; - } xfs_buf_delwri_queue(bp, &isol->buffers); xfs_buf_relse(bp); @@ -571,27 +549,88 @@ xfs_qm_set_defquota( { xfs_dquot_t *dqp; struct xfs_def_quota *defq; + struct xfs_disk_dquot *ddqp; int error; - error = xfs_qm_dqread(mp, 0, type, 0, &dqp); + error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); + if (error) + return; + + ddqp = &dqp->q_core; + defq = xfs_get_defquota(dqp, qinf); + + /* + * Timers and warnings have been already set, let's just set the + * default limits for this quota type + */ + defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); + defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); + defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + xfs_qm_dqdestroy(dqp); +} - if (!error) { - xfs_disk_dquot_t *ddqp = &dqp->q_core; +/* Initialize quota time limits from the root dquot. */ +static void +xfs_qm_init_timelimits( + struct xfs_mount *mp, + struct xfs_quotainfo *qinf) +{ + struct xfs_disk_dquot *ddqp; + struct xfs_dquot *dqp; + uint type; + int error; - defq = xfs_get_defquota(dqp, qinf); + qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; - /* - * Timers and warnings have been already set, let's just set the - * default limits for this quota type - */ - defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); - defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); - defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); - defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); - defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); - defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - xfs_qm_dqdestroy(dqp); - } + /* + * We try to get the limits from the superuser's limits fields. + * This is quite hacky, but it is standard quota practice. + * + * Since we may not have done a quotacheck by this point, just read + * the dquot without attaching it to any hashtables or lists. + * + * Timers and warnings are globally set by the first timer found in + * user/group/proj quota types, otherwise a default value is used. + * This should be split into different fields per quota type. + */ + if (XFS_IS_UQUOTA_RUNNING(mp)) + type = XFS_DQ_USER; + else if (XFS_IS_GQUOTA_RUNNING(mp)) + type = XFS_DQ_GROUP; + else + type = XFS_DQ_PROJ; + error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); + if (error) + return; + + ddqp = &dqp->q_core; + /* + * The warnings and timers set the grace period given to + * a user or group before he or she can not perform any + * more writing. If it is zero, a default is used. + */ + if (ddqp->d_btimer) + qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer); + if (ddqp->d_itimer) + qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer); + if (ddqp->d_rtbtimer) + qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); + if (ddqp->d_bwarns) + qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns); + if (ddqp->d_iwarns) + qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns); + if (ddqp->d_rtbwarns) + qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); + + xfs_qm_dqdestroy(dqp); } /* @@ -600,11 +639,10 @@ xfs_qm_set_defquota( */ STATIC int xfs_qm_init_quotainfo( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_quotainfo_t *qinf; - int error; - xfs_dquot_t *dqp; + struct xfs_quotainfo *qinf; + int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -636,52 +674,7 @@ xfs_qm_init_quotainfo( mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); - /* - * We try to get the limits from the superuser's limits fields. - * This is quite hacky, but it is standard quota practice. - * - * Since we may not have done a quotacheck by this point, just read - * the dquot without attaching it to any hashtables or lists. - * - * Timers and warnings are globally set by the first timer found in - * user/group/proj quota types, otherwise a default value is used. - * This should be split into different fields per quota type. - */ - error = xfs_qm_dqread(mp, 0, - XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : - (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : - XFS_DQ_PROJ), - 0, &dqp); - - if (!error) { - xfs_disk_dquot_t *ddqp = &dqp->q_core; - - /* - * The warnings and timers set the grace period given to - * a user or group before he or she can not perform any - * more writing. If it is zero, a default is used. - */ - qinf->qi_btimelimit = ddqp->d_btimer ? - be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = ddqp->d_itimer ? - be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ? - be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = ddqp->d_bwarns ? - be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = ddqp->d_iwarns ? - be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ? - be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT; - xfs_qm_dqdestroy(dqp); - } else { - qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; - } + xfs_qm_init_timelimits(mp, qinf); if (XFS_IS_UQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); @@ -865,9 +858,9 @@ xfs_qm_reset_dqcounts( * find uninitialised dquot blks. See comment in * xfs_dquot_verify. */ - fa = xfs_dquot_verify(mp, ddq, id + j, type, 0); + fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type); if (fa) - xfs_dquot_repair(mp, ddq, id + j, type); + xfs_dqblk_repair(mp, &dqb[j], id + j, type); /* * Reset type in case we are reusing group quota file for @@ -893,7 +886,7 @@ xfs_qm_reset_dqcounts( } STATIC int -xfs_qm_dqiter_bufs( +xfs_qm_reset_dqcounts_all( struct xfs_mount *mp, xfs_dqid_t firstid, xfs_fsblock_t bno, @@ -961,11 +954,11 @@ xfs_qm_dqiter_bufs( } /* - * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a - * caller supplied function for every chunk of dquots that we find. + * Iterate over all allocated dquot blocks in this quota inode, zeroing all + * counters for every chunk of dquots that we find. */ STATIC int -xfs_qm_dqiterate( +xfs_qm_reset_dqcounts_buf( struct xfs_mount *mp, struct xfs_inode *qip, uint flags, @@ -1041,7 +1034,7 @@ xfs_qm_dqiterate( * Iterate thru all the blks in the extent and * reset the counters of all the dquots inside them. */ - error = xfs_qm_dqiter_bufs(mp, firstid, + error = xfs_qm_reset_dqcounts_all(mp, firstid, map[i].br_startblock, map[i].br_blockcount, flags, buffer_list); @@ -1066,16 +1059,17 @@ out: STATIC int xfs_qm_quotacheck_dqadjust( struct xfs_inode *ip, - xfs_dqid_t id, uint type, xfs_qcnt_t nblks, xfs_qcnt_t rtblks) { struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *dqp; + xfs_dqid_t id; int error; - error = xfs_qm_dqget(mp, ip, id, type, XFS_QMOPT_DQALLOC, &dqp); + id = xfs_qm_id_for_quotatype(ip, type); + error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { /* * Shouldn't be able to turn off quotas here. @@ -1148,13 +1142,10 @@ xfs_qm_dqusage_adjust( } /* - * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget - * interface expects the inode to be exclusively locked because that's - * the case in all other instances. It's OK that we do this because - * quotacheck is done only at mount time. + * We don't _need_ to take the ilock EXCL here because quotacheck runs + * at mount time and therefore nobody will be racing chown/chproj. */ - error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL, - &ip); + error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip); if (error) { *res = BULKSTAT_RV_NOTHING; return error; @@ -1189,33 +1180,31 @@ xfs_qm_dqusage_adjust( * and quotaoffs don't race. (Quotachecks happen at mount time only). */ if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid, - XFS_DQ_USER, nblks, rtblks); + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks, + rtblks); if (error) goto error0; } if (XFS_IS_GQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid, - XFS_DQ_GROUP, nblks, rtblks); + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks, + rtblks); if (error) goto error0; } if (XFS_IS_PQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip), - XFS_DQ_PROJ, nblks, rtblks); + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks, + rtblks); if (error) goto error0; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); IRELE(ip); *res = BULKSTAT_RV_DIDONE; return 0; error0: - xfs_iunlock(ip, XFS_ILOCK_EXCL); IRELE(ip); *res = BULKSTAT_RV_GIVEUP; return error; @@ -1247,9 +1236,8 @@ xfs_qm_flush_one( */ if (!xfs_dqflock_nowait(dqp)) { /* buf is pinned in-core by delwri list */ - DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen); - bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL); + bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0); if (!bp) { error = -EINVAL; goto out_unlock; @@ -1307,7 +1295,7 @@ xfs_qm_quotacheck( * We don't log our changes till later. */ if (uip) { - error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA, &buffer_list); if (error) goto error_return; @@ -1315,7 +1303,7 @@ xfs_qm_quotacheck( } if (gip) { - error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA, &buffer_list); if (error) goto error_return; @@ -1323,7 +1311,7 @@ xfs_qm_quotacheck( } if (pip) { - error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA, &buffer_list); if (error) goto error_return; @@ -1675,7 +1663,7 @@ xfs_qm_vop_dqalloc( * if necessary. The dquot(s) will not be locked. */ if (XFS_NOT_DQATTACHED(mp, ip)) { - error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC); + error = xfs_qm_dqattach_locked(ip, true); if (error) { xfs_iunlock(ip, lockflags); return error; @@ -1694,10 +1682,7 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, uid, - XFS_DQ_USER, - XFS_QMOPT_DQALLOC, - &uq); + error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; @@ -1720,10 +1705,7 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { if (ip->i_d.di_gid != gid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, gid, - XFS_DQ_GROUP, - XFS_QMOPT_DQALLOC, - &gq); + error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1739,10 +1721,8 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, - XFS_DQ_PROJ, - XFS_QMOPT_DQALLOC, - &pq); + error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, + true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1933,7 +1913,7 @@ xfs_qm_vop_rename_dqattach( */ if (i == 0 || ip != i_tab[i-1]) { if (XFS_NOT_DQATTACHED(mp, ip)) { - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 2975a822e9f0..3ccf0fbc9071 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_QM_H__ #define __XFS_QM_H__ @@ -170,8 +158,10 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); -extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *, - uint, struct qc_dqblk *, uint); +extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, + uint, struct qc_dqblk *); +extern int xfs_qm_scall_getquota_next(struct xfs_mount *, + xfs_dqid_t *, uint, struct qc_dqblk *); extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, struct qc_dqblk *); extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 2be6d2735ca9..73a1d77ec187 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -72,7 +60,7 @@ xfs_qm_statvfs( xfs_mount_t *mp = ip->i_mount; xfs_dquot_t *dqp; - if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { + if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 9cb5c381b01c..abc8a21e3a82 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/capability.h> @@ -425,7 +413,7 @@ xfs_qm_scall_setqlim( * a reference to the dquot, so it's safe to do this unlock/lock without * it being reclaimed in the mean time. */ - error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { ASSERT(error != -ENOENT); goto out_unlock; @@ -622,39 +610,14 @@ out: return error; } - -int -xfs_qm_scall_getquota( +/* Fill out the quota context. */ +static void +xfs_qm_scall_getquota_fill_qc( struct xfs_mount *mp, - xfs_dqid_t *id, uint type, - struct qc_dqblk *dst, - uint dqget_flags) + const struct xfs_dquot *dqp, + struct qc_dqblk *dst) { - struct xfs_dquot *dqp; - int error; - - /* - * Try to get the dquot. We don't want it allocated on disk, so - * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't - * exist, we'll get ENOENT back. - */ - error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp); - if (error) - return error; - - /* - * If everything's NULL, this dquot doesn't quite exist as far as - * our utility programs are concerned. - */ - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - error = -ENOENT; - goto out_put; - } - - /* Fill in the ID we actually read from disk */ - *id = be32_to_cpu(dqp->q_core.d_id); - memset(dst, 0, sizeof(*dst)); dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); @@ -696,7 +659,7 @@ xfs_qm_scall_getquota( if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && - *id != 0) { + dqp->q_core.d_id != 0) { if ((dst->d_space > dst->d_spc_softlimit) && (dst->d_spc_softlimit > 0)) { ASSERT(dst->d_spc_timer != 0); @@ -707,11 +670,69 @@ xfs_qm_scall_getquota( } } #endif +} + +/* Return the quota information for the dquot matching id. */ +int +xfs_qm_scall_getquota( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct qc_dqblk *dst) +{ + struct xfs_dquot *dqp; + int error; + + /* + * Try to get the dquot. We don't want it allocated on disk, so don't + * set doalloc. If it doesn't exist, we'll get ENOENT back. + */ + error = xfs_qm_dqget(mp, id, type, false, &dqp); + if (error) + return error; + + /* + * If everything's NULL, this dquot doesn't quite exist as far as + * our utility programs are concerned. + */ + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + error = -ENOENT; + goto out_put; + } + + xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); + out_put: xfs_qm_dqput(dqp); return error; } +/* + * Return the quota information for the first initialized dquot whose id + * is at least as high as id. + */ +int +xfs_qm_scall_getquota_next( + struct xfs_mount *mp, + xfs_dqid_t *id, + uint type, + struct qc_dqblk *dst) +{ + struct xfs_dquot *dqp; + int error; + + error = xfs_qm_dqget_next(mp, *id, type, &dqp); + if (error) + return error; + + /* Fill in the ID we actually read from disk */ + *id = be32_to_cpu(dqp->q_core.d_id); + + xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); + + xfs_qm_dqput(dqp); + return error; +} STATIC int xfs_dqrele_inode( diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index ce6506adab7b..55b798265ef7 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_QUOTA_H__ #define __XFS_QUOTA_H__ @@ -48,6 +36,22 @@ struct xfs_trans; (XFS_IS_PQUOTA_ON(mp) && \ (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) +static inline uint +xfs_quota_chkd_flag( + uint dqtype) +{ + switch (dqtype) { + case XFS_DQ_USER: + return XFS_UQUOTA_CHKD; + case XFS_DQ_GROUP: + return XFS_GQUOTA_CHKD; + case XFS_DQ_PROJ: + return XFS_PQUOTA_CHKD; + default: + return 0; + } +} + /* * The structure kept inside the xfs_trans_t keep track of dquot changes * within a transaction and apply them later. @@ -90,8 +94,8 @@ extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, uint); -extern int xfs_qm_dqattach(struct xfs_inode *, uint); -extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); +extern int xfs_qm_dqattach(struct xfs_inode *); +extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc); extern void xfs_qm_dqdetach(struct xfs_inode *); extern void xfs_qm_dqrele(struct xfs_dquot *); extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); @@ -132,7 +136,7 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) #define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0) -#define xfs_qm_dqattach(ip, fl) (0) +#define xfs_qm_dqattach(ip) (0) #define xfs_qm_dqattach_locked(ip, fl) (0) #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index a65108594a07..205fbb2a77e4 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2008, Christoph Hellwig * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_format.h" @@ -239,8 +227,7 @@ xfs_fs_get_dqblk( return -ESRCH; id = from_kqid(&init_user_ns, qid); - return xfs_qm_scall_getquota(mp, &id, - xfs_quota_type(qid.type), qdq, 0); + return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq); } /* Return quota info for active quota >= this qid */ @@ -260,9 +247,8 @@ xfs_fs_get_nextdqblk( return -ESRCH; id = from_kqid(&init_user_ns, *qid); - ret = xfs_qm_scall_getquota(mp, &id, - xfs_quota_type(qid->type), qdq, - XFS_QMOPT_DQNEXT); + ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type), + qdq); if (ret) return ret; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 15c9393dd7a7..472a73e9d331 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -159,7 +145,7 @@ STATIC void xfs_cui_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_cui_release(CUI_ITEM(lip)); } @@ -310,7 +296,7 @@ xfs_cud_item_unlock( { struct xfs_cud_log_item *cudp = CUD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_cui_release(cudp->cud_cuip); kmem_zone_free(xfs_cud_zone, cudp); } diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index 0e5327349a13..dd830b69cd1e 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_REFCOUNT_ITEM_H__ #define __XFS_REFCOUNT_ITEM_H__ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cdbd342a5249..592fb2071a03 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -305,7 +291,7 @@ xfs_reflink_reserve_cow( * Fork all the shared blocks from our write offset until the end of * the extent. */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach_locked(ip, false); if (error) return error; @@ -431,7 +417,7 @@ retry: if (error) return error; - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach_locked(ip, false); if (error) goto out; goto retry; @@ -552,6 +538,9 @@ xfs_reflink_trim_irec_to_next_cow( * * If cancel_real is true this function cancels all COW fork extents for the * inode; if cancel_real is false, real extents are not cleared. + * + * Caller must have already joined the inode to the current transaction. The + * inode will be joined to the transaction returned to the caller. */ int xfs_reflink_cancel_cow_blocks( @@ -592,7 +581,6 @@ xfs_reflink_cancel_cow_blocks( if (error) break; } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { - xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); /* Free the CoW orphan record. */ @@ -1359,7 +1347,7 @@ xfs_reflink_remap_range( goto out_unlock; /* Attach dquots to dest inode before changing block map */ - ret = xfs_qm_dqattach(dest, 0); + ret = xfs_qm_dqattach(dest); if (ret) goto out_unlock; @@ -1551,7 +1539,12 @@ next: return 0; } -/* Clear the inode reflink flag if there are no shared extents. */ +/* + * Clear the inode reflink flag if there are no shared extents. + * + * The caller is responsible for joining the inode to the transaction passed in. + * The inode will be joined to the transaction that is returned to the caller. + */ int xfs_reflink_clear_inode_flag( struct xfs_inode *ip, @@ -1578,7 +1571,6 @@ xfs_reflink_clear_inode_flag( trace_xfs_reflink_unset_inode_flag(ip); ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); - xfs_trans_ijoin(*tpp, ip, 0); xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 701487bab468..1532827ba911 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 06a07846c9b3..127dc9c32a54 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -158,7 +144,7 @@ STATIC void xfs_rui_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_rui_release(RUI_ITEM(lip)); } @@ -331,7 +317,7 @@ xfs_rud_item_unlock( { struct xfs_rud_log_item *rudp = RUD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_rui_release(rudp->rud_ruip); kmem_zone_free(xfs_rud_zone, rudp); } diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 340c968e1f9c..7e482baa27f5 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __XFS_RMAP_ITEM_H__ #define __XFS_RMAP_ITEM_H__ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 488719d43ca8..329d4d26c13e 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -313,8 +301,12 @@ xfs_rtallocate_extent_block( /* * If size should be a multiple of prod, make that so. */ - if (prod > 1 && (p = do_mod(bestlen, prod))) - bestlen -= p; + if (prod > 1) { + div_u64_rem(bestlen, prod, &p); + if (p) + bestlen -= p; + } + /* * Allocate besti for bestlen & return that. */ @@ -1275,7 +1267,7 @@ xfs_rtpick_extent( b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >> (log2 + 1); if (b >= mp->m_sb.sb_rextents) - b = do_mod(b, mp->m_sb.sb_rextents); + div64_u64_rem(b, mp->m_sb.sb_rextents, &b); if (b + len > mp->m_sb.sb_rextents) b = mp->m_sb.sb_rextents - len; } diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index dfee3c991155..93e77b221355 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_RTALLOC_H__ #define __XFS_RTALLOC_H__ @@ -23,9 +11,14 @@ struct xfs_mount; struct xfs_trans; +/* + * XXX: Most of the realtime allocation functions deal in units of realtime + * extents, not realtime blocks. This looks funny when paired with the type + * name and screams for a larger cleanup. + */ struct xfs_rtalloc_rec { - xfs_rtblock_t ar_startblock; - xfs_rtblock_t ar_blockcount; + xfs_rtblock_t ar_startext; + xfs_rtblock_t ar_extcount; }; typedef int (*xfs_rtalloc_query_range_fn)( diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 056e12b421eb..4e4423153071 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include <linux/proc_fs.h> @@ -113,6 +101,7 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) } } +#ifdef CONFIG_PROC_FS /* legacy quota interfaces */ #ifdef CONFIG_XFS_QUOTA static int xqm_proc_show(struct seq_file *m, void *v) @@ -124,18 +113,6 @@ static int xqm_proc_show(struct seq_file *m, void *v) return 0; } -static int xqm_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xqm_proc_show, NULL); -} - -static const struct file_operations xqm_proc_fops = { - .open = xqm_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* legacy quota stats interface no 2 */ static int xqmstat_proc_show(struct seq_file *m, void *v) { @@ -147,22 +124,8 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) seq_putc(m, '\n'); return 0; } - -static int xqmstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xqmstat_proc_show, NULL); -} - -static const struct file_operations xqmstat_proc_fops = { - .owner = THIS_MODULE, - .open = xqmstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif /* CONFIG_XFS_QUOTA */ -#ifdef CONFIG_PROC_FS int xfs_init_procfs(void) { @@ -174,11 +137,9 @@ xfs_init_procfs(void) goto out; #ifdef CONFIG_XFS_QUOTA - if (!proc_create("fs/xfs/xqmstat", 0, NULL, - &xqmstat_proc_fops)) + if (!proc_create_single("fs/xfs/xqmstat", 0, NULL, xqmstat_proc_show)) goto out; - if (!proc_create("fs/xfs/xqm", 0, NULL, - &xqm_proc_fops)) + if (!proc_create_single("fs/xfs/xqm", 0, NULL, xqm_proc_show)) goto out; #endif return 0; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index f64d0ae345c4..130db070e4d8 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_STATS_H__ #define __XFS_STATS_H__ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d71424052917..9d791f158dfe 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" @@ -63,7 +51,7 @@ #include <linux/parser.h> static const struct super_operations xfs_super_operations; -struct bio_set *xfs_ioend_bioset; +struct bio_set xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -1148,7 +1136,7 @@ xfs_fs_statfs( statp->f_bavail = statp->f_bfree; fakeinos = statp->f_bfree << sbp->sb_inopblog; - statp->f_files = MIN(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); + statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); if (mp->m_maxicount) statp->f_files = min_t(typeof(statp->f_files), statp->f_files, @@ -1372,7 +1360,6 @@ xfs_fs_remount( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_queue_eofblocks(mp); /* Recover any CoW blocks that never got remapped. */ error = xfs_reflink_recover_cow(mp); @@ -1382,7 +1369,7 @@ xfs_fs_remount( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } - xfs_queue_cowblocks(mp); + xfs_icache_enable_reclaim(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1392,8 +1379,13 @@ xfs_fs_remount( /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) { + /* + * Cancel background eofb scanning so it cannot race with the + * final log force+buftarg wait and deadlock the remount. + */ + xfs_icache_disable_reclaim(mp); + /* Get rid of any leftover CoW reservations... */ - cancel_delayed_work_sync(&mp->m_cowblocks_work); error = xfs_icache_free_cowblocks(mp, NULL); if (error) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -1416,12 +1408,6 @@ xfs_fs_remount( */ xfs_save_resvblks(mp); - /* - * Cancel background eofb scanning so it cannot race with the - * final log force+buftarg wait and deadlock the remount. - */ - cancel_delayed_work_sync(&mp->m_eofblocks_work); - xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } @@ -1441,6 +1427,7 @@ xfs_fs_freeze( { struct xfs_mount *mp = XFS_M(sb); + xfs_icache_disable_reclaim(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); return xfs_sync_sb(mp, true); @@ -1454,6 +1441,7 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); + xfs_icache_enable_reclaim(mp); return 0; } @@ -1635,6 +1623,17 @@ xfs_fs_fill_super( #endif sb->s_op = &xfs_super_operations; + /* + * Delay mount work if the debug hook is set. This is debug + * instrumention to coordinate simulation of xfs mount failures with + * VFS superblock operations + */ + if (xfs_globals.mount_delay) { + xfs_notice(mp, "Delaying mount for %d seconds.", + xfs_globals.mount_delay); + msleep(xfs_globals.mount_delay * 1000); + } + if (silent) flags |= XFS_MFSI_QUIET; @@ -1690,11 +1689,17 @@ xfs_fs_fill_super( sb->s_flags |= SB_I_VERSION; if (mp->m_flags & XFS_MOUNT_DAX) { + bool rtdev_is_dax = false, datadev_is_dax; + xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - error = bdev_dax_supported(sb, sb->s_blocksize); - if (error) { + datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev, + sb->s_blocksize); + if (mp->m_rtdev_targp) + rtdev_is_dax = bdev_dax_supported( + mp->m_rtdev_targp->bt_bdev, sb->s_blocksize); + if (!rtdev_is_dax && !datadev_is_dax) { xfs_alert(mp, "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; @@ -1761,6 +1766,7 @@ xfs_fs_fill_super( out_close_devices: xfs_close_devices(mp); out_free_fsname: + sb->s_fs_info = NULL; xfs_free_fsname(mp); kfree(mp); out: @@ -1778,6 +1784,10 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); + /* if ->fill_super failed, we have no mount to tear down */ + if (!sb->s_fs_info) + return; + xfs_notice(mp, "Unmounting Filesystem"); xfs_filestream_unmount(mp); xfs_unmountfs(mp); @@ -1787,6 +1797,8 @@ xfs_fs_put_super( xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); + + sb->s_fs_info = NULL; xfs_free_fsname(mp); kfree(mp); } @@ -1806,6 +1818,9 @@ xfs_fs_nr_cached_objects( struct super_block *sb, struct shrink_control *sc) { + /* Paranoia: catch incorrect calls during mount setup or teardown */ + if (WARN_ON_ONCE(!sb->s_fs_info)) + return 0; return xfs_reclaim_inodes_count(XFS_M(sb)); } @@ -1845,10 +1860,9 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, + if (bioset_init(&xfs_ioend_bioset, 4 * MAX_BUF_PER_PAGE, offsetof(struct xfs_ioend, io_inline_bio), - BIOSET_NEED_BVECS); - if (!xfs_ioend_bioset) + BIOSET_NEED_BVECS)) goto out; xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), @@ -1880,11 +1894,6 @@ xfs_init_zones(void) if (!xfs_trans_zone) goto out_destroy_ifork_zone; - xfs_log_item_desc_zone = - kmem_zone_init(sizeof(struct xfs_log_item_desc), - "xfs_log_item_desc"); - if (!xfs_log_item_desc_zone) - goto out_destroy_trans_zone; /* * The size of the zone allocated buf log item is the maximum @@ -1894,7 +1903,7 @@ xfs_init_zones(void) xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), "xfs_buf_item"); if (!xfs_buf_item_zone) - goto out_destroy_log_item_desc_zone; + goto out_destroy_trans_zone; xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * @@ -1982,8 +1991,6 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_efd_zone); out_destroy_buf_item_zone: kmem_zone_destroy(xfs_buf_item_zone); - out_destroy_log_item_desc_zone: - kmem_zone_destroy(xfs_log_item_desc_zone); out_destroy_trans_zone: kmem_zone_destroy(xfs_trans_zone); out_destroy_ifork_zone: @@ -1997,7 +2004,7 @@ xfs_init_zones(void) out_destroy_log_ticket_zone: kmem_zone_destroy(xfs_log_ticket_zone); out_free_ioend_bioset: - bioset_free(xfs_ioend_bioset); + bioset_exit(&xfs_ioend_bioset); out: return -ENOMEM; } @@ -2022,14 +2029,13 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_efi_zone); kmem_zone_destroy(xfs_efd_zone); kmem_zone_destroy(xfs_buf_item_zone); - kmem_zone_destroy(xfs_log_item_desc_zone); kmem_zone_destroy(xfs_trans_zone); kmem_zone_destroy(xfs_ifork_zone); kmem_zone_destroy(xfs_da_state_zone); kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); kmem_zone_destroy(xfs_log_ticket_zone); - bioset_free(xfs_ioend_bioset); + bioset_exit(&xfs_ioend_bioset); } STATIC int __init diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 8cee8e8050e3..21cb49a43d7c 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_SUPER_H__ #define __XFS_SUPER_H__ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 5b66ac12913c..3783afcb68d2 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * Copyright (c) 2012-2013 Red Hat, Inc. * All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_shared.h" @@ -259,6 +247,7 @@ xfs_symlink( * bmapi or the directory create code. */ xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* * Allocate an inode for the symlink. @@ -488,16 +477,11 @@ xfs_inactive_symlink_rmt( error = xfs_defer_finish(&tp, &dfops); if (error) goto error_bmap_cancel; - /* - * The first xact was committed, so add the inode to the new one. - * Mark it dirty so it will be logged and moved forward in the log as - * part of every commit. - */ - xfs_trans_ijoin(tp, ip, 0); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* * Commit the transaction containing extent freeing and EFDs. */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index aeaee8923617..9743d8c9394b 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2012 Red Hat, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_SYMLINK_H #define __XFS_SYMLINK_H 1 diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index afe1f66aaa69..0cc034dfb786 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2001-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include <linux/sysctl.h> diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 82afee005140..168488130a19 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2001-2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_SYSCTL_H__ #define __XFS_SYSCTL_H__ @@ -95,6 +83,7 @@ extern xfs_param_t xfs_params; struct xfs_globals { int log_recovery_delay; /* log recovery delay (secs) */ + int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 8b2ccc234f36..cd6a994a7250 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" @@ -165,9 +153,40 @@ log_recovery_delay_show( } XFS_SYSFS_ATTR_RW(log_recovery_delay); +STATIC ssize_t +mount_delay_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 60) + return -EINVAL; + + xfs_globals.mount_delay = val; + + return count; +} + +STATIC ssize_t +mount_delay_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay); +} +XFS_SYSFS_ATTR_RW(mount_delay); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), + ATTR_LIST(mount_delay), NULL, }; diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index d04637181ef2..e9f810fc6731 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_SYSFS_H__ diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 35f3546b6af5..cb6489c22cad 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2009, Christoph Hellwig * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8955254b900e..972d45d28097 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2009, Christoph Hellwig * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xfs @@ -441,8 +429,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __field(unsigned, bli_recur) __field(int, bli_refcount) __field(unsigned, bli_flags) - __field(void *, li_desc) - __field(unsigned, li_flags) + __field(unsigned long, li_flags) ), TP_fast_assign( __entry->dev = bip->bli_buf->b_target->bt_dev; @@ -455,12 +442,11 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); __entry->buf_lockval = bip->bli_buf->b_sema.count; - __entry->li_desc = bip->bli_item.li_desc; __entry->li_flags = bip->bli_item.li_flags; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " "lock %d flags %s recur %d refcount %d bliflags %s " - "lidesc %p liflags %s", + "liflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->buf_bno, __entry->buf_len, @@ -471,7 +457,6 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->bli_recur, __entry->bli_refcount, __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), - __entry->li_desc, __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) ) @@ -1018,7 +1003,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __field(dev_t, dev) __field(void *, lip) __field(uint, type) - __field(uint, flags) + __field(unsigned long, flags) __field(xfs_lsn_t, lsn) ), TP_fast_assign( @@ -1070,7 +1055,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class, __field(dev_t, dev) __field(void *, lip) __field(uint, type) - __field(uint, flags) + __field(unsigned long, flags) __field(xfs_lsn_t, old_lsn) __field(xfs_lsn_t, new_lsn) ), @@ -1750,6 +1735,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(int, namelen) __field(int, valuelen) __field(xfs_dahash_t, hashval) + __field(int, flags) __field(int, op_flags) ), TP_fast_assign( @@ -1760,10 +1746,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen = args->namelen; __entry->valuelen = args->valuelen; __entry->hashval = args->hashval; + __entry->flags = args->flags; __entry->op_flags = args->op_flags; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " - "hashval 0x%x op_flags %s", + "hashval 0x%x flags %s op_flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -1771,6 +1758,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen, __entry->valuelen, __entry->hashval, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) ) @@ -2243,30 +2231,35 @@ struct xfs_defer_pending; struct xfs_defer_ops; DECLARE_EVENT_CLASS(xfs_defer_class, - TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), - TP_ARGS(mp, dop), + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, + unsigned long caller_ip), + TP_ARGS(mp, dop, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(void *, dop) __field(char, committed) __field(char, low) + __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; __entry->dop = dop; __entry->committed = dop->dop_committed; __entry->low = dop->dop_low; + __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ops %p committed %d low %d", + TP_printk("dev %d:%d ops %p committed %d low %d, caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dop, __entry->committed, - __entry->low) + __entry->low, + (char *)__entry->caller_ip) ) #define DEFINE_DEFER_EVENT(name) \ DEFINE_EVENT(xfs_defer_class, name, \ - TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), \ - TP_ARGS(mp, dop)) + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, \ + unsigned long caller_ip), \ + TP_ARGS(mp, dop, caller_ip)) DECLARE_EVENT_CLASS(xfs_defer_error_class, TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error), @@ -2433,6 +2426,8 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); +DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer); +DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); /* rmap tracepoints */ DECLARE_EVENT_CLASS(xfs_rmap_class, @@ -3346,6 +3341,43 @@ TRACE_EVENT(xfs_trans_resv_calc, __entry->logflags) ); +DECLARE_EVENT_CLASS(xfs_trans_class, + TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), + TP_ARGS(tp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint32_t, tid) + __field(uint32_t, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->tid = 0; + if (tp->t_ticket) + __entry->tid = tp->t_ticket->t_tid; + __entry->flags = tp->t_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d trans %x flags 0x%x caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->flags, + (char *)__entry->caller_ip) +) + +#define DEFINE_TRANS_EVENT(name) \ +DEFINE_EVENT(xfs_trans_class, name, \ + TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), \ + TP_ARGS(tp, caller_ip)) +DEFINE_TRANS_EVENT(xfs_trans_alloc); +DEFINE_TRANS_EVENT(xfs_trans_cancel); +DEFINE_TRANS_EVENT(xfs_trans_commit); +DEFINE_TRANS_EVENT(xfs_trans_dup); +DEFINE_TRANS_EVENT(xfs_trans_free); +DEFINE_TRANS_EVENT(xfs_trans_roll); +DEFINE_TRANS_EVENT(xfs_trans_add_item); +DEFINE_TRANS_EVENT(xfs_trans_free_items); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index d6d8f9d129a7..524f543c5b82 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * Copyright (C) 2010 Red Hat, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -31,9 +19,9 @@ #include "xfs_log.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_defer.h" kmem_zone_t *xfs_trans_zone; -kmem_zone_t *xfs_log_item_desc_zone; #if defined(CONFIG_TRACEPOINTS) static void @@ -79,6 +67,7 @@ xfs_trans_free( xfs_extent_busy_sort(&tp->t_busy); xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); + trace_xfs_trans_free(tp, _RET_IP_); atomic_dec(&tp->t_mountp->m_active_trans); if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); @@ -94,11 +83,13 @@ xfs_trans_free( * blocks. Locks and log items, however, are no inherited. They must * be added to the new transaction explicitly. */ -STATIC xfs_trans_t * +STATIC struct xfs_trans * xfs_trans_dup( - xfs_trans_t *tp) + struct xfs_trans *tp) { - xfs_trans_t *ntp; + struct xfs_trans *ntp; + + trace_xfs_trans_dup(tp, _RET_IP_); ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); @@ -127,6 +118,7 @@ xfs_trans_dup( ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; tp->t_rtx_res = tp->t_rtx_res_used; ntp->t_pflags = tp->t_pflags; + ntp->t_agfl_dfops = tp->t_agfl_dfops; xfs_trans_dup_dqinfo(tp, ntp); @@ -266,7 +258,12 @@ xfs_trans_alloc( if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); - WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + /* + * Zero-reservation ("empty") transactions can't modify anything, so + * they're allowed to run while we're frozen. + */ + WARN_ON(resp->tr_logres > 0 && + mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); tp = kmem_zone_zalloc(xfs_trans_zone, @@ -283,6 +280,8 @@ xfs_trans_alloc( return error; } + trace_xfs_trans_alloc(tp, _RET_IP_); + *tpp = tp; return 0; } @@ -727,73 +726,52 @@ out: return; } -/* - * Add the given log item to the transaction's list of log items. - * - * The log item will now point to its new descriptor with its li_desc field. - */ +/* Add the given log item to the transaction's list of log items. */ void xfs_trans_add_item( struct xfs_trans *tp, struct xfs_log_item *lip) { - struct xfs_log_item_desc *lidp; - ASSERT(lip->li_mountp == tp->t_mountp); ASSERT(lip->li_ailp == tp->t_mountp->m_ail); + ASSERT(list_empty(&lip->li_trans)); + ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags)); - lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); - - lidp->lid_item = lip; - lidp->lid_flags = 0; - list_add_tail(&lidp->lid_trans, &tp->t_items); - - lip->li_desc = lidp; -} - -STATIC void -xfs_trans_free_item_desc( - struct xfs_log_item_desc *lidp) -{ - list_del_init(&lidp->lid_trans); - kmem_zone_free(xfs_log_item_desc_zone, lidp); + list_add_tail(&lip->li_trans, &tp->t_items); + trace_xfs_trans_add_item(tp, _RET_IP_); } /* - * Unlink and free the given descriptor. + * Unlink the log item from the transaction. the log item is no longer + * considered dirty in this transaction, as the linked transaction has + * finished, either by abort or commit completion. */ void xfs_trans_del_item( struct xfs_log_item *lip) { - xfs_trans_free_item_desc(lip->li_desc); - lip->li_desc = NULL; + clear_bit(XFS_LI_DIRTY, &lip->li_flags); + list_del_init(&lip->li_trans); } -/* - * Unlock all of the items of a transaction and free all the descriptors - * of that transaction. - */ +/* Detach and unlock all of the items in a transaction */ void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, bool abort) { - struct xfs_log_item_desc *lidp, *next; - - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_item *lip, *next; - lip->li_desc = NULL; + trace_xfs_trans_free_items(tp, _RET_IP_); + list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { + xfs_trans_del_item(lip); if (commit_lsn != NULLCOMMITLSN) lip->li_ops->iop_committing(lip, commit_lsn); if (abort) - lip->li_flags |= XFS_LI_ABORTED; + set_bit(XFS_LI_ABORTED, &lip->li_flags); lip->li_ops->iop_unlock(lip); - - xfs_trans_free_item_desc(lidp); } } @@ -861,7 +839,7 @@ xfs_trans_committed_bulk( xfs_lsn_t item_lsn; if (aborted) - lip->li_flags |= XFS_LI_ABORTED; + set_bit(XFS_LI_ABORTED, &lip->li_flags); item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); /* item_lsn of -1 means the item needs no further processing */ @@ -936,6 +914,11 @@ __xfs_trans_commit( int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; + ASSERT(!tp->t_agfl_dfops || + !xfs_defer_has_unfinished_work(tp->t_agfl_dfops) || regrant); + + trace_xfs_trans_commit(tp, _RET_IP_); + /* * If there is nothing to be logged by the transaction, * then unlock all of the items associated with the @@ -991,6 +974,7 @@ out_unreserve: commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); if (commit_lsn == -1 && !error) error = -EIO; + tp->t_ticket = NULL; } current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); @@ -1022,6 +1006,8 @@ xfs_trans_cancel( struct xfs_mount *mp = tp->t_mountp; bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); + trace_xfs_trans_cancel(tp, _RET_IP_); + /* * See if the caller is relying on us to shut down the * filesystem. This happens in paths where we detect @@ -1033,17 +1019,19 @@ xfs_trans_cancel( } #ifdef DEBUG if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; - list_for_each_entry(lidp, &tp->t_items, lid_trans) - ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD)); + list_for_each_entry(lip, &tp->t_items, li_trans) + ASSERT(!(lip->li_type == XFS_LI_EFD)); } #endif xfs_trans_unreserve_and_mod_sb(tp); xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) + if (tp->t_ticket) { xfs_log_done(mp, tp->t_ticket, NULL, false); + tp->t_ticket = NULL; + } /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -1067,6 +1055,8 @@ xfs_trans_roll( struct xfs_trans_res tres; int error; + trace_xfs_trans_roll(trans, _RET_IP_); + /* * Copy the critical parameters from one trans to the next. */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9d542dfe0052..6526314f0b8f 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_TRANS_H__ #define __XFS_TRANS_H__ @@ -27,7 +15,6 @@ struct xfs_efi_log_item; struct xfs_inode; struct xfs_item_ops; struct xfs_log_iovec; -struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_trans_res; @@ -43,12 +30,12 @@ struct xfs_bud_log_item; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ + struct list_head li_trans; /* transaction list */ xfs_lsn_t li_lsn; /* last on-disk lsn */ - struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ struct xfs_mount *li_mountp; /* ptr to fs mount */ struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ - uint li_flags; /* misc flags */ + unsigned long li_flags; /* misc flags */ struct xfs_buf *li_buf; /* real buffer pointer */ struct list_head li_bio_list; /* buffer item list */ void (*li_cb)(struct xfs_buf *, @@ -64,14 +51,21 @@ typedef struct xfs_log_item { xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; -#define XFS_LI_IN_AIL 0x1 -#define XFS_LI_ABORTED 0x2 -#define XFS_LI_FAILED 0x4 +/* + * li_flags use the (set/test/clear)_bit atomic interfaces because updates can + * race with each other and we don't want to have to use the AIL lock to + * serialise all updates. + */ +#define XFS_LI_IN_AIL 0 +#define XFS_LI_ABORTED 1 +#define XFS_LI_FAILED 2 +#define XFS_LI_DIRTY 3 /* log item dirty in transaction */ #define XFS_LI_FLAGS \ - { XFS_LI_IN_AIL, "IN_AIL" }, \ - { XFS_LI_ABORTED, "ABORTED" }, \ - { XFS_LI_FAILED, "FAILED" } + { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ + { (1 << XFS_LI_ABORTED), "ABORTED" }, \ + { (1 << XFS_LI_FAILED), "FAILED" }, \ + { (1 << XFS_LI_DIRTY), "DIRTY" } struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); @@ -111,6 +105,7 @@ typedef struct xfs_trans { struct xlog_ticket *t_ticket; /* log mgr ticket */ struct xfs_mount *t_mountp; /* ptr to fs mount struct */ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ + struct xfs_defer_ops *t_agfl_dfops; /* optional agfl fixup dfops */ unsigned int t_flags; /* misc flags */ int64_t t_icount_delta; /* superblock icount change */ int64_t t_ifree_delta; /* superblock ifree change */ @@ -228,7 +223,8 @@ struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *, uint); int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, - xfs_extlen_t, struct xfs_owner_info *); + xfs_extlen_t, struct xfs_owner_info *, + bool); int xfs_trans_commit(struct xfs_trans *); int xfs_trans_roll(struct xfs_trans **); int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *); @@ -242,7 +238,6 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); extern kmem_zone_t *xfs_trans_zone; -extern kmem_zone_t *xfs_log_item_desc_zone; /* rmap updates */ enum xfs_rmap_intent_type; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d4a2445215e6..55326f971cb3 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (c) 2008 Dave Chinner * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -32,30 +20,51 @@ #ifdef DEBUG /* * Check that the list is sorted as it should be. + * + * Called with the ail lock held, but we don't want to assert fail with it + * held otherwise we'll lock everything up and won't be able to debug the + * cause. Hence we sample and check the state under the AIL lock and return if + * everything is fine, otherwise we drop the lock and run the ASSERT checks. + * Asserts may not be fatal, so pick the lock back up and continue onwards. */ STATIC void xfs_ail_check( - struct xfs_ail *ailp, - xfs_log_item_t *lip) + struct xfs_ail *ailp, + struct xfs_log_item *lip) { - xfs_log_item_t *prev_lip; + struct xfs_log_item *prev_lip; + struct xfs_log_item *next_lip; + xfs_lsn_t prev_lsn = NULLCOMMITLSN; + xfs_lsn_t next_lsn = NULLCOMMITLSN; + xfs_lsn_t lsn; + bool in_ail; + if (list_empty(&ailp->ail_head)) return; /* - * Check the next and previous entries are valid. + * Sample then check the next and previous entries are valid. */ - ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); - if (&prev_lip->li_ail != &ailp->ail_head) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - - prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); + in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags); + prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail); if (&prev_lip->li_ail != &ailp->ail_head) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); + prev_lsn = prev_lip->li_lsn; + next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail); + if (&next_lip->li_ail != &ailp->ail_head) + next_lsn = next_lip->li_lsn; + lsn = lip->li_lsn; + if (in_ail && + (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) && + (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0)) + return; + spin_unlock(&ailp->ail_lock); + ASSERT(in_ail); + ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0); + ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0); + spin_lock(&ailp->ail_lock); } #else /* !DEBUG */ #define xfs_ail_check(a,l) @@ -684,7 +693,7 @@ xfs_trans_ail_update_bulk( for (i = 0; i < nr_items; i++) { struct xfs_log_item *lip = log_items[i]; - if (lip->li_flags & XFS_LI_IN_AIL) { + if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) { /* check if we really need to move the item */ if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) continue; @@ -694,7 +703,6 @@ xfs_trans_ail_update_bulk( if (mlip == lip) mlip_changed = 1; } else { - lip->li_flags |= XFS_LI_IN_AIL; trace_xfs_ail_insert(lip, 0, lsn); } lip->li_lsn = lsn; @@ -725,7 +733,7 @@ xfs_ail_delete_one( trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); xfs_clear_li_failed(lip); - lip->li_flags &= ~XFS_LI_IN_AIL; + clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; return mlip == lip; @@ -761,7 +769,7 @@ xfs_trans_ail_delete( struct xfs_mount *mp = ailp->ail_mount; bool mlip_changed; - if (!(lip->li_flags & XFS_LI_IN_AIL)) { + if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 14543d93cd4b..a15a5cd867f9 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -79,7 +65,7 @@ xfs_trans_log_finish_bmap_update( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); return error; } @@ -158,7 +144,7 @@ xfs_bmap_update_log_item( bmap = container_of(item, struct xfs_bmap_intent, bi_list); tp->t_flags |= XFS_TRANS_DIRTY; - buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); /* * atomic_inc_return gives us the value after the increment; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index a5d9dfc45d98..15919f67a88f 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -40,7 +28,7 @@ xfs_trans_buf_item_match( struct xfs_buf_map *map, int nmaps) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; struct xfs_buf_log_item *blip; int len = 0; int i; @@ -48,8 +36,8 @@ xfs_trans_buf_item_match( for (i = 0; i < nmaps; i++) len += map[i].bm_len; - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - blip = (struct xfs_buf_log_item *)lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { + blip = (struct xfs_buf_log_item *)lip; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && @@ -100,14 +88,10 @@ _xfs_trans_bjoin( atomic_inc(&bip->bli_refcount); /* - * Get a log_item_desc to point at the new item. + * Attach the item to the transaction so we can find it in + * xfs_trans_get_buf() and friends. */ xfs_trans_add_item(tp, &bip->bli_item); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * in xfs_trans_get_buf() and friends above. - */ bp->b_transp = tp; } @@ -391,7 +375,7 @@ xfs_trans_brelse( * If the buffer is dirty within this transaction, we can't * release it until we commit. */ - if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY) + if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; /* @@ -442,7 +426,7 @@ xfs_trans_brelse( ASSERT(bp->b_pincount == 0); ***/ ASSERT(atomic_read(&bip->bli_refcount) == 0); - ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); xfs_buf_item_relse(bp); } @@ -542,7 +526,7 @@ xfs_trans_dirty_buf( bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; - bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); } /* @@ -626,7 +610,7 @@ xfs_trans_binval( ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); + ASSERT(test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; } @@ -642,7 +626,7 @@ xfs_trans_binval( memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); } - bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); tp->t_flags |= XFS_TRANS_DIRTY; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c3d547211d16..c23257a26c2b 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -77,7 +65,7 @@ xfs_trans_log_dquot( ASSERT(XFS_DQ_IS_LOCKED(dqp)); tp->t_flags |= XFS_TRANS_DIRTY; - dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags); } /* @@ -879,7 +867,7 @@ xfs_trans_log_quotaoff_item( xfs_qoff_logitem_t *qlp) { tp->t_flags |= XFS_TRANS_DIRTY; - qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags); } STATIC void diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index ab438647592a..bd66c76f55e6 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -68,7 +56,8 @@ xfs_trans_free_extent( struct xfs_efd_log_item *efdp, xfs_fsblock_t start_block, xfs_extlen_t ext_len, - struct xfs_owner_info *oinfo) + struct xfs_owner_info *oinfo, + bool skip_discard) { struct xfs_mount *mp = tp->t_mountp; uint next_extent; @@ -79,9 +68,8 @@ xfs_trans_free_extent( trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); - error = xfs_free_extent(tp, start_block, ext_len, oinfo, - XFS_AG_RESV_NONE); - + error = __xfs_free_extent(tp, start_block, ext_len, + oinfo, XFS_AG_RESV_NONE, skip_discard); /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: @@ -90,7 +78,7 @@ xfs_trans_free_extent( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); @@ -155,7 +143,7 @@ xfs_extent_free_log_item( free = container_of(item, struct xfs_extent_free_item, xefi_list); tp->t_flags |= XFS_TRANS_DIRTY; - efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); /* * atomic_inc_return gives us the value after the increment; @@ -195,7 +183,7 @@ xfs_extent_free_finish_item( error = xfs_trans_free_extent(tp, done_item, free->xefi_startblock, free->xefi_blockcount, - &free->xefi_oinfo); + &free->xefi_oinfo, free->xefi_skip_discard); kmem_free(free); return error; } @@ -231,9 +219,79 @@ static const struct xfs_defer_op_type xfs_extent_free_defer_type = { .cancel_item = xfs_extent_free_cancel_item, }; +/* + * AGFL blocks are accounted differently in the reserve pools and are not + * inserted into the busy extent list. + */ +STATIC int +xfs_agfl_free_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efd_log_item *efdp = done_item; + struct xfs_extent_free_item *free; + struct xfs_extent *extp; + struct xfs_buf *agbp; + int error; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + uint next_extent; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + ASSERT(free->xefi_blockcount == 1); + agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); + + trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (!error) + error = xfs_free_agfl_block(tp, agno, agbno, agbp, + &free->xefi_oinfo); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = free->xefi_startblock; + extp->ext_len = free->xefi_blockcount; + efdp->efd_next_extent++; + + kmem_free(free); + return error; +} + + +/* sub-type with special handling for AGFL deferred frees */ +static const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .type = XFS_DEFER_OPS_TYPE_AGFL_FREE, + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .diff_items = xfs_extent_free_diff_items, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .log_item = xfs_extent_free_log_item, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, +}; + /* Register the deferred op type. */ void xfs_extent_free_init_defer_op(void) { xfs_defer_init_op_type(&xfs_extent_free_defer_type); + xfs_defer_init_op_type(&xfs_agfl_free_defer_type); } diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 07cea592dc01..542927321a61 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" @@ -70,7 +58,7 @@ xfs_trans_ichgtime( int flags) { struct inode *inode = VFS_I(ip); - struct timespec tv; + struct timespec64 tv; ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -133,14 +121,13 @@ xfs_trans_log_inode( * set however, then go ahead and bump the i_version counter * unconditionally. */ - if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && + if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) && IS_I_VERSION(VFS_I(ip))) { if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) flags |= XFS_ILOG_CORE; } tp->t_flags |= XFS_TRANS_DIRTY; - ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; /* * Always OR in the bits from the ili_last_fields field. diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index be24b0c8a332..091eae9f4e74 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -1,25 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_TRANS_PRIV_H__ #define __XFS_TRANS_PRIV_H__ struct xfs_log_item; -struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_ail; @@ -119,7 +106,7 @@ xfs_trans_ail_remove( spin_lock(&ailp->ail_lock); /* xfs_trans_ail_delete() drops the AIL lock */ - if (lip->li_flags & XFS_LI_IN_AIL) + if (test_bit(XFS_LI_IN_AIL, &lip->li_flags)) xfs_trans_ail_delete(ailp, lip, shutdown_type); else spin_unlock(&ailp->ail_lock); @@ -171,11 +158,10 @@ xfs_clear_li_failed( { struct xfs_buf *bp = lip->li_buf; - ASSERT(lip->li_flags & XFS_LI_IN_AIL); + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); lockdep_assert_held(&lip->li_ailp->ail_lock); - if (lip->li_flags & XFS_LI_FAILED) { - lip->li_flags &= ~XFS_LI_FAILED; + if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { lip->li_buf = NULL; xfs_buf_rele(bp); } @@ -188,9 +174,8 @@ xfs_set_li_failed( { lockdep_assert_held(&lip->li_ailp->ail_lock); - if (!(lip->li_flags & XFS_LI_FAILED)) { + if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { xfs_buf_hold(bp); - lip->li_flags |= XFS_LI_FAILED; lip->li_buf = bp; } } diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c index 94c1877af834..46dd4fca8aa7 100644 --- a/fs/xfs/xfs_trans_refcount.c +++ b/fs/xfs/xfs_trans_refcount.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -77,7 +63,7 @@ xfs_trans_log_finish_refcount_update( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); return error; } @@ -154,7 +140,7 @@ xfs_refcount_update_log_item( refc = container_of(item, struct xfs_refcount_intent, ri_list); tp->t_flags |= XFS_TRANS_DIRTY; - cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); /* * atomic_inc_return gives us the value after the increment; diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 9b577beb43d7..726d8e2c0558 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. - * * Author: Darrick J. Wong <darrick.wong@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "xfs.h" #include "xfs_fs.h" @@ -117,7 +103,7 @@ xfs_trans_log_finish_rmap_update( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - rudp->rud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); return error; } @@ -175,7 +161,7 @@ xfs_rmap_update_log_item( rmap = container_of(item, struct xfs_rmap_intent, ri_list); tp->t_flags |= XFS_TRANS_DIRTY; - ruip->rui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); /* * atomic_inc_return gives us the value after the increment; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 0594db435972..63ee1d5bf1d7 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Christoph Hellwig. * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" |