From 53ce9a3364de0723b27d861de93bfc882f7db050 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Wed, 17 Jul 2013 14:53:53 +0200 Subject: fuse: readdirplus: fix dentry leak In case d_lookup() returns a dentry with d_inode == NULL, the dentry is not returned with dput(). This results in triggering a BUG() in shrink_dcache_for_umount_subtree(): BUG: Dentry ...{i=0,n=...} still in use (1) [unmount of fuse fuse] [SzM: need to d_drop() as well] Reported-by: Justin Clift Signed-off-by: Niels de Vos Signed-off-by: Miklos Szeredi Tested-by: Brian Foster Tested-by: Niels de Vos CC: stable@vger.kernel.org --- fs/fuse/dir.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0eda52738ec4..2ae5308174e2 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1227,9 +1227,15 @@ static int fuse_direntplus_link(struct file *file, name.hash = full_name_hash(name.name, name.len); dentry = d_lookup(parent, &name); - if (dentry && dentry->d_inode) { + if (dentry) { inode = dentry->d_inode; - if (get_node_id(inode) == o->nodeid) { + if (!inode) { + d_drop(dentry); + } else if (get_node_id(inode) != o->nodeid) { + err = d_invalidate(dentry); + if (err) + goto out; + } else { struct fuse_inode *fi; fi = get_fuse_inode(inode); spin_lock(&fc->lock); @@ -1242,9 +1248,6 @@ static int fuse_direntplus_link(struct file *file, */ goto found; } - err = d_invalidate(dentry); - if (err) - goto out; dput(dentry); dentry = NULL; } -- cgit v1.2.3 From a28ef45cbb1e7fadd5159deb17b02de15c6e4aaf Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 17 Jul 2013 14:53:53 +0200 Subject: fuse: readdirplus: sanity checks Add sanity checks before adding or updating an entry with data received from readdirplus. Signed-off-by: Miklos Szeredi CC: stable@vger.kernel.org --- fs/fuse/dir.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2ae5308174e2..bb6829720dd6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1223,6 +1223,12 @@ static int fuse_direntplus_link(struct file *file, if (name.name[1] == '.' && name.len == 2) return 0; } + + if (invalid_nodeid(o->nodeid)) + return -EIO; + if (!fuse_valid_type(o->attr.mode)) + return -EIO; + fc = get_fuse_conn(dir); name.hash = full_name_hash(name.name, name.len); @@ -1231,10 +1237,14 @@ static int fuse_direntplus_link(struct file *file, inode = dentry->d_inode; if (!inode) { d_drop(dentry); - } else if (get_node_id(inode) != o->nodeid) { + } else if (get_node_id(inode) != o->nodeid || + ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { err = d_invalidate(dentry); if (err) goto out; + } else if (is_bad_inode(inode)) { + err = -EIO; + goto out; } else { struct fuse_inode *fi; fi = get_fuse_inode(inode); -- cgit v1.2.3 From 2914941e3178d84a216fc4eb85292dfef3b6d628 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 17 Jul 2013 14:53:53 +0200 Subject: fuse: readdirplus: fix instantiate Fuse does instantiation slightly differently from NFS/CIFS which use d_materialise_unique(). Signed-off-by: Miklos Szeredi CC: stable@vger.kernel.org --- fs/fuse/dir.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bb6829720dd6..5dfbb5439e4e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1272,10 +1272,19 @@ static int fuse_direntplus_link(struct file *file, if (!inode) goto out; - alias = d_materialise_unique(dentry, inode); - err = PTR_ERR(alias); - if (IS_ERR(alias)) - goto out; + if (S_ISDIR(inode->i_mode)) { + mutex_lock(&fc->inst_mutex); + alias = fuse_d_add_directory(dentry, inode); + mutex_unlock(&fc->inst_mutex); + err = PTR_ERR(alias); + if (IS_ERR(alias)) { + iput(inode); + goto out; + } + } else { + alias = d_splice_alias(inode, dentry); + } + if (alias) { dput(dentry); dentry = alias; -- cgit v1.2.3 From fa2b7213600f8110ebac64acebc78a885b0594a0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 17 Jul 2013 14:53:53 +0200 Subject: fuse: readdirplus: change attributes once If we got the inode through fuse_iget() then the attributes are already up-to-date. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5dfbb5439e4e..37d85e05b1f5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1252,6 +1252,10 @@ static int fuse_direntplus_link(struct file *file, fi->nlookup++; spin_unlock(&fc->lock); + fuse_change_attributes(inode, &o->attr, + entry_attr_timeout(o), + attr_version); + /* * The other branch to 'found' comes via fuse_iget() * which bumps nlookup inside @@ -1291,9 +1295,6 @@ static int fuse_direntplus_link(struct file *file, } found: - fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), - attr_version); - fuse_change_entry_timeout(dentry, o); err = 0; -- cgit v1.2.3 From c7263bcdc4d150e3c718b711a5f6fad496d9f662 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 17 Jul 2013 14:53:54 +0200 Subject: fuse: readdirplus: cleanup Niels noted that we don't need the 'dentry = NULL' line. Signed-off-by: Miklos Szeredi CC: Niels de Vos --- fs/fuse/dir.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 37d85e05b1f5..72a5d5b04494 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1263,7 +1263,6 @@ static int fuse_direntplus_link(struct file *file, goto found; } dput(dentry); - dentry = NULL; } dentry = d_alloc(parent, &name); @@ -1299,8 +1298,7 @@ found: err = 0; out: - if (dentry) - dput(dentry); + dput(dentry); return err; } -- cgit v1.2.3 From e4daf1ffbe6cc3b12aab4d604e627829e93e9914 Mon Sep 17 00:00:00 2001 From: Harshula Jayasuriya Date: Tue, 23 Jul 2013 14:21:35 +1000 Subject: nfsd: nfsd_open: when dentry_open returns an error do not propagate as struct file The following call chain: ------------------------------------------------------------ nfs4_get_vfs_file - nfsd_open - dentry_open - do_dentry_open - __get_file_write_access - get_write_access - return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; ------------------------------------------------------------ can result in the following state: ------------------------------------------------------------ struct nfs4_file { ... fi_fds = {0xffff880c1fa65c80, 0xffffffffffffffe6, 0x0}, fi_access = {{ counter = 0x1 }, { counter = 0x0 }}, ... ------------------------------------------------------------ 1) First time around, in nfs4_get_vfs_file() fp->fi_fds[O_WRONLY] is NULL, hence nfsd_open() is called where we get status set to an error and fp->fi_fds[O_WRONLY] to -ETXTBSY. Thus we do not reach nfs4_file_get_access() and fi_access[O_WRONLY] is not incremented. 2) Second time around, in nfs4_get_vfs_file() fp->fi_fds[O_WRONLY] is NOT NULL (-ETXTBSY), so nfsd_open() is NOT called, but nfs4_file_get_access() IS called and fi_access[O_WRONLY] is incremented. Thus we leave a landmine in the form of the nfs4_file data structure in an incorrect state. 3) Eventually, when __nfs4_file_put_access() is called it finds fi_access[O_WRONLY] being non-zero, it decrements it and calls nfs4_file_put_fd() which tries to fput -ETXTBSY. ------------------------------------------------------------ ... [exception RIP: fput+0x9] RIP: ffffffff81177fa9 RSP: ffff88062e365c90 RFLAGS: 00010282 RAX: ffff880c2b3d99cc RBX: ffff880c2b3d9978 RCX: 0000000000000002 RDX: dead000000100101 RSI: 0000000000000001 RDI: ffffffffffffffe6 RBP: ffff88062e365c90 R8: ffff88041fe797d8 R9: ffff88062e365d58 R10: 0000000000000008 R11: 0000000000000000 R12: 0000000000000001 R13: 0000000000000007 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #9 [ffff88062e365c98] __nfs4_file_put_access at ffffffffa0562334 [nfsd] #10 [ffff88062e365cc8] nfs4_file_put_access at ffffffffa05623ab [nfsd] #11 [ffff88062e365ce8] free_generic_stateid at ffffffffa056634d [nfsd] #12 [ffff88062e365d18] release_open_stateid at ffffffffa0566e4b [nfsd] #13 [ffff88062e365d38] nfsd4_close at ffffffffa0567401 [nfsd] #14 [ffff88062e365d88] nfsd4_proc_compound at ffffffffa0557f28 [nfsd] #15 [ffff88062e365dd8] nfsd_dispatch at ffffffffa054543e [nfsd] #16 [ffff88062e365e18] svc_process_common at ffffffffa04ba5a4 [sunrpc] #17 [ffff88062e365e98] svc_process at ffffffffa04babe0 [sunrpc] #18 [ffff88062e365eb8] nfsd at ffffffffa0545b62 [nfsd] #19 [ffff88062e365ee8] kthread at ffffffff81090886 #20 [ffff88062e365f48] kernel_thread at ffffffff8100c14a ------------------------------------------------------------ Cc: stable@vger.kernel.org Signed-off-by: Harshula Jayasuriya Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8ff6a0019b0b..c827acb0e943 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -830,9 +830,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, flags = O_WRONLY|O_LARGEFILE; } *filp = dentry_open(&path, flags, current_cred()); - if (IS_ERR(*filp)) + if (IS_ERR(*filp)) { host_err = PTR_ERR(*filp); - else { + *filp = NULL; + } else { host_err = ima_file_check(*filp, may_flags); if (may_flags & NFSD_MAY_64BIT_COOKIE) -- cgit v1.2.3 From 4f3cc4809a98a165a9708b72b47de71643797bbd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Jul 2013 12:53:39 -0400 Subject: NFSv4: Fix brainfart in attribute length calculation The calculation of the attribute length was 4 bytes off. Signed-off-by: Trond Myklebust Tested-by: Andre Heider Reported-and-tested-by: Henrik Rydberg Signed-off-by: Linus Torvalds --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c74d6168db99..3850b018815f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1118,11 +1118,11 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, len, ((char *)p - (char *)q) + 4); BUG(); } - len = (char *)p - (char *)q - (bmval_len << 2); *q++ = htonl(bmval0); *q++ = htonl(bmval1); if (bmval_len == 3) *q++ = htonl(bmval2); + len = (char *)p - (char *)(q + 1); *q = htonl(len); /* out: */ -- cgit v1.2.3 From e1b4271ac261b290fdab51446996fb13e68a57be Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 24 Jul 2013 15:47:30 +1000 Subject: xfs: di_flushiter considered harmful When we made all inode updates transactional, we no longer needed the log recovery detection for inodes being newer on disk than the transaction being replayed - it was redundant as replay of the log would always result in the latest version of the inode would be on disk. It was redundant, but left in place because it wasn't considered to be a problem. However, with the new "don't read inodes on create" optimisation, flushiter has come back to bite us. Essentially, the optimisation made always initialises flushiter to zero in the create transaction, and so if we then crash and run recovery and the inode already on disk has a non-zero flushiter it will skip recovery of that inode. As a result, log recovery does the wrong thing and we end up with a corrupt filesystem. Because we have to support old kernel to new kernel upgrades, we can't just get rid of the flushiter support in log recovery as we might be upgrading from a kernel that doesn't have fully transactional inode updates. Unfortunately, for v4 superblocks there is no way to guarantee that log recovery knows about this fact. We cannot add a new inode format flag to say it's a "special inode create" because it won't be understood by older kernels and so recovery could do the wrong thing on downgrade. We cannot specially detect the combination of zero mode/non-zero flushiter on disk to non-zero mode, zero flushiter in the log item during recovery because wrapping of the flushiter can result in false detection. Hence that makes this "don't use flushiter" optimisation limited to a disk format that guarantees that we don't need it. And that means the only fix here is to limit the "no read IO on create" optimisation to version 5 superblocks.... Reported-by: Markus Trippelsdorf Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit e60896d8f2b81412421953e14d3feb14177edb56) --- fs/xfs/xfs_dinode.h | 3 +++ fs/xfs/xfs_inode.c | 31 ++++++++++++++++++++++--------- fs/xfs/xfs_log_recover.c | 13 +++++++++++-- 3 files changed, 36 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index 07d735a80a0f..e5869b50dc41 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -39,6 +39,9 @@ typedef struct xfs_timestamp { * There is a very similar struct icdinode in xfs_inode which matches the * layout of the first 96 bytes of this structure, but is kept in native * format instead of big endian. + * + * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed + * padding field for v3 inodes. */ typedef struct xfs_dinode { __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b78481f99d9d..bb262c25c8de 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -896,7 +896,6 @@ xfs_dinode_to_disk( to->di_projid_lo = cpu_to_be16(from->di_projid_lo); to->di_projid_hi = cpu_to_be16(from->di_projid_hi); memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - to->di_flushiter = cpu_to_be16(from->di_flushiter); to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); @@ -924,6 +923,9 @@ xfs_dinode_to_disk( to->di_lsn = cpu_to_be64(from->di_lsn); memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); } } @@ -1029,10 +1031,14 @@ xfs_dinode_calc_crc( /* * Read the disk inode attributes into the in-core inode structure. * - * If we are initialising a new inode and we are not utilising the - * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core - * with a random generation number. If we are keeping inodes around, we need to - * read the inode cluster to get the existing generation number off disk. + * For version 5 superblocks, if we are initialising a new inode and we are not + * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new + * inode core with a random generation number. If we are keeping inodes around, + * we need to read the inode cluster to get the existing generation number off + * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode + * format) then log recovery is dependent on the di_flushiter field being + * initialised from the current on-disk value and hence we must also read the + * inode off disk. */ int xfs_iread( @@ -1054,6 +1060,7 @@ xfs_iread( /* shortcut IO on inode allocation if possible */ if ((iget_flags & XFS_IGET_CREATE) && + xfs_sb_version_hascrc(&mp->m_sb) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { /* initialise the on-disk inode core */ memset(&ip->i_d, 0, sizeof(ip->i_d)); @@ -2882,12 +2889,18 @@ xfs_iflush_int( __func__, ip->i_ino, ip->i_d.di_forkoff, ip); goto corrupt_out; } + /* - * bump the flush iteration count, used to detect flushes which - * postdate a log record during recovery. This is redundant as we now - * log every change and hence this can't happen. Still, it doesn't hurt. + * Inode item log recovery for v1/v2 inodes are dependent on the + * di_flushiter count for correct sequencing. We bump the flush + * iteration count so we can detect flushes which postdate a log record + * during recovery. This is redundant as we now log every change and + * hence this can't happen but we need to still do it to ensure + * backwards compatibility with old kernels that predate logging all + * inode changes. */ - ip->i_d.di_flushiter++; + if (ip->i_d.di_version < 3) + ip->i_d.di_flushiter++; /* * Copy the dirty parts of the inode into the on-disk diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 6fcc910a50b9..7681b19aa5dc 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2592,8 +2592,16 @@ xlog_recover_inode_pass2( goto error; } - /* Skip replay when the on disk inode is newer than the log one */ - if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_hascrc(&mp->m_sb) && + dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers @@ -2608,6 +2616,7 @@ xlog_recover_inode_pass2( goto error; } } + /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; -- cgit v1.2.3