From 72d42504bd7faa6de1c1644b5e46652933e040a6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 May 2017 13:34:29 +0200 Subject: ovl: select EXPORTFS We get a link error when EXPORTFS is not enabled: ERROR: "exportfs_encode_fh" [fs/overlayfs/overlay.ko] undefined! ERROR: "exportfs_decode_fh" [fs/overlayfs/overlay.ko] undefined! This adds a Kconfig 'select' statement for overlayfs, the same way that it is done for the other users of exportfs. Fixes: 3a1e819b4e80 ("ovl: store file handle of lower inode on copy up") Signed-off-by: Arnd Bergmann Signed-off-by: Miklos Szeredi --- fs/overlayfs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 0daac5112f7a..c0c9683934b7 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,5 +1,6 @@ config OVERLAY_FS tristate "Overlay filesystem support" + select EXPORTFS help An overlay filesystem combines two filesystems - an 'upper' filesystem and a 'lower' filesystem. When a name exists in both filesystems, the -- cgit v1.2.3 From 4751832da990a927c37526ae67b9226ea01eb99e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 7 Apr 2017 10:43:15 +0800 Subject: btrfs: fiemap: Cache and merge fiemap extent before submit it to user [BUG] Cycle mount btrfs can cause fiemap to return different result. Like: # mount /dev/vdb5 /mnt/btrfs # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file # xfs_io -c "fiemap -v" /mnt/btrfs/file /mnt/test/file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 25088..25215 128 0x1 # umount /mnt/btrfs # mount /dev/vdb5 /mnt/btrfs # xfs_io -c "fiemap -v" /mnt/btrfs/file /mnt/test/file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..31]: 25088..25119 32 0x0 1: [32..63]: 25120..25151 32 0x0 2: [64..95]: 25152..25183 32 0x0 3: [96..127]: 25184..25215 32 0x1 But after above fiemap, we get correct merged result if we call fiemap again. # xfs_io -c "fiemap -v" /mnt/btrfs/file /mnt/test/file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 25088..25215 128 0x1 [REASON] Btrfs will try to merge extent map when inserting new extent map. btrfs_fiemap(start=0 len=(u64)-1) |- extent_fiemap(start=0 len=(u64)-1) |- get_extent_skip_holes(start=0 len=64k) | |- btrfs_get_extent_fiemap(start=0 len=64k) | |- btrfs_get_extent(start=0 len=64k) | | Found on-disk (ino, EXTENT_DATA, 0) | |- add_extent_mapping() | |- Return (em->start=0, len=16k) | |- fiemap_fill_next_extent(logic=0 phys=X len=16k) | |- get_extent_skip_holes(start=0 len=64k) | |- btrfs_get_extent_fiemap(start=0 len=64k) | |- btrfs_get_extent(start=16k len=48k) | | Found on-disk (ino, EXTENT_DATA, 16k) | |- add_extent_mapping() | | |- try_merge_map() | | Merge with previous em start=0 len=16k | | resulting em start=0 len=32k | |- Return (em->start=0, len=32K) << Merged result |- Stripe off the unrelated range (0~16K) of return em |- fiemap_fill_next_extent(logic=16K phys=X+16K len=16K) ^^^ Causing split fiemap extent. And since in add_extent_mapping(), em is already merged, in next fiemap() call, we will get merged result. [FIX] Here we introduce a new structure, fiemap_cache, which records previous fiemap extent. And will always try to merge current fiemap_cache result before calling fiemap_fill_next_extent(). Only when we failed to merge current fiemap extent with cached one, we will call fiemap_fill_next_extent() to submit cached one. So by this method, we can merge all fiemap extents. It can also be done in fs/ioctl.c, however the problem is if fieinfo->fi_extents_max == 0, we have no space to cache previous fiemap extent. So I choose to merge it in btrfs. Signed-off-by: Qu Wenruo Reviewed-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d8da3edf2ac3..4317ed3ad830 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4377,6 +4377,123 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, return NULL; } +/* + * To cache previous fiemap extent + * + * Will be used for merging fiemap extent + */ +struct fiemap_cache { + u64 offset; + u64 phys; + u64 len; + u32 flags; + bool cached; +}; + +/* + * Helper to submit fiemap extent. + * + * Will try to merge current fiemap extent specified by @offset, @phys, + * @len and @flags with cached one. + * And only when we fails to merge, cached one will be submitted as + * fiemap extent. + * + * Return value is the same as fiemap_fill_next_extent(). + */ +static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + u64 offset, u64 phys, u64 len, u32 flags) +{ + int ret = 0; + + if (!cache->cached) + goto assign; + + /* + * Sanity check, extent_fiemap() should have ensured that new + * fiemap extent won't overlap with cahced one. + * Not recoverable. + * + * NOTE: Physical address can overlap, due to compression + */ + if (cache->offset + cache->len > offset) { + WARN_ON(1); + return -EINVAL; + } + + /* + * Only merges fiemap extents if + * 1) Their logical addresses are continuous + * + * 2) Their physical addresses are continuous + * So truly compressed (physical size smaller than logical size) + * extents won't get merged with each other + * + * 3) Share same flags except FIEMAP_EXTENT_LAST + * So regular extent won't get merged with prealloc extent + */ + if (cache->offset + cache->len == offset && + cache->phys + cache->len == phys && + (cache->flags & ~FIEMAP_EXTENT_LAST) == + (flags & ~FIEMAP_EXTENT_LAST)) { + cache->len += len; + cache->flags |= flags; + goto try_submit_last; + } + + /* Not mergeable, need to submit cached one */ + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret) + return ret; +assign: + cache->cached = true; + cache->offset = offset; + cache->phys = phys; + cache->len = len; + cache->flags = flags; +try_submit_last: + if (cache->flags & FIEMAP_EXTENT_LAST) { + ret = fiemap_fill_next_extent(fieinfo, cache->offset, + cache->phys, cache->len, cache->flags); + cache->cached = false; + } + return ret; +} + +/* + * Sanity check for fiemap cache + * + * All fiemap cache should be submitted by emit_fiemap_extent() + * Iteration should be terminated either by last fiemap extent or + * fieinfo->fi_extents_max. + * So no cached fiemap should exist. + */ +static int check_fiemap_cache(struct btrfs_fs_info *fs_info, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + int ret; + + if (!cache->cached) + return 0; + + /* Small and recoverbale problem, only to info developer */ +#ifdef CONFIG_BTRFS_DEBUG + WARN_ON(1); +#endif + btrfs_warn(fs_info, + "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x", + cache->offset, cache->phys, cache->len, cache->flags); + ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, + cache->len, cache->flags); + cache->cached = false; + if (ret > 0) + ret = 0; + return ret; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -4394,6 +4511,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; + struct fiemap_cache cache = { 0 }; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4573,8 +4691,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_LAST; end = 1; } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); + ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, + em_len, flags); if (ret) { if (ret == 1) ret = 0; @@ -4582,6 +4700,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } } out_free: + if (!ret) + ret = check_fiemap_cache(root->fs_info, fieinfo, &cache); free_extent_map(em); out: btrfs_free_path(path); -- cgit v1.2.3 From 8d91012528b3c95c159e6b6779e367421df9cbb5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 2 May 2017 17:03:50 +0200 Subject: btrfs: Make flush bios explicitely sync Commit b685d3d65ac7 "block: treat REQ_FUA and REQ_PREFLUSH as synchronous" removed REQ_SYNC flag from WRITE_{FUA|PREFLUSH|...} definitions. generic_make_request_checks() however strips REQ_FUA and REQ_PREFLUSH flags from a bio when the storage doesn't report volatile write cache and thus write effectively becomes asynchronous which can lead to performance regressions Fix the problem by making sure all bios which are synchronous are properly marked with REQ_SYNC. CC: David Sterba CC: linux-btrfs@vger.kernel.org Fixes: b685d3d65ac791406e0dfd8779cc9b3707fea5a3 Signed-off-by: Jan Kara Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 683194242deb..b3579d11fb26 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3488,10 +3488,12 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - if (i == 0) - ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh); - else + if (i == 0) { + ret = btrfsic_submit_bh(REQ_OP_WRITE, + REQ_SYNC | REQ_FUA, bh); + } else { ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + } if (ret) errors++; } @@ -3556,7 +3558,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; -- cgit v1.2.3 From bff5baf8aa37a97293725a16c03f49872249c07e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 9 May 2017 18:14:01 +0100 Subject: btrfs: fix incorrect error return ret being passed to mapping_set_error The setting of return code ret should be based on the error code passed into function end_extent_writepage and not on ret. Thanks to Liu Bo for spotting this mistake in the original fix I submitted. Detected by CoverityScan, CID#1414312 ("Logically dead code") Fixes: 5dca6eea91653e ("Btrfs: mark mapping with error flag to report errors to userspace") Signed-off-by: Colin Ian King Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4317ed3ad830..d3619e010005 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2458,7 +2458,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) if (!uptodate) { ClearPageUptodate(page); SetPageError(page); - ret = ret < 0 ? ret : -EIO; + ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } } -- cgit v1.2.3 From 9512a16b0e1217bbef73d276a67c28b5fbb46512 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 16 May 2017 15:57:42 -0400 Subject: nfsd: Revert "nfsd: check for oversized NFSv2/v3 arguments" This reverts commit 51f567777799 "nfsd: check for oversized NFSv2/v3 arguments", which breaks support for NFSv3 ACLs. That patch was actually an earlier draft of a fix for the problem that was eventually fixed by e6838a29ecb "nfsd: check for oversized NFSv2/v3 arguments". But somehow I accidentally left this earlier draft in the branch that was part of my 2.12 pull request. Reported-by: Eryu Guan Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3xdr.c | 23 ++++++----------------- fs/nfsd/nfsxdr.c | 13 +++---------- include/linux/sunrpc/svc.h | 3 ++- 3 files changed, 11 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 12feac6ee2fd..452334694a5d 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -334,11 +334,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); - - if (!xdr_argsize_check(rqstp, p)) - return 0; + args->count = ntohl(*p++); len = min(args->count, max_blocksize); /* set up the kvec */ @@ -352,7 +349,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, v++; } args->vlen = v; - return 1; + return xdr_argsize_check(rqstp, p); } int @@ -544,11 +541,9 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, p = decode_fh(p, &args->fh); if (!p) return 0; - if (!xdr_argsize_check(rqstp, p)) - return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); - return 1; + return xdr_argsize_check(rqstp, p); } int @@ -574,14 +569,10 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - - if (!xdr_argsize_check(rqstp, p)) - return 0; - args->count = min_t(u32, args->count, PAGE_SIZE); args->buffer = page_address(*(rqstp->rq_next_page++)); - return 1; + return xdr_argsize_check(rqstp, p); } int @@ -599,9 +590,6 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, args->dircount = ntohl(*p++); args->count = ntohl(*p++); - if (!xdr_argsize_check(rqstp, p)) - return 0; - len = args->count = min(args->count, max_blocksize); while (len > 0) { struct page *p = *(rqstp->rq_next_page++); @@ -609,7 +597,8 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, args->buffer = page_address(p); len -= PAGE_SIZE; } - return 1; + + return xdr_argsize_check(rqstp, p); } int diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 6a4947a3f4fa..de07ff625777 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -257,9 +257,6 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, len = args->count = ntohl(*p++); p++; /* totalcount - unused */ - if (!xdr_argsize_check(rqstp, p)) - return 0; - len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2); /* set up somewhere to store response. @@ -275,7 +272,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, v++; } args->vlen = v; - return 1; + return xdr_argsize_check(rqstp, p); } int @@ -365,11 +362,9 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli p = decode_fh(p, &args->fh); if (!p) return 0; - if (!xdr_argsize_check(rqstp, p)) - return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); - return 1; + return xdr_argsize_check(rqstp, p); } int @@ -407,11 +402,9 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, args->cookie = ntohl(*p++); args->count = ntohl(*p++); args->count = min_t(u32, args->count, PAGE_SIZE); - if (!xdr_argsize_check(rqstp, p)) - return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); - return 1; + return xdr_argsize_check(rqstp, p); } /* diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 94631026f79c..11cef5a7bc87 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -336,7 +336,8 @@ xdr_argsize_check(struct svc_rqst *rqstp, __be32 *p) { char *cp = (char *)p; struct kvec *vec = &rqstp->rq_arg.head[0]; - return cp == (char *)vec->iov_base + vec->iov_len; + return cp >= (char*)vec->iov_base + && cp <= (char*)vec->iov_base + vec->iov_len; } static inline int -- cgit v1.2.3 From 8137ae26d25303e7b5cfb418fd28b976461e5b6e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 16 May 2017 08:45:46 +0300 Subject: ovl: fix creds leak in copy up error path Fixes: 42f269b92540 ("ovl: rearrange code in ovl_copy_up_locked()") Cc: # v4.11 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 9008ab9fbd2e..061a8448e6c4 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -343,12 +343,13 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, temp = ovl_do_tmpfile(upperdir, stat->mode); else temp = ovl_lookup_temp(workdir, dentry); - err = PTR_ERR(temp); - if (IS_ERR(temp)) - goto out1; - err = 0; - if (!tmpfile) + if (IS_ERR(temp)) { + err = PTR_ERR(temp); + temp = NULL; + } + + if (!err && !tmpfile) err = ovl_create_real(wdir, temp, &cattr, NULL, true); if (new_creds) { -- cgit v1.2.3 From 82b749b2c65e9d108c1c5598dc0a5f436b525f42 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 17 May 2017 00:12:40 +0300 Subject: ovl: check on mount time if upper fs supports setting xattr xattr are needed by overlayfs for setting opaque dir, redirect dir and copy up origin. Check at mount time by trying to set the overlay.opaque xattr on the workdir and if that fails issue a warning message. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/overlayfs.h | 3 +++ fs/overlayfs/ovl_entry.h | 1 + fs/overlayfs/super.c | 13 +++++++++++++ fs/overlayfs/util.c | 21 +++++++++++++++++++++ 4 files changed, 38 insertions(+) (limited to 'fs') diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index caa36cb9c46d..ce7c3aba61e4 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -279,3 +279,6 @@ int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_flags(struct dentry *dentry, int flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); +int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, + const char *name, const void *value, size_t size, + int xerr); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index b2023ddb8532..ad86c0a302eb 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -28,6 +28,7 @@ struct ovl_fs { /* creds of process who forced instantiation of super block */ const struct cred *creator_cred; bool tmpfile; + bool noxattr; wait_queue_head_t copyup_wq; /* sb common to all layers */ struct super_block *same_sb; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 9828b7de8999..f1647626a882 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -891,6 +891,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) dput(temp); else pr_warn("overlayfs: upper fs does not support tmpfile.\n"); + + /* + * Check if upper/work fs supports trusted.overlay.* + * xattr + */ + err = ovl_do_setxattr(ufs->workdir, OVL_XATTR_OPAQUE, + "0", 1, 0); + if (err) { + ufs->noxattr = true; + pr_warn("overlayfs: upper fs does not support xattr.\n"); + } else { + vfs_removexattr(ufs->workdir, OVL_XATTR_OPAQUE); + } } } diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index cfdea47313a1..b5a0dc36ee96 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -303,3 +303,24 @@ void ovl_copy_up_end(struct dentry *dentry) wake_up_locked(&ofs->copyup_wq); spin_unlock(&ofs->copyup_wq.lock); } + +int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, + const char *name, const void *value, size_t size, + int xerr) +{ + int err; + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + if (ofs->noxattr) + return xerr; + + err = ovl_do_setxattr(upperdentry, name, value, size, 0); + + if (err == -EOPNOTSUPP) { + pr_warn("overlayfs: cannot set %s xattr on upper\n", name); + ofs->noxattr = true; + return xerr; + } + + return err; +} -- cgit v1.2.3 From 6266d465bde044a105f6c2d4e244680f951a2d70 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 18 May 2017 16:11:24 +0200 Subject: ovl: don't fail copy-up if upper doesn't support xattr Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 061a8448e6c4..f92ab35d43a6 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -300,7 +300,11 @@ static int ovl_set_origin(struct dentry *dentry, struct dentry *lower, return PTR_ERR(fh); } - err = ovl_do_setxattr(upper, OVL_XATTR_ORIGIN, fh, fh ? fh->len : 0, 0); + /* + * Do not fail when upper doesn't support xattrs. + */ + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh, + fh ? fh->len : 0, 0); kfree(fh); return err; -- cgit v1.2.3 From 21a228781104ae6fed7e720137ab024575071feb Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 17 May 2017 00:12:41 +0300 Subject: ovl: handle rename when upper doesn't support xattr On failure to set opaque/redirect xattr on rename, skip setting xattr and return -EXDEV. On failure to set opaque xattr when creating a new directory, -EIO is returned instead of -EOPNOTSUPP. Any failure to set those xattr will be recorded in super block and then setting any xattr on upper won't be attempted again. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 29 +++++++++++++++++++---------- fs/overlayfs/overlayfs.h | 1 - fs/overlayfs/util.c | 9 +-------- 3 files changed, 20 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 723b98b90698..80e0e202a346 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -127,17 +127,28 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, return err; } -static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) +static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, + int xerr) { int err; - err = ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0); + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_OPAQUE, "y", 1, xerr); if (!err) ovl_dentry_set_opaque(dentry); return err; } +static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) +{ + /* + * Fail with -EIO when trying to create opaque dir and upper doesn't + * support xattrs. ovl_rename() calls ovl_set_opaque_xerr(-EXDEV) to + * return a specific error for noxattr case. + */ + return ovl_set_opaque_xerr(dentry, upperdentry, -EIO); +} + /* Common operations required to be done after creation of file on upper */ static void ovl_instantiate(struct dentry *dentry, struct inode *inode, struct dentry *newdentry, bool hardlink) @@ -846,18 +857,16 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) if (IS_ERR(redirect)) return PTR_ERR(redirect); - err = ovl_do_setxattr(ovl_dentry_upper(dentry), OVL_XATTR_REDIRECT, - redirect, strlen(redirect), 0); + err = ovl_check_setxattr(dentry, ovl_dentry_upper(dentry), + OVL_XATTR_REDIRECT, + redirect, strlen(redirect), -EXDEV); if (!err) { spin_lock(&dentry->d_lock); ovl_dentry_set_redirect(dentry, redirect); spin_unlock(&dentry->d_lock); } else { kfree(redirect); - if (err == -EOPNOTSUPP) - ovl_clear_redirect_dir(dentry->d_sb); - else - pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); + pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); /* Fall back to userspace copy-up */ err = -EXDEV; } @@ -992,7 +1001,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (ovl_type_merge_or_lower(old)) err = ovl_set_redirect(old, samedir); else if (!old_opaque && ovl_type_merge(new->d_parent)) - err = ovl_set_opaque(old, olddentry); + err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); if (err) goto out_dput; } @@ -1000,7 +1009,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (ovl_type_merge_or_lower(new)) err = ovl_set_redirect(new, samedir); else if (!new_opaque && ovl_type_merge(old->d_parent)) - err = ovl_set_opaque(new, newdentry); + err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); if (err) goto out_dput; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index ce7c3aba61e4..505b18b56330 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -206,7 +206,6 @@ bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); bool ovl_redirect_dir(struct super_block *sb); -void ovl_clear_redirect_dir(struct super_block *sb); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index b5a0dc36ee96..4d541a8d0834 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -191,14 +191,7 @@ bool ovl_redirect_dir(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; - return ofs->config.redirect_dir; -} - -void ovl_clear_redirect_dir(struct super_block *sb) -{ - struct ovl_fs *ofs = sb->s_fs_info; - - ofs->config.redirect_dir = false; + return ofs->config.redirect_dir && !ofs->noxattr; } const char *ovl_dentry_get_redirect(struct dentry *dentry) -- cgit v1.2.3 From 3d27573ce32b47ba54e6680c77c26a700d67cc16 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 19 May 2017 09:33:49 +0200 Subject: ovl: remove unused arg from ovl_lookup_temp() Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 2 +- fs/overlayfs/dir.c | 8 ++++---- fs/overlayfs/overlayfs.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index f92ab35d43a6..843ed2a2d7db 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -346,7 +346,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (tmpfile) temp = ovl_do_tmpfile(upperdir, stat->mode); else - temp = ovl_lookup_temp(workdir, dentry); + temp = ovl_lookup_temp(workdir); err = 0; if (IS_ERR(temp)) { err = PTR_ERR(temp); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 80e0e202a346..369ee7c4cdc0 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -41,7 +41,7 @@ void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) } } -struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) +struct dentry *ovl_lookup_temp(struct dentry *workdir) { struct dentry *temp; char name[20]; @@ -68,7 +68,7 @@ static struct dentry *ovl_whiteout(struct dentry *workdir, struct dentry *whiteout; struct inode *wdir = workdir->d_inode; - whiteout = ovl_lookup_temp(workdir, dentry); + whiteout = ovl_lookup_temp(workdir); if (IS_ERR(whiteout)) return whiteout; @@ -261,7 +261,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (upper->d_parent->d_inode != udir) goto out_unlock; - opaquedir = ovl_lookup_temp(workdir, dentry); + opaquedir = ovl_lookup_temp(workdir); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out_unlock; @@ -393,7 +393,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out; - newdentry = ovl_lookup_temp(workdir, dentry); + newdentry = ovl_lookup_temp(workdir); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) goto out_unlock; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 505b18b56330..7c56932bfc2f 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -262,7 +262,7 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; -struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry); +struct dentry *ovl_lookup_temp(struct dentry *workdir); struct cattr { dev_t rdev; umode_t mode; -- cgit v1.2.3 From ee1d6d37b6b884383b501089be93ce94f2153028 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 11 May 2017 16:42:26 +0300 Subject: ovl: mark upper dir with type origin entries "impure" When moving a merge dir or non-dir with copy up origin into a non-merge upper dir (a.k.a pure upper dir), we are marking the target parent dir "impure". ovl_iterate() iterates pure upper dirs directly, because there is no need to filter out whiteouts and merge dir content with lower dir. But for the case of an "impure" upper dir, ovl_iterate() will not be able to iterate the real upper dir directly, because it will need to lookup the origin inode and use it to fill d_ino. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ fs/overlayfs/namei.c | 18 ++++++++++++++++-- fs/overlayfs/overlayfs.h | 3 +++ fs/overlayfs/ovl_entry.h | 1 + fs/overlayfs/util.c | 14 ++++++++++++++ 5 files changed, 79 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 369ee7c4cdc0..f2a118ba00e4 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -149,6 +149,22 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) return ovl_set_opaque_xerr(dentry, upperdentry, -EIO); } +static int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) +{ + int err; + + /* + * Do not fail when upper doesn't support xattrs. + * Upper inodes won't have origin nor redirect xattr anyway. + */ + err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE, + "y", 1, 0); + if (!err) + ovl_dentry_set_impure(dentry); + + return err; +} + /* Common operations required to be done after creation of file on upper */ static void ovl_instantiate(struct dentry *dentry, struct inode *inode, struct dentry *newdentry, bool hardlink) @@ -173,6 +189,11 @@ static bool ovl_type_merge(struct dentry *dentry) return OVL_TYPE_MERGE(ovl_path_type(dentry)); } +static bool ovl_type_origin(struct dentry *dentry) +{ + return OVL_TYPE_ORIGIN(ovl_path_type(dentry)); +} + static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct cattr *attr, struct dentry *hardlink) { @@ -952,6 +973,30 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, old_upperdir = ovl_dentry_upper(old->d_parent); new_upperdir = ovl_dentry_upper(new->d_parent); + if (!samedir) { + /* + * When moving a merge dir or non-dir with copy up origin into + * a non-merge upper dir (a.k.a pure upper dir), we are making + * the target parent dir "impure". ovl_iterate() iterates pure + * upper dirs directly, because there is no need to filter out + * whiteouts and merge dir content with lower dir. But for the + * case of an "impure" upper dir, ovl_iterate() cannot iterate + * the real directory directly, because it looks for the inode + * numbers to fill d_ino in the entries origin inode. + */ + if (ovl_type_origin(old) && !ovl_type_merge(new->d_parent)) { + err = ovl_set_impure(new->d_parent, new_upperdir); + if (err) + goto out_revert_creds; + } + if (!overwrite && ovl_type_origin(new) && + !ovl_type_merge(old->d_parent)) { + err = ovl_set_impure(old->d_parent, old_upperdir); + if (err) + goto out_revert_creds; + } + } + trap = lock_rename(new_upperdir, old_upperdir); olddentry = lookup_one_len(old->d_name.name, old_upperdir, diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index bad0f665a635..0c72a5909db2 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -167,7 +167,7 @@ invalid: goto out; } -static bool ovl_is_opaquedir(struct dentry *dentry) +static bool ovl_check_dir_xattr(struct dentry *dentry, const char *name) { int res; char val; @@ -175,13 +175,23 @@ static bool ovl_is_opaquedir(struct dentry *dentry) if (!d_is_dir(dentry)) return false; - res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); + res = vfs_getxattr(dentry, name, &val, 1); if (res == 1 && val == 'y') return true; return false; } +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + return ovl_check_dir_xattr(dentry, OVL_XATTR_OPAQUE); +} + +static bool ovl_is_impuredir(struct dentry *dentry) +{ + return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); +} + static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, const char *name, unsigned int namelen, size_t prelen, const char *post, @@ -351,6 +361,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int ctr = 0; struct inode *inode = NULL; bool upperopaque = false; + bool upperimpure = false; char *upperredirect = NULL; struct dentry *this; unsigned int i; @@ -395,6 +406,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, poe = roe; } upperopaque = d.opaque; + if (upperdentry && d.is_dir) + upperimpure = ovl_is_impuredir(upperdentry); } if (!d.stop && poe->numlower) { @@ -463,6 +476,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, revert_creds(old_cred); oe->opaque = upperopaque; + oe->impure = upperimpure; oe->redirect = upperredirect; oe->__upperdentry = upperdentry; memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 7c56932bfc2f..a9fb958fd5d4 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -24,6 +24,7 @@ enum ovl_path_type { #define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque" #define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect" #define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin" +#define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure" /* * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, @@ -203,8 +204,10 @@ struct dentry *ovl_dentry_real(struct dentry *dentry); struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); bool ovl_dentry_is_opaque(struct dentry *dentry); +bool ovl_dentry_is_impure(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); +void ovl_dentry_set_impure(struct dentry *dentry); bool ovl_redirect_dir(struct super_block *sb); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index ad86c0a302eb..34bc4a9f5c61 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -43,6 +43,7 @@ struct ovl_entry { u64 version; const char *redirect; bool opaque; + bool impure; bool copying; }; struct rcu_head rcu; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 4d541a8d0834..e0dfb07d5457 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -175,6 +175,13 @@ bool ovl_dentry_is_opaque(struct dentry *dentry) return oe->opaque; } +bool ovl_dentry_is_impure(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->impure; +} + bool ovl_dentry_is_whiteout(struct dentry *dentry) { return !dentry->d_inode && ovl_dentry_is_opaque(dentry); @@ -187,6 +194,13 @@ void ovl_dentry_set_opaque(struct dentry *dentry) oe->opaque = true; } +void ovl_dentry_set_impure(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + oe->impure = true; +} + bool ovl_redirect_dir(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; -- cgit v1.2.3 From 964edf66bf9ab70cb387b27946c0aef7b94c4d1b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 21 May 2017 22:31:23 -0400 Subject: ext4: clear lockdep subtype for quota files on quota off Quota files have special ranking of i_data_sem lock. We inform lockdep about it when turning on quotas however when turning quotas off, we don't clear the lockdep subclass from i_data_sem lock and thus when the inode gets later reused for a normal file or directory, lockdep gets confused and complains about possible deadlocks. Fix the problem by resetting lockdep subclass of i_data_sem on quota off. Cc: stable@vger.kernel.org Fixes: daf647d2dd58cec59570d7698a45b98e580f2076 Reported-and-tested-by: Ross Zwisler Reviewed-by: Andreas Dilger Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0b177da9ea82..8f05c72bc4ec 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -848,14 +848,9 @@ static inline void ext4_quota_off_umount(struct super_block *sb) { int type; - if (ext4_has_feature_quota(sb)) { - dquot_disable(sb, -1, - DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - } else { - /* Use our quota_off function to clear inode flags etc. */ - for (type = 0; type < EXT4_MAXQUOTAS; type++) - ext4_quota_off(sb, type); - } + /* Use our quota_off function to clear inode flags etc. */ + for (type = 0; type < EXT4_MAXQUOTAS; type++) + ext4_quota_off(sb, type); } #else static inline void ext4_quota_off_umount(struct super_block *sb) @@ -5485,7 +5480,7 @@ static int ext4_quota_off(struct super_block *sb, int type) goto out; err = dquot_quota_off(sb, type); - if (err) + if (err || ext4_has_feature_quota(sb)) goto out_put; inode_lock(inode); @@ -5505,6 +5500,7 @@ static int ext4_quota_off(struct super_block *sb, int type) out_unlock: inode_unlock(inode); out_put: + lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); iput(inode); return err; out: -- cgit v1.2.3 From b4709067ac0944e4a5b94eabdc26155c6f2efbd7 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Sun, 21 May 2017 22:32:23 -0400 Subject: jbd2: preserve original nofs flag during journal restart When a transaction starts, start_this_handle() saves current PF_MEMALLOC_NOFS value so that it can be restored at journal stop time. Journal restart is a special case that calls start_this_handle() without stopping the transaction. start_this_handle() isn't aware that the original value is already stored so it overwrites it with current value. For instance, a call sequence like below leaves PF_MEMALLOC_NOFS flag set at the end: jbd2_journal_start() jbd2__journal_restart() jbd2_journal_stop() Make jbd2__journal_restart() restore the original value before calling start_this_handle(). Fixes: 81378da64de6 ("jbd2: mark the transaction context with the scope GFP_NOFS context") Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/jbd2/transaction.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 9ee4832b6f8b..2d30a6da7013 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -680,6 +680,12 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); handle->h_buffer_credits = nblocks; + /* + * Restore the original nofs context because the journal restart + * is basically the same thing as journal stop and start. + * start_this_handle will start a new nofs context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); ret = start_this_handle(journal, handle, gfp_mask); return ret; } -- cgit v1.2.3 From 7d95eddf313c88b24f99d4ca9c2411a4b82fef33 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 21 May 2017 22:33:23 -0400 Subject: ext4: fix SEEK_HOLE Currently, SEEK_HOLE implementation in ext4 may both return that there's a hole at some offset although that offset already has data and skip some holes during a search for the next hole. The first problem is demostrated by: xfs_io -c "falloc 0 256k" -c "pwrite 0 56k" -c "seek -h 0" file wrote 57344/57344 bytes at offset 0 56 KiB, 14 ops; 0.0000 sec (2.054 GiB/sec and 538461.5385 ops/sec) Whence Result HOLE 0 Where we can see that SEEK_HOLE wrongly returned offset 0 as containing a hole although we have written data there. The second problem can be demonstrated by: xfs_io -c "falloc 0 256k" -c "pwrite 0 56k" -c "pwrite 128k 8k" -c "seek -h 0" file wrote 57344/57344 bytes at offset 0 56 KiB, 14 ops; 0.0000 sec (1.978 GiB/sec and 518518.5185 ops/sec) wrote 8192/8192 bytes at offset 131072 8 KiB, 2 ops; 0.0000 sec (2 GiB/sec and 500000.0000 ops/sec) Whence Result HOLE 139264 Where we can see that hole at offsets 56k..128k has been ignored by the SEEK_HOLE call. The underlying problem is in the ext4_find_unwritten_pgoff() which is just buggy. In some cases it fails to update returned offset when it finds a hole (when no pages are found or when the first found page has higher index than expected), in some cases conditions for detecting hole are just missing (we fail to detect a situation where indices of returned pages are not contiguous). Fix ext4_find_unwritten_pgoff() to properly detect non-contiguous page indices and also handle all cases where we got less pages then expected in one place and handle it properly there. CC: stable@vger.kernel.org Fixes: c8c0df241cc2719b1262e627f999638411934f60 CC: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 50 ++++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 831fd6beebf0..bbea2dccd584 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -484,47 +484,27 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, (pgoff_t)num); - if (nr_pages == 0) { - if (whence == SEEK_DATA) - break; - - BUG_ON(whence != SEEK_HOLE); - /* - * If this is the first time to go into the loop and - * offset is not beyond the end offset, it will be a - * hole at this offset - */ - if (lastoff == startoff || lastoff < endoff) - found = 1; - break; - } - - /* - * If this is the first time to go into the loop and - * offset is smaller than the first page offset, it will be a - * hole at this offset. - */ - if (lastoff == startoff && whence == SEEK_HOLE && - lastoff < page_offset(pvec.pages[0])) { - found = 1; + if (nr_pages == 0) break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; struct buffer_head *bh, *head; /* - * If the current offset is not beyond the end of given - * range, it will be a hole. + * If current offset is smaller than the page offset, + * there is a hole at this offset. */ - if (lastoff < endoff && whence == SEEK_HOLE && - page->index > end) { + if (whence == SEEK_HOLE && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { found = 1; *offset = lastoff; goto out; } + if (page->index > end) + goto out; + lock_page(page); if (unlikely(page->mapping != inode->i_mapping)) { @@ -564,20 +544,18 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); } - /* - * The no. of pages is less than our desired, that would be a - * hole in there. - */ - if (nr_pages < num && whence == SEEK_HOLE) { - found = 1; - *offset = lastoff; + /* The no. of pages is less than our desired, we are done. */ + if (nr_pages < num) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + if (whence == SEEK_HOLE && lastoff < endoff) { + found = 1; + *offset = lastoff; + } out: pagevec_release(&pvec); return found; -- cgit v1.2.3 From 3f1d5bad3fae983da07be01cff2fde13293bb7b9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 21 May 2017 22:34:23 -0400 Subject: ext4: fix off-by-in in loop termination in ext4_find_unwritten_pgoff() There is an off-by-one error in loop termination conditions in ext4_find_unwritten_pgoff() since 'end' may index a page beyond end of desired range if 'endoff' is page aligned. It doesn't have any visible effects but still it is good to fix it. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bbea2dccd584..2b00bf84c05b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -474,7 +474,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, endoff = (loff_t)end_blk << blkbits; index = startoff >> PAGE_SHIFT; - end = endoff >> PAGE_SHIFT; + end = (endoff - 1) >> PAGE_SHIFT; pagevec_init(&pvec, 0); do { -- cgit v1.2.3 From 9651e6b2e20648d04d5e1fe6479a3056047e8781 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sun, 21 May 2017 22:35:23 -0400 Subject: ext4: handle the rest of ext4_mb_load_buddy() ENOMEM errors I've got another report about breaking ext4 by ENOMEM error returned from ext4_mb_load_buddy() caused by memory shortage in memory cgroup. This time inside ext4_discard_preallocations(). This patch replaces ext4_error() with ext4_warning() where errors returned from ext4_mb_load_buddy() are not fatal and handled by caller: * ext4_mb_discard_group_preallocations() - called before generating ENOSPC, we'll try to discard other group or return ENOSPC into user-space. * ext4_trim_all_free() - just stop trimming and return ENOMEM from ioctl. Some callers cannot handle errors, thus __GFP_NOFAIL is used for them: * ext4_discard_preallocations() * ext4_mb_discard_lg_preallocations() Fixes: adb7ef600cc9 ("ext4: use __GFP_NOFAIL in ext4_free_blocks()") Signed-off-by: Konstantin Khlebnikov Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5083bce20ac4..b7928cddd539 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3887,7 +3887,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, "Error loading buddy information for %u", group); + ext4_warning(sb, "Error %d loading buddy information for %u", + err, group); put_bh(bitmap_bh); return 0; } @@ -4044,10 +4045,11 @@ repeat: BUG_ON(pa->pa_type != MB_INODE_PA); group = ext4_get_group_number(sb, pa->pa_pstart); - err = ext4_mb_load_buddy(sb, group, &e4b); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); if (err) { - ext4_error(sb, "Error loading buddy information for %u", - group); + ext4_error(sb, "Error %d loading buddy information for %u", + err, group); continue; } @@ -4303,11 +4305,14 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, spin_unlock(&lg->lg_prealloc_lock); list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + int err; group = ext4_get_group_number(sb, pa->pa_pstart); - if (ext4_mb_load_buddy(sb, group, &e4b)) { - ext4_error(sb, "Error loading buddy information for %u", - group); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) { + ext4_error(sb, "Error %d loading buddy information for %u", + err, group); continue; } ext4_lock_group(sb, group); @@ -5127,8 +5132,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { - ext4_error(sb, "Error in loading buddy " - "information for %u", group); + ext4_warning(sb, "Error %d loading buddy information for %u", + ret, group); return ret; } bitmap = e4b.bd_bitmap; -- cgit v1.2.3 From 887a9730614727c4fff7cb756711b190593fc1df Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sun, 21 May 2017 22:36:23 -0400 Subject: ext4: keep existing extra fields when inode expands ext4_expand_extra_isize() should clear only space between old and new size. Fixes: 6dd4ee7cab7e # v2.6.23 Cc: stable@vger.kernel.org Signed-off-by: Konstantin Khlebnikov Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1bd0bfa547f6..7cd99de8c4ba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5637,8 +5637,9 @@ static int ext4_expand_extra_isize(struct inode *inode, /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { - memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, - new_extra_isize); + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize, 0, + new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } -- cgit v1.2.3 From 9a307403d374b993061f5992a6e260c944920d0b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 23 May 2017 12:24:40 -0400 Subject: nfsd4: fix null dereference on replay if we receive a compound such that: - the sessionid, slot, and sequence number in the SEQUENCE op match a cached succesful reply with N ops, and - the Nth operation of the compound is a PUTFH, PUTPUBFH, PUTROOTFH, or RESTOREFH, then nfsd4_sequence will return 0 and set cstate->status to nfserr_replay_cache. The current filehandle will not be set. This will cause us to call check_nfsd_access with first argument NULL. To nfsd4_compound it looks like we just succesfully executed an operation that set a filehandle, but the current filehandle is not set. Fix this by moving the nfserr_replay_cache earlier. There was never any reason to have it after the encode_op label, since the only case where he hit that is when opdesc->op_func sets it. Note that there are two ways we could hit this case: - a client is resending a previously sent compound that ended with one of the four PUTFH-like operations, or - a client is sending a *new* compound that (incorrectly) shares sessionid, slot, and sequence number with a previously sent compound, and the length of the previously sent compound happens to match the position of a PUTFH-like operation in the new compound. The second is obviously incorrect client behavior. The first is also very strange--the only purpose of a PUTFH-like operation is to set the current filehandle to be used by the following operation, so there's no point in having it as the last in a compound. So it's likely this requires a buggy or malicious client to reproduce. Reported-by: Scott Mayhew Cc: stable@kernel.vger.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index c453a1998e00..dadb3bf305b2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1769,6 +1769,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, opdesc->op_get_currentstateid(cstate, &op->u); op->status = opdesc->op_func(rqstp, cstate, &op->u); + /* Only from SEQUENCE */ + if (cstate->status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + status = op->status; + goto out; + } if (!op->status) { if (opdesc->op_set_currentstateid) opdesc->op_set_currentstateid(cstate, &op->u); @@ -1779,14 +1785,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (need_wrongsec_check(rqstp)) op->status = check_nfsd_access(current_fh->fh_export, rqstp); } - encode_op: - /* Only from SEQUENCE */ - if (cstate->status == nfserr_replay_cache) { - dprintk("%s NFS4.1 replay from cache\n", __func__); - status = op->status; - goto out; - } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; nfsd4_encode_replay(&resp->xdr, op); -- cgit v1.2.3 From 0f0b9b63e14fc3f66e4d342df016c9b071c5abed Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 2 May 2017 13:14:13 +0200 Subject: gfs2: Make flush bios explicitely sync Commit b685d3d65ac7 "block: treat REQ_FUA and REQ_PREFLUSH as synchronous" removed REQ_SYNC flag from WRITE_{FUA|PREFLUSH|...} definitions. generic_make_request_checks() however strips REQ_FUA and REQ_PREFLUSH flags from a bio when the storage doesn't report volatile write cache and thus write effectively becomes asynchronous which can lead to performance regressions Fix the problem by making sure all bios which are synchronous are properly marked with REQ_SYNC. Fixes: b685d3d65ac791406e0dfd8779cc9b3707fea5a3 CC: Steven Whitehouse CC: cluster-devel@redhat.com CC: stable@vger.kernel.org Acked-by: Bob Peterson Signed-off-by: Jan Kara --- fs/gfs2/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f865b96374df..d2955daf17a4 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -659,7 +659,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) struct gfs2_log_header *lh; unsigned int tail; u32 hash; - int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META; + int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lh = page_address(page); -- cgit v1.2.3 From d8747d642ec4ce96adf17ae35652a5e4015cfe02 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 2 May 2017 13:16:18 +0200 Subject: reiserfs: Make flush bios explicitely sync Commit b685d3d65ac7 "block: treat REQ_FUA and REQ_PREFLUSH as synchronous" removed REQ_SYNC flag from WRITE_{FUA|PREFLUSH|...} definitions. generic_make_request_checks() however strips REQ_FUA and REQ_PREFLUSH flags from a bio when the storage doesn't report volatile write cache and thus write effectively becomes asynchronous which can lead to performance regressions Fix the problem by making sure all bios which are synchronous are properly marked with REQ_SYNC. Fixes: b685d3d65ac791406e0dfd8779cc9b3707fea5a3 CC: reiserfs-devel@vger.kernel.org CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/reiserfs/journal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index da01f497180a..39bb1e838d8d 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1112,7 +1112,7 @@ static int flush_commit_list(struct super_block *s, depth = reiserfs_write_unlock_nested(s); if (reiserfs_barrier_flush(s)) __sync_dirty_buffer(jl->j_commit_bh, - REQ_PREFLUSH | REQ_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(jl->j_commit_bh); reiserfs_write_lock_nested(s, depth); @@ -1271,7 +1271,7 @@ static int _update_journal_header_block(struct super_block *sb, if (reiserfs_barrier_flush(sb)) __sync_dirty_buffer(journal->j_header_bh, - REQ_PREFLUSH | REQ_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(journal->j_header_bh); -- cgit v1.2.3 From 6d3b5d8d8dd1c14f991ccab84b40f8425f1ae91b Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 16 May 2017 12:00:15 -0400 Subject: NFS fix COMMIT after COPY Fix a typo in the commit e092693443b995c8e3a565a73b5fdb05f1260f9b "NFS append COMMIT after synchronous COPY" Reported-by: Eryu Guan Fixes: e092693443b ("NFS append COMMIT after synchronous COPY") Signed-off-by: Olga Kornievskaia Tested-by: Eryu Guan Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 929d09a5310a..319a47db218d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -177,7 +177,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, if (status) goto out; - if (!nfs_write_verifier_cmp(&res->write_res.verifier.verifier, + if (nfs_write_verifier_cmp(&res->write_res.verifier.verifier, &res->commit_res.verf->verifier)) { status = -EAGAIN; goto out; -- cgit v1.2.3 From 662f9a105b4322b8559d448f86110e6ec24b8738 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 20 May 2017 00:31:12 +0300 Subject: pNFS/flexfiles: missing error code in ff_layout_alloc_lseg() If xdr_inline_decode() fails then we end up returning ERR_PTR(0). The caller treats NULL returns as -ENOMEM so it doesn't really hurt runtime, but obviously we intended to set an error code here. Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver") Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index f5714ee01000..23542dc44a25 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -454,6 +454,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; /* fh */ + rc = -EIO; p = xdr_inline_decode(&stream, 4); if (!p) goto out_err_free; -- cgit v1.2.3 From 08cb5b0f058a325fcb5305e33f572ff6d6dfa289 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 22 May 2017 20:20:23 -0400 Subject: pnfs: Fix the check for requests in range of layout segment It's possible and acceptable for NFS to attempt to add requests beyond the range of the current pgio->pg_lseg, a case which should be caught and limited by the pg_test operation. However, the current handling of this case replaces pgio->pg_lseg with a new layout segment (after a WARN) within that pg_test operation. That will cause all the previously added requests to be submitted with this new layout segment, which may not be valid for those requests. Fix this problem by only returning zero for the number of bytes to coalesce from pg_test for this case which allows any previously added requests to complete on the current layout segment. The check for requests starting out of range of the layout segment moves to pg_init, so that the replacement of pgio->pg_lseg will be done when the next request is added. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 25 +++++++++++++++++-------- fs/nfs/pnfs.h | 10 ++++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index adc6ec28d4b5..c383d0913b54 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2094,12 +2094,26 @@ pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); +/* + * Check for any intersection between the request and the pgio->pg_lseg, + * and if none, put this pgio->pg_lseg away. + */ +static void +pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) { + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + } +} + void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { u64 rd_size = req->wb_bytes; pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_range(pgio, req); if (pgio->pg_lseg == NULL) { if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); @@ -2131,6 +2145,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_range(pgio, req); if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -2191,16 +2206,10 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset, pgio->pg_lseg->pls_range.length); req_start = req_offset(req); - WARN_ON_ONCE(req_start >= seg_end); + /* start of request is past the last byte of this segment */ - if (req_start >= seg_end) { - /* reference the new lseg */ - if (pgio->pg_ops->pg_cleanup) - pgio->pg_ops->pg_cleanup(pgio); - if (pgio->pg_ops->pg_init) - pgio->pg_ops->pg_init(pgio, req); + if (req_start >= seg_end) return 0; - } /* adjust 'size' iff there are fewer bytes left in the * segment than what nfs_generic_pg_test returned */ diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 2d05b756a8d6..99731e3e332f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -593,6 +593,16 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2); } +static inline bool +pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page *req) +{ + u64 seg_last = pnfs_end_offset(lseg->pls_range.offset, lseg->pls_range.length); + u64 req_last = req_offset(req) + req->wb_bytes; + + return pnfs_is_range_intersecting(lseg->pls_range.offset, seg_last, + req_offset(req), req_last); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG -- cgit v1.2.3 From b49c15f97c936ef5a536821f97e4dd8568369802 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 May 2017 07:55:44 -0400 Subject: NFSv4.0: Fix a lock leak in nfs40_walk_client_list Xiaolong Ye's kernel test robot detected the following Oops: [ 299.158991] BUG: scheduling while atomic: mount.nfs/9387/0x00000002 [ 299.169587] 2 locks held by mount.nfs/9387: [ 299.176165] #0: (nfs_clid_init_mutex){......}, at: [] nfs4_discover_server_trunking+0x47/0x1fc [ 299.201802] #1: (&(&nn->nfs_client_lock)->rlock){......}, at: [] nfs40_walk_client_list+0x2e9/0x338 [ 299.221979] CPU: 0 PID: 9387 Comm: mount.nfs Not tainted 4.11.0-rc7-00021-g14d1bbb #45 [ 299.235584] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-20161025_171302-gandalf 04/01/2014 [ 299.251176] Call Trace: [ 299.255192] dump_stack+0x61/0x7e [ 299.260416] __schedule_bug+0x65/0x74 [ 299.266208] __schedule+0x5d/0x87c [ 299.271883] schedule+0x89/0x9a [ 299.276937] schedule_timeout+0x232/0x289 [ 299.283223] ? detach_if_pending+0x10b/0x10b [ 299.289935] schedule_timeout_uninterruptible+0x2a/0x2c [ 299.298266] ? put_rpccred+0x3e/0x115 [ 299.304327] ? schedule_timeout_uninterruptible+0x2a/0x2c [ 299.312851] msleep+0x1e/0x22 [ 299.317612] nfs4_discover_server_trunking+0x102/0x1fc [ 299.325644] nfs4_init_client+0x13f/0x194 It looks as if we recently added a spin_lock() leak to nfs40_walk_client_list() when cleaning up the code. Reported-by: kernel test robot Fixes: 14d1bbb0ca42 ("NFS: Create a common nfs4_match_client() function") Cc: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 692a7a8bfc7a..66776f022111 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -582,7 +582,6 @@ int nfs40_walk_client_list(struct nfs_client *new, */ nfs4_schedule_path_down_recovery(pos); default: - spin_lock(&nn->nfs_client_lock); goto out; } -- cgit v1.2.3 From 624327f8794704c5066b11a52f9da6a09dce7f9a Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Wed, 24 May 2017 18:02:20 -0400 Subject: ext4: fix off-by-one on max nr_pages in ext4_find_unwritten_pgoff() ext4_find_unwritten_pgoff() is used to search for offset of hole or data in page range [index, end] (both inclusive), and the max number of pages to search should be at least one, if end == index. Otherwise the only page is missed and no hole or data is found, which is not correct. When block size is smaller than page size, this can be demonstrated by preallocating a file with size smaller than page size and writing data to the last block. E.g. run this xfs_io command on a 1k block size ext4 on x86_64 host. # xfs_io -fc "falloc 0 3k" -c "pwrite 2k 1k" \ -c "seek -d 0" /mnt/ext4/testfile wrote 1024/1024 bytes at offset 2048 1 KiB, 1 ops; 0.0000 sec (42.459 MiB/sec and 43478.2609 ops/sec) Whence Result DATA EOF Data at offset 2k was missed, and lseek(2) returned ENXIO. This is unconvered by generic/285 subtest 07 and 08 on ppc64 host, where pagesize is 64k. Because a recent change to generic/285 reduced the preallocated file size to smaller than 64k. Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2b00bf84c05b..02ce7e7bbdf5 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -481,7 +481,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, int i, num; unsigned long nr_pages; - num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, (pgoff_t)num); if (nr_pages == 0) -- cgit v1.2.3 From e5465795cac48233c9b606a7a419f4190a91851e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 24 May 2017 18:05:29 -0400 Subject: ext4: fix off-by-one error when writing back pages before dio read The 'lend' argument of filemap_write_and_wait_range() is inclusive, so we need to subtract 1 from pos + count. Note that 'count' is guaranteed to be nonzero since ext4_file_read_iter() returns early when given a 0 count. Fixes: 16c54688592c ("ext4: Allow parallel DIO reads") Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7cd99de8c4ba..90b264928742 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3713,7 +3713,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) */ inode_lock_shared(inode); ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count); + iocb->ki_pos + count - 1); if (ret) goto out_unlock; ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, -- cgit v1.2.3 From d6b975504e7e71fc29fcd14530433e816d7f5aac Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 24 May 2017 18:10:49 -0400 Subject: ext4: remove unused d_name argument from ext4_search_dir() et al. Now that we are passing a struct ext4_filename, we do not need to pass around the original struct qstr too. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 2 -- fs/ext4/inline.c | 5 ++--- fs/ext4/namei.c | 13 +++++-------- 3 files changed, 7 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8e8046104f4d..32191548abed 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2523,7 +2523,6 @@ extern int ext4_search_dir(struct buffer_head *bh, int buf_size, struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(handle_t *handle, @@ -3007,7 +3006,6 @@ extern int htree_inlinedir_to_tree(struct file *dir_file, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); extern int ext4_delete_inline_entry(handle_t *handle, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d5dea4c293ef..8d141c0c8ff9 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1627,7 +1627,6 @@ out: struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) { @@ -1649,7 +1648,7 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, - dir, fname, d_name, 0, res_dir); + dir, fname, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) @@ -1662,7 +1661,7 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, - dir, fname, d_name, 0, res_dir); + dir, fname, 0, res_dir); if (ret == 1) goto out_find; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b81f7d46f344..404256caf9cf 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1155,12 +1155,11 @@ errout: static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, - fname, d_name, offset, res_dir); + fname, offset, res_dir); } /* @@ -1262,7 +1261,6 @@ static inline bool ext4_match(const struct ext4_filename *fname, */ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, - const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; @@ -1355,7 +1353,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (ext4_has_inline_data(dir)) { int has_inline_data = 1; - ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir, + ret = ext4_find_inline_entry(dir, &fname, res_dir, &has_inline_data); if (has_inline_data) { if (inlined) @@ -1447,7 +1445,7 @@ restart: goto next; } set_buffer_verified(bh); - i = search_dirblock(bh, dir, &fname, d_name, + i = search_dirblock(bh, dir, &fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; @@ -1488,7 +1486,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, { struct super_block * sb = dir->i_sb; struct dx_frame frames[2], *frame; - const struct qstr *d_name = fname->usr_fname; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1505,7 +1502,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, if (IS_ERR(bh)) goto errout; - retval = search_dirblock(bh, dir, fname, d_name, + retval = search_dirblock(bh, dir, fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) @@ -1530,7 +1527,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, bh = NULL; errout: - dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); + dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); success: dx_release(frames); return bh; -- cgit v1.2.3 From c41d342b39f15acbdc61948bab0cb3c567ec992a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 24 May 2017 18:20:31 -0400 Subject: ext4: remove redundant check for encrypted file on dio write path Currently we don't allow direct I/O on encrypted regular files, so in such cases we return 0 early in ext4_direct_IO(). There was also an additional BUG_ON() check in ext4_direct_IO_write(), but it can never be hit because of the earlier check for the exact same condition in ext4_direct_IO(). There was also no matching check on the read path, which made the write path specific check seem very ad-hoc. Just remove the unnecessary BUG_ON(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: David Gstir Reviewed-by: Jan Kara --- fs/ext4/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 90b264928742..074aeba78259 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3629,9 +3629,6 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) get_block_func = ext4_dio_get_block_unwritten_async; dio_flags = DIO_LOCKING; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION - BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); -#endif ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block_func, ext4_end_io_dio, NULL, dio_flags); -- cgit v1.2.3 From b8cb5a545c3dd8b975aad19ea020eabe0a888e8d Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 24 May 2017 18:24:07 -0400 Subject: ext4: fix quota charging for shared xattr blocks ext4_xattr_block_set() calls dquot_alloc_block() to charge for an xattr block when new references are made. However if dquot_initialize() hasn't been called on an inode, request for charging is effectively ignored because ext4_inode_info->i_dquot is not initialized yet. Add dquot_initialize() to call paths that lead to ext4_xattr_block_set(). Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/acl.c | 4 ++++ fs/ext4/super.c | 3 +++ fs/ext4/xattr.c | 8 ++++++++ fs/quota/dquot.c | 16 ++++++++++++++++ include/linux/quotaops.h | 6 ++++++ 5 files changed, 37 insertions(+) (limited to 'fs') diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index fd389935ecd1..3ec0e46de95f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -4,6 +4,7 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, */ +#include #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" @@ -232,6 +233,9 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) handle_t *handle; int error, retries = 0; + error = dquot_initialize(inode); + if (error) + return error; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, ext4_jbd2_credits_xattr(inode)); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8f05c72bc4ec..d37c81f327e7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1174,6 +1174,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, return res; } + res = dquot_initialize(inode); + if (res) + return res; retry: handle = ext4_journal_start(inode, EXT4_HT_MISC, ext4_jbd2_credits_xattr(inode)); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8fb7ce14e6eb..5d3c2536641c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -888,6 +888,8 @@ inserted: else { u32 ref; + WARN_ON_ONCE(dquot_initialize_needed(inode)); + /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, @@ -954,6 +956,8 @@ inserted: /* We need to allocate a new block */ ext4_fsblk_t goal, block; + WARN_ON_ONCE(dquot_initialize_needed(inode)); + goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); @@ -1166,6 +1170,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, return -EINVAL; if (strlen(name) > 255) return -ERANGE; + ext4_write_lock_xattr(inode, &no_expand); error = ext4_reserve_inode_write(handle, inode, &is.iloc); @@ -1267,6 +1272,9 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, int error, retries = 0; int credits = ext4_jbd2_credits_xattr(inode); + error = dquot_initialize(inode); + if (error) + return error; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ebf80c7739e1..48813aeaab80 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1512,6 +1512,22 @@ int dquot_initialize(struct inode *inode) } EXPORT_SYMBOL(dquot_initialize); +bool dquot_initialize_needed(struct inode *inode) +{ + struct dquot **dquots; + int i; + + if (!dquot_active(inode)) + return false; + + dquots = i_dquot(inode); + for (i = 0; i < MAXQUOTAS; i++) + if (!dquots[i] && sb_has_quota_active(inode->i_sb, i)) + return true; + return false; +} +EXPORT_SYMBOL(dquot_initialize_needed); + /* * Release all quotas referenced by inode. * diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 9c6f768b7d32..dda22f45fc1b 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -44,6 +44,7 @@ void inode_sub_rsv_space(struct inode *inode, qsize_t number); void inode_reclaim_rsv_space(struct inode *inode, qsize_t number); int dquot_initialize(struct inode *inode); +bool dquot_initialize_needed(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); static inline struct dquot *dqgrab(struct dquot *dquot) @@ -207,6 +208,11 @@ static inline int dquot_initialize(struct inode *inode) return 0; } +static inline bool dquot_initialize_needed(struct inode *inode) +{ + return false; +} + static inline void dquot_drop(struct inode *inode) { } -- cgit v1.2.3 From 4f8caa60a5a13a78f26198618f21774bd6aa6498 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 26 May 2017 17:40:52 -0400 Subject: ext4: fix data corruption with EXT4_GET_BLOCKS_ZERO When ext4_map_blocks() is called with EXT4_GET_BLOCKS_ZERO to zero-out allocated blocks and these blocks are actually converted from unwritten extent the following race can happen: CPU0 CPU1 page fault page fault ... ... ext4_map_blocks() ext4_ext_map_blocks() ext4_ext_handle_unwritten_extents() ext4_ext_convert_to_initialized() - zero out converted extent ext4_zeroout_es() - inserts extent as initialized in status tree ext4_map_blocks() ext4_es_lookup_extent() - finds initialized extent write data ext4_issue_zeroout() - zeroes out new extent overwriting data This problem can be reproduced by generic/340 for the fallocated case for the last block in the file. Fix the problem by avoiding zeroing out the area we are mapping with ext4_map_blocks() in ext4_ext_convert_to_initialized(). It is pointless to zero out this area in the first place as the caller asked us to convert the area to initialized because he is just going to write data there before the transaction finishes. To achieve this we delete the special case of zeroing out full extent as that will be handled by the cases below zeroing only the part of the extent that needs it. We also instruct ext4_split_extent() that the middle of extent being split contains data so that ext4_split_extent_at() cannot zero out full extent in case of ENOSPC. CC: stable@vger.kernel.org Fixes: 12735f881952c32b31bc4e433768f18489f79ec9 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 80 +++++++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2a97dff87b96..83a513efc824 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3413,13 +3413,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; - struct ext4_extent zero_ex; + struct ext4_extent zero_ex1, zero_ex2; struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int allocated = 0, max_zeroout = 0; int err = 0; - int split_flag = 0; + int split_flag = EXT4_EXT_DATA_VALID2; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, @@ -3436,7 +3436,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - zero_ex.ee_len = 0; + zero_ex1.ee_len = 0; + zero_ex2.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3576,62 +3577,52 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (ext4_encrypted_inode(inode)) max_zeroout = 0; - /* If extent is less than s_max_zeroout_kb, zeroout directly */ - if (max_zeroout && (ee_len <= max_zeroout)) { - err = ext4_ext_zeroout(inode, ex); - if (err) - goto out; - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); - ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(handle, inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + path->p_depth); - goto out; - } - /* - * four cases: + * five cases: * 1. split the extent into three extents. - * 2. split the extent into two extents, zeroout the first half. - * 3. split the extent into two extents, zeroout the second half. + * 2. split the extent into two extents, zeroout the head of the first + * extent. + * 3. split the extent into two extents, zeroout the tail of the second + * extent. * 4. split the extent into two extents with out zeroout. + * 5. no splitting needed, just possibly zeroout the head and / or the + * tail of the extent. */ split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (max_zeroout && (allocated > map->m_len)) { + if (max_zeroout && (allocated > split_map.m_len)) { if (allocated <= max_zeroout) { - /* case 3 */ - zero_ex.ee_block = - cpu_to_le32(map->m_lblk); - zero_ex.ee_len = cpu_to_le16(allocated); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex) + map->m_lblk - ee_block); - err = ext4_ext_zeroout(inode, &zero_ex); + /* case 3 or 5 */ + zero_ex1.ee_block = + cpu_to_le32(split_map.m_lblk + + split_map.m_len); + zero_ex1.ee_len = + cpu_to_le16(allocated - split_map.m_len); + ext4_ext_store_pblock(&zero_ex1, + ext4_ext_pblock(ex) + split_map.m_lblk + + split_map.m_len - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto out; - split_map.m_lblk = map->m_lblk; split_map.m_len = allocated; - } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { - /* case 2 */ - if (map->m_lblk != ee_block) { - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(map->m_lblk - + } + if (split_map.m_lblk - ee_block + split_map.m_len < + max_zeroout) { + /* case 2 or 5 */ + if (split_map.m_lblk != ee_block) { + zero_ex2.ee_block = ex->ee_block; + zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - ee_block); - ext4_ext_store_pblock(&zero_ex, + ext4_ext_store_pblock(&zero_ex2, ext4_ext_pblock(ex)); - err = ext4_ext_zeroout(inode, &zero_ex); + err = ext4_ext_zeroout(inode, &zero_ex2); if (err) goto out; } + split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; - split_map.m_len = map->m_lblk - ee_block + map->m_len; allocated = map->m_len; } } @@ -3642,8 +3633,11 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = 0; out: /* If we have gotten a failure, don't zero out status tree */ - if (!err) - err = ext4_zeroout_es(inode, &zero_ex); + if (!err) { + err = ext4_zeroout_es(inode, &zero_ex1); + if (!err) + err = ext4_zeroout_es(inode, &zero_ex2); + } return err ? err : allocated; } -- cgit v1.2.3 From a056bdaae7a181f7dcc876cfab2f94538e508709 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 26 May 2017 17:45:45 -0400 Subject: ext4: fix data corruption for mmap writes mpage_submit_page() can race with another process growing i_size and writing data via mmap to the written-back page. As mpage_submit_page() samples i_size too early, it may happen that ext4_bio_write_page() zeroes out too large tail of the page and thus corrupts user data. Fix the problem by sampling i_size only after the page has been write-protected in page tables by clear_page_dirty_for_io() call. Reported-by: Michael Zimmer CC: stable@vger.kernel.org Fixes: cb20d5188366f04d96d2e07b1240cc92170ade40 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 074aeba78259..657a98fa80b6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2124,15 +2124,29 @@ static int ext4_writepage(struct page *page, static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) { int len; - loff_t size = i_size_read(mpd->inode); + loff_t size; int err; BUG_ON(page->index != mpd->first_page); + clear_page_dirty_for_io(page); + /* + * We have to be very careful here! Nothing protects writeback path + * against i_size changes and the page can be writeably mapped into + * page tables. So an application can be growing i_size and writing + * data through mmap while writeback runs. clear_page_dirty_for_io() + * write-protects our page in page tables and the page cannot get + * written to again until we release page lock. So only after + * clear_page_dirty_for_io() we are safe to sample i_size for + * ext4_bio_write_page() to zero-out tail of the written page. We rely + * on the barrier provided by TestClearPageDirty in + * clear_page_dirty_for_io() to make sure i_size is really sampled only + * after page tables are updated. + */ + size = i_size_read(mpd->inode); if (page->index == size >> PAGE_SHIFT) len = size & ~PAGE_MASK; else len = PAGE_SIZE; - clear_page_dirty_for_io(page); err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); if (!err) mpd->wbc->nr_to_write--; -- cgit v1.2.3 From fe3b81b446d4ecb954f1b9dd191164a78fd278ad Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 4 Apr 2017 17:08:42 -0700 Subject: NFS: Use ERR_CAST() to avoid cross-structure cast When the call to nfs_devname() fails, the error path attempts to retain the error via the mnt variable, but this requires a cast across very different types (char * to struct vfsmount *), which the upcoming structure layout randomization plugin flags as being potentially dangerous in the face of randomization. This is a false positive, but what this code actually wants to do is retain the error value, so this patch explicitly sets it, instead of using what seems to be an unexpected cast. Signed-off-by: Kees Cook Acked-by: Trond Myklebust Reviewed-by: Christoph Hellwig --- fs/nfs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 1a224a33a6c2..e5686be67be8 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -246,7 +246,7 @@ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, devname = nfs_devname(dentry, page, PAGE_SIZE); if (IS_ERR(devname)) - mnt = (struct vfsmount *)devname; + mnt = ERR_CAST(devname); else mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); -- cgit v1.2.3 From fee2aa753823860f2b8dfe58d98cafe8e4840855 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 8 May 2017 14:45:26 -0700 Subject: ntfs: Use ERR_CAST() to avoid cross-structure cast When trying to propagate an error result, the error return path attempts to retain the error, but does this with an open cast across very different types, which the upcoming structure layout randomization plugin flags as being potentially dangerous in the face of randomization. This is a false positive, but what this code actually wants to do is use ERR_CAST() to retain the error value. Cc: Anton Altaparmakov Cc: Andrew Morton Signed-off-by: Kees Cook --- fs/ntfs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 358258364616..4690cd75d8d7 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -159,7 +159,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, PTR_ERR(dent_inode)); kfree(name); /* Return the error code. */ - return (struct dentry *)dent_inode; + return ERR_CAST(dent_inode); } /* It is guaranteed that @name is no longer allocated at this point. */ if (MREF_ERR(mref) == -ENOENT) { -- cgit v1.2.3 From 7585d12f6555fdf4faaefec34ac58b28555b27d0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 8 May 2017 14:49:27 -0700 Subject: ocfs2: Use ERR_CAST() to avoid cross-structure cast When trying to propagate an error result, the error return path attempts to retain the error, but does this with an open cast across very different types, which the upcoming structure layout randomization plugin flags as being potentially dangerous in the face of randomization. This is a false positive, but what this code actually wants to do is use ERR_CAST() to retain the error value. Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Kees Cook --- fs/ocfs2/export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 827fc9809bc2..9f88188060db 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -119,7 +119,7 @@ check_err: if (IS_ERR(inode)) { mlog_errno(PTR_ERR(inode)); - result = (void *)inode; + result = ERR_CAST(inode); goto bail; } -- cgit v1.2.3 From f3a1568582cc207663a4d5e37da790334372855b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 24 May 2017 15:29:33 +0300 Subject: ovl: mark upper merge dir with type origin entries "impure" An upper dir is marked "impure" to let ovl_iterate() know that this directory may contain non pure upper entries whose d_ino may need to be read from the origin inode. We already mark a non-merge dir "impure" when moving a non-pure child entry inside it, to let ovl_iterate() know not to iterate the non-merge dir directly. Mark also a merge dir "impure" when moving a non-pure child entry inside it and when copying up a child entry inside it. This can be used to optimize ovl_iterate() to perform a "pure merge" of upper and lower directories, merging the content of the directories, without having to read d_ino from origin inodes. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 5 +++++ fs/overlayfs/dir.c | 31 +++++-------------------------- fs/overlayfs/namei.c | 20 -------------------- fs/overlayfs/overlayfs.h | 15 +++++++++++---- fs/overlayfs/super.c | 5 ++++- fs/overlayfs/util.c | 42 +++++++++++++++++++++++++++++++++++------- 6 files changed, 60 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 843ed2a2d7db..7a44533f4bbf 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -459,6 +459,11 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; + /* Mark parent "impure" because it may now contain non-pure upper */ + err = ovl_set_impure(parent, upperdir); + if (err) + return err; + err = vfs_getattr(&parentpath, &pstat, STATX_ATIME | STATX_MTIME, AT_STATX_SYNC_AS_STAT); if (err) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index f2a118ba00e4..a63a71656e9b 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -149,22 +149,6 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) return ovl_set_opaque_xerr(dentry, upperdentry, -EIO); } -static int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) -{ - int err; - - /* - * Do not fail when upper doesn't support xattrs. - * Upper inodes won't have origin nor redirect xattr anyway. - */ - err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE, - "y", 1, 0); - if (!err) - ovl_dentry_set_impure(dentry); - - return err; -} - /* Common operations required to be done after creation of file on upper */ static void ovl_instantiate(struct dentry *dentry, struct inode *inode, struct dentry *newdentry, bool hardlink) @@ -976,21 +960,16 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (!samedir) { /* * When moving a merge dir or non-dir with copy up origin into - * a non-merge upper dir (a.k.a pure upper dir), we are making - * the target parent dir "impure". ovl_iterate() iterates pure - * upper dirs directly, because there is no need to filter out - * whiteouts and merge dir content with lower dir. But for the - * case of an "impure" upper dir, ovl_iterate() cannot iterate - * the real directory directly, because it looks for the inode - * numbers to fill d_ino in the entries origin inode. + * a new parent, we are marking the new parent dir "impure". + * When ovl_iterate() iterates an "impure" upper dir, it will + * lookup the origin inodes of the entries to fill d_ino. */ - if (ovl_type_origin(old) && !ovl_type_merge(new->d_parent)) { + if (ovl_type_origin(old)) { err = ovl_set_impure(new->d_parent, new_upperdir); if (err) goto out_revert_creds; } - if (!overwrite && ovl_type_origin(new) && - !ovl_type_merge(old->d_parent)) { + if (!overwrite && ovl_type_origin(new)) { err = ovl_set_impure(old->d_parent, old_upperdir); if (err) goto out_revert_creds; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 0c72a5909db2..f3136c31e72a 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -167,31 +167,11 @@ invalid: goto out; } -static bool ovl_check_dir_xattr(struct dentry *dentry, const char *name) -{ - int res; - char val; - - if (!d_is_dir(dentry)) - return false; - - res = vfs_getxattr(dentry, name, &val, 1); - if (res == 1 && val == 'y') - return true; - - return false; -} - static bool ovl_is_opaquedir(struct dentry *dentry) { return ovl_check_dir_xattr(dentry, OVL_XATTR_OPAQUE); } -static bool ovl_is_impuredir(struct dentry *dentry) -{ - return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); -} - static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, const char *name, unsigned int namelen, size_t prelen, const char *post, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index a9fb958fd5d4..0623cebeefff 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -207,7 +207,6 @@ bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_impure(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); -void ovl_dentry_set_impure(struct dentry *dentry); bool ovl_redirect_dir(struct super_block *sb); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); @@ -221,6 +220,17 @@ bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry); void ovl_copy_up_end(struct dentry *dentry); +bool ovl_check_dir_xattr(struct dentry *dentry, const char *name); +int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, + const char *name, const void *value, size_t size, + int xerr); +int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); + +static inline bool ovl_is_impuredir(struct dentry *dentry) +{ + return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); +} + /* namei.c */ int ovl_path_next(int idx, struct dentry *dentry, struct path *path); @@ -281,6 +291,3 @@ int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_flags(struct dentry *dentry, int flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); -int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, - const char *name, const void *value, size_t size, - int xerr); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f1647626a882..4882ffb37bae 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -974,7 +974,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) path_put(&workpath); kfree(lowertmp); - oe->__upperdentry = upperpath.dentry; + if (upperpath.dentry) { + oe->__upperdentry = upperpath.dentry; + oe->impure = ovl_is_impuredir(upperpath.dentry); + } for (i = 0; i < numlower; i++) { oe->lowerstack[i].dentry = stack[i].dentry; oe->lowerstack[i].mnt = ufs->lower_mnt[i]; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index e0dfb07d5457..809048913889 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -194,13 +194,6 @@ void ovl_dentry_set_opaque(struct dentry *dentry) oe->opaque = true; } -void ovl_dentry_set_impure(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - oe->impure = true; -} - bool ovl_redirect_dir(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; @@ -311,6 +304,21 @@ void ovl_copy_up_end(struct dentry *dentry) spin_unlock(&ofs->copyup_wq.lock); } +bool ovl_check_dir_xattr(struct dentry *dentry, const char *name) +{ + int res; + char val; + + if (!d_is_dir(dentry)) + return false; + + res = vfs_getxattr(dentry, name, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, const char *name, const void *value, size_t size, int xerr) @@ -331,3 +339,23 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, return err; } + +int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) +{ + int err; + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->impure) + return 0; + + /* + * Do not fail when upper doesn't support xattrs. + * Upper inodes won't have origin nor redirect xattr anyway. + */ + err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE, + "y", 1, 0); + if (!err) + oe->impure = true; + + return err; +} -- cgit v1.2.3 From a082c6f680da298cf075886ff032f32ccb7c5e1a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 29 May 2017 15:15:27 +0200 Subject: ovl: filter trusted xattr for non-admin Filesystems filter out extended attributes in the "trusted." domain for unprivlieged callers. Overlay calls underlying filesystem's method with elevated privs, so need to do the filtering in overlayfs too. Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index ad9547f82da5..d613e2c41242 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -240,6 +240,16 @@ int ovl_xattr_get(struct dentry *dentry, const char *name, return res; } +static bool ovl_can_list(const char *s) +{ + /* List all non-trusted xatts */ + if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) + return true; + + /* Never list trusted.overlay, list other trusted for superuser only */ + return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN); +} + ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); @@ -263,7 +273,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return -EIO; len -= slen; - if (ovl_is_private_xattr(s)) { + if (!ovl_can_list(s)) { res -= slen; memmove(s, s + slen, len); } else { -- cgit v1.2.3 From 67a7d5f561f469ad2fa5154d2888258ab8e6df7c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 May 2017 13:24:55 -0400 Subject: ext4: fix fdatasync(2) after extent manipulation operations Currently, extent manipulation operations such as hole punch, range zeroing, or extent shifting do not record the fact that file data has changed and thus fdatasync(2) has a work to do. As a result if we crash e.g. after a punch hole and fdatasync, user can still possibly see the punched out data after journal replay. Test generic/392 fails due to these problems. Fix the problem by properly marking that file data has changed in these operations. CC: stable@vger.kernel.org Fixes: a4bb6b64e39abc0e41ca077725f2a72c868e7622 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 5 +++++ fs/ext4/inode.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 83a513efc824..3e36508610b7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4877,6 +4877,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); @@ -5563,6 +5565,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); @@ -5736,6 +5739,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 657a98fa80b6..5cf82d03968c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4218,6 +4218,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_dio: -- cgit v1.2.3 From f511c0b17b081562dca8ac5061dfa86db4c66cc2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 30 May 2017 12:38:59 -0700 Subject: "Yes, people use FOLL_FORCE ;)" This effectively reverts commit 8ee74a91ac30 ("proc: try to remove use of FOLL_FORCE entirely") It turns out that people do depend on FOLL_FORCE for the /proc//mem case, and we're talking not just debuggers. Talking to the affected people, the use-cases are: Keno Fischer: "We used these semantics as a hardening mechanism in the julia JIT. By opening /proc/self/mem and using these semantics, we could avoid needing RWX pages, or a dual mapping approach. We do have fallbacks to these other methods (though getting EIO here actually causes an assert in released versions - we'll updated that to make sure to take the fall back in that case). Nevertheless the /proc/self/mem approach was our favored approach because it a) Required an attacker to be able to execute syscalls which is a taller order than getting memory write and b) didn't double the virtual address space requirements (as a dual mapping approach would). I think in general this feature is very useful for anybody who needs to precisely control the execution of some other process. Various debuggers (gdb/lldb/rr) certainly fall into that category, but there's another class of such processes (wine, various emulators) which may want to do that kind of thing. Now, I suspect most of these will have the other process under ptrace control, so maybe allowing (same_mm || ptraced) would be ok, but at least for the sandbox/remote-jit use case, it would be perfectly reasonable to not have the jit server be a ptracer" Robert O'Callahan: "We write to readonly code and data mappings via /proc/.../mem in lots of different situations, particularly when we're adjusting program state during replay to match the recorded execution. Like Julia, we can add workarounds, but they could be expensive." so not only do people use FOLL_FORCE for both reads and writes, but they use it for both the local mm and remote mm. With these comments in mind, we likely also cannot add the "are we actively ptracing" check either, so this keeps the new code organization and does not do a real revert that would add back the original comment about "Maybe we should limit FOLL_FORCE to actual ptrace users?" Reported-by: Keno Fischer Reported-by: Robert O'Callahan Cc: Kees Cook Cc: Andy Lutomirski Cc: Eric Biederman Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 45f6bf68fff3..f1e1927ccd48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -821,7 +821,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - flags = write ? FOLL_WRITE : 0; + flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); -- cgit v1.2.3 From 63db7c815bc0997c29e484d2409684fdd9fcd93b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 31 May 2017 08:22:52 -0700 Subject: xfs: use ->b_state to fix buffer I/O accounting release race We've had user reports of unmount hangs in xfs_wait_buftarg() that analysis shows is due to btp->bt_io_count == -1. bt_io_count represents the count of in-flight asynchronous buffers and thus should always be >= 0. xfs_wait_buftarg() waits for this value to stabilize to zero in order to ensure that all untracked (with respect to the lru) buffers have completed I/O processing before unmount proceeds to tear down in-core data structures. The value of -1 implies an I/O accounting decrement race. Indeed, the fact that xfs_buf_ioacct_dec() is called from xfs_buf_rele() (where the buffer lock is no longer held) means that bp->b_flags can be updated from an unsafe context. While a user-level reproducer is currently not available, some intrusive hacks to run racing buffer lookups/ioacct/releases from multiple threads was used to successfully manufacture this problem. Existing callers do not expect to acquire the buffer lock from xfs_buf_rele(). Therefore, we can not safely update ->b_flags from this context. It turns out that we already have separate buffer state bits and associated serialization for dealing with buffer LRU state in the form of ->b_state and ->b_lock. Therefore, replace the _XBF_IN_FLIGHT flag with a ->b_state variant, update the I/O accounting wrappers appropriately and make sure they are used with the correct locking. This ensures that buffer in-flight state can be modified at buffer release time without racing with modifications from a buffer lock holder. Fixes: 9c7504aa72b6 ("xfs: track and serialize in-flight async buffers against unmount") Cc: # v4.8+ Signed-off-by: Brian Foster Reviewed-by: Nikolay Borisov Tested-by: Libor Pechacek Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 38 ++++++++++++++++++++++++++------------ fs/xfs/xfs_buf.h | 5 ++--- 2 files changed, 28 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 62fa39276a24..07b77b73b024 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -97,12 +97,16 @@ static inline void xfs_buf_ioacct_inc( struct xfs_buf *bp) { - if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + if (bp->b_flags & XBF_NO_IOACCT) return; ASSERT(bp->b_flags & XBF_ASYNC); - bp->b_flags |= _XBF_IN_FLIGHT; - percpu_counter_inc(&bp->b_target->bt_io_count); + spin_lock(&bp->b_lock); + if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { + bp->b_state |= XFS_BSTATE_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); + } + spin_unlock(&bp->b_lock); } /* @@ -110,14 +114,24 @@ xfs_buf_ioacct_inc( * freed and unaccount from the buftarg. */ static inline void -xfs_buf_ioacct_dec( +__xfs_buf_ioacct_dec( struct xfs_buf *bp) { - if (!(bp->b_flags & _XBF_IN_FLIGHT)) - return; + ASSERT(spin_is_locked(&bp->b_lock)); - bp->b_flags &= ~_XBF_IN_FLIGHT; - percpu_counter_dec(&bp->b_target->bt_io_count); + if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { + bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); + } +} + +static inline void +xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + spin_lock(&bp->b_lock); + __xfs_buf_ioacct_dec(bp); + spin_unlock(&bp->b_lock); } /* @@ -149,9 +163,9 @@ xfs_buf_stale( * unaccounted (released to LRU) before that occurs. Drop in-flight * status now to preserve accounting consistency. */ - xfs_buf_ioacct_dec(bp); - spin_lock(&bp->b_lock); + __xfs_buf_ioacct_dec(bp); + atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) @@ -979,12 +993,12 @@ xfs_buf_rele( * ensures the decrement occurs only once per-buf. */ if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) - xfs_buf_ioacct_dec(bp); + __xfs_buf_ioacct_dec(bp); goto out_unlock; } /* the last reference has been dropped ... */ - xfs_buf_ioacct_dec(bp); + __xfs_buf_ioacct_dec(bp); if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { /* * If the buffer is added to the LRU take a new reference to the diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8d1d44f87ce9..1508121f29f2 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -63,7 +63,6 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ -#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -84,14 +83,14 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" }, \ - { _XBF_IN_FLIGHT, "IN_FLIGHT" } + { _XBF_COMPOUND, "COMPOUND" } /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ +#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ /* * The xfs_buftarg contains 2 notions of "sector size" - -- cgit v1.2.3 From cc2b702c52094b637a351d7491ac5200331d0445 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 12 May 2017 01:03:52 +0200 Subject: btrfs: use correct types for page indices in btrfs_page_exists_in_range Variables start_idx and end_idx are supposed to hold a page index derived from the file offsets. The int type is not the right one though, offsets larger than 1 << 44 will get silently trimmed off the high bits. (1 << 44 is 16TiB) What can go wrong, if start is below the boundary and end gets trimmed: - if there's a page after start, we'll find it (radix_tree_gang_lookup_slot) - the final check "if (page->index <= end_idx)" will unexpectedly fail The function will return false, ie. "there's no page in the range", although there is at least one. btrfs_page_exists_in_range is used to prevent races in: * in hole punching, where we make sure there are not pages in the truncated range, otherwise we'll wait for them to finish and redo truncation, but we're going to replace the pages with holes anyway so the only problem is the intermediate state * lock_extent_direct: we want to make sure there are no pages before we lock and start DIO, to prevent stale data reads For practical occurence of the bug, there are several constaints. The file must be quite large, the affected range must cross the 16TiB boundary and the internal state of the file pages and pending operations must match. Also, we must not have started any ordered data in the range, otherwise we don't even reach the buggy function check. DIO locking tries hard in several places to avoid deadlocks with buffered IO and avoids waiting for ranges. The worst consequence seems to be stale data read. CC: Liu Bo CC: stable@vger.kernel.org # 3.16+ Fixes: fc4adbff823f7 ("btrfs: Drop EXTENT_UPTODATE check in hole punching and direct locking") Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 17cbe9306faf..3fc3e10594a4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7483,8 +7483,8 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) int found = false; void **pagep = NULL; struct page *page = NULL; - int start_idx; - int end_idx; + unsigned long start_idx; + unsigned long end_idx; start_idx = start >> PAGE_SHIFT; -- cgit v1.2.3 From 896533a7da929136d0432713f02a3edffece2826 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 17 May 2017 09:49:37 -0400 Subject: btrfs: fix memory leak in update_space_info failure path If we fail to add the space_info kobject, we'll leak the memory for the percpu counter. Fixes: 6ab0a2029c (btrfs: publish allocation data in sysfs) Cc: # v3.14+ Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e390451c72e6..27ca5b81ed22 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3993,6 +3993,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, info->space_info_kobj, "%s", alloc_name(found->flags)); if (ret) { + percpu_counter_destroy(&found->total_bytes_pinned); kfree(found); return ret; } -- cgit v1.2.3 From a9b3311ef36b670909ea4443f306c8318082c8f0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 17 May 2017 11:38:34 -0400 Subject: btrfs: fix race with relocation recovery and fs_root setup If we have to recover relocation during mount, we'll ultimately have to evict the orphan inode. That goes through the reservation dance, where priority_reclaim_metadata_space and flush_space expect fs_info->fs_root to be valid. That's the next thing to be set up during mount, so we crash, almost always in flush_space trying to join the transaction but priority_reclaim_metadata_space is possible as well. This call path has been problematic in the past WRT whether ->fs_root is valid yet. Commit 957780eb278 (Btrfs: introduce ticketed enospc infrastructure) added new users that are called in the direct path instead of the async path that had already been worked around. The thing is that we don't actually need the fs_root, specifically, for anything. We either use it to determine whether the root is the chunk_root for use in choosing an allocation profile or as a root to pass btrfs_join_transaction before immediately committing it. Anything that isn't the chunk root works in the former case and any root works in the latter. A simple fix is to use a root we know will always be there: the extent_root. Cc: # v4.8+ Fixes: 957780eb278 (Btrfs: introduce ticketed enospc infrastructure) Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 27ca5b81ed22..33d979e9ea2a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4845,7 +4845,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, spin_unlock(&delayed_rsv->lock); commit: - trans = btrfs_join_transaction(fs_info->fs_root); + trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return -ENOSPC; @@ -4863,7 +4863,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, u64 orig_bytes, int state) { - struct btrfs_root *root = fs_info->fs_root; + struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; int nr; int ret = 0; @@ -5063,7 +5063,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int flush_state = FLUSH_DELAYED_ITEMS_NR; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->extent_root, space_info); if (!to_reclaim) { spin_unlock(&space_info->lock); -- cgit v1.2.3 From e2093926a098a8ccf0f1d10f6df8dad452cb28d3 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 2 Jun 2017 14:46:37 -0700 Subject: dax: fix race between colliding PMD & PTE entries We currently have two related PMD vs PTE races in the DAX code. These can both be easily triggered by having two threads reading and writing simultaneously to the same private mapping, with the key being that private mapping reads can be handled with PMDs but private mapping writes are always handled with PTEs so that we can COW. Here is the first race: CPU 0 CPU 1 (private mapping write) __handle_mm_fault() create_huge_pmd() - FALLBACK handle_pte_fault() passes check for pmd_devmap() (private mapping read) __handle_mm_fault() create_huge_pmd() dax_iomap_pmd_fault() inserts PMD dax_iomap_pte_fault() does a PTE fault, but we already have a DAX PMD installed in our page tables at this spot. Here's the second race: CPU 0 CPU 1 (private mapping read) __handle_mm_fault() passes check for pmd_none() create_huge_pmd() dax_iomap_pmd_fault() inserts PMD (private mapping write) __handle_mm_fault() create_huge_pmd() - FALLBACK (private mapping read) __handle_mm_fault() passes check for pmd_none() create_huge_pmd() handle_pte_fault() dax_iomap_pte_fault() inserts PTE dax_iomap_pmd_fault() inserts PMD, but we already have a PTE at this spot. The core of the issue is that while there is isolation between faults to the same range in the DAX fault handlers via our DAX entry locking, there is no isolation between faults in the code in mm/memory.c. This means for instance that this code in __handle_mm_fault() can run: if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { ret = create_huge_pmd(&vmf); But by the time we actually get to run the fault handler called by create_huge_pmd(), the PMD is no longer pmd_none() because a racing PTE fault has installed a normal PMD here as a parent. This is the cause of the 2nd race. The first race is similar - there is the following check in handle_pte_fault(): } else { /* See comment in pte_alloc_one_map() */ if (pmd_devmap(*vmf->pmd) || pmd_trans_unstable(vmf->pmd)) return 0; So if a pmd_devmap() PMD (a DAX PMD) has been installed at vmf->pmd, we will bail and retry the fault. This is correct, but there is nothing preventing the PMD from being installed after this check but before we actually get to the DAX PTE fault handlers. In my testing these races result in the following types of errors: BUG: Bad rss-counter state mm:ffff8800a817d280 idx:1 val:1 BUG: non-zero nr_ptes on freeing mm: 15 Fix this issue by having the DAX fault handlers verify that it is safe to continue their fault after they have taken an entry lock to block other racing faults. [ross.zwisler@linux.intel.com: improve fix for colliding PMD & PTE entries] Link: http://lkml.kernel.org/r/20170526195932.32178-1-ross.zwisler@linux.intel.com Link: http://lkml.kernel.org/r/20170522215749.23516-2-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reported-by: Pawel Lebioda Reviewed-by: Jan Kara Cc: "Darrick J. Wong" Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Hansen Cc: Matthew Wilcox Cc: "Kirill A . Shutemov" Cc: Pawel Lebioda Cc: Dave Jiang Cc: Xiong Zhou Cc: Eryu Guan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index c22eaf162f95..2a6889b3585f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1154,6 +1154,17 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto out; } + /* + * It is possible, particularly with mixed reads & writes to private + * mappings, that we have raced with a PMD fault that overlaps with + * the PTE we need to set up. If so just return and the fault will be + * retried. + */ + if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { + vmf_ret = VM_FAULT_NOPAGE; + goto unlock_entry; + } + /* * Note that we don't bother to use iomap_apply here: DAX required * the file system block size to be equal the page size, which means @@ -1397,6 +1408,18 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, if (IS_ERR(entry)) goto fallback; + /* + * It is possible, particularly with mixed reads & writes to private + * mappings, that we have raced with a PTE fault that overlaps with + * the PMD we need to set up. If so just return and the fault will be + * retried. + */ + if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && + !pmd_devmap(*vmf->pmd)) { + result = 0; + goto unlock_entry; + } + /* * Note that we don't use iomap_apply here. We aren't doing I/O, only * setting up a mapping, so really we're using iomap_begin() as a way -- cgit v1.2.3 From 4f253e1eb628f5adf7ca4f43aab4bbb1bfffa081 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 16 May 2017 12:18:11 +0200 Subject: nfs: Mark unnecessarily extern functions as static nfs_initialise_sb() and nfs_clone_super() are declared as extern even though they are used only in fs/nfs/super.c. Mark them as static. Also remove explicit 'inline' directive from nfs_initialise_sb() and leave it upto compiler to decide whether inlining is worth it. Signed-off-by: Jan Kara Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 2 -- fs/nfs/super.c | 5 +++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e9b4c3320e37..3e24392f2caa 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -398,7 +398,6 @@ extern struct file_system_type nfs4_referral_fs_type; bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); -void nfs_initialise_sb(struct super_block *); int nfs_set_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); int nfs_clone_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); struct dentry *nfs_fs_mount_common(struct nfs_server *, int, const char *, @@ -458,7 +457,6 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ -void nfs_clone_super(struct super_block *, struct nfs_mount_info *); void nfs_umount_begin(struct super_block *); int nfs_statfs(struct dentry *, struct kstatfs *); int nfs_show_options(struct seq_file *, struct dentry *); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2f3822a4a7d5..eceb4eabb064 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2301,7 +2301,7 @@ EXPORT_SYMBOL_GPL(nfs_remount); /* * Initialise the common bits of the superblock */ -inline void nfs_initialise_sb(struct super_block *sb) +static void nfs_initialise_sb(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); @@ -2348,7 +2348,8 @@ EXPORT_SYMBOL_GPL(nfs_fill_super); /* * Finish setting up a cloned NFS2/3/4 superblock */ -void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) +static void nfs_clone_super(struct super_block *sb, + struct nfs_mount_info *mount_info) { const struct super_block *old_sb = mount_info->cloned->sb; struct nfs_server *server = NFS_SB(sb); -- cgit v1.2.3 From 239e250e4acbc0104d514307029c0839e834a51a Mon Sep 17 00:00:00 2001 From: Richard Narron Date: Sun, 4 Jun 2017 16:23:18 -0700 Subject: fs/ufs: Set UFS default maximum bytes per file This fixes a problem with reading files larger than 2GB from a UFS-2 file system: https://bugzilla.kernel.org/show_bug.cgi?id=195721 The incorrect UFS s_maxsize limit became a problem as of commit c2a9737f45e2 ("vfs,mm: fix a dead loop in truncate_inode_pages_range()") which started using s_maxbytes to avoid a page index overflow in do_generic_file_read(). That caused files to be truncated on UFS-2 file systems because the default maximum file size is 2GB (MAX_NON_LFS) and UFS didn't update it. Here I simply increase the default to a common value used by other file systems. Signed-off-by: Richard Narron Cc: Al Viro Cc: Will B Cc: Theodore Ts'o Cc: # v4.9 and backports of c2a9737f45e2 Signed-off-by: Linus Torvalds --- fs/ufs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 131b2b77c818..29ecaf739449 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -812,9 +812,8 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) uspi->s_dirblksize = UFS_SECTOR_SIZE; super_block_offset=UFS_SBLOCK; - /* Keep 2Gig file limit. Some UFS variants need to override - this but as I don't know which I'll let those in the know loosen - the rules */ + sb->s_maxbytes = MAX_LFS_FILESIZE; + switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { case UFS_MOUNT_UFSTYPE_44BSD: UFSD("ufstype=44bsd\n"); -- cgit v1.2.3 From 286b92f43c0d0ae2c29a61769b66219fe5ae6701 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Wed, 24 May 2017 09:31:32 +0800 Subject: btrfs: tree-log.c: Wrong printk information about namelen In verify_dir_item, it wants to printk name_len of dir_item but printk data_len acutally. Fix it by calling btrfs_dir_name_len instead of btrfs_dir_data_len. Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/dir-item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 60a750678a82..c24d615e3d7f 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -468,7 +468,7 @@ int verify_dir_item(struct btrfs_fs_info *fs_info, if (btrfs_dir_name_len(leaf, dir_item) > namelen) { btrfs_crit(fs_info, "invalid dir item name len: %u", - (unsigned)btrfs_dir_data_len(leaf, dir_item)); + (unsigned)btrfs_dir_name_len(leaf, dir_item)); return 1; } -- cgit v1.2.3 From 452e62b71fbbefe2646fad3a968371a026936c6d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 26 May 2017 17:44:23 -0600 Subject: Btrfs: clear EXTENT_DEFRAG bits in finish_ordered_io Before this, we use 'filled' mode here, ie. if all range has been filled with EXTENT_DEFRAG bits, get to clear it, but if the defrag range joins the adjacent delalloc range, then we'll have EXTENT_DEFRAG bits in extent_state until releasing this inode's pages, and that prevents extent_data from being freed. This clears the bit if any was found within the ordered extent. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3fc3e10594a4..ef3c98c527c1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2952,7 +2952,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = test_range_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 1, cached_state); + EXTENT_DEFRAG, 0, cached_state); if (ret) { u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); if (0 && last_snapshot >= BTRFS_I(inode)->generation) -- cgit v1.2.3 From 70e7af244f24c94604ef6eca32ad297632018583 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 2 Jun 2017 01:20:01 -0700 Subject: Btrfs: fix delalloc accounting leak caused by u32 overflow btrfs_calc_trans_metadata_size() does an unsigned 32-bit multiplication, which can overflow if num_items >= 4 GB / (nodesize * BTRFS_MAX_LEVEL * 2). For a nodesize of 16kB, this overflow happens at 16k items. Usually, num_items is a small constant passed to btrfs_start_transaction(), but we also use btrfs_calc_trans_metadata_size() for metadata reservations for extent items in btrfs_delalloc_{reserve,release}_metadata(). In drop_outstanding_extents(), num_items is calculated as inode->reserved_extents - inode->outstanding_extents. The difference between these two counters is usually small, but if many delalloc extents are reserved and then the outstanding extents are merged in btrfs_merge_extent_hook(), the difference can become large enough to overflow in btrfs_calc_trans_metadata_size(). The overflow manifests itself as a leak of a multiple of 4 GB in delalloc_block_rsv and the metadata bytes_may_use counter. This in turn can cause early ENOSPC errors. Additionally, these WARN_ONs in extent-tree.c will be hit when unmounting: WARN_ON(fs_info->delalloc_block_rsv.size > 0); WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0); Fix it by casting nodesize to a u64 so that btrfs_calc_trans_metadata_size() does a full 64-bit multiplication. While we're here, do the same in btrfs_calc_trunc_metadata_size(); this can't overflow with any existing uses, but it's better to be safe here than have another hard-to-debug problem later on. Cc: stable@vger.kernel.org Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1e82516fe2d8..43a8c42cdf07 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2564,7 +2564,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { - return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* @@ -2574,7 +2574,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { - return fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From 8785d84d002c2ce0f68fbcd6c2c86be859802c7e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 02:42:03 -0400 Subject: ufs: restore proper tail allocation Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/ufs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7e41aee7b69a..feb5f280db09 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -284,7 +284,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, goal += uspi->s_fpb; } tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), - goal, uspi->s_fpb, err, locked_page); + goal, nfrags, err, locked_page); if (!tmp) { *err = -ENOSPC; -- cgit v1.2.3 From 414cf7186dbec29bd946c138d6b5c09da5955a08 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 18:15:18 -0400 Subject: fix ufs_isblockset() Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/ufs/util.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ufs/util.h b/fs/ufs/util.h index b7fbf53dbc81..398019fb1448 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -473,15 +473,19 @@ static inline unsigned _ubh_find_last_zero_bit_( static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned begin, unsigned block) { + u8 mask; switch (uspi->s_fpb) { case 8: return (*ubh_get_addr (ubh, begin + block) == 0xff); case 4: - return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2))); + mask = 0x0f << ((block & 0x01) << 2); + return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask; case 2: - return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1))); + mask = 0x03 << ((block & 0x03) << 1); + return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask; case 1: - return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07))); + mask = 0x01 << (block & 0x07); + return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask; } return 0; } -- cgit v1.2.3 From eb315d2ae614493fd1ebb026c75a80573d84f7ad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 21:15:03 -0400 Subject: ufs: restore maintaining ->i_blocks Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/stat.c | 1 + fs/ufs/balloc.c | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/stat.c b/fs/stat.c index f494b182c7c7..c35610845ab1 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -672,6 +672,7 @@ void __inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_bytes -= 512; } } +EXPORT_SYMBOL(__inode_add_bytes); void inode_add_bytes(struct inode *inode, loff_t bytes) { diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index a0376a2c1c29..d642cc0a8271 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -82,7 +82,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ufs_error (sb, "ufs_free_fragments", "bit already cleared for fragment %u", i); } - + + inode_sub_bytes(inode, count << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, count); uspi->cs_total.cs_nffree += count; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); @@ -184,6 +185,7 @@ do_more: ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); } ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); @@ -494,6 +496,20 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return 0; } +static bool try_add_frags(struct inode *inode, unsigned frags) +{ + unsigned size = frags * i_blocksize(inode); + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, size); + if (unlikely((u32)inode->i_blocks != inode->i_blocks)) { + __inode_sub_bytes(inode, size); + spin_unlock(&inode->i_lock); + return false; + } + spin_unlock(&inode->i_lock); + return true; +} + static u64 ufs_add_fragments(struct inode *inode, u64 fragment, unsigned oldcount, unsigned newcount) { @@ -530,6 +546,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, for (i = oldcount; i < newcount; i++) if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) return 0; + + if (!try_add_frags(inode, count)) + return 0; /* * Block can be extended */ @@ -647,6 +666,7 @@ cg_found: ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; + inode_sub_bytes(inode, i << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i); @@ -657,6 +677,8 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; + if (!try_add_frags(inode, count)) + return 0; for (i = 0; i < count; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); @@ -716,6 +738,8 @@ norot: return INVBLOCK; ucpi->c_rotor = result; gotit: + if (!try_add_frags(inode, uspi->s_fpb)) + return 0; blkno = ufs_fragstoblks(result); ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) -- cgit v1.2.3 From 6b0d144fa758869bdd652c50aa41aaf601232550 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 21:15:45 -0400 Subject: ufs: set correct ->s_maxsize Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/ufs/super.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 131b2b77c818..d9aa2627c9df 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -746,6 +746,23 @@ static void ufs_put_super(struct super_block *sb) return; } +static u64 ufs_max_bytes(struct super_block *sb) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int bits = uspi->s_apbshift; + u64 res; + + if (bits > 21) + res = ~0ULL; + else + res = UFS_NDADDR + (1LL << bits) + (1LL << (2*bits)) + + (1LL << (3*bits)); + + if (res >= (MAX_LFS_FILESIZE >> uspi->s_bshift)) + return MAX_LFS_FILESIZE; + return res << uspi->s_bshift; +} + static int ufs_fill_super(struct super_block *sb, void *data, int silent) { struct ufs_sb_info * sbi; @@ -1212,6 +1229,7 @@ magic_found: "fast symlink size (%u)\n", uspi->s_maxsymlinklen); uspi->s_maxsymlinklen = maxsymlen; } + sb->s_maxbytes = ufs_max_bytes(sb); sb->s_max_links = UFS_LINK_MAX; inode = ufs_iget(sb, UFS_ROOTINO); -- cgit v1.2.3 From 940ef1a0ed939c2ca029fca715e25e7778ce1e34 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 23:27:12 -0400 Subject: ufs_extend_tail(): fix the braino in calling conventions of ufs_new_fragments() ... and it really needs splitting into "new" and "extend" cases, but that's for later Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/ufs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index feb5f280db09..966cced0f88e 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -235,7 +235,8 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, p = ufs_get_direct_data_ptr(uspi, ufsi, block); tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), - new_size, err, locked_page); + new_size - (lastfrag & uspi->s_fpbmask), err, + locked_page); return tmp != 0; } -- cgit v1.2.3 From 006351ac8ead0d4a67dd3845e3ceffe650a23212 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Jun 2017 23:28:53 -0400 Subject: ufs_getfrag_block(): we only grab ->truncate_mutex on block creation path Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/ufs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 966cced0f88e..9bf10285c628 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -403,7 +403,9 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff if (!create) { phys64 = ufs_frag_map(inode, offsets, depth); - goto out; + if (phys64) + map_bh(bh_result, sb, phys64 + frag); + return 0; } /* This code entered only while writing ....? */ -- cgit v1.2.3 From babef37dccbaa49249a22bae9150686815d7be71 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 9 Jun 2017 16:20:34 -0400 Subject: excessive checks in ufs_write_failed() and ufs_evict_inode() As it is, short copy in write() to append-only file will fail to truncate the excessive allocated blocks. As the matter of fact, all checks in ufs_truncate_blocks() are either redundant or wrong for that caller. As for the only other caller (ufs_evict_inode()), we only need the file type checks there. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/ufs/inode.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 9bf10285c628..34f11cf0900a 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -844,7 +844,9 @@ void ufs_evict_inode(struct inode * inode) truncate_inode_pages_final(&inode->i_data); if (want_delete) { inode->i_size = 0; - if (inode->i_blocks) + if (inode->i_blocks && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) ufs_truncate_blocks(inode); } @@ -1103,7 +1105,7 @@ out: return err; } -static void __ufs_truncate_blocks(struct inode *inode) +static void ufs_truncate_blocks(struct inode *inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; @@ -1186,7 +1188,7 @@ static int ufs_truncate(struct inode *inode, loff_t size) truncate_setsize(inode, size); - __ufs_truncate_blocks(inode); + ufs_truncate_blocks(inode); inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); out: @@ -1194,16 +1196,6 @@ out: return err; } -static void ufs_truncate_blocks(struct inode *inode) -{ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - __ufs_truncate_blocks(inode); -} - int ufs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); -- cgit v1.2.3 From 67a70017fa0a152657bc7e337e69bb9c9f5549bf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 10 Jun 2017 12:01:50 -0400 Subject: ufs: we need to sync inode before freeing it Signed-off-by: Al Viro --- fs/ufs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 34f11cf0900a..da553ffec85b 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -848,6 +848,7 @@ void ufs_evict_inode(struct inode * inode) (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) ufs_truncate_blocks(inode); + ufs_update_inode(inode, inode_needs_sync(inode)); } invalidate_inode_buffers(inode); -- cgit v1.2.3