From d98456fcafa6f3fd1985f9b7429aaa3531c6bfa0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 31 Jan 2012 20:27:41 -0500 Subject: Btrfs: don't reserve data with extents locked in btrfs_fallocate btrfs_fallocate tries to allocate space only if ranges in the file don't already exist. But the enospc checks it does are not allowed with extents locked. Signed-off-by: Chris Mason --- fs/btrfs/file.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0f61e11a2998..0621a3a7d5d1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1603,6 +1603,14 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & ~FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; + /* + * Make sure we have enough space before we do the + * allocation. + */ + ret = btrfs_check_data_free_space(inode, len); + if (ret) + return ret; + /* * wait for ordered IO before we have any locks. We'll loop again * below with the locks held. @@ -1666,27 +1674,12 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - - /* - * Make sure we have enough space before we do the - * allocation. - */ - ret = btrfs_check_data_free_space(inode, last_byte - - cur_offset); - if (ret) { - free_extent_map(em); - break; - } - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, last_byte - cur_offset, 1 << inode->i_blkbits, offset + len, &alloc_hint); - /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, last_byte - - cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -1714,6 +1707,8 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + /* Let go of our reservation. */ + btrfs_free_reserved_data_space(inode, len); return ret; } -- cgit v1.2.3 From 331818f1c468a24e581aedcbe52af799366a9dfe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Feb 2012 18:30:53 -0500 Subject: NFSv4: Fix an Oops in the NFSv4 getacl code Commit bf118a342f10dafe44b14451a1392c3254629a1f (NFSv4: include bitmap in nfsv4 get acl data) introduces the 'acl_scratch' page for the case where we may need to decode multi-page data. However it fails to take into account the fact that the variable may be NULL (for the case where we're not doing multi-page decode), and it also attaches it to the encoding xdr_stream rather than the decoding one. The immediate result is an Oops in nfs4_xdr_enc_getacl due to the call to page_address() with a NULL page pointer. Signed-off-by: Trond Myklebust Cc: Andy Adamson Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 8 ++++---- fs/nfs/nfs4xdr.c | 5 ++++- include/linux/nfs_xdr.h | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f0c849c98fe4..d202e04aca94 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3575,8 +3575,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu } if (npages > 1) { /* for decoding across pages */ - args.acl_scratch = alloc_page(GFP_KERNEL); - if (!args.acl_scratch) + res.acl_scratch = alloc_page(GFP_KERNEL); + if (!res.acl_scratch) goto out_free; } args.acl_len = npages * PAGE_SIZE; @@ -3612,8 +3612,8 @@ out_free: for (i = 0; i < npages; i++) if (pages[i]) __free_page(pages[i]); - if (args.acl_scratch) - __free_page(args.acl_scratch); + if (res.acl_scratch) + __free_page(res.acl_scratch); return ret; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 95e92e438407..33bd8d0f745d 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2522,7 +2522,6 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, xdr_inline_pages(&req->rq_rcv_buf, replen << 2, args->acl_pages, args->acl_pgbase, args->acl_len); - xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE); encode_nops(&hdr); } @@ -6032,6 +6031,10 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, struct compound_hdr hdr; int status; + if (res->acl_scratch != NULL) { + void *p = page_address(res->acl_scratch); + xdr_set_scratch_buffer(xdr, p, PAGE_SIZE); + } status = decode_compound_hdr(xdr, &hdr); if (status) goto out; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index a764cef06b73..d6ba9a12591e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -614,7 +614,6 @@ struct nfs_getaclargs { size_t acl_len; unsigned int acl_pgbase; struct page ** acl_pages; - struct page * acl_scratch; struct nfs4_sequence_args seq_args; }; @@ -624,6 +623,7 @@ struct nfs_getaclres { size_t acl_len; size_t acl_data_offset; int acl_flags; + struct page * acl_scratch; struct nfs4_sequence_res seq_res; }; -- cgit v1.2.3 From b9f9a03150969e4bd9967c20bce67c4de769058f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 9 Feb 2012 15:31:36 -0500 Subject: NFSv4: Ensure we throw out bad delegation stateids on NFS4ERR_BAD_STATEID To ensure that we don't just reuse the bad delegation when we attempt to recover the nfs4_state that received the bad stateid error. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4state.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a53f33b4ac3a..45392032e7bd 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1132,6 +1132,8 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4 { struct nfs_client *clp = server->nfs_client; + if (test_and_clear_bit(NFS_DELEGATED_STATE, &state->flags)) + nfs_async_inode_return_delegation(state->inode, &state->stateid); nfs4_state_mark_reclaim_nograce(clp, state); nfs4_schedule_state_manager(clp); } -- cgit v1.2.3 From 05293485a0b6b1f803e8a3c0ff188c38f6969985 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 13 Feb 2012 20:51:05 +0000 Subject: XFS: xfs_trans_add_item() - don't assign in ASSERT() when compare is intended It looks to me like the two ASSERT()s in xfs_trans_add_item() really want to do a compare (==) rather than assignment (=). This patch changes it from the latter to the former. Signed-off-by: Jesper Juhl Signed-off-by: Ben Myers --- fs/xfs/xfs_trans.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 329b06aba1c2..7adcdf15ae0c 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1151,8 +1151,8 @@ xfs_trans_add_item( { struct xfs_log_item_desc *lidp; - ASSERT(lip->li_mountp = tp->t_mountp); - ASSERT(lip->li_ailp = tp->t_mountp->m_ail); + ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_ailp == tp->t_mountp->m_ail); lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); -- cgit v1.2.3 From e188dc02d3a9c911be56eca5aa114fe7e9822d53 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 3 Feb 2012 14:25:18 +0100 Subject: vfs: fix d_inode_lookup() dentry ref leak d_inode_lookup() leaks a dentry reference on IS_DEADDIR(). Signed-off-by: Miklos Szeredi CC: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 208c6aa4a989..a780ea515c47 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1095,8 +1095,10 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr struct dentry *old; /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(inode))) + if (unlikely(IS_DEADDIR(inode))) { + dput(dentry); return ERR_PTR(-ENOENT); + } old = inode->i_op->lookup(inode, dentry, nd); if (unlikely(old)) { -- cgit v1.2.3 From 1d6f2097865e64963e90cce04980dce2f9fc023f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 22 Aug 2011 11:52:28 +0800 Subject: autofs4 - fix lockdep splat in autofs When recursing down the locks when traversing a tree/list in get_next_positive_dentry() or get_next_positive_subdir() a lock can change from being nested to being a parent which breaks lockdep. This patch tells lockdep about what we did. Signed-off-by: Steven Rostedt Acked-by: Ian Kent Signed-off-by: Al Viro --- fs/autofs4/expire.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 450f529a4eae..1feb68ecef95 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -124,6 +124,7 @@ start: /* Negative dentry - try next */ if (!simple_positive(q)) { spin_unlock(&p->d_lock); + lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_); p = q; goto again; } @@ -186,6 +187,7 @@ again: /* Negative dentry - try next */ if (!simple_positive(ret)) { spin_unlock(&p->d_lock); + lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_); p = ret; goto again; } -- cgit v1.2.3 From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 8 Feb 2012 12:39:07 -0800 Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts When the number of dentry cache hash table entries gets too high (2147483648 entries), as happens by default on a 16TB system, use of a signed integer in the dcache_init() initialization loop prevents the dentry_hashtable from getting initialized, causing a panic in __d_lookup(). Fix this in dcache_init() and similar areas. Signed-off-by: Dimitri Sivanich Acked-by: David S. Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- fs/inode.c | 8 ++++---- kernel/pid.c | 4 ++-- mm/page_alloc.c | 1 + net/ipv4/tcp.c | 5 +++-- 5 files changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 16a53cc2cc02..fe19ac13f75f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - int loop; + unsigned int loop; /* * A constructor could be added for stable state like the lists, @@ -3016,7 +3016,7 @@ static void __init dcache_init(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } diff --git a/fs/inode.c b/fs/inode.c index fb10d86ffad7..d3ebdbe723d0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1651,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -1669,13 +1669,13 @@ void __init inode_init_early(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - int loop; + unsigned int loop; /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", @@ -1699,7 +1699,7 @@ void __init inode_init(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } diff --git a/kernel/pid.c b/kernel/pid.c index ce8e00deaccb..9f08dfabaf13 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - int i, pidhash_size; + unsigned int i, pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL, &pidhash_shift, NULL, 4096); - pidhash_size = 1 << pidhash_shift; + pidhash_size = 1U << pidhash_shift; for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2186ecb36f7..a13ded1938f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5236,6 +5236,7 @@ void *__init alloc_large_system_hash(const char *tablename, max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; do_div(max, bucketsize); } + max = min(max, 0x80000000ULL); if (numentries > max) numentries = max; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 37755ccc0e96..22ef5f9fd2ff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3240,7 +3240,8 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int i, max_share, cnt; + int max_share, cnt; + unsigned int i; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3283,7 +3284,7 @@ void __init tcp_init(void) &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); -- cgit v1.2.3 From 6b6dc836a195e077e76977b6c020a73de411b46d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 10 Feb 2012 11:03:00 +0100 Subject: vfs: Provide function to get superblock and wait for it to thaw In quota code we need to find a superblock corresponding to a device and wait for superblock to be unfrozen. However this waiting has to happen without s_umount semaphore because that is required for superblock to thaw. So provide a function in VFS for this to keep dances with s_umount where they belong. [AV: implementation switched to saner variant] Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/super.c | 22 ++++++++++++++++++++++ include/linux/fs.h | 1 + 2 files changed, 23 insertions(+) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 6015c02296b7..6277ec6cb60a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -633,6 +633,28 @@ rescan: EXPORT_SYMBOL(get_super); +/** + * get_super_thawed - get thawed superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device. The superblock is returned once it is thawed + * (or immediately if it was not frozen). %NULL is returned if no match + * is found. + */ +struct super_block *get_super_thawed(struct block_device *bdev) +{ + while (1) { + struct super_block *s = get_super(bdev); + if (!s || s->s_frozen == SB_UNFROZEN) + return s; + up_read(&s->s_umount); + vfs_check_frozen(s, SB_FREEZE_WRITE); + put_super(s); + } +} +EXPORT_SYMBOL(get_super_thawed); + /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for diff --git a/include/linux/fs.h b/include/linux/fs.h index 386da09f229d..69cd5bb640f5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2496,6 +2496,7 @@ extern void get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern struct super_block *get_super(struct block_device *); +extern struct super_block *get_super_thawed(struct block_device *); extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); -- cgit v1.2.3 From dcdbed853d9fbb0547b781ba676049b87f54129a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 10 Feb 2012 11:03:01 +0100 Subject: quota: Fix deadlock with suspend and quotas This script causes a kernel deadlock: set -e DEVICE=/dev/vg1/linear lvchange -ay $DEVICE mkfs.ext3 $DEVICE mount -t ext3 -o usrquota,grpquota $DEVICE /mnt/test quotacheck -gu /mnt/test umount /mnt/test mount -t ext3 -o usrquota,grpquota $DEVICE /mnt/test quotaon /mnt/test dmsetup suspend $DEVICE setquota -u root 1 2 3 4 /mnt/test & sleep 1 dmsetup resume $DEVICE setquota acquired semaphore s_umount for read and then tried to perform a transaction (and waits because the device is suspended). dmsetup resume tries to acquire s_umount for write before resuming the device (and waits for setquota). Fix the deadlock by grabbing a thawed superblock for quota commands which need it. Reported-by: Mikulas Patocka Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/quota/quota.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 7898cd688a00..fc2c4388d126 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -292,11 +292,26 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, } } +/* Return 1 if 'cmd' will block on frozen filesystem */ +static int quotactl_cmd_write(int cmd) +{ + switch (cmd) { + case Q_GETFMT: + case Q_GETINFO: + case Q_SYNC: + case Q_XGETQSTAT: + case Q_XGETQUOTA: + case Q_XQUOTASYNC: + return 0; + } + return 1; +} + /* * look up a superblock on which quota ops will be performed * - use the name of a block device to find the superblock thereon */ -static struct super_block *quotactl_block(const char __user *special) +static struct super_block *quotactl_block(const char __user *special, int cmd) { #ifdef CONFIG_BLOCK struct block_device *bdev; @@ -309,7 +324,10 @@ static struct super_block *quotactl_block(const char __user *special) putname(tmp); if (IS_ERR(bdev)) return ERR_CAST(bdev); - sb = get_super(bdev); + if (quotactl_cmd_write(cmd)) + sb = get_super_thawed(bdev); + else + sb = get_super(bdev); bdput(bdev); if (!sb) return ERR_PTR(-ENODEV); @@ -361,7 +379,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, pathp = &path; } - sb = quotactl_block(special); + sb = quotactl_block(special, cmds); if (IS_ERR(sb)) { ret = PTR_ERR(sb); goto out; -- cgit v1.2.3 From fcf83067bf6eb101a35620d752bd559d473cfbaa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Feb 2012 20:56:29 -0500 Subject: vfs: fix compat_sys_stat() handling of overflows in st_nlink Massaged cp_compat_stat() into form closer to cp_new_stat(); the only real issue had been in handling of st_nlink overflows - native 32bit stat(2) returns -EOVERFLOW in such situations, compat one silently loses upper bits. Signed-off-by: Al Viro --- fs/compat.c | 56 +++++++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index fa9d721ecfee..07880bae28a9 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -131,41 +131,35 @@ asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_tim static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { - compat_ino_t ino = stat->ino; - typeof(ubuf->st_uid) uid = 0; - typeof(ubuf->st_gid) gid = 0; - int err; + struct compat_stat tmp; - SET_UID(uid, stat->uid); - SET_GID(gid, stat->gid); + if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + return -EOVERFLOW; - if ((u64) stat->size > MAX_NON_LFS || - !old_valid_dev(stat->dev) || - !old_valid_dev(stat->rdev)) + memset(&tmp, 0, sizeof(tmp)); + tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; - if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - - if (clear_user(ubuf, sizeof(*ubuf))) - return -EFAULT; - - err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev); - err |= __put_user(ino, &ubuf->st_ino); - err |= __put_user(stat->mode, &ubuf->st_mode); - err |= __put_user(stat->nlink, &ubuf->st_nlink); - err |= __put_user(uid, &ubuf->st_uid); - err |= __put_user(gid, &ubuf->st_gid); - err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev); - err |= __put_user(stat->size, &ubuf->st_size); - err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime); - err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec); - err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime); - err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec); - err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime); - err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec); - err |= __put_user(stat->blksize, &ubuf->st_blksize); - err |= __put_user(stat->blocks, &ubuf->st_blocks); - return err; + SET_UID(tmp.st_uid, stat->uid); + SET_GID(tmp.st_gid, stat->gid); + tmp.st_rdev = old_encode_dev(stat->rdev); + if ((u64) stat->size > MAX_NON_LFS) + return -EOVERFLOW; + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime = stat->ctime.tv_sec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; } asmlinkage long compat_sys_newstat(const char __user * filename, -- cgit v1.2.3 From 847c9db5cb50841589b8ebd3da0769b1b02fb3b2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Feb 2012 21:00:05 -0500 Subject: ocfs2: deal with wraparounds of i_nlink in ocfs2_rename() unfortunately, nlink_t may be smaller than 32 bits and ->i_nlink on ocfs2 can grow up to 0xffffffff; storing it in nlink_t variable will lose upper bits on such architectures. Needs to be made u32, until we get kernel-side nlink_t uniformly 32bit... Signed-off-by: Al Viro --- fs/ocfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index be244692550d..a9856e3eaaf0 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1053,7 +1053,7 @@ static int ocfs2_rename(struct inode *old_dir, handle_t *handle = NULL; struct buffer_head *old_dir_bh = NULL; struct buffer_head *new_dir_bh = NULL; - nlink_t old_dir_nlink = old_dir->i_nlink; + u32 old_dir_nlink = old_dir->i_nlink; struct ocfs2_dinode *old_di; struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, }; struct ocfs2_dir_lookup_result target_lookup_res = { NULL, }; -- cgit v1.2.3 From 941b2ddf71987ef369389517a7e215dd505fe01e Mon Sep 17 00:00:00 2001 From: Keith Mannthey Date: Tue, 29 Nov 2011 17:44:12 -0800 Subject: btrfs: Sector Size check during Mount Gracefully fail when trying to mount a BTRFS file system that has a sectorsize smaller than PAGE_SIZE. On PPC it is possible to build a FS while using a 4k PAGE_SIZE kernel then boot into a 64K PAGE_SIZE kernel. Presently open_ctree fails in an endless loop and hangs the machine in this situation. My debugging has show this Sector size < Page size to be a non trivial situation and a graceful exit from the situation would be nice for the time being. Signed-off-by: Keith Mannthey --- fs/btrfs/disk-io.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4c867112b4c8..58d0678dfcba 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2258,6 +2258,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_sb_buffer; } + if (sectorsize < PAGE_SIZE) { + printk(KERN_WARNING "btrfs: Incompatible sector size " + "found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); -- cgit v1.2.3 From 8f24b49688281a77e8331894ed407f0cfe732303 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 8 Feb 2012 16:01:01 +0100 Subject: Btrfs: avoid positive number with ERR_PTR inode_ref_info() returns 1 when the element wasn't found and < 0 on error, just like btrfs_search_slot(). In iref_to_path() it's an error when the inode ref can't be found, thus we return ERR_PTR(ret) in that case. In order to avoid ERR_PTR(1), we now set ret to -ENOENT in that case. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 633c701a287d..98f6bf10bbd4 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -892,6 +892,8 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, if (eb != eb_in) free_extent_buffer(eb); ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + if (ret > 0) + ret = -ENOENT; if (ret) break; next_inum = found_key.offset; -- cgit v1.2.3 From 6af021d8fc3bcce790e7fbb391e39c5920fa3f71 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Thu, 9 Feb 2012 14:25:50 +0800 Subject: Btrfs: return the internal error unchanged if btrfs_get_extent_fiemap() call failed for SEEK_DATA/SEEK_HOLE inquiry Given that ENXIO only means "offset beyond EOF" for either SEEK_DATA or SEEK_HOLE inquiry in a desired file range, so we should return the internal error unchanged if btrfs_get_extent_fiemap() call failed, rather than ENXIO. Cc: Dave Chinner Signed-off-by: Jie Liu --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0621a3a7d5d1..3a520899b024 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1755,7 +1755,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) start - root->sectorsize, root->sectorsize, 0); if (IS_ERR(em)) { - ret = -ENXIO; + ret = PTR_ERR(em); goto out; } last_end = em->start + em->len; @@ -1767,7 +1767,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) while (1) { em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { - ret = -ENXIO; + ret = PTR_ERR(em); break; } -- cgit v1.2.3 From 2cac13e41bf5b99ffc426bd28dfd2248df1dfa67 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 9 Feb 2012 18:17:41 +0800 Subject: Btrfs: fix trim 0 bytes after a device delete A user reported a bug of btrfs's trim, that is we will trim 0 bytes after a device delete. The reproducer: $ mkfs.btrfs disk1 $ mkfs.btrfs disk2 $ mount disk1 /mnt $ fstrim -v /mnt $ btrfs device add disk2 /mnt $ btrfs device del disk1 /mnt $ fstrim -v /mnt This is because after we delete the device, the block group may start from a non-zero place, which will confuse trim to discard nothing. Reported-by: Lutz Euler Signed-off-by: Liu Bo --- fs/btrfs/extent-tree.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 283af7a676a3..60bfe2d68547 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7886,9 +7886,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) u64 start; u64 end; u64 trimmed = 0; + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret = 0; - cache = btrfs_lookup_block_group(fs_info, range->start); + /* + * try to trim all FS space, our block group may start from non-zero. + */ + if (range->len == total_bytes) + cache = btrfs_lookup_first_block_group(fs_info, range->start); + else + cache = btrfs_lookup_block_group(fs_info, range->start); while (cache) { if (cache->key.objectid >= (range->start + range->len)) { -- cgit v1.2.3 From 859acaf1a29bbacf6256f1159210c8d6df992b33 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 9 Feb 2012 15:09:02 +0100 Subject: btrfs: don't check DUP chunks twice Because scrub enumerates the dev extent tree to find the chunks to scrub, it currently finds each DUP chunk twice and also scrubs it twice. This patch makes sure that scrub_chunk only checks that part of the chunk the dev extent has been found for. This only changes the behaviour for DUP chunks. Reported-and-tested-by: Stefan Behrens Signed-off-by: Arne Jansen --- fs/btrfs/scrub.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9770cc5bfb76..abc0fbffa510 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1367,7 +1367,8 @@ out: } static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length) + u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, + u64 dev_offset) { struct btrfs_mapping_tree *map_tree = &sdev->dev->dev_root->fs_info->mapping_tree; @@ -1391,7 +1392,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, goto out; for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sdev->dev) { + if (map->stripes[i].dev == sdev->dev && + map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sdev, map, i, chunk_offset, length); if (ret) goto out; @@ -1487,7 +1489,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) break; } ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, - chunk_offset, length); + chunk_offset, length, found_key.offset); btrfs_put_block_group(cache); if (ret) break; -- cgit v1.2.3 From a7e221e9002306a753d6f78b4060edabce402033 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 14 Feb 2012 17:12:23 +0900 Subject: Btrfs: fix memory leak in load_free_space_cache() load_free_space_cache() has forgotten to free path. Signed-off-by: Tsutomu Itoh --- fs/btrfs/free-space-cache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5802b1473c3d..b30242f4435e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -777,6 +777,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, spin_lock(&block_group->lock); if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { spin_unlock(&block_group->lock); + btrfs_free_path(path); goto out; } spin_unlock(&block_group->lock); -- cgit v1.2.3 From 87826df0ec36fc28884b4ddbb3f3af41c4c2008f Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 15 Feb 2012 16:23:57 +0100 Subject: btrfs: delalloc for page dirtied out-of-band in fixup worker We encountered an issue that was easily observable on s/390 systems but could really happen anywhere. The timing just seemed to hit reliably on s/390 with limited memory. The gist is that when an unexpected set_page_dirty() happened, we'd run into the BUG() in btrfs_writepage_fixup_worker since it wasn't properly set up for delalloc. This patch does the following: - Performs the missing delalloc in the fixup worker - Allow the start hook to return -EBUSY which informs __extent_writepage that it should mark the page skipped and not to redirty it. This is required since the fixup worker can fail with -ENOSPC and the page will have already been redirtied. That causes an Oops in drop_outstanding_extents later. Retrying the fixup worker could lead to an infinite loop. Deferring the page redirty also saves us some cycles since the page would be stuck in a resubmit-redirty loop until the fixup worker completes. It's not harmful, just wasteful. - If the fixup worker fails, we mark the page and mapping as errored, and end the writeback, similar to what we would do had the page actually been submitted to writeback. Signed-off-by: Jeff Mahoney --- fs/btrfs/extent_io.c | 65 ++++++++++++++++++++++++++++++++-------------------- fs/btrfs/extent_io.h | 1 + fs/btrfs/inode.c | 14 +++++++++-- 3 files changed, 53 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fcf77e1ded40..89ba79fb945c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2161,6 +2161,38 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, /* lots and lots of room for performance fixes in the end_bio funcs */ +int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +{ + int uptodate = (err == 0); + struct extent_io_tree *tree; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate && tree->ops && + tree->ops->writepage_io_failed_hook) { + ret = tree->ops->writepage_io_failed_hook(NULL, page, + start, end, NULL); + /* Writeback already completed */ + if (ret == 0) + return 1; + } + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); + ClearPageUptodate(page); + SetPageError(page); + } + return 0; +} + /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -2172,13 +2204,11 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, */ static void end_bio_extent_writepage(struct bio *bio, int err) { - int uptodate = err == 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct extent_io_tree *tree; u64 start; u64 end; int whole_page; - int ret; do { struct page *page = bvec->bv_page; @@ -2195,28 +2225,9 @@ static void end_bio_extent_writepage(struct bio *bio, int err) if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - if (tree->ops && tree->ops->writepage_end_io_hook) { - ret = tree->ops->writepage_end_io_hook(page, start, - end, NULL, uptodate); - if (ret) - uptodate = 0; - } - - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(bio, page, - start, end, NULL); - if (ret == 0) { - uptodate = (err == 0); - continue; - } - } - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); - ClearPageUptodate(page); - SetPageError(page); - } + if (end_extent_writepage(page, err, start, end)) + continue; if (whole_page) end_page_writeback(page); @@ -2818,8 +2829,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); update_nr_written(page, wbc, nr_written); unlock_page(page); ret = 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bc6a042cb6fc..cecc3518c121 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -319,4 +319,5 @@ struct btrfs_mapping_tree; int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); +int end_extent_writepage(struct page *page, int err, u64 start, u64 end); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7405753ec5d7..bf392e532617 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1555,6 +1555,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct inode *inode; u64 page_start; u64 page_end; + int ret; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; @@ -1582,12 +1583,21 @@ again: page_end, &cached_state, GFP_NOFS); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); goto again; } - BUG(); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) { + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + ClearPageChecked(page); + goto out; + } + btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); + set_page_dirty(page); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -1630,7 +1640,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) fixup->work.func = btrfs_writepage_fixup_worker; fixup->page = page; btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); - return -EAGAIN; + return -EBUSY; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From c08782dacd7a098f2b8bca7f4a57a5b402e9e1e5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: btrfs: fix structs where bitfields and spinlock/atomic share 8B word On ia64, powerpc64 and sparc64 the bitfield is modified through a RMW cycle and current gcc rewrites the adjacent 4B word, which in case of a spinlock or atomic has disaterous effect. https://lkml.org/lkml/2012/2/1/220 Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent_map.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3c2cbf7b6663..8e4457e0478e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -886,7 +886,7 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned int full:1; + unsigned int full; }; /* diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 33a7890b1f40..1195f09761fe 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,8 +26,8 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - unsigned int in_tree:1; - unsigned int compress_type:4; + unsigned int in_tree; + unsigned int compress_type; }; struct extent_map_tree { -- cgit v1.2.3 From 8a3344269465b26761b74493cfbeeaa75d821614 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 7 Oct 2011 18:06:13 +0200 Subject: btrfs: silence warning in raid array setup Raid array setup code creates an extent buffer in an usual way. When the PAGE_CACHE_SIZE is > super block size, the extent pages are not marked up-to-date, which triggers a WARN_ON in the following write_extent_buffer call. Add an explicit up-to-date call to silence the warning. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7ffdb154daec..d8f282b5baa9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4357,6 +4357,20 @@ int btrfs_read_sys_array(struct btrfs_root *root) return -ENOMEM; btrfs_set_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); + /* + * The sb extent buffer is artifical and just used to read the system array. + * btrfs_set_buffer_uptodate() call does not properly mark all it's + * pages up-to-date when the page is larger: extent does not cover the + * whole page and consequently check_page_uptodate does not find all + * the page's extents up-to-date (the hole beyond sb), + * write_extent_buffer then triggers a WARN_ON. + * + * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, + * but sb spans only this function. Add an explicit SetPageUptodate call + * to silence the warning eg. on PowerPC 64. + */ + if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) + SetPageUptodate(sb->first_page); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); -- cgit v1.2.3 From 12fc9d0923ca70ae8960bccebac09d5c12f8c4d4 Mon Sep 17 00:00:00 2001 From: Florian Albrechtskirchinger Date: Fri, 10 Feb 2012 22:15:54 +0100 Subject: btrfs: honor umask when creating subvol root Set the subvol root inode permissions based on the current umask. --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf392e532617..6e0ee9b0d742 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6706,8 +6706,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int err; u64 index = 0; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, - new_dirid, S_IFDIR | 0700, &index); + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, + new_dirid, new_dirid, + S_IFDIR | (~current_umask() & S_IRWXUGO), + &index); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &btrfs_dir_inode_operations; -- cgit v1.2.3 From 013bd4c336ad0d30e9e41f9cff0dbc1858934e75 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 16 Feb 2012 10:11:40 +0900 Subject: Btrfs: fix return value check of extent_io_ops This patch adds the check on the return value of extent_io_ops. Signed-off-by: Tsutomu Itoh --- fs/btrfs/extent_io.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 89ba79fb945c..b05d35a7c0f1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2154,9 +2154,10 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, failrec->this_mirror, num_copies, failrec->in_validation); - tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, - failrec->bio_flags, 0); - return 0; + ret = tree->ops->submit_bio_hook(inode, read_mode, bio, + failrec->this_mirror, + failrec->bio_flags, 0); + return ret; } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -2790,9 +2791,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_start = delalloc_end + 1; continue; } - tree->ops->fill_delalloc(inode, page, delalloc_start, - delalloc_end, &page_started, - &nr_written); + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + &nr_written); + BUG_ON(ret); /* * delalloc_end is already one less than the total * length, so we don't subtract one from -- cgit v1.2.3 From 600a45e1d5e376f679ff9ecc4ce9452710a6d27c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 16 Feb 2012 15:01:24 +0800 Subject: Btrfs: fix deadlock on page lock when doing auto-defragment When I ran xfstests circularly on a auto-defragment btrfs, the deadlock happened. Steps to reproduce: [tty0] # export MOUNT_OPTIONS="-o autodefrag" # export TEST_DEV= # export TEST_DIR= # export SCRATCH_DEV= # export SCRATCH_MNT= # while [ 1 ] > do > ./check 091 127 263 > sleep 1 > done [tty1] # while [ 1 ] > do > echo 3 > /proc/sys/vm/drop_caches > done Several hours later, the test processes will hang on, and the deadlock will happen on page lock. The reason is that: Auto defrag task Flush thread Test task btrfs_writepages() add ordered extent (including page 1, 2) set page 1 writeback set page 2 writeback endio_fn() end page 2 writeback release page 2 lock page 1 alloc and lock page 2 page 2 is not uptodate btrfs_readpage() start ordered extent() btrfs_writepages() try to lock page 1 so deadlock happens. Fix this bug by unlocking the page which is in writeback, and re-locking it after the writeback end. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0b06a5ca8afc..e9bdb8b783e5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -862,6 +862,7 @@ static int cluster_pages_for_defrag(struct inode *inode, int i_done; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_io_tree *tree; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); if (isize == 0) @@ -872,18 +873,34 @@ static int cluster_pages_for_defrag(struct inode *inode, num_pages << PAGE_CACHE_SHIFT); if (ret) return ret; -again: - ret = 0; i_done = 0; + tree = &BTRFS_I(inode)->io_tree; /* step one, lock all the pages */ for (i = 0; i < num_pages; i++) { struct page *page; +again: page = find_or_create_page(inode->i_mapping, - start_index + i, mask); + start_index + i, mask); if (!page) break; + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + while (1) { + lock_extent(tree, page_start, page_end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, + page_start); + unlock_extent(tree, page_start, page_end, GFP_NOFS); + if (!ordered) + break; + + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + } + if (!PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); @@ -894,15 +911,22 @@ again: break; } } + isize = i_size_read(inode); file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - if (!isize || page->index > file_end || - page->mapping != inode->i_mapping) { + if (!isize || page->index > file_end) { /* whoops, we blew past eof, skip this page */ unlock_page(page); page_cache_release(page); break; } + + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + pages[i] = page; i_done++; } @@ -925,25 +949,6 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, 0, &cached_state, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1); - if (ordered && - ordered->file_offset + ordered->len > page_start && - ordered->file_offset < page_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, - &cached_state, GFP_NOFS); - for (i = 0; i < i_done; i++) { - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - btrfs_wait_ordered_range(inode, page_start, - page_end - page_start); - goto again; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, -- cgit v1.2.3 From 285190d99fef696ec8b0041787387f013cb71d67 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 16 Feb 2012 16:23:58 +0900 Subject: Btrfs: check return value of lookup_extent_mapping() correctly This patch corrects error checking of lookup_extent_mapping(). Signed-off-by: Tsutomu Itoh --- fs/btrfs/compression.c | 2 ++ fs/btrfs/extent_io.c | 2 +- fs/btrfs/volumes.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 14f1c5a0b2d2..d02c27cd14c7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -588,6 +588,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page_offset(bio->bi_io_vec->bv_page), PAGE_CACHE_SIZE); read_unlock(&em_tree->lock); + if (!em) + return -EIO; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b05d35a7c0f1..8d6f55fbd28e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3308,7 +3308,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); - if (IS_ERR_OR_NULL(em)) { + if (!em) { write_unlock(&map->lock); break; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d8f282b5baa9..cd040bf3fd87 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1954,7 +1954,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, em = lookup_extent_mapping(em_tree, chunk_offset, 1); read_unlock(&em_tree->lock); - BUG_ON(em->start > chunk_offset || + BUG_ON(!em || em->start > chunk_offset || em->start + em->len < chunk_offset); map = (struct map_lookup *)em->bdev; -- cgit v1.2.3 From 0449314a9cc5a7fb0bd42e2175a3c46f68f3a2b0 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:37 +0800 Subject: Btrfs: skip states when they does not contain bits to clear Clearing a range's bits is different with setting them, since we don't need to touch them when states do not contain bits we want. Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8d6f55fbd28e..fe14285b53f1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -513,6 +513,15 @@ hit_next: WARN_ON(state->end < start); last_end = state->end; + if (state->end < end && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; + + /* the state doesn't have the wanted bits, go ahead */ + if (!(state->state & bits)) + goto next; + /* * | ---- desired range ---- | * | state | or @@ -565,12 +574,8 @@ hit_next: goto out; } - if (state->end < end && prealloc && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - set |= clear_state_bit(tree, state, &bits, wake); +next: if (last_end == (u64)-1) goto out; start = last_end + 1; -- cgit v1.2.3 From 9d47c7671dc555e198c7347a173ed37316e0c4c1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:38 +0800 Subject: Btrfs: kick out redundant stuff in convert_extent_bit clear_state_bit will do merge_state for us, so kick out the redundant one. Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fe14285b53f1..37259ff5cd71 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -966,8 +966,6 @@ hit_next: set_state_bits(tree, state, &bits); clear_state_bit(tree, state, &clear_bits, 0); - - merge_state(tree, state); if (last_end == (u64)-1) goto out; @@ -1012,7 +1010,6 @@ hit_next: if (state->end <= end) { set_state_bits(tree, state, &bits); clear_state_bit(tree, state, &clear_bits, 0); - merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1073,8 +1070,6 @@ hit_next: set_state_bits(tree, prealloc, &bits); clear_state_bit(tree, prealloc, &clear_bits, 0); - - merge_state(tree, prealloc); prealloc = NULL; goto out; } -- cgit v1.2.3 From d9b0218f6cb682aa6a4ada2bfc5a25fdf3018563 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:39 +0800 Subject: Btrfs: fix a bug on overcommit stuff When overcommitting, we should check the sum of pinned space and bytes for delayed item. Signed-off-by: Liu Bo --- fs/btrfs/extent-tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 60bfe2d68547..dc083f55bcfd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3611,12 +3611,15 @@ static int may_commit_transaction(struct btrfs_root *root, if (space_info != delayed_rsv->space_info) return -ENOSPC; + spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); - if (delayed_rsv->size < bytes) { + if (space_info->bytes_pinned + delayed_rsv->size < bytes) { spin_unlock(&delayed_rsv->lock); + spin_unlock(&space_info->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); + spin_unlock(&space_info->lock); commit: trans = btrfs_join_transaction(root); -- cgit v1.2.3 From f86f36a6ae625eda87a13e1ea102a908e08f491b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 14 Feb 2012 20:33:19 -0500 Subject: NFSv4.1: Fix a NFSv4.1 session initialisation regression Commit aacd553 (NFSv4.1: cleanup init and reset of session slot tables) introduces a regression in the session initialisation code. New tables now find their sequence ids initialised to 0, rather than the mandated value of 1 (see RFC5661). Fix the problem by merging nfs4_reset_slot_table() and nfs4_init_slot_table(). Since the tbl->max_slots is initialised to 0, the test in nfs4_reset_slot_table for max_reqs != tbl->max_slots will automatically pass for an empty table. Reported-by: Vitaliy Gusev Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 107 +++++++++++++++++++++--------------------------------- 1 file changed, 42 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d202e04aca94..b4d67feab90b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5008,37 +5008,53 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) return status; } +static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags) +{ + return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags); +} + +static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, + struct nfs4_slot *new, + u32 max_slots, + u32 ivalue) +{ + struct nfs4_slot *old = NULL; + u32 i; + + spin_lock(&tbl->slot_tbl_lock); + if (new) { + old = tbl->slots; + tbl->slots = new; + tbl->max_slots = max_slots; + } + tbl->highest_used_slotid = -1; /* no slot is currently used */ + for (i = 0; i < tbl->max_slots; i++) + tbl->slots[i].seq_nr = ivalue; + spin_unlock(&tbl->slot_tbl_lock); + kfree(old); +} + /* - * Reset a slot table + * (re)Initialise a slot table */ -static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, - int ivalue) +static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, + u32 ivalue) { struct nfs4_slot *new = NULL; - int i; - int ret = 0; + int ret = -ENOMEM; dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, max_reqs, tbl->max_slots); /* Does the newly negotiated max_reqs match the existing slot table? */ if (max_reqs != tbl->max_slots) { - ret = -ENOMEM; - new = kmalloc(max_reqs * sizeof(struct nfs4_slot), - GFP_NOFS); + new = nfs4_alloc_slots(max_reqs, GFP_NOFS); if (!new) goto out; - ret = 0; - kfree(tbl->slots); - } - spin_lock(&tbl->slot_tbl_lock); - if (new) { - tbl->slots = new; - tbl->max_slots = max_reqs; } - for (i = 0; i < tbl->max_slots; ++i) - tbl->slots[i].seq_nr = ivalue; - spin_unlock(&tbl->slot_tbl_lock); + ret = 0; + + nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue); dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, tbl, tbl->slots, tbl->max_slots); out: @@ -5060,36 +5076,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session) return; } -/* - * Initialize slot table - */ -static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, - int max_slots, int ivalue) -{ - struct nfs4_slot *slot; - int ret = -ENOMEM; - - BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE); - - dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); - - slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS); - if (!slot) - goto out; - ret = 0; - - spin_lock(&tbl->slot_tbl_lock); - tbl->max_slots = max_slots; - tbl->slots = slot; - tbl->highest_used_slotid = -1; /* no slot is currently used */ - spin_unlock(&tbl->slot_tbl_lock); - dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, - tbl, tbl->slots, tbl->max_slots); -out: - dprintk("<-- %s: return %d\n", __func__, ret); - return ret; -} - /* * Initialize or reset the forechannel and backchannel tables */ @@ -5101,25 +5087,16 @@ static int nfs4_setup_session_slot_tables(struct nfs4_session *ses) dprintk("--> %s\n", __func__); /* Fore channel */ tbl = &ses->fc_slot_table; - if (tbl->slots == NULL) { - status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1); - if (status) /* -ENOMEM */ - return status; - } else { - status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1); - if (status) - return status; - } + status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); + if (status) /* -ENOMEM */ + return status; /* Back channel */ tbl = &ses->bc_slot_table; - if (tbl->slots == NULL) { - status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0); - if (status) - /* Fore and back channel share a connection so get - * both slot tables or neither */ - nfs4_destroy_slot_tables(ses); - } else - status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0); + status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); + if (status && tbl->slots == NULL) + /* Fore and back channel share a connection so get + * both slot tables or neither */ + nfs4_destroy_slot_tables(ses); return status; } -- cgit v1.2.3 From abe9a6d57b4544ac208401f9c0a4262814db2be4 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 16 Feb 2012 11:17:05 -0500 Subject: NFSv4: fix server_scope memory leak server_scope would never be freed if nfs4_check_cl_exchange_flags() returned non-zero Signed-off-by: Weston Andros Adamson Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b4d67feab90b..ec9f6ef6c5dd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4883,8 +4883,10 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) clp->cl_rpcclient->cl_auth->au_flavor); res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL); - if (unlikely(!res.server_scope)) - return -ENOMEM; + if (unlikely(!res.server_scope)) { + status = -ENOMEM; + goto out; + } status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (!status) @@ -4901,12 +4903,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) clp->server_scope = NULL; } - if (!clp->server_scope) + if (!clp->server_scope) { clp->server_scope = res.server_scope; - else - kfree(res.server_scope); + goto out; + } } - + kfree(res.server_scope); +out: dprintk("<-- %s status= %d\n", __func__, status); return status; } -- cgit v1.2.3 From 692e5759a43b916f0b66bcb39b2957499992381e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:36 +0800 Subject: Btrfs: be less strict on finding next node in clear_extent_bit In clear_extent_bit, it is enough that next node is adjacent in tree level. Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 37259ff5cd71..45ca8f9b0d61 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -582,8 +582,7 @@ next: if (start <= end && next_node) { state = rb_entry(next_node, struct extent_state, rb_node); - if (state->start == start) - goto hit_next; + goto hit_next; } goto search_again; -- cgit v1.2.3 From 20f12d8ac01917d96860f352f67eddd912df0afb Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Mon, 6 Feb 2012 12:50:07 +0000 Subject: xfs: change available ranges of softlimit and hardlimit in quota check In general, quota allows us to use disk blocks and inodes up to each limit, that is, they are available if they don't exceed their limitations. Current xfs sets their available ranges to lower than them except disk inode quota check. So, this patch changes the ranges to not beyond them. Signed-off-by: Mitsuo Hayasaka Cc: Ben Myers Cc: Alex Elder Cc: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_dquot.c | 24 ++++++++++++------------ fs/xfs/xfs_log_recover.c | 6 +++--- fs/xfs/xfs_qm_syscalls.c | 4 ++-- fs/xfs/xfs_trans_dquot.c | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index cbcb7bea38e2..53db20ee3e77 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -139,10 +139,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_btimer) { if ((d->d_blk_softlimit && - (be64_to_cpu(d->d_bcount) >= + (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_softlimit))) || (d->d_blk_hardlimit && - (be64_to_cpu(d->d_bcount) >= + (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_btimelimit); @@ -151,10 +151,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_blk_softlimit || - (be64_to_cpu(d->d_bcount) < + (be64_to_cpu(d->d_bcount) <= be64_to_cpu(d->d_blk_softlimit))) && (!d->d_blk_hardlimit || - (be64_to_cpu(d->d_bcount) < + (be64_to_cpu(d->d_bcount) <= be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = 0; } @@ -162,10 +162,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_itimer) { if ((d->d_ino_softlimit && - (be64_to_cpu(d->d_icount) >= + (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_softlimit))) || (d->d_ino_hardlimit && - (be64_to_cpu(d->d_icount) >= + (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_itimelimit); @@ -174,10 +174,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_ino_softlimit || - (be64_to_cpu(d->d_icount) < + (be64_to_cpu(d->d_icount) <= be64_to_cpu(d->d_ino_softlimit))) && (!d->d_ino_hardlimit || - (be64_to_cpu(d->d_icount) < + (be64_to_cpu(d->d_icount) <= be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = 0; } @@ -185,10 +185,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_rtbtimer) { if ((d->d_rtb_softlimit && - (be64_to_cpu(d->d_rtbcount) >= + (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_softlimit))) || (d->d_rtb_hardlimit && - (be64_to_cpu(d->d_rtbcount) >= + (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_rtbtimelimit); @@ -197,10 +197,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_rtb_softlimit || - (be64_to_cpu(d->d_rtbcount) < + (be64_to_cpu(d->d_rtbcount) <= be64_to_cpu(d->d_rtb_softlimit))) && (!d->d_rtb_hardlimit || - (be64_to_cpu(d->d_rtbcount) < + (be64_to_cpu(d->d_rtbcount) <= be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = 0; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 15ff5392fb65..0ed9ee77937c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1981,7 +1981,7 @@ xfs_qm_dqcheck( if (!errs && ddq->d_id) { if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) >= + be64_to_cpu(ddq->d_bcount) > be64_to_cpu(ddq->d_blk_softlimit)) { if (!ddq->d_btimer) { if (flags & XFS_QMOPT_DOWARN) @@ -1992,7 +1992,7 @@ xfs_qm_dqcheck( } } if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) >= + be64_to_cpu(ddq->d_icount) > be64_to_cpu(ddq->d_ino_softlimit)) { if (!ddq->d_itimer) { if (flags & XFS_QMOPT_DOWARN) @@ -2003,7 +2003,7 @@ xfs_qm_dqcheck( } } if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) >= + be64_to_cpu(ddq->d_rtbcount) > be64_to_cpu(ddq->d_rtb_softlimit)) { if (!ddq->d_rtbtimer) { if (flags & XFS_QMOPT_DOWARN) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index eafbcff81f3a..711a86e39ff0 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -813,11 +813,11 @@ xfs_qm_export_dquot( (XFS_IS_OQUOTA_ENFORCED(mp) && (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && dst->d_id != 0) { - if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && + if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { ASSERT(dst->d_btimer != 0); } - if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) && + if (((int) dst->d_icount > (int) dst->d_ino_softlimit) && (dst->d_ino_softlimit > 0)) { ASSERT(dst->d_itimer != 0); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 4d00ee67792d..85255536b4b6 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -649,12 +649,12 @@ xfs_trans_dqresv( * nblks. */ if (hardlimit > 0ULL && - hardlimit <= nblks + *resbcountp) { + hardlimit < nblks + *resbcountp) { xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); goto error_return; } if (softlimit > 0ULL && - softlimit <= nblks + *resbcountp) { + softlimit < nblks + *resbcountp) { if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, -- cgit v1.2.3 From c922bbc819324558e61402a7a76c10c550ca61bc Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Mon, 6 Feb 2012 12:50:30 +0000 Subject: xfs: make inode quota check more general The xfs checks quota when reserving disk blocks and inodes. In the block reservation, it checks if the total number of blocks including current usage and new reservation exceed quota. In the inode reservation, it checks using the total number of inodes including only current usage without new reservation. However, this inode quota check works well since the caller of xfs_trans_dquot() always sets the argument of the number of new inode reservation to 1 or 0 and inode is reserved one by one in current xfs. To make it more general, this patch changes it to the same way as the block quota check. Signed-off-by: Mitsuo Hayasaka Cc: Ben Myers Cc: Alex Elder Cc: Christoph Hellwig Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_trans_dquot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 85255536b4b6..c4ba366d24e6 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -677,11 +677,13 @@ xfs_trans_dqresv( if (!softlimit) softlimit = q->qi_isoftlimit; - if (hardlimit > 0ULL && count >= hardlimit) { + if (hardlimit > 0ULL && + hardlimit < ninos + count) { xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); goto error_return; } - if (softlimit > 0ULL && count >= softlimit) { + if (softlimit > 0ULL && + softlimit < ninos + count) { if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, -- cgit v1.2.3 From faf309009e2e18d30c032b7d9479f29b91677c37 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Feb 2012 17:24:20 -0800 Subject: sys_poll: fix incorrect type for 'timeout' parameter The 'poll()' system call timeout parameter is supposed to be 'int', not 'long'. Now, the reason this matters is that right now 32-bit compat mode is broken on at least x86-64, because the 32-bit code just calls 'sys_poll()' directly on x86-64, and the 32-bit argument will have been zero-extended, turning a signed 'int' into a large unsigned 'long' value. We could just introduce a 'compat_sys_poll()' function for this, and that may eventually be what we have to do, but since the actual standard poll() semantics is *supposed* to be 'int', and since at least on x86-64 glibc sign-extends the argument before invocing the system call (so nobody can actually use a 64-bit timeout value in user space _anyway_, even in 64-bit binaries), the simpler solution would seem to be to just fix the definition of the system call to match what it should have been from the very start. If it turns out that somebody somehow circumvents the user-level libc 64-bit sign extension and actually uses a large unsigned 64-bit timeout despite that not being how poll() is supposed to work, we will need to do the compat_sys_poll() approach. Reported-by: Thomas Meyer Acked-by: Eric Dumazet Signed-off-by: Linus Torvalds --- arch/s390/kernel/compat_wrapper.S | 2 +- fs/select.c | 2 +- include/linux/syscalls.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 18c51df9fe06..ff605a39cf43 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -662,7 +662,7 @@ ENTRY(sys32_getresuid16_wrapper) ENTRY(sys32_poll_wrapper) llgtr %r2,%r2 # struct pollfd * llgfr %r3,%r3 # unsigned int - lgfr %r4,%r4 # long + lgfr %r4,%r4 # int jg sys_poll # branch to system call ENTRY(sys32_setresgid16_wrapper) diff --git a/fs/select.c b/fs/select.c index d33418fdc858..e782258d0de3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -912,7 +912,7 @@ static long do_restart_poll(struct restart_block *restart_block) } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, - long, timeout_msecs) + int, timeout_msecs) { struct timespec end_time, *to = NULL; int ret; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 515669fa3c1d..8ec1153ff57b 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -624,7 +624,7 @@ asmlinkage long sys_socketpair(int, int, int, int __user *); asmlinkage long sys_socketcall(int call, unsigned long __user *args); asmlinkage long sys_listen(int, int); asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, - long timeout); + int timeout); asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp); asmlinkage long sys_old_select(struct sel_arg_struct __user *arg); -- cgit v1.2.3 From fe66a05a06795bd3b788404d69ea7709f46a1609 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Feb 2012 08:40:56 -0500 Subject: Btrfs: improve error handling for btrfs_insert_dir_item callers This allows us to gracefully continue if we aren't able to insert directory items, both for normal files/dirs and snapshots. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 20 +++++++++++++++++++- fs/btrfs/transaction.c | 13 +++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6e0ee9b0d742..cbeb2e36cebc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4585,7 +4585,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, root, name, name_len, parent_inode, &key, btrfs_inode_type(inode), index); - BUG_ON(ret); + if (ret) + goto fail_dir_item; btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); @@ -4593,6 +4594,23 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, parent_inode); } return ret; + +fail_dir_item: + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + u64 local_index; + int err; + err = btrfs_del_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_ino, &local_index, name, name_len); + + } else if (add_backref) { + u64 local_index; + int err; + + err = btrfs_del_inode_ref(trans, root, name, name_len, + ino, parent_ino, &local_index); + } + return ret; } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 287a6728b1ad..016977beee5c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -915,7 +915,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, BTRFS_FT_DIR, index); - BUG_ON(ret); + if (ret) { + pending->error = -EEXIST; + dput(parent); + goto fail; + } btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); @@ -993,12 +997,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, { struct btrfs_pending_snapshot *pending; struct list_head *head = &trans->transaction->pending_snapshots; - int ret; - list_for_each_entry(pending, head, list) { - ret = create_pending_snapshot(trans, fs_info, pending); - BUG_ON(ret); - } + list_for_each_entry(pending, head, list) + create_pending_snapshot(trans, fs_info, pending); return 0; } -- cgit v1.2.3 From a6b0d5c8dbfd428717fc4db4c36757783f391c7b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Feb 2012 20:53:43 -0500 Subject: Btrfs: make sure we update latest_bdev When we are setting up the mount, we close all the devices that were not actually part of the metadata we found. But, we don't make sure that one of those devices wasn't fs_devices->latest_bdev, which means we can do a use after free on the one we closed. This updates latest_bdev as it goes. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 ++++++ fs/btrfs/volumes.c | 17 ++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 58d0678dfcba..b801d29f3f10 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2305,6 +2305,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_devices); + if (!fs_devices->latest_bdev) { + printk(KERN_CRIT "btrfs: failed to read devices on %s\n", + sb->s_id); + goto fail_tree_roots; + } + retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cd040bf3fd87..7eefdef8a14c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -459,12 +459,23 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *next; + struct block_device *latest_bdev = NULL; + u64 latest_devid = 0; + u64 latest_transid = 0; + mutex_lock(&uuid_mutex); again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->in_fs_metadata) + if (device->in_fs_metadata) { + if (!latest_transid || + device->generation > latest_transid) { + latest_devid = device->devid; + latest_transid = device->generation; + latest_bdev = device->bdev; + } continue; + } if (device->bdev) { blkdev_put(device->bdev, device->mode); @@ -487,6 +498,10 @@ again: goto again; } + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + mutex_unlock(&uuid_mutex); return 0; } -- cgit v1.2.3 From 16780cabb877dbd0c8c5e9ff9bdebd6c5bdd1a7b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Feb 2012 22:14:55 -0500 Subject: Btrfs: add extra sanity checks on the path names in btrfs_mksubvol Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e9bdb8b783e5..05446f77f99b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1333,6 +1333,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, goto out; } + if (name[0] == '.' && + (namelen == 1 || (name[1] == '.' && namelen == 2))) { + ret = -EEXIST; + goto out; + } + if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, NULL, transid, readonly); -- cgit v1.2.3 From 506531905296d6aee84480c879b25ea98c3f9db6 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 22 Feb 2012 12:36:24 -0500 Subject: Btrfs: clear the extent uptodate bits during parent transid failures If btrfs reads a block and finds a parent transid mismatch, it clears the uptodate flags on the extent buffer, and the pages inside it. But we only clear the uptodate bits in the state tree if the block straddles more than one page. This is from an old optimization from to reduce contention on the extent state tree. But it is buggy because the code that retries a read from a different copy of the block is going to find the uptodate state bits set and skip the IO. The end result of the bug is that we'll never actually read the good copy (if there is one). The fix here is to always clear the uptodate state bits, which is safe because this code is only called when the parent transid fails. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 45ca8f9b0d61..a55fbe6252de 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3871,10 +3871,9 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - if (eb_straddles_pages(eb)) { - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); - } + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) -- cgit v1.2.3 From 5500cdbe14d7435e04f66ff3cfb8ecd8b8e44ebf Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 23 Feb 2012 10:49:04 -0500 Subject: Btrfs: increase the global block reserve estimates When doing IO with large amounts of data fragmentation, the global block reserve calulations are too low. This increases them to avoid ENOSPC crashes. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dc083f55bcfd..079e5a1c343c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4108,7 +4108,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) num_bytes += div64_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3); + num_bytes = div64_u64(meta_used, 3) * 2; return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); } -- cgit v1.2.3 From 37fbf4bfb826372c3ca6c09d8a015d1fe9f5e186 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Thu, 23 Feb 2012 23:40:05 +0000 Subject: Restore direct_io / truncate locking API With kernel 3.1, Christoph removed i_alloc_sem and replaced it with calls (namely inode_dio_wait() and inode_dio_done()) which are EXPORT_SYMBOL_GPL() thus they cannot be used by non-GPL file systems and further inode_dio_wait() was pushed from notify_change() into the file system ->setattr() method but no non-GPL file system can make this call. That means non-GPL file systems cannot exist any more unless they do not use any VFS functionality related to reading/writing as far as I can tell or at least as long as they want to implement direct i/o. Both Linus and Al (and others) have said on LKML that this breakage of the VFS API should not have happened and that the change was simply missed as it was not documented in the change logs of the patches that did those changes. This patch changes the two function exports in question to be EXPORT_SYMBOL() thus restoring the VFS API as it used to be - accessible for all modules. Christoph, who introduced the two functions and exported them GPL-only is CC-ed on this patch to give him the opportunity to object to the symbols being changed in this manner if he did indeed intend them to be GPL-only and does not want them to become available to all modules. Signed-off-by: Anton Altaparmakov CC: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/direct-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 4a588dbd11bf..f4aadd15b613 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -173,7 +173,7 @@ void inode_dio_wait(struct inode *inode) if (atomic_read(&inode->i_dio_count)) __inode_dio_wait(inode); } -EXPORT_SYMBOL_GPL(inode_dio_wait); +EXPORT_SYMBOL(inode_dio_wait); /* * inode_dio_done - signal finish of a direct I/O requests @@ -187,7 +187,7 @@ void inode_dio_done(struct inode *inode) if (atomic_dec_and_test(&inode->i_dio_count)) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } -EXPORT_SYMBOL_GPL(inode_dio_done); +EXPORT_SYMBOL(inode_dio_done); /* * How many pages are in the queue? -- cgit v1.2.3 From e77266e4c4be6f9dc91bf688bce015a8babd5fe0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 24 Feb 2012 10:39:05 -0500 Subject: Btrfs: fix compiler warnings on 32 bit systems The enospc tracing code added some interesting uses of u64 pointer casts. Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/extent-tree.c | 35 +++++++++++++++++++---------------- fs/btrfs/inode-map.c | 6 ++++-- fs/btrfs/transaction.c | 3 ++- 4 files changed, 26 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b669a7d8e499..d986824bb2b4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -644,7 +644,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( static int btrfsic_process_superblock(struct btrfsic_state *state, struct btrfs_fs_devices *fs_devices) { - int ret; + int ret = 0; struct btrfs_super_block *selected_super; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 079e5a1c343c..37e0a800d34e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3312,7 +3312,8 @@ commit_trans: } data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)data_sinfo, bytes, 1); + (u64)(unsigned long)data_sinfo, + bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3333,7 +3334,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)data_sinfo, bytes, 0); + (u64)(unsigned long)data_sinfo, + bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3698,9 +3700,9 @@ again: if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)space_info, - orig_bytes, 1); + "space_info", + (u64)(unsigned long)space_info, + orig_bytes, 1); ret = 0; } else { /* @@ -3769,9 +3771,9 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)space_info, - orig_bytes, 1); + "space_info", + (u64)(unsigned long)space_info, + orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3916,8 +3918,8 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)space_info, - num_bytes, 0); + (u64)(unsigned long)space_info, + num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -4135,14 +4137,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)sinfo, num_bytes, 1); + (u64)(unsigned long)sinfo, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)sinfo, num_bytes, 0); + (u64)(unsigned long)sinfo, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4195,7 +4197,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; - trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans, + trace_btrfs_space_reservation(root->fs_info, "transaction", + (u64)(unsigned long)trans, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; @@ -4713,9 +4716,9 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { trace_btrfs_space_reservation(cache->fs_info, - "space_info", - (u64)space_info, - num_bytes, 0); + "space_info", + (u64)(unsigned long)space_info, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 213ffa86ce1b..ee15d88b33d2 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -438,7 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, trans->bytes_reserved); if (ret) goto out; - trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + (u64)(unsigned long)trans, trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); @@ -500,7 +501,8 @@ again: out_put: iput(inode); out_release: - trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + (u64)(unsigned long)trans, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 016977beee5c..04b77e3ceb7a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -327,7 +327,8 @@ again: if (num_bytes) { trace_btrfs_space_reservation(root->fs_info, "transaction", - (u64)h, num_bytes, 1); + (u64)(unsigned long)h, + num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } -- cgit v1.2.3 From d80e731ecab420ddcb79ee9d0ac427acbc187b4b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Feb 2012 20:07:11 +0100 Subject: epoll: introduce POLLFREE to flush ->signalfd_wqh before kfree() This patch is intentionally incomplete to simplify the review. It ignores ep_unregister_pollwait() which plays with the same wqh. See the next change. epoll assumes that the EPOLL_CTL_ADD'ed file controls everything f_op->poll() needs. In particular it assumes that the wait queue can't go away until eventpoll_release(). This is not true in case of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand which is not connected to the file. This patch adds the special event, POLLFREE, currently only for epoll. It expects that init_poll_funcptr()'ed hook should do the necessary cleanup. Perhaps it should be defined as EPOLLFREE in eventpoll. __cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if ->signalfd_wqh is not empty, we add the new signalfd_cleanup() helper. ep_poll_callback(POLLFREE) simply does list_del_init(task_list). This make this poll entry inconsistent, but we don't care. If you share epoll fd which contains our sigfd with another process you should blame yourself. signalfd is "really special". I simply do not know how we can define the "right" semantics if it used with epoll. The main problem is, epoll calls signalfd_poll() once to establish the connection with the wait queue, after that signalfd_poll(NULL) returns the different/inconsistent results depending on who does EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd has nothing to do with the file, it works with the current thread. In short: this patch is the hack which tries to fix the symptoms. It also assumes that nobody can take tasklist_lock under epoll locks, this seems to be true. Note: - we do not have wake_up_all_poll() but wake_up_poll() is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE. - signalfd_cleanup() uses POLLHUP along with POLLFREE, we need a couple of simple changes in eventpoll.c to make sure it can't be "lost". Reported-by: Maxime Bizon Cc: Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++++ fs/signalfd.c | 11 +++++++++++ include/asm-generic/poll.h | 2 ++ include/linux/signalfd.h | 5 ++++- kernel/fork.c | 5 ++++- 5 files changed, 25 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aabdfc38cf24..34bbfc6dd8dc 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -842,6 +842,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + /* the caller holds eppoll_entry->whead->lock */ + if ((unsigned long)key & POLLFREE) + list_del_init(&wait->task_list); + spin_lock_irqsave(&ep->lock, flags); /* diff --git a/fs/signalfd.c b/fs/signalfd.c index 492465b451dd..79c1eea98a3a 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,17 @@ #include #include +void signalfd_cleanup(struct sighand_struct *sighand) +{ + wait_queue_head_t *wqh = &sighand->signalfd_wqh; + + if (likely(!waitqueue_active(wqh))) + return; + + /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + wake_up_poll(wqh, POLLHUP | POLLFREE); +} + struct signalfd_ctx { sigset_t sigmask; }; diff --git a/include/asm-generic/poll.h b/include/asm-generic/poll.h index 44bce836d350..9ce7f44aebd2 100644 --- a/include/asm-generic/poll.h +++ b/include/asm-generic/poll.h @@ -28,6 +28,8 @@ #define POLLRDHUP 0x2000 #endif +#define POLLFREE 0x4000 /* currently only for epoll */ + struct pollfd { int fd; short events; diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h index 3ff4961da9b5..247399b2979a 100644 --- a/include/linux/signalfd.h +++ b/include/linux/signalfd.h @@ -61,13 +61,16 @@ static inline void signalfd_notify(struct task_struct *tsk, int sig) wake_up(&tsk->sighand->signalfd_wqh); } +extern void signalfd_cleanup(struct sighand_struct *sighand); + #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } +static inline void signalfd_cleanup(struct sighand_struct *sighand) { } + #endif /* CONFIG_SIGNALFD */ #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNALFD_H */ - diff --git a/kernel/fork.c b/kernel/fork.c index b77fd559c78e..e2cd3e2a5ae8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -935,8 +936,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) + if (atomic_dec_and_test(&sighand->count)) { + signalfd_cleanup(sighand); kmem_cache_free(sighand_cachep, sighand); + } } -- cgit v1.2.3 From 971316f0503a5c50633d07b83b6db2f15a3a5b00 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Feb 2012 20:07:29 +0100 Subject: epoll: ep_unregister_pollwait() can use the freed pwq->whead signalfd_cleanup() ensures that ->signalfd_wqh is not used, but this is not enough. eppoll_entry->whead still points to the memory we are going to free, ep_unregister_pollwait()->remove_wait_queue() is obviously unsafe. Change ep_poll_callback(POLLFREE) to set eppoll_entry->whead = NULL, change ep_unregister_pollwait() to check pwq->whead != NULL under rcu_read_lock() before remove_wait_queue(). We add the new helper, ep_remove_wait_queue(), for this. This works because sighand_cachep is SLAB_DESTROY_BY_RCU and because ->signalfd_wqh is initialized in sighand_ctor(), not in copy_sighand. ep_unregister_pollwait()->remove_wait_queue() can play with already freed and potentially reused ->sighand, but this is fine. This memory must have the valid ->signalfd_wqh until rcu_read_unlock(). Reported-by: Maxime Bizon Cc: Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 30 +++++++++++++++++++++++++++--- fs/signalfd.c | 6 +++++- 2 files changed, 32 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 34bbfc6dd8dc..ea54cdef04dd 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -320,6 +320,11 @@ static inline int ep_is_linked(struct list_head *p) return !list_empty(p); } +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait); +} + /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_t *p) { @@ -467,6 +472,18 @@ static void ep_poll_safewake(wait_queue_head_t *wq) put_cpu(); } +static void ep_remove_wait_queue(struct eppoll_entry *pwq) +{ + wait_queue_head_t *whead; + + rcu_read_lock(); + /* If it is cleared by POLLFREE, it should be rcu-safe */ + whead = rcu_dereference(pwq->whead); + if (whead) + remove_wait_queue(whead, &pwq->wait); + rcu_read_unlock(); +} + /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held (or "epmutex" if called from @@ -481,7 +498,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) pwq = list_first_entry(lsthead, struct eppoll_entry, llink); list_del(&pwq->llink); - remove_wait_queue(pwq->whead, &pwq->wait); + ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } @@ -842,9 +859,16 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; - /* the caller holds eppoll_entry->whead->lock */ - if ((unsigned long)key & POLLFREE) + if ((unsigned long)key & POLLFREE) { + ep_pwq_from_wait(wait)->whead = NULL; + /* + * whead = NULL above can race with ep_remove_wait_queue() + * which can do another remove_wait_queue() after us, so we + * can't use __remove_wait_queue(). whead->lock is held by + * the caller. + */ list_del_init(&wait->task_list); + } spin_lock_irqsave(&ep->lock, flags); diff --git a/fs/signalfd.c b/fs/signalfd.c index 79c1eea98a3a..7ae2a574cb25 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -33,7 +33,11 @@ void signalfd_cleanup(struct sighand_struct *sighand) { wait_queue_head_t *wqh = &sighand->signalfd_wqh; - + /* + * The lockless check can race with remove_wait_queue() in progress, + * but in this case its caller should run under rcu_read_lock() and + * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. + */ if (likely(!waitqueue_active(wqh))) return; -- cgit v1.2.3 From a32744d4abae24572eff7269bc17895c41bd0085 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Wed, 22 Feb 2012 20:45:44 +0800 Subject: autofs: work around unhappy compat problem on x86-64 When the autofs protocol version 5 packet type was added in commit 5c0a32fc2cd0 ("autofs4: add new packet type for v5 communications"), it obvously tried quite hard to be word-size agnostic, and uses explicitly sized fields that are all correctly aligned. However, with the final "char name[NAME_MAX+1]" array at the end, the actual size of the structure ends up being not very well defined: because the struct isn't marked 'packed', doing a "sizeof()" on it will align the size of the struct up to the biggest alignment of the members it has. And despite all the members being the same, the alignment of them is different: a "__u64" has 4-byte alignment on x86-32, but native 8-byte alignment on x86-64. And while 'NAME_MAX+1' ends up being a nice round number (256), the name[] array starts out a 4-byte aligned. End result: the "packed" size of the structure is 300 bytes: 4-byte, but not 8-byte aligned. As a result, despite all the fields being in the same place on all architectures, sizeof() will round up that size to 304 bytes on architectures that have 8-byte alignment for u64. Note that this is *not* a problem for 32-bit compat mode on POWER, since there __u64 is 8-byte aligned even in 32-bit mode. But on x86, 32-bit and 64-bit alignment is different for 64-bit entities, and as a result the structure that has exactly the same layout has different sizes. So on x86-64, but no other architecture, we will just subtract 4 from the size of the structure when running in a compat task. That way we will write the properly sized packet that user mode expects. Not pretty. Sadly, this very subtle, and unnecessary, size difference has been encoded in user space that wants to read packets of *exactly* the right size, and will refuse to touch anything else. Reported-and-tested-by: Thomas Meyer Signed-off-by: Ian Kent Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 1 + fs/autofs4/dev-ioctl.c | 1 + fs/autofs4/inode.c | 2 ++ fs/autofs4/waitq.c | 22 +++++++++++++++++++--- 4 files changed, 23 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index d8d8e7ba6a1e..eb1cc92cd67d 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -110,6 +110,7 @@ struct autofs_sb_info { int sub_version; int min_proto; int max_proto; + int compat_daemon; unsigned long exp_timeout; unsigned int type; int reghost_enabled; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 76741d8d7786..85f1fcdb30e7 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -385,6 +385,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; + sbi->compat_daemon = is_compat_task(); } out: mutex_unlock(&sbi->wq_mutex); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index e16980b00b8d..06858d955120 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "autofs_i.h" #include @@ -224,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) set_autofs_type_indirect(&sbi->type); sbi->min_proto = 0; sbi->max_proto = 0; + sbi->compat_daemon = is_compat_task(); mutex_init(&sbi->wq_mutex); mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index da8876d38a7b..9c098db43344 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -91,7 +91,24 @@ static int autofs4_write(struct autofs_sb_info *sbi, return (bytes > 0); } - + +/* + * The autofs_v5 packet was misdesigned. + * + * The packets are identical on x86-32 and x86-64, but have different + * alignment. Which means that 'sizeof()' will give different results. + * Fix it up for the case of running 32-bit user mode on a 64-bit kernel. + */ +static noinline size_t autofs_v5_packet_size(struct autofs_sb_info *sbi) +{ + size_t pktsz = sizeof(struct autofs_v5_packet); +#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) + if (sbi->compat_daemon > 0) + pktsz -= 4; +#endif + return pktsz; +} + static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) @@ -155,8 +172,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; - pktsz = sizeof(*packet); - + pktsz = autofs_v5_packet_size(sbi); packet->wait_queue_token = wq->wait_queue_token; packet->len = wq->name.len; memcpy(packet->name, wq->name.name, wq->name.len); -- cgit v1.2.3