From d4f4de5e5ef8efde85febb6876cd3c8ab1631999 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 15 Sep 2019 12:12:39 -0400 Subject: Fix the locking in dcache_readdir() and friends There are two problems in dcache_readdir() - one is that lockless traversal of the list needs non-trivial cooperation of d_alloc() (at least a switch to list_add_rcu(), and probably more than just that) and another is that it assumes that no removal will happen without the directory locked exclusive. Said assumption had always been there, never had been stated explicitly and is violated by several places in the kernel (devpts and selinuxfs). * replacement of next_positive() with different calling conventions: it returns struct list_head * instead of struct dentry *; the latter is passed in and out by reference, grabbing the result and dropping the original value. * scan is under ->d_lock. If we run out of timeslice, cursor is moved after the last position we'd reached and we reschedule; then the scan continues from that place. To avoid livelocks between multiple lseek() (with cursors getting moved past each other, never reaching the real entries) we always skip the cursors, need_resched() or not. * returned list_head * is either ->d_child of dentry we'd found or ->d_subdirs of parent (if we got to the end of the list). * dcache_readdir() and dcache_dir_lseek() switched to new helper. dcache_readdir() always holds a reference to dentry passed to dir_emit() now. Cursor is moved to just before the entry where dir_emit() has failed or into the very end of the list, if we'd run out. * move_cursor() eliminated - it had sucky calling conventions and after fixing that it became simply list_move() (in lseek and scan_positives) or list_move_tail() (in readdir). All operations with the list are under ->d_lock now, and we do not depend upon having all file removals done with parent locked exclusive anymore. Cc: stable@vger.kernel.org Reported-by: "zhengbin (A)" Signed-off-by: Al Viro --- fs/libfs.c | 134 +++++++++++++++++++++++++++++++------------------------------ 1 file changed, 69 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index c9b2850c0f7c..8e023b08a240 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -89,58 +89,47 @@ int dcache_dir_close(struct inode *inode, struct file *file) EXPORT_SYMBOL(dcache_dir_close); /* parent is locked at least shared */ -static struct dentry *next_positive(struct dentry *parent, - struct list_head *from, - int count) +/* + * Returns an element of siblings' list. + * We are looking for th positive after

; if + * found, dentry is grabbed and passed to caller via *. + * If no such element exists, the anchor of list is returned + * and * is set to NULL. + */ +static struct list_head *scan_positives(struct dentry *cursor, + struct list_head *p, + loff_t count, + struct dentry **res) { - unsigned *seq = &parent->d_inode->i_dir_seq, n; - struct dentry *res; - struct list_head *p; - bool skipped; - int i; + struct dentry *dentry = cursor->d_parent, *found = NULL; -retry: - i = count; - skipped = false; - n = smp_load_acquire(seq) & ~1; - res = NULL; - rcu_read_lock(); - for (p = from->next; p != &parent->d_subdirs; p = p->next) { + spin_lock(&dentry->d_lock); + while ((p = p->next) != &dentry->d_subdirs) { struct dentry *d = list_entry(p, struct dentry, d_child); - if (!simple_positive(d)) { - skipped = true; - } else if (!--i) { - res = d; - break; + // we must at least skip cursors, to avoid livelocks + if (d->d_flags & DCACHE_DENTRY_CURSOR) + continue; + if (simple_positive(d) && !--count) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + found = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(found)) + break; + count = 1; + } + if (need_resched()) { + list_move(&cursor->d_child, p); + p = &cursor->d_child; + spin_unlock(&dentry->d_lock); + cond_resched(); + spin_lock(&dentry->d_lock); } } - rcu_read_unlock(); - if (skipped) { - smp_rmb(); - if (unlikely(*seq != n)) - goto retry; - } - return res; -} - -static void move_cursor(struct dentry *cursor, struct list_head *after) -{ - struct dentry *parent = cursor->d_parent; - unsigned n, *seq = &parent->d_inode->i_dir_seq; - spin_lock(&parent->d_lock); - for (;;) { - n = *seq; - if (!(n & 1) && cmpxchg(seq, n, n + 1) == n) - break; - cpu_relax(); - } - __list_del(cursor->d_child.prev, cursor->d_child.next); - if (after) - list_add(&cursor->d_child, after); - else - list_add_tail(&cursor->d_child, &parent->d_subdirs); - smp_store_release(seq, n + 2); - spin_unlock(&parent->d_lock); + spin_unlock(&dentry->d_lock); + dput(*res); + *res = found; + return p; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) @@ -158,17 +147,28 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) return -EINVAL; } if (offset != file->f_pos) { + struct dentry *cursor = file->private_data; + struct dentry *to = NULL; + struct list_head *p; + file->f_pos = offset; - if (file->f_pos >= 2) { - struct dentry *cursor = file->private_data; - struct dentry *to; - loff_t n = file->f_pos - 2; - - inode_lock_shared(dentry->d_inode); - to = next_positive(dentry, &dentry->d_subdirs, n); - move_cursor(cursor, to ? &to->d_child : NULL); - inode_unlock_shared(dentry->d_inode); + inode_lock_shared(dentry->d_inode); + + if (file->f_pos > 2) { + p = scan_positives(cursor, &dentry->d_subdirs, + file->f_pos - 2, &to); + spin_lock(&dentry->d_lock); + list_move(&cursor->d_child, p); + spin_unlock(&dentry->d_lock); + } else { + spin_lock(&dentry->d_lock); + list_del_init(&cursor->d_child); + spin_unlock(&dentry->d_lock); } + + dput(to); + + inode_unlock_shared(dentry->d_inode); } return offset; } @@ -190,25 +190,29 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *p = &cursor->d_child; - struct dentry *next; - bool moved = false; + struct list_head *anchor = &dentry->d_subdirs; + struct dentry *next = NULL; + struct list_head *p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) - p = &dentry->d_subdirs; - while ((next = next_positive(dentry, p, 1)) != NULL) { + p = anchor; + else + p = &cursor->d_child; + + while ((p = scan_positives(cursor, p, 1, &next)) != anchor) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, dt_type(d_inode(next)))) break; - moved = true; - p = &next->d_child; ctx->pos++; } - if (moved) - move_cursor(cursor, p); + spin_lock(&dentry->d_lock); + list_move_tail(&cursor->d_child, p); + spin_unlock(&dentry->d_lock); + dput(next); + return 0; } EXPORT_SYMBOL(dcache_readdir); -- cgit v1.2.3 From bdf200731145f07a6127cb16753e2e8fdc159cf4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 1 Oct 2019 09:53:29 -0600 Subject: io_uring: use __kernel_timespec in timeout ABI All system calls use struct __kernel_timespec instead of the old struct timespec, but this one was just added with the old-style ABI. Change it now to enforce the use of __kernel_timespec, avoiding ABI confusion and the need for compat handlers on 32-bit architectures. Any user space caller will have to use __kernel_timespec now, but this is unambiguous and works for any C library regardless of the time_t definition. A nicer way to specify the timeout would have been a less ambiguous 64-bit nanosecond value, but I suppose it's too late now to change that as this would impact both 32-bit and 64-bit users. Fixes: 5262f567987d ("io_uring: IORING_OP_TIMEOUT support") Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index dd094b387cab..0bc167aca46d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1892,15 +1892,15 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned count, req_dist, tail_index; struct io_ring_ctx *ctx = req->ctx; struct list_head *entry; - struct timespec ts; + struct timespec64 ts; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || sqe->len != 1) return -EINVAL; - if (copy_from_user(&ts, (void __user *) (unsigned long) sqe->addr, - sizeof(ts))) + + if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; /* @@ -1934,7 +1934,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); req->timeout.timer.function = io_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts), + hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), HRTIMER_MODE_REL); return 0; } -- cgit v1.2.3 From c67d970f0ea8dcc423e112137d34334fa0abb8ec Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 30 Sep 2019 10:20:25 +0100 Subject: Btrfs: fix memory leak due to concurrent append writes with fiemap When we have a buffered write that starts at an offset greater than or equals to the file's size happening concurrently with a full ranged fiemap, we can end up leaking an extent state structure. Suppose we have a file with a size of 1Mb, and before the buffered write and fiemap are performed, it has a single extent state in its io tree representing the range from 0 to 1Mb, with the EXTENT_DELALLOC bit set. The following sequence diagram shows how the memory leak happens if a fiemap a buffered write, starting at offset 1Mb and with a length of 4Kb, are performed concurrently. CPU 1 CPU 2 extent_fiemap() --> it's a full ranged fiemap range from 0 to LLONG_MAX - 1 (9223372036854775807) --> locks range in the inode's io tree --> after this we have 2 extent states in the io tree: --> 1 for range [0, 1Mb[ with the bits EXTENT_LOCKED and EXTENT_DELALLOC_BITS set --> 1 for the range [1Mb, LLONG_MAX[ with the EXTENT_LOCKED bit set --> start buffered write at offset 1Mb with a length of 4Kb btrfs_file_write_iter() btrfs_buffered_write() --> cached_state is NULL lock_and_cleanup_extent_if_need() --> returns 0 and does not lock range because it starts at current i_size / eof --> cached_state remains NULL btrfs_dirty_pages() btrfs_set_extent_delalloc() (...) __set_extent_bit() --> splits extent state for range [1Mb, LLONG_MAX[ and now we have 2 extent states: --> one for the range [1Mb, 1Mb + 4Kb[ with EXTENT_LOCKED set --> another one for the range [1Mb + 4Kb, LLONG_MAX[ with EXTENT_LOCKED set as well --> sets EXTENT_DELALLOC on the extent state for the range [1Mb, 1Mb + 4Kb[ --> caches extent state [1Mb, 1Mb + 4Kb[ into @cached_state because it has the bit EXTENT_LOCKED set --> btrfs_buffered_write() ends up with a non-NULL cached_state and never calls anything to release its reference on it, resulting in a memory leak Fix this by calling free_extent_state() on cached_state if the range was not locked by lock_and_cleanup_extent_if_need(). The same issue can happen if anything else other than fiemap locks a range that covers eof and beyond. This could be triggered, sporadically, by test case generic/561 from the fstests suite, which makes duperemove run concurrently with fsstress, and duperemove does plenty of calls to fiemap. When CONFIG_BTRFS_DEBUG is set the leak is reported in dmesg/syslog when removing the btrfs module with a message like the following: [77100.039461] BTRFS: state leak: start 6574080 end 6582271 state 16402 in tree 0 refs 1 Otherwise (CONFIG_BTRFS_DEBUG not set) detectable with kmemleak. CC: stable@vger.kernel.org # 4.16+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8fe4eb7e5045..27e5b269e729 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1591,7 +1591,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; - struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; @@ -1611,6 +1610,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, return -ENOMEM; while (iov_iter_count(i) > 0) { + struct extent_state *cached_state = NULL; size_t offset = offset_in_page(pos); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), @@ -1758,9 +1758,20 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, pos, copied, &cached_state); + + /* + * If we have not locked the extent range, because the range's + * start offset is >= i_size, we might still have a non-NULL + * cached extent state, acquired while marking the extent range + * as delalloc through btrfs_dirty_pages(). Therefore free any + * possible cached extent state to avoid a memory leak. + */ if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true); if (ret) { -- cgit v1.2.3 From 4203e968947071586a98b5314fd7ffdea3b4f971 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2019 16:27:25 -0400 Subject: btrfs: fix incorrect updating of log root tree We've historically had reports of being unable to mount file systems because the tree log root couldn't be read. Usually this is the "parent transid failure", but could be any of the related errors, including "fsid mismatch" or "bad tree block", depending on which block got allocated. The modification of the individual log root items are serialized on the per-log root root_mutex. This means that any modification to the per-subvol log root_item is completely protected. However we update the root item in the log root tree outside of the log root tree log_mutex. We do this in order to allow multiple subvolumes to be updated in each log transaction. This is problematic however because when we are writing the log root tree out we update the super block with the _current_ log root node information. Since these two operations happen independently of each other, you can end up updating the log root tree in between writing out the dirty blocks and setting the super block to point at the current root. This means we'll point at the new root node that hasn't been written out, instead of the one we should be pointing at. Thus whatever garbage or old block we end up pointing at complains when we mount the file system later and try to replay the log. Fix this by copying the log's root item into a local root item copy. Then once we're safely under the log_root_tree->log_mutex we update the root item in the log_root_tree. This way we do not modify the log_root_tree while we're committing it, fixing the problem. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Chris Mason Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 77b6797fcac3..2488eb4b70fc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2932,7 +2932,8 @@ out: * in the tree of log roots */ static int update_log_root(struct btrfs_trans_handle *trans, - struct btrfs_root *log) + struct btrfs_root *log, + struct btrfs_root_item *root_item) { struct btrfs_fs_info *fs_info = log->fs_info; int ret; @@ -2940,10 +2941,10 @@ static int update_log_root(struct btrfs_trans_handle *trans, if (log->log_transid == 1) { /* insert root item on the first sync */ ret = btrfs_insert_root(trans, fs_info->log_root_tree, - &log->root_key, &log->root_item); + &log->root_key, root_item); } else { ret = btrfs_update_root(trans, fs_info->log_root_tree, - &log->root_key, &log->root_item); + &log->root_key, root_item); } return ret; } @@ -3041,6 +3042,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = fs_info->log_root_tree; + struct btrfs_root_item new_root_item; int log_transid = 0; struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; @@ -3104,17 +3106,25 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } + /* + * We _must_ update under the root->log_mutex in order to make sure we + * have a consistent view of the log root we are trying to commit at + * this moment. + * + * We _must_ copy this into a local copy, because we are not holding the + * log_root_tree->log_mutex yet. This is important because when we + * commit the log_root_tree we must have a consistent view of the + * log_root_tree when we update the super block to point at the + * log_root_tree bytenr. If we update the log_root_tree here we'll race + * with the commit and possibly point at the new block which we may not + * have written out. + */ btrfs_set_root_node(&log->root_item, log->node); + memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; - /* - * Update or create log root item under the root's log_mutex to prevent - * races with concurrent log syncs that can lead to failure to update - * log root item because it was not created yet. - */ - ret = update_log_root(trans, log); /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to @@ -3135,6 +3145,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); mutex_lock(&log_root_tree->log_mutex); + + /* + * Now we are safe to update the log_root_tree because we're under the + * log_mutex, and we're a current writer so we're holding the commit + * open until we drop the log_mutex. + */ + ret = update_log_root(trans, log, &new_root_item); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { /* atomic_dec_and_test implies a barrier */ cond_wake_up_nomb(&log_root_tree->log_writer_wait); -- cgit v1.2.3 From 7a54789074a54f64addf5b49bf1994f478337a83 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Thu, 12 Sep 2019 19:55:01 -0400 Subject: btrfs: fix balance convert to single on 32-bit host CPUs Currently, the command: btrfs balance start -dconvert=single,soft . on a Raspberry Pi produces the following kernel message: BTRFS error (device mmcblk0p2): balance: invalid convert data profile single This fails because we use is_power_of_2(unsigned long) to validate the new data profile, the constant for 'single' profile uses bit 48, and there are only 32 bits in a long on ARM. Fix by open-coding the check using u64 variables. Tested by completing the original balance command on several Raspberry Pis. Fixes: 818255feece6 ("btrfs: use common helper instead of open coding a bit test") CC: stable@vger.kernel.org # 4.20+ Signed-off-by: Zygo Blaxell Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cdd7af424033..bdfe4493e43a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3845,7 +3845,11 @@ static int alloc_profile_is_valid(u64 flags, int extended) return !extended; /* "0" is valid for usual profiles */ /* true if exactly one bit set */ - return is_power_of_2(flags); + /* + * Don't use is_power_of_2(unsigned long) because it won't work + * for the single profile (1ULL << 48) on 32-bit CPUs. + */ + return flags != 0 && (flags & (flags - 1)) == 0; } static inline int balance_need_close(struct btrfs_fs_info *fs_info) -- cgit v1.2.3 From 11a19a90870ea5496a8ded69b86f5b476b6d3355 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 9 Sep 2019 10:12:04 -0400 Subject: btrfs: allocate new inode in NOFS context A user reported a lockdep splat ====================================================== WARNING: possible circular locking dependency detected 5.2.11-gentoo #2 Not tainted ------------------------------------------------------ kswapd0/711 is trying to acquire lock: 000000007777a663 (sb_internal){.+.+}, at: start_transaction+0x3a8/0x500 but task is already holding lock: 000000000ba86300 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x0/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}: kmem_cache_alloc+0x1f/0x1c0 btrfs_alloc_inode+0x1f/0x260 alloc_inode+0x16/0xa0 new_inode+0xe/0xb0 btrfs_new_inode+0x70/0x610 btrfs_symlink+0xd0/0x420 vfs_symlink+0x9c/0x100 do_symlinkat+0x66/0xe0 do_syscall_64+0x55/0x1c0 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (sb_internal){.+.+}: __sb_start_write+0xf6/0x150 start_transaction+0x3a8/0x500 btrfs_commit_inode_delayed_inode+0x59/0x110 btrfs_evict_inode+0x19e/0x4c0 evict+0xbc/0x1f0 inode_lru_isolate+0x113/0x190 __list_lru_walk_one.isra.4+0x5c/0x100 list_lru_walk_one+0x32/0x50 prune_icache_sb+0x36/0x80 super_cache_scan+0x14a/0x1d0 do_shrink_slab+0x131/0x320 shrink_node+0xf7/0x380 balance_pgdat+0x2d5/0x640 kswapd+0x2ba/0x5e0 kthread+0x147/0x160 ret_from_fork+0x24/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sb_internal); lock(fs_reclaim); lock(sb_internal); *** DEADLOCK *** 3 locks held by kswapd0/711: #0: 000000000ba86300 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x0/0x30 #1: 000000004a5100f8 (shrinker_rwsem){++++}, at: shrink_node+0x9a/0x380 #2: 00000000f956fa46 (&type->s_umount_key#30){++++}, at: super_cache_scan+0x35/0x1d0 stack backtrace: CPU: 7 PID: 711 Comm: kswapd0 Not tainted 5.2.11-gentoo #2 Hardware name: Dell Inc. Precision Tower 3620/0MWYPT, BIOS 2.4.2 09/29/2017 Call Trace: dump_stack+0x85/0xc7 print_circular_bug.cold.40+0x1d9/0x235 __lock_acquire+0x18b1/0x1f00 lock_acquire+0xa6/0x170 ? start_transaction+0x3a8/0x500 __sb_start_write+0xf6/0x150 ? start_transaction+0x3a8/0x500 start_transaction+0x3a8/0x500 btrfs_commit_inode_delayed_inode+0x59/0x110 btrfs_evict_inode+0x19e/0x4c0 ? var_wake_function+0x20/0x20 evict+0xbc/0x1f0 inode_lru_isolate+0x113/0x190 ? discard_new_inode+0xc0/0xc0 __list_lru_walk_one.isra.4+0x5c/0x100 ? discard_new_inode+0xc0/0xc0 list_lru_walk_one+0x32/0x50 prune_icache_sb+0x36/0x80 super_cache_scan+0x14a/0x1d0 do_shrink_slab+0x131/0x320 shrink_node+0xf7/0x380 balance_pgdat+0x2d5/0x640 kswapd+0x2ba/0x5e0 ? __wake_up_common_lock+0x90/0x90 kthread+0x147/0x160 ? balance_pgdat+0x640/0x640 ? __kthread_create_on_node+0x160/0x160 ret_from_fork+0x24/0x30 This is because btrfs_new_inode() calls new_inode() under the transaction. We could probably move the new_inode() outside of this but for now just wrap it in memalloc_nofs_save(). Reported-by: Zdenek Sojka Fixes: 712e36c5f2a7 ("btrfs: use GFP_KERNEL in btrfs_alloc_inode") CC: stable@vger.kernel.org # 4.16+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0546401bc0a..0f2754eaa05b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6305,13 +6305,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, u32 sizes[2]; int nitems = name ? 2 : 1; unsigned long ptr; + unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); + nofs_flag = memalloc_nofs_save(); inode = new_inode(fs_info->sb); + memalloc_nofs_restore(nofs_flag); if (!inode) { btrfs_free_path(path); return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 33ea5aaa87cdae0f9af4d6b7ee4f650a1a36fd1d Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Thu, 26 Sep 2019 14:29:38 +0800 Subject: nfs: Fix nfsi->nrequests count error on nfs_inode_remove_request When xfstests testing, there are some WARNING as below: WARNING: CPU: 0 PID: 6235 at fs/nfs/inode.c:122 nfs_clear_inode+0x9c/0xd8 Modules linked in: CPU: 0 PID: 6235 Comm: umount.nfs Hardware name: linux,dummy-virt (DT) pstate: 60000005 (nZCv daif -PAN -UAO) pc : nfs_clear_inode+0x9c/0xd8 lr : nfs_evict_inode+0x60/0x78 sp : fffffc000f68fc00 x29: fffffc000f68fc00 x28: fffffe00c53155c0 x27: fffffe00c5315000 x26: fffffc0009a63748 x25: fffffc000f68fd18 x24: fffffc000bfaaf40 x23: fffffc000936d3c0 x22: fffffe00c4ff5e20 x21: fffffc000bfaaf40 x20: fffffe00c4ff5d10 x19: fffffc000c056000 x18: 000000000000003c x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000040 x14: 0000000000000228 x13: fffffc000c3a2000 x12: 0000000000000045 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : 0000000000000000 x7 : 0000000000000000 x6 : fffffc00084b027c x5 : fffffc0009a64000 x4 : fffffe00c0e77400 x3 : fffffc000c0563a8 x2 : fffffffffffffffb x1 : 000000000000764e x0 : 0000000000000001 Call trace: nfs_clear_inode+0x9c/0xd8 nfs_evict_inode+0x60/0x78 evict+0x108/0x380 dispose_list+0x70/0xa0 evict_inodes+0x194/0x210 generic_shutdown_super+0xb0/0x220 nfs_kill_super+0x40/0x88 deactivate_locked_super+0xb4/0x120 deactivate_super+0x144/0x160 cleanup_mnt+0x98/0x148 __cleanup_mnt+0x38/0x50 task_work_run+0x114/0x160 do_notify_resume+0x2f8/0x308 work_pending+0x8/0x14 The nrequest should be increased/decreased only if PG_INODE_REF flag was setted. But in the nfs_inode_remove_request function, it maybe decrease when no PG_INODE_REF flag, this maybe lead nrequests count error. Reported-by: Hulk Robot Signed-off-by: ZhangXiaoxu Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 85ca49549b39..52cab65f91cf 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -786,7 +786,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; - atomic_long_dec(&nfsi->nrequests); if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { head = req->wb_head; @@ -799,8 +798,10 @@ static void nfs_inode_remove_request(struct nfs_page *req) spin_unlock(&mapping->private_lock); } - if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { nfs_release_request(req); + atomic_long_dec(&nfsi->nrequests); + } } static void -- cgit v1.2.3 From c5f4987e86f6692fdb12533ea1fc7a7bb98e555a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 Oct 2019 10:03:36 -0400 Subject: btrfs: fix uninitialized ret in ref-verify Coverity caught a case where we could return with a uninitialized value in ret in process_leaf. This is actually pretty likely because we could very easily run into a block group item key and have a garbage value in ret and think there was an errror. Fix this by initializing ret to 0. Reported-by: Colin Ian King Fixes: fd708b81d972 ("Btrfs: add a extent ref verify tool") CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index e87cbdad02a3..b57f3618e58e 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -500,7 +500,7 @@ static int process_leaf(struct btrfs_root *root, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u32 count; - int i = 0, tree_block_level = 0, ret; + int i = 0, tree_block_level = 0, ret = 0; struct btrfs_key key; int nritems = btrfs_header_nritems(leaf); -- cgit v1.2.3 From bf7ec93c644cb0064ba7d2fc40d4841c5ba382ab Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 4 Oct 2019 17:01:08 +0300 Subject: io_uring: fix reversed nonblock flag for link submission io_queue_link_head() accepts @force_nonblock flag, but io_ring_submit() passes something opposite. Fixes: c576666863b78 ("io_uring: optimize submit_and_wait API") Reported-by: kbuild test robot Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0bc167aca46d..ab8c4e5e442c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2761,7 +2761,7 @@ out: if (link) io_queue_link_head(ctx, link, &link->submit, shadow_req, - block_for_last); + !block_for_last); if (statep) io_submit_state_end(statep); -- cgit v1.2.3 From 9f79b78ef74436c7507bac6bfb7b8b989263bccb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 May 2016 21:59:07 -0700 Subject: Convert filldir[64]() from __put_user() to unsafe_put_user() We really should avoid the "__{get,put}_user()" functions entirely, because they can easily be mis-used and the original intent of being used for simple direct user accesses no longer holds in a post-SMAP/PAN world. Manually optimizing away the user access range check makes no sense any more, when the range check is generally much cheaper than the "enable user accesses" code that the __{get,put}_user() functions still need. So instead of __put_user(), use the unsafe_put_user() interface with user_access_{begin,end}() that really does generate better code these days, and which is generally a nicer interface. Under some loads, the multiple user writes that filldir() does are actually quite noticeable. This also makes the dirent name copy use unsafe_put_user() with a couple of macros. We do not want to make function calls with SMAP/PAN disabled, and the code this generates is quite good when the architecture uses "asm goto" for unsafe_put_user() like x86 does. Note that this doesn't bother with the legacy cases. Nobody should use them anyway, so performance doesn't really matter there. Signed-off-by: Linus Torvalds --- fs/readdir.c | 128 +++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 93 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/readdir.c b/fs/readdir.c index 2f6a4534e0df..579c8ea894ae 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -20,9 +20,63 @@ #include #include #include - #include +#include + +/* + * Note the "unsafe_put_user() semantics: we goto a + * label for errors. + * + * Also note how we use a "while()" loop here, even though + * only the biggest size needs to loop. The compiler (well, + * at least gcc) is smart enough to turn the smaller sizes + * into just if-statements, and this way we don't need to + * care whether 'u64' or 'u32' is the biggest size. + */ +#define unsafe_copy_loop(dst, src, len, type, label) \ + while (len >= sizeof(type)) { \ + unsafe_put_user(get_unaligned((type *)src), \ + (type __user *)dst, label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +/* + * We avoid doing 64-bit copies on 32-bit architectures. They + * might be better, but the component names are mostly small, + * and the 64-bit cases can end up being much more complex and + * put much more register pressure on the code, so it's likely + * not worth the pain of unaligned accesses etc. + * + * So limit the copies to "unsigned long" size. I did verify + * that at least the x86-32 case is ok without this limiting, + * but I worry about random other legacy 32-bit cases that + * might not do as well. + */ +#define unsafe_copy_type(dst, src, len, type, label) do { \ + if (sizeof(type) <= sizeof(unsigned long)) \ + unsafe_copy_loop(dst, src, len, type, label); \ +} while (0) + +/* + * Copy the dirent name to user space, and NUL-terminate + * it. This should not be a function call, since we're doing + * the copy inside a "user_access_begin/end()" section. + */ +#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ + char __user *dst = (_dst); \ + const char *src = (_src); \ + size_t len = (_len); \ + unsafe_copy_type(dst, src, len, u64, label); \ + unsafe_copy_type(dst, src, len, u32, label); \ + unsafe_copy_type(dst, src, len, u16, label); \ + unsafe_copy_type(dst, src, len, u8, label); \ + unsafe_put_user(0, dst, label); \ +} while (0) + + int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); @@ -182,28 +236,31 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, return -EOVERFLOW; } dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + if (dirent && signal_pending(current)) + return -EINTR; + + /* + * Note! This range-checks 'previous' (which may be NULL). + * The real range was checked in getdents + */ + if (!user_access_begin(dirent, sizeof(*dirent))) goto efault; + if (dirent) + unsafe_put_user(offset, &dirent->d_off, efault_end); + dirent = buf->current_dir; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_access_end(); + buf->previous = dirent; dirent = (void __user *)dirent + reclen; buf->current_dir = dirent; buf->count -= reclen; return 0; +efault_end: + user_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -263,30 +320,31 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, if (reclen > buf->count) return -EINVAL; dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(ino, &dirent->d_ino)) - goto efault; - if (__put_user(0, &dirent->d_off)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (__put_user(d_type, &dirent->d_type)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) + if (dirent && signal_pending(current)) + return -EINTR; + + /* + * Note! This range-checks 'previous' (which may be NULL). + * The real range was checked in getdents + */ + if (!user_access_begin(dirent, sizeof(*dirent))) goto efault; + if (dirent) + unsafe_put_user(offset, &dirent->d_off, efault_end); + dirent = buf->current_dir; + unsafe_put_user(ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, &dirent->d_type, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_access_end(); + buf->previous = dirent; dirent = (void __user *)dirent + reclen; buf->current_dir = dirent; buf->count -= reclen; return 0; +efault_end: + user_access_end(); efault: buf->error = -EFAULT; return -EFAULT; -- cgit v1.2.3 From 8a23eb804ca4f2be909e372cf5a9e7b30ae476cd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 5 Oct 2019 11:32:52 -0700 Subject: Make filldir[64]() verify the directory entry filename is valid This has been discussed several times, and now filesystem people are talking about doing it individually at the filesystem layer, so head that off at the pass and just do it in getdents{64}(). This is partially based on a patch by Jann Horn, but checks for NUL bytes as well, and somewhat simplified. There's also commentary about how it might be better if invalid names due to filesystem corruption don't cause an immediate failure, but only an error at the end of the readdir(), so that people can still see the filenames that are ok. There's also been discussion about just how much POSIX strictly speaking requires this since it's about filesystem corruption. It's really more "protect user space from bad behavior" as pointed out by Jann. But since Eric Biederman looked up the POSIX wording, here it is for context: "From readdir: The readdir() function shall return a pointer to a structure representing the directory entry at the current position in the directory stream specified by the argument dirp, and position the directory stream at the next entry. It shall return a null pointer upon reaching the end of the directory stream. The structure dirent defined in the header describes a directory entry. From definitions: 3.129 Directory Entry (or Link) An object that associates a filename with a file. Several directory entries can associate names with the same file. ... 3.169 Filename A name consisting of 1 to {NAME_MAX} bytes used to name a file. The characters composing the name may be selected from the set of all character values excluding the slash character and the null byte. The filenames dot and dot-dot have special meaning. A filename is sometimes referred to as a 'pathname component'." Note that I didn't bother adding the checks to any legacy interfaces that nobody uses. Also note that if this ends up being noticeable as a performance regression, we can fix that to do a much more optimized model that checks for both NUL and '/' at the same time one word at a time. We haven't really tended to optimize 'memchr()', and it only checks for one pattern at a time anyway, and we really _should_ check for NUL too (but see the comment about "soft errors" in the code about why it currently only checks for '/') See the CONFIG_DCACHE_WORD_ACCESS case of hash_name() for how the name lookup code looks for pathname terminating characters in parallel. Link: https://lore.kernel.org/lkml/20190118161440.220134-2-jannh@google.com/ Cc: Alexander Viro Cc: Jann Horn Cc: Eric W. Biederman Signed-off-by: Linus Torvalds --- fs/readdir.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'fs') diff --git a/fs/readdir.c b/fs/readdir.c index 579c8ea894ae..19bea591c3f1 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -118,6 +118,40 @@ out: } EXPORT_SYMBOL(iterate_dir); +/* + * POSIX says that a dirent name cannot contain NULL or a '/'. + * + * It's not 100% clear what we should really do in this case. + * The filesystem is clearly corrupted, but returning a hard + * error means that you now don't see any of the other names + * either, so that isn't a perfect alternative. + * + * And if you return an error, what error do you use? Several + * filesystems seem to have decided on EUCLEAN being the error + * code for EFSCORRUPTED, and that may be the error to use. Or + * just EIO, which is perhaps more obvious to users. + * + * In order to see the other file names in the directory, the + * caller might want to make this a "soft" error: skip the + * entry, and return the error at the end instead. + * + * Note that this should likely do a "memchr(name, 0, len)" + * check too, since that would be filesystem corruption as + * well. However, that case can't actually confuse user space, + * which has to do a strlen() on the name anyway to find the + * filename length, and the above "soft error" worry means + * that it's probably better left alone until we have that + * issue clarified. + */ +static int verify_dirent_name(const char *name, int len) +{ + if (WARN_ON_ONCE(!len)) + return -EIO; + if (WARN_ON_ONCE(memchr(name, '/', len))) + return -EIO; + return 0; +} + /* * Traditional linux readdir() handling.. * @@ -227,6 +261,9 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; @@ -316,6 +353,9 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; -- cgit v1.2.3 From b212921b13bda088a004328457c5c21458262fe2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 Oct 2019 13:53:27 -0700 Subject: elf: don't use MAP_FIXED_NOREPLACE for elf executable mappings In commit 4ed28639519c ("fs, elf: drop MAP_FIXED usage from elf_map") we changed elf to use MAP_FIXED_NOREPLACE instead of MAP_FIXED for the executable mappings. Then, people reported that it broke some binaries that had overlapping segments from the same file, and commit ad55eac74f20 ("elf: enforce MAP_FIXED on overlaying elf segments") re-instated MAP_FIXED for some overlaying elf segment cases. But only some - despite the summary line of that commit, it only did it when it also does a temporary brk vma for one obvious overlapping case. Now Russell King reports another overlapping case with old 32-bit x86 binaries, which doesn't trigger that limited case. End result: we had better just drop MAP_FIXED_NOREPLACE entirely, and go back to MAP_FIXED. Yes, it's a sign of old binaries generated with old tool-chains, but we do pride ourselves on not breaking existing setups. This still leaves MAP_FIXED_NOREPLACE in place for the load_elf_interp() and the old load_elf_library() use-cases, because nobody has reported breakage for those. Yet. Note that in all the cases seen so far, the overlapping elf sections seem to be just re-mapping of the same executable with different section attributes. We could possibly introduce a new MAP_FIXED_NOFILECHANGE flag or similar, which acts like NOREPLACE, but allows just remapping the same executable file using different protection flags. It's not clear that would make a huge difference to anything, but if people really hate that "elf remaps over previous maps" behavior, maybe at least a more limited form of remapping would alleviate some concerns. Alternatively, we should take a look at our elf_map() logic to see if we end up not mapping things properly the first time. In the meantime, this is the minimal "don't do that then" patch while people hopefully think about it more. Reported-by: Russell King Fixes: 4ed28639519c ("fs, elf: drop MAP_FIXED usage from elf_map") Fixes: ad55eac74f20 ("elf: enforce MAP_FIXED on overlaying elf segments") Cc: Michal Hocko Cc: Kees Cook Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ad4c6b1d5074..c5642bcb6b46 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -879,7 +879,7 @@ out_free_interp: the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; + int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; @@ -911,13 +911,6 @@ out_free_interp: */ } } - - /* - * Some binaries have overlapping elf segments and then - * we have to forcefully map over an existing mapping - * e.g. over this newly established brk mapping. - */ - elf_fixed = MAP_FIXED; } elf_prot = make_prot(elf_ppnt->p_flags); @@ -930,7 +923,7 @@ out_free_interp: * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= elf_fixed; + elf_flags |= MAP_FIXED; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -966,7 +959,7 @@ out_free_interp: load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= elf_fixed; + elf_flags |= MAP_FIXED; } else load_bias = 0; -- cgit v1.2.3 From e093c4be760ebf46c131ae0dd6138865a22f46fa Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 30 Sep 2019 11:29:44 -0700 Subject: xfs: Fix tail rounding in xfs_alloc_file_space() To ensure that all blocks touched by the range [offset, offset + count) are allocated, we need to calculate the block count from the difference of the range end (rounded up) and the range start (rounded down). Before this patch, we just round up the byte count, which may lead to unaligned ranges not being fully allocated: $ touch test_file $ block_size=$(stat -fc '%S' test_file) $ fallocate -o $((block_size / 2)) -l $block_size test_file $ xfs_bmap test_file test_file: 0: [0..7]: 1396264..1396271 1: [8..15]: hole There should not be a hole there. Instead, the first two blocks should be fully allocated. With this patch applied, the result is something like this: $ touch test_file $ block_size=$(stat -fc '%S' test_file) $ fallocate -o $((block_size / 2)) -l $block_size test_file $ xfs_bmap test_file test_file: 0: [0..15]: 11024..11039 Signed-off-by: Max Reitz Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0910cb75b65d..4f443703065e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -864,6 +864,7 @@ xfs_alloc_file_space( xfs_filblks_t allocatesize_fsb; xfs_extlen_t extsz, temp; xfs_fileoff_t startoffset_fsb; + xfs_fileoff_t endoffset_fsb; int nimaps; int quota_flag; int rt; @@ -891,7 +892,8 @@ xfs_alloc_file_space( imapp = &imaps[0]; nimaps = 1; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); - allocatesize_fsb = XFS_B_TO_FSB(mp, count); + endoffset_fsb = XFS_B_TO_FSB(mp, offset + count); + allocatesize_fsb = endoffset_fsb - startoffset_fsb; /* * Allocate file space until done or until there is an error -- cgit v1.2.3 From 6374ca03975ab0a2b1a5ced222e0ef2ea6e22f9e Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 30 Sep 2019 11:31:12 -0700 Subject: xfs: remove unused flags arg from xfs_get_aghdr_buf() The flags arg is always passed as zero, so remove it. (xfs_buf_get_uncached takes flags to support XBF_NO_IOACCT for the sb, but that should never be relevant for xfs_get_aghdr_buf) Signed-off-by: Eric Sandeen Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 5de296b34ab1..14fbdf22b7e7 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -28,12 +28,11 @@ xfs_get_aghdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, - int flags, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; - bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0); if (!bp) return NULL; @@ -345,7 +344,7 @@ xfs_ag_init_hdr( { struct xfs_buf *bp; - bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops); + bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops); if (!bp) return -ENOMEM; -- cgit v1.2.3 From d5cc14d9f92833bd71219bf7fff180f097c3816d Mon Sep 17 00:00:00 2001 From: Aliasgar Surti Date: Mon, 30 Sep 2019 11:31:48 -0700 Subject: xfs: removed unused error variable from xchk_refcountbt_rec Removed unused error variable. Instead of using error variable, returned the value directly as it wasn't updated. Signed-off-by: Aliasgar Surti Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/refcount.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 93b3793bc5b3..0cab11a5d390 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -341,7 +341,6 @@ xchk_refcountbt_rec( xfs_extlen_t len; xfs_nlink_t refcount; bool has_cowflag; - int error = 0; bno = be32_to_cpu(rec->refc.rc_startblock); len = be32_to_cpu(rec->refc.rc_blockcount); @@ -366,7 +365,7 @@ xchk_refcountbt_rec( xchk_refcountbt_xref(bs->sc, bno, len, refcount); - return error; + return 0; } /* Make sure we have as many refc blocks as the rmap says. */ -- cgit v1.2.3 From 3219e8cf0dade9884d3c6cb432d433b4ca56875d Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Fri, 4 Oct 2019 16:38:44 -0700 Subject: xfs: assure zeroed memory buffers for certain kmem allocations Guarantee zeroed memory buffers for cases where potential memory leak to disk can occur. In these cases, kmem_alloc is used and doesn't zero the buffer, opening the possibility of information leakage to disk. Use existing infrastucture (xfs_buf_allocate_memory) to obtain the already zeroed buffer from kernel memory. This solution avoids the performance issue that would occur if a wholesale change to replace kmem_alloc with kmem_zalloc was done. Signed-off-by: Bill O'Donnell [darrick: fix bitwise complaint about kmflag_mask] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 12 +++++++++++- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 21c243622a79..0abba171aa89 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -345,6 +345,15 @@ xfs_buf_allocate_memory( unsigned short page_count, i; xfs_off_t start, end; int error; + xfs_km_flags_t kmflag_mask = 0; + + /* + * assure zeroed buffer for non-read cases. + */ + if (!(flags & XBF_READ)) { + kmflag_mask |= KM_ZERO; + gfp_mask |= __GFP_ZERO; + } /* * for buffers that are contained within a single page, just allocate @@ -354,7 +363,8 @@ xfs_buf_allocate_memory( size = BBTOB(bp->b_length); if (size < PAGE_SIZE) { int align_mask = xfs_buftarg_dma_alignment(bp->b_target); - bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS); + bp->b_addr = kmem_alloc_io(size, align_mask, + KM_NOFS | kmflag_mask); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a2beee9f74da..641d07f30a27 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1443,7 +1443,7 @@ xlog_alloc_log( prev_iclog = iclog; iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, - KM_MAYFAIL); + KM_MAYFAIL | KM_ZERO); if (!iclog->ic_data) goto out_free_iclog; #ifdef DEBUG diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 508319039dce..c1a514ffff55 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -127,7 +127,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL); + return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO); } /* -- cgit v1.2.3 From 52870d5048919004b6629d1a2f2b0a99c23b9960 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 1 Oct 2019 21:25:46 -0500 Subject: smb3: cleanup some recent endian errors spotted by updated sparse Now that sparse has been fixed, it spotted a couple recent minor endian errors (and removed one additional sparse warning). Thanks to Luc Van Oostenryck for his help fixing sparse. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/connect.c | 2 +- fs/cifs/smb2pdu.c | 3 ++- fs/cifs/smb2proto.h | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2850c3ce4391..b724227b853c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4264,7 +4264,7 @@ static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, server->ops->qfs_tcon(*xid, tcon); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) { if (tcon->fsDevInfo.DeviceCharacteristics & - FILE_READ_ONLY_DEVICE) + cpu_to_le32(FILE_READ_ONLY_DEVICE)) cifs_dbg(VFS, "mounted to read only share\n"); else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE) == 0) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 85f9d614d968..fd5f2b4b7582 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3217,7 +3217,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; - req->OutputBufferLength = SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE; + req->OutputBufferLength = + cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE); req->CompletionFilter = cpu_to_le32(completion_filter); if (watch_tree) req->Flags = cpu_to_le16(SMB2_WATCH_TREE); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index da3a6d580808..71b2930b8e0b 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -150,6 +150,10 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, bool is_fsctl, char *in_data, u32 indatalen, __u32 max_response_size); extern void SMB2_ioctl_free(struct smb_rqst *rqst); +extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, bool watch_tree, + u32 completion_filter); + extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, -- cgit v1.2.3 From dd19c106a36690b47bb1acc68372f2b472b495b8 Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Tue, 1 Oct 2019 16:34:13 +0900 Subject: fs: cifs: mute -Wunused-const-variable message After 'Initial git repository build' commit, 'mapping_table_ERRHRD' variable has not been used. So 'mapping_table_ERRHRD' const variable could be removed to mute below warning message: fs/cifs/netmisc.c:120:40: warning: unused variable 'mapping_table_ERRHRD' [-Wunused-const-variable] static const struct smb_to_posix_error mapping_table_ERRHRD[] = { ^ Signed-off-by: Austin Kim Signed-off-by: Steve French --- fs/cifs/netmisc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 49c17ee18254..9b41436fb8db 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -117,10 +117,6 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = { {0, 0} }; -static const struct smb_to_posix_error mapping_table_ERRHRD[] = { - {0, 0} -}; - /* * Convert a string containing text IPv4 or IPv6 address to binary form. * -- cgit v1.2.3 From cb248819d209d113e45fed459773991518e8e80b Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Thu, 3 Oct 2019 15:16:27 +1000 Subject: cifs: use cifsInodeInfo->open_file_lock while iterating to avoid a panic Commit 487317c99477 ("cifs: add spinlock for the openFileList to cifsInodeInfo") added cifsInodeInfo->open_file_lock spin_lock to protect the openFileList, but missed a few places where cifs_inode->openFileList was enumerated. Change these remaining tcon->open_file_lock to cifsInodeInfo->open_file_lock to avoid panic in is_size_safe_to_change. [17313.245641] RIP: 0010:is_size_safe_to_change+0x57/0xb0 [cifs] [17313.245645] Code: 68 40 48 89 ef e8 19 67 b7 f1 48 8b 43 40 48 8d 4b 40 48 8d 50 f0 48 39 c1 75 0f eb 47 48 8b 42 10 48 8d 50 f0 48 39 c1 74 3a <8b> 80 88 00 00 00 83 c0 01 a8 02 74 e6 48 89 ef c6 07 00 0f 1f 40 [17313.245649] RSP: 0018:ffff94ae1baefa30 EFLAGS: 00010202 [17313.245654] RAX: dead000000000100 RBX: ffff88dc72243300 RCX: ffff88dc72243340 [17313.245657] RDX: dead0000000000f0 RSI: 00000000098f7940 RDI: ffff88dd3102f040 [17313.245659] RBP: ffff88dd3102f040 R08: 0000000000000000 R09: ffff94ae1baefc40 [17313.245661] R10: ffffcdc8bb1c4e80 R11: ffffcdc8b50adb08 R12: 00000000098f7940 [17313.245663] R13: ffff88dc72243300 R14: ffff88dbc8f19600 R15: ffff88dc72243428 [17313.245667] FS: 00007fb145485700(0000) GS:ffff88dd3e000000(0000) knlGS:0000000000000000 [17313.245670] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [17313.245672] CR2: 0000026bb46c6000 CR3: 0000004edb110003 CR4: 00000000007606e0 [17313.245753] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [17313.245756] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [17313.245759] PKRU: 55555554 [17313.245761] Call Trace: [17313.245803] cifs_fattr_to_inode+0x16b/0x580 [cifs] [17313.245838] cifs_get_inode_info+0x35c/0xa60 [cifs] [17313.245852] ? kmem_cache_alloc_trace+0x151/0x1d0 [17313.245885] cifs_open+0x38f/0x990 [cifs] [17313.245921] ? cifs_revalidate_dentry_attr+0x3e/0x350 [cifs] [17313.245953] ? cifsFileInfo_get+0x30/0x30 [cifs] [17313.245960] ? do_dentry_open+0x132/0x330 [17313.245963] do_dentry_open+0x132/0x330 [17313.245969] path_openat+0x573/0x14d0 [17313.245974] do_filp_open+0x93/0x100 [17313.245979] ? __check_object_size+0xa3/0x181 [17313.245986] ? audit_alloc_name+0x7e/0xd0 [17313.245992] do_sys_open+0x184/0x220 [17313.245999] do_syscall_64+0x5b/0x1b0 Fixes: 487317c99477 ("cifs: add spinlock for the openFileList to cifsInodeInfo") CC: Stable Signed-off-by: Dave Wysochanski Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/file.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4b95700c507c..3758237bf951 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1840,13 +1840,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); /* we could simply get the first_list_entry since write-only entries are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ @@ -1858,7 +1857,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, /* found a good file */ /* lock it so it will not be closed on us */ cifsFileInfo_get(open_file); - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return open_file; } /* else might as well continue, and look for another, or simply have the caller reopen it @@ -1866,7 +1865,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, } else /* write only file */ break; /* write only files are last so must be done */ } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return NULL; } @@ -1877,7 +1876,6 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; bool any_available = false; int rc = -EBADF; unsigned int refind = 0; @@ -1897,16 +1895,15 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); - tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); refind_writable: if (refind > MAX_REOPEN_ATT) { - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return rc; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { @@ -1918,7 +1915,7 @@ refind_writable: if (!open_file->invalidHandle) { /* found a good writable file */ cifsFileInfo_get(open_file); - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); *ret_file = open_file; return 0; } else { @@ -1938,7 +1935,7 @@ refind_writable: cifsFileInfo_get(inv_file); } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); if (inv_file) { rc = cifs_reopen_file(inv_file, false); @@ -1953,7 +1950,7 @@ refind_writable: cifsFileInfo_put(inv_file); ++refind; inv_file = NULL; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); goto refind_writable; } @@ -4461,17 +4458,15 @@ static int cifs_readpage(struct file *file, struct page *page) static int is_inode_writable(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; - struct cifs_tcon *tcon = - cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb)); - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return 1; } } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return 0; } -- cgit v1.2.3 From 30573a82fb179420b8aac30a3a3595aa96a93156 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 30 Sep 2019 10:06:18 -0700 Subject: CIFS: Gracefully handle QueryInfo errors during open Currently if the client identifies problems when processing metadata returned in CREATE response, the open handle is being leaked. This causes multiple problems like a file missing a lease break by that client which causes high latencies to other clients accessing the file. Another side-effect of this is that the file can't be deleted. Fix this by closing the file after the client hits an error after the file was opened and the open descriptor wasn't returned to the user space. Also convert -ESTALE to -EOPENSTALE to allow the VFS to revalidate a dentry and retry the open. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 3758237bf951..5ad15de2bb4f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -253,6 +253,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, xid, fid); + if (rc) { + server->ops->close(xid, tcon, fid); + if (rc == -ESTALE) + rc = -EOPENSTALE; + } + out: kfree(buf); return rc; -- cgit v1.2.3 From c512c69187197fe08026cb5bbe7b9709f4f89b73 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 7 Oct 2019 12:56:48 -0700 Subject: uaccess: implement a proper unsafe_copy_to_user() and switch filldir over to it In commit 9f79b78ef744 ("Convert filldir[64]() from __put_user() to unsafe_put_user()") I made filldir() use unsafe_put_user(), which improves code generation on x86 enormously. But because we didn't have a "unsafe_copy_to_user()", the dirent name copy was also done by hand with unsafe_put_user() in a loop, and it turns out that a lot of other architectures didn't like that, because unlike x86, they have various alignment issues. Most non-x86 architectures trap and fix it up, and some (like xtensa) will just fail unaligned put_user() accesses unconditionally. Which makes that "copy using put_user() in a loop" not work for them at all. I could make that code do explicit alignment etc, but the architectures that don't like unaligned accesses also don't really use the fancy "user_access_begin/end()" model, so they might just use the regular old __copy_to_user() interface. So this commit takes that looping implementation, turns it into the x86 version of "unsafe_copy_to_user()", and makes other architectures implement the unsafe copy version as __copy_to_user() (the same way they do for the other unsafe_xyz() accessor functions). Note that it only does this for the copying _to_ user space, and we still don't have a unsafe version of copy_from_user(). That's partly because we have no current users of it, but also partly because the copy_from_user() case is slightly different and cannot efficiently be implemented in terms of a unsafe_get_user() loop (because gcc can't do asm goto with outputs). It would be trivial to do this using "rep movsb", which would work really nicely on newer x86 cores, but really badly on some older ones. Al Viro is looking at cleaning up all our user copy routines to make this all a non-issue, but for now we have this simple-but-stupid version for x86 that works fine for the dirent name copy case because those names are short strings and we simply don't need anything fancier. Fixes: 9f79b78ef744 ("Convert filldir[64]() from __put_user() to unsafe_put_user()") Reported-by: Guenter Roeck Reported-and-tested-by: Tony Luck Cc: Al Viro Cc: Max Filippov Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 23 ++++++++++++++++++++++ fs/readdir.c | 44 ++---------------------------------------- include/linux/uaccess.h | 6 ++++-- 3 files changed, 29 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35c225ede0e4..61d93f062a36 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -734,5 +734,28 @@ do { \ if (unlikely(__gu_err)) goto err_label; \ } while (0) +/* + * We want the unsafe accessors to always be inlined and use + * the error labels - thus the macro games. + */ +#define unsafe_copy_loop(dst, src, len, type, label) \ + while (len >= sizeof(type)) { \ + unsafe_put_user(*(type *)src,(type __user *)dst,label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +#define unsafe_copy_to_user(_dst,_src,_len,label) \ +do { \ + char __user *__ucu_dst = (_dst); \ + const char *__ucu_src = (_src); \ + size_t __ucu_len = (_len); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ +} while (0) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/fs/readdir.c b/fs/readdir.c index 19bea591c3f1..6e2623e57b2e 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -27,53 +27,13 @@ /* * Note the "unsafe_put_user() semantics: we goto a * label for errors. - * - * Also note how we use a "while()" loop here, even though - * only the biggest size needs to loop. The compiler (well, - * at least gcc) is smart enough to turn the smaller sizes - * into just if-statements, and this way we don't need to - * care whether 'u64' or 'u32' is the biggest size. - */ -#define unsafe_copy_loop(dst, src, len, type, label) \ - while (len >= sizeof(type)) { \ - unsafe_put_user(get_unaligned((type *)src), \ - (type __user *)dst, label); \ - dst += sizeof(type); \ - src += sizeof(type); \ - len -= sizeof(type); \ - } - -/* - * We avoid doing 64-bit copies on 32-bit architectures. They - * might be better, but the component names are mostly small, - * and the 64-bit cases can end up being much more complex and - * put much more register pressure on the code, so it's likely - * not worth the pain of unaligned accesses etc. - * - * So limit the copies to "unsigned long" size. I did verify - * that at least the x86-32 case is ok without this limiting, - * but I worry about random other legacy 32-bit cases that - * might not do as well. - */ -#define unsafe_copy_type(dst, src, len, type, label) do { \ - if (sizeof(type) <= sizeof(unsigned long)) \ - unsafe_copy_loop(dst, src, len, type, label); \ -} while (0) - -/* - * Copy the dirent name to user space, and NUL-terminate - * it. This should not be a function call, since we're doing - * the copy inside a "user_access_begin/end()" section. */ #define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ char __user *dst = (_dst); \ const char *src = (_src); \ size_t len = (_len); \ - unsafe_copy_type(dst, src, len, u64, label); \ - unsafe_copy_type(dst, src, len, u32, label); \ - unsafe_copy_type(dst, src, len, u16, label); \ - unsafe_copy_type(dst, src, len, u8, label); \ - unsafe_put_user(0, dst, label); \ + unsafe_put_user(0, dst+len, label); \ + unsafe_copy_to_user(dst, src, len, label); \ } while (0) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index e47d0522a1f4..d4ee6e942562 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -355,8 +355,10 @@ extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); #ifndef user_access_begin #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) -#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) +#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) +#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) +#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) +#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif -- cgit v1.2.3 From 7a243c82ea527cd1da47381ad9cd646844f3b693 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Sun, 6 Oct 2019 17:57:47 -0700 Subject: ocfs2: clear zero in unaligned direct IO Unused portion of a part-written fs-block-sized block is not set to zero in unaligned append direct write.This can lead to serious data inconsistencies. Ocfs2 manage disk with cluster size(for example, 1M), part-written in one cluster will change the cluster state from UN-WRITTEN to WRITTEN, VFS(function dio_zero_block) doesn't do the cleaning because bh's state is not set to NEW in function ocfs2_dio_wr_get_block when we write a WRITTEN cluster. For example, the cluster size is 1M, file size is 8k and we direct write from 14k to 15k, then 12k~14k and 15k~16k will contain dirty data. We have to deal with two cases: 1.The starting position of direct write is outside the file. 2.The starting position of direct write is located in the file. We need set bh's state to NEW in the first case. In the second case, we need mapped twice because bh's state of area out file should be set to NEW while area in file not. [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/5292e287-8f1a-fd4a-1a14-661e555e0bed@huawei.com Signed-off-by: Jia Guo Reviewed-by: Yiwen Jiang Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8de1c9d644f6..50d56d9a0475 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2146,13 +2146,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, struct ocfs2_dio_write_ctxt *dwc = NULL; struct buffer_head *di_bh = NULL; u64 p_blkno; - loff_t pos = iblock << inode->i_sb->s_blocksize_bits; + unsigned int i_blkbits = inode->i_sb->s_blocksize_bits; + loff_t pos = iblock << i_blkbits; + sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits; unsigned len, total_len = bh_result->b_size; int ret = 0, first_get_block = 0; len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); len = min(total_len, len); + /* + * bh_result->b_size is count in get_more_blocks according to write + * "pos" and "end", we need map twice to return different buffer state: + * 1. area in file size, not set NEW; + * 2. area out file size, set NEW. + * + * iblock endblk + * |--------|---------|---------|--------- + * |<-------area in file------->| + */ + + if ((iblock <= endblk) && + ((iblock + ((len - 1) >> i_blkbits)) > endblk)) + len = (endblk - iblock + 1) << i_blkbits; + mlog(0, "get block of %lu at %llu:%u req %u\n", inode->i_ino, pos, len, total_len); @@ -2236,6 +2253,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, if (desc->c_needs_zero) set_buffer_new(bh_result); + if (iblock > endblk) + set_buffer_new(bh_result); + /* May sleep in end_io. It should not happen in a irq context. So defer * it to dio work queue. */ set_buffer_defer_completion(bh_result); -- cgit v1.2.3 From 56e94ea132bb5c2c1d0b60a6aeb34dcb7d71a53d Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sun, 6 Oct 2019 17:57:50 -0700 Subject: fs: ocfs2: fix possible null-pointer dereferences in ocfs2_xa_prepare_entry() In ocfs2_xa_prepare_entry(), there is an if statement on line 2136 to check whether loc->xl_entry is NULL: if (loc->xl_entry) When loc->xl_entry is NULL, it is used on line 2158: ocfs2_xa_add_entry(loc, name_hash); loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash); loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size); and line 2164: ocfs2_xa_add_namevalue(loc, xi); loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len); loc->xl_entry->xe_name_len = xi->xi_name_len; Thus, possible null-pointer dereferences may occur. To fix these bugs, if loc-xl_entry is NULL, ocfs2_xa_prepare_entry() abnormally returns with -EINVAL. These bugs are found by a static analysis tool STCheck written by us. [akpm@linux-foundation.org: remove now-unused ocfs2_xa_add_entry()] Link: http://lkml.kernel.org/r/20190726101447.9153-1-baijiaju1990@gmail.com Signed-off-by: Jia-Ju Bai Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 56 +++++++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 90c830e3758e..d8507972ee13 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1490,18 +1490,6 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc, return loc->xl_ops->xlo_check_space(loc, xi); } -static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) -{ - loc->xl_ops->xlo_add_entry(loc, name_hash); - loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash); - /* - * We can't leave the new entry's xe_name_offset at zero or - * add_namevalue() will go nuts. We set it to the size of our - * storage so that it can never be less than any other entry. - */ - loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size); -} - static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_info *xi) { @@ -2133,29 +2121,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, if (rc) goto out; - if (loc->xl_entry) { - if (ocfs2_xa_can_reuse_entry(loc, xi)) { - orig_value_size = loc->xl_entry->xe_value_size; - rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); - if (rc) - goto out; - goto alloc_value; - } + if (!loc->xl_entry) { + rc = -EINVAL; + goto out; + } - if (!ocfs2_xattr_is_local(loc->xl_entry)) { - orig_clusters = ocfs2_xa_value_clusters(loc); - rc = ocfs2_xa_value_truncate(loc, 0, ctxt); - if (rc) { - mlog_errno(rc); - ocfs2_xa_cleanup_value_truncate(loc, - "overwriting", - orig_clusters); - goto out; - } + if (ocfs2_xa_can_reuse_entry(loc, xi)) { + orig_value_size = loc->xl_entry->xe_value_size; + rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); + if (rc) + goto out; + goto alloc_value; + } + + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + ocfs2_xa_cleanup_value_truncate(loc, + "overwriting", + orig_clusters); + goto out; } - ocfs2_xa_wipe_namevalue(loc); - } else - ocfs2_xa_add_entry(loc, name_hash); + } + ocfs2_xa_wipe_namevalue(loc); /* * If we get here, we have a blank entry. Fill it. We grow our -- cgit v1.2.3 From 583fee3e12df0e6f1f66f063b989d8e7fed0e65a Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sun, 6 Oct 2019 17:57:54 -0700 Subject: fs: ocfs2: fix a possible null-pointer dereference in ocfs2_write_end_nolock() In ocfs2_write_end_nolock(), there are an if statement on lines 1976, 2047 and 2058, to check whether handle is NULL: if (handle) When handle is NULL, it is used on line 2045: ocfs2_update_inode_fsync_trans(handle, inode, 1); oi->i_sync_tid = handle->h_transaction->t_tid; Thus, a possible null-pointer dereference may occur. To fix this bug, handle is checked before calling ocfs2_update_inode_fsync_trans(). This bug is found by a static analysis tool STCheck written by us. Link: http://lkml.kernel.org/r/20190726033705.32307-1-baijiaju1990@gmail.com Signed-off-by: Jia-Ju Bai Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 50d56d9a0475..9cd0a6815933 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2049,7 +2049,8 @@ out_write_size: inode->i_mtime = inode->i_ctime = current_time(inode); di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ocfs2_update_inode_fsync_trans(handle, inode, 1); + if (handle) + ocfs2_update_inode_fsync_trans(handle, inode, 1); } if (handle) ocfs2_journal_dirty(handle, wc->w_di_bh); -- cgit v1.2.3 From 2abb7d3b12d007c30193f48bebed781009bebdd2 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sun, 6 Oct 2019 17:57:57 -0700 Subject: fs: ocfs2: fix a possible null-pointer dereference in ocfs2_info_scan_inode_alloc() In ocfs2_info_scan_inode_alloc(), there is an if statement on line 283 to check whether inode_alloc is NULL: if (inode_alloc) When inode_alloc is NULL, it is used on line 287: ocfs2_inode_lock(inode_alloc, &bh, 0); ocfs2_inode_lock_full_nested(inode, ...) struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); Thus, a possible null-pointer dereference may occur. To fix this bug, inode_alloc is checked on line 286. This bug is found by a static analysis tool STCheck written by us. Link: http://lkml.kernel.org/r/20190726033717.32359-1-baijiaju1990@gmail.com Signed-off-by: Jia-Ju Bai Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index d6f7b299eb23..efeea208fdeb 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -283,7 +283,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, if (inode_alloc) inode_lock(inode_alloc); - if (o2info_coherent(&fi->ifi_req)) { + if (inode_alloc && o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); if (status < 0) { mlog_errno(status); -- cgit v1.2.3 From 8e00c4e9dd852f7a9bf12234fad65a2f2f93788f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 6 Oct 2019 17:58:09 -0700 Subject: writeback: fix use-after-free in finish_writeback_work() finish_writeback_work() reads @done->waitq after decrementing @done->cnt. However, once @done->cnt reaches zero, @done may be freed (from stack) at any moment and @done->waitq can contain something unrelated by the time finish_writeback_work() tries to read it. This led to the following crash. "BUG: kernel NULL pointer dereference, address: 0000000000000002" #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP DEBUG_PAGEALLOC CPU: 40 PID: 555153 Comm: kworker/u98:50 Kdump: loaded Not tainted ... Workqueue: writeback wb_workfn (flush-btrfs-1) RIP: 0010:_raw_spin_lock_irqsave+0x10/0x30 Code: 48 89 d8 5b c3 e8 50 db 6b ff eb f4 0f 1f 40 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 53 9c 5b fa 31 c0 ba 01 00 00 00 0f b1 17 75 05 48 89 d8 5b c3 89 c6 e8 fe ca 6b ff eb f2 66 90 RSP: 0018:ffffc90049b27d98 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000246 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000003 RDI: 0000000000000002 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 R10: ffff889fff407600 R11: ffff88ba9395d740 R12: 000000000000e300 R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88bfdfa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000002 CR3: 0000000002409005 CR4: 00000000001606e0 Call Trace: __wake_up_common_lock+0x63/0xc0 wb_workfn+0xd2/0x3e0 process_one_work+0x1f5/0x3f0 worker_thread+0x2d/0x3d0 kthread+0x111/0x130 ret_from_fork+0x1f/0x30 Fix it by reading and caching @done->waitq before decrementing @done->cnt. Link: http://lkml.kernel.org/r/20190924010631.GH2233839@devbig004.ftw2.facebook.com Fixes: 5b9cce4c7eb069 ("writeback: Generalize and expose wb_completion") Signed-off-by: Tejun Heo Debugged-by: Chris Mason Reviewed-by: Jens Axboe Cc: Jan Kara Cc: [5.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8aaa7eec7b74..e88421d9a48d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -164,8 +164,13 @@ static void finish_writeback_work(struct bdi_writeback *wb, if (work->auto_free) kfree(work); - if (done && atomic_dec_and_test(&done->cnt)) - wake_up_all(done->waitq); + if (done) { + wait_queue_head_t *waitq = done->waitq; + + /* @done can't be accessed after the following dec */ + if (atomic_dec_and_test(&done->cnt)) + wake_up_all(waitq); + } } static void wb_queue_work(struct bdi_writeback *wb, -- cgit v1.2.3 From 6805b32ec2b0897eb180295385efe306e5ac3b3d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 8 Oct 2019 02:18:42 +0300 Subject: io_uring: remove wait loop spurious wakeups Any changes interesting to tasks waiting in io_cqring_wait() are commited with io_cqring_ev_posted(). However, io_ring_drop_ctx_refs() also tries to do that but with no reason, that means spurious wakeups every io_free_req() and io_uring_enter(). Just use percpu_ref_put() instead. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ab8c4e5e442c..ceb3497bdd2a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -591,14 +591,6 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, io_cqring_ev_posted(ctx); } -static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) -{ - percpu_ref_put_many(&ctx->refs, refs); - - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); -} - static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { @@ -646,7 +638,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->result = 0; return req; out: - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); return NULL; } @@ -654,7 +646,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) { if (*nr) { kmem_cache_free_bulk(req_cachep, *nr, reqs); - io_ring_drop_ctx_refs(ctx, *nr); + percpu_ref_put_many(&ctx->refs, *nr); *nr = 0; } } @@ -663,7 +655,7 @@ static void __io_free_req(struct io_kiocb *req) { if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); - io_ring_drop_ctx_refs(req->ctx, 1); + percpu_ref_put(&req->ctx->refs); kmem_cache_free(req_cachep, req); } @@ -3584,7 +3576,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); out_fput: fdput(f); return submitted ? submitted : ret; -- cgit v1.2.3 From 431d39887d6273d6d84edf3c2eab09f4200e788a Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Tue, 3 Sep 2019 12:30:19 +0900 Subject: btrfs: silence maybe-uninitialized warning in clone_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC throws warning message as below: ‘clone_src_i_size’ may be used uninitialized in this function [-Wmaybe-uninitialized] #define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) ^ fs/btrfs/send.c:5088:6: note: ‘clone_src_i_size’ was declared here u64 clone_src_i_size; ^ The clone_src_i_size is only used as call-by-reference in a call to get_inode_info(). Silence the warning by initializing clone_src_i_size to 0. Note that the warning is a false positive and reported by older versions of GCC (eg. 7.x) but not eg 9.x. As there have been numerous people, the patch is applied. Setting clone_src_i_size to 0 does not otherwise make sense and would not do any action in case the code changes in the future. Signed-off-by: Austin Kim Reviewed-by: David Sterba [ add note ] Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f3215028235c..123ac54af071 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5085,7 +5085,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *path; struct btrfs_key key; int ret; - u64 clone_src_i_size; + u64 clone_src_i_size = 0; /* * Prevent cloning from a zero offset with a length matching the sector -- cgit v1.2.3 From d0959b080b1faa9741857ef607c90531b66e2fb8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 5 Oct 2019 10:53:58 -0500 Subject: smb3: remove noisy debug message and minor cleanup Message was intended only for developer temporary build In addition cleanup two minor warnings noticed by Coverity and a trivial change to workaround a sparse warning Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/cifsglob.h | 2 +- fs/cifs/connect.c | 2 +- fs/cifs/smb2pdu.c | 11 ++++------- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 2e960e1049db..50dfd9049370 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1210,7 +1210,7 @@ struct cifs_search_info { bool smallBuf:1; /* so we know which buf_release function to call */ }; -#define ACL_NO_MODE -1 +#define ACL_NO_MODE ((umode_t)(-1)) struct cifs_open_parms { struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b724227b853c..a64dfa95a925 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4445,7 +4445,7 @@ static int setup_dfs_tgt_conn(const char *path, int rc; struct dfs_info3_param ref = {0}; char *mdata = NULL, *fake_devname = NULL; - struct smb_vol fake_vol = {0}; + struct smb_vol fake_vol = {NULL}; cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index fd5f2b4b7582..05149862aea4 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -751,8 +751,8 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) unsigned int num = *num_iovec; iov[num].iov_base = create_posix_buf(mode); - if (mode == -1) - cifs_dbg(VFS, "illegal mode\n"); /* BB REMOVEME */ + if (mode == ACL_NO_MODE) + cifs_dbg(FYI, "illegal mode\n"); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -2521,11 +2521,8 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, return rc; } - /* TODO: add handling for the mode on create */ - if (oparms->disposition == FILE_CREATE) - cifs_dbg(VFS, "mode is 0x%x\n", oparms->mode); /* BB REMOVEME */ - - if ((oparms->disposition == FILE_CREATE) && (oparms->mode != -1)) { + if ((oparms->disposition == FILE_CREATE) && + (oparms->mode != ACL_NO_MODE)) { if (n_iov > 2) { struct create_context *ccontext = (struct create_context *)iov[n_iov-1].iov_base; -- cgit v1.2.3 From d4cfbf04b2aafcb657eb8ea0835e5cf51d371448 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 8 Oct 2019 00:27:14 -0500 Subject: smb3: Fix regression in time handling Fixes: cb7a69e60590 ("cifs: Initialize filesystem timestamp ranges") Only very old servers (e.g. OS/2 and DOS) did not support DCE TIME (100 nanosecond granularity). Fix the checks used to set minimum and maximum times. Fixes xfstest generic/258 (on 5.4-rc1 and later) CC: Deepa Dinamani Acked-by: Jeff Layton Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/cifsfs.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2e9c7f493f99..c049c7b3aa87 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -169,18 +169,26 @@ cifs_read_super(struct super_block *sb) else sb->s_maxbytes = MAX_NON_LFS; - /* BB FIXME fix time_gran to be larger for LANMAN sessions */ - sb->s_time_gran = 100; - - if (tcon->unix_ext) { - ts = cifs_NTtimeToUnix(0); + /* Some very old servers like DOS and OS/2 used 2 second granularity */ + if ((tcon->ses->server->vals->protocol_id == SMB10_PROT_ID) && + ((tcon->ses->capabilities & + tcon->ses->server->vals->cap_nt_find) == 0) && + !tcon->unix_ext) { + sb->s_time_gran = 1000000000; /* 1 second is max allowed gran */ + ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0); sb->s_time_min = ts.tv_sec; - ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX)); + ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX), + cpu_to_le16(SMB_TIME_MAX), 0); sb->s_time_max = ts.tv_sec; } else { - ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0); + /* + * Almost every server, including all SMB2+, uses DCE TIME + * ie 100 nanosecond units, since 1601. See MS-DTYP and MS-FSCC + */ + sb->s_time_gran = 100; + ts = cifs_NTtimeToUnix(0); sb->s_time_min = ts.tv_sec; - ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX), cpu_to_le16(SMB_TIME_MAX), 0); + ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX)); sb->s_time_max = ts.tv_sec; } -- cgit v1.2.3 From c82e5ac7fe3570a269c0929bf7899f62048e7dbc Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 30 Sep 2019 10:06:19 -0700 Subject: CIFS: Force revalidate inode when dentry is stale Currently the client indicates that a dentry is stale when inode numbers or type types between a local inode and a remote file don't match. If this is the case attributes is not being copied from remote to local, so, it is already known that the local copy has stale metadata. That's why the inode needs to be marked for revalidation in order to tell the VFS to lookup the dentry again before openning a file. This prevents unexpected stale errors to be returned to the user space when openning a file. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 3bae2e53f0b8..5dcc95b38310 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -414,6 +414,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* if uniqueid is different, return error */ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) { + CIFS_I(*pinode)->time = 0; /* force reval */ rc = -ESTALE; goto cgiiu_exit; } @@ -421,6 +422,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* if filetype is different, return error */ if (unlikely(((*pinode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { + CIFS_I(*pinode)->time = 0; /* force reval */ rc = -ESTALE; goto cgiiu_exit; } @@ -933,6 +935,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* if uniqueid is different, return error */ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { + CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; goto cgii_exit; } @@ -940,6 +943,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* if filetype is different, return error */ if (unlikely(((*inode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { + CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; goto cgii_exit; } -- cgit v1.2.3 From 0b3d0ef9840f7be202393ca9116b857f6f793715 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 30 Sep 2019 10:06:20 -0700 Subject: CIFS: Force reval dentry if LOOKUP_REVAL flag is set Mark inode for force revalidation if LOOKUP_REVAL flag is set. This tells the client to actually send a QueryInfo request to the server to obtain the latest metadata in case a directory or a file were changed remotely. Only do that if the client doesn't have a lease for the file to avoid unneeded round trips to the server. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/dir.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index dd5ac841aefa..7ce689d31aa2 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -738,10 +738,16 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, static int cifs_d_revalidate(struct dentry *direntry, unsigned int flags) { + struct inode *inode; + if (flags & LOOKUP_RCU) return -ECHILD; if (d_really_is_positive(direntry)) { + inode = d_inode(direntry); + if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode))) + CIFS_I(inode)->time = 0; /* force reval */ + if (cifs_revalidate_dentry(direntry)) return 0; else { @@ -752,7 +758,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) * attributes will have been updated by * cifs_revalidate_dentry(). */ - if (IS_AUTOMOUNT(d_inode(direntry)) && + if (IS_AUTOMOUNT(inode) && !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) { spin_lock(&direntry->d_lock); direntry->d_flags |= DCACHE_NEED_AUTOMOUNT; -- cgit v1.2.3 From 031d73ed768a40684f3ca21992265ffdb6a270bf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 30 Sep 2019 14:02:56 -0400 Subject: NFS: Fix O_DIRECT accounting of number of bytes read/written When a series of O_DIRECT reads or writes are truncated, either due to eof or due to an error, then we should return the number of contiguous bytes that were received/sent starting at the offset specified by the application. Currently, we are failing to correctly check contiguity, and so we're failing the generic/465 in xfstests when the race between the read and write RPCs causes the file to get extended while the 2 reads are outstanding. If the first read RPC call wins the race and returns with eof set, we should treat the second read RPC as being truncated. Reported-by: Su Yanjun Fixes: 1ccbad9f9f9bd ("nfs: fix DIO good bytes calculation") Cc: stable@vger.kernel.org # 4.1+ Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/direct.c | 78 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 222d7115db71..98a9a0bcdf38 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -123,32 +123,49 @@ static inline int put_dreq(struct nfs_direct_req *dreq) } static void -nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) +nfs_direct_handle_truncated(struct nfs_direct_req *dreq, + const struct nfs_pgio_header *hdr, + ssize_t dreq_len) { - int i; - ssize_t count; + struct nfs_direct_mirror *mirror = &dreq->mirrors[hdr->pgio_mirror_idx]; + + if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) || + test_bit(NFS_IOHDR_EOF, &hdr->flags))) + return; + if (dreq->max_count >= dreq_len) { + dreq->max_count = dreq_len; + if (dreq->count > dreq_len) + dreq->count = dreq_len; + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) + dreq->error = hdr->error; + else /* Clear outstanding error if this is EOF */ + dreq->error = 0; + } + if (mirror->count > dreq_len) + mirror->count = dreq_len; +} - WARN_ON_ONCE(dreq->count >= dreq->max_count); +static void +nfs_direct_count_bytes(struct nfs_direct_req *dreq, + const struct nfs_pgio_header *hdr) +{ + struct nfs_direct_mirror *mirror = &dreq->mirrors[hdr->pgio_mirror_idx]; + loff_t hdr_end = hdr->io_start + hdr->good_bytes; + ssize_t dreq_len = 0; - if (dreq->mirror_count == 1) { - dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; - dreq->count += hdr->good_bytes; - } else { - /* mirrored writes */ - count = dreq->mirrors[hdr->pgio_mirror_idx].count; - if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { - count = hdr->io_start + hdr->good_bytes - dreq->io_start; - dreq->mirrors[hdr->pgio_mirror_idx].count = count; - } - /* update the dreq->count by finding the minimum agreed count from all - * mirrors */ - count = dreq->mirrors[0].count; + if (hdr_end > dreq->io_start) + dreq_len = hdr_end - dreq->io_start; - for (i = 1; i < dreq->mirror_count; i++) - count = min(count, dreq->mirrors[i].count); + nfs_direct_handle_truncated(dreq, hdr, dreq_len); - dreq->count = count; - } + if (dreq_len > dreq->max_count) + dreq_len = dreq->max_count; + + if (mirror->count < dreq_len) + mirror->count = dreq_len; + if (dreq->count < dreq_len) + dreq->count = dreq_len; } /* @@ -402,20 +419,12 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) struct nfs_direct_req *dreq = hdr->dreq; spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { spin_unlock(&dreq->lock); goto out_put; } - if (hdr->good_bytes != 0) - nfs_direct_good_bytes(dreq, hdr); - - if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) - dreq->error = 0; - + nfs_direct_count_bytes(dreq, hdr); spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { @@ -652,6 +661,9 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); dreq->count = 0; + dreq->max_count = 0; + list_for_each_entry(req, &reqs, wb_list) + dreq->max_count += req->wb_bytes; dreq->verf.committed = NFS_INVALID_STABLE_HOW; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); for (i = 0; i < dreq->mirror_count; i++) @@ -791,17 +803,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { spin_unlock(&dreq->lock); goto out_put; } + nfs_direct_count_bytes(dreq, hdr); if (hdr->good_bytes != 0) { - nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) request_commit = true; -- cgit v1.2.3 From 0b5748477924d2fb37f6b77d1a7eef600a96d722 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 30 Sep 2019 14:02:57 -0400 Subject: NFS: Remove redundant mirror tracking in O_DIRECT We no longer need the extra mirror length tracking in the O_DIRECT code, as we are able to track the maximum contiguous length in dreq->max_count. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/direct.c | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 98a9a0bcdf38..040a50fd9bf3 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -64,13 +64,6 @@ static struct kmem_cache *nfs_direct_cachep; -/* - * This represents a set of asynchronous requests that we're waiting on - */ -struct nfs_direct_mirror { - ssize_t count; -}; - struct nfs_direct_req { struct kref kref; /* release manager */ @@ -84,9 +77,6 @@ struct nfs_direct_req { atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ - struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX]; - int mirror_count; - loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ @@ -127,8 +117,6 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq, const struct nfs_pgio_header *hdr, ssize_t dreq_len) { - struct nfs_direct_mirror *mirror = &dreq->mirrors[hdr->pgio_mirror_idx]; - if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) || test_bit(NFS_IOHDR_EOF, &hdr->flags))) return; @@ -142,15 +130,12 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq, else /* Clear outstanding error if this is EOF */ dreq->error = 0; } - if (mirror->count > dreq_len) - mirror->count = dreq_len; } static void nfs_direct_count_bytes(struct nfs_direct_req *dreq, const struct nfs_pgio_header *hdr) { - struct nfs_direct_mirror *mirror = &dreq->mirrors[hdr->pgio_mirror_idx]; loff_t hdr_end = hdr->io_start + hdr->good_bytes; ssize_t dreq_len = 0; @@ -162,8 +147,6 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, if (dreq_len > dreq->max_count) dreq_len = dreq->max_count; - if (mirror->count < dreq_len) - mirror->count = dreq_len; if (dreq->count < dreq_len) dreq->count = dreq_len; } @@ -310,18 +293,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, cinfo->completion_ops = &nfs_direct_commit_completion_ops; } -static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq, - struct nfs_pageio_descriptor *pgio, - struct nfs_page *req) -{ - int mirror_count = 1; - - if (pgio->pg_ops->pg_get_mirror_count) - mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); - - dreq->mirror_count = mirror_count; -} - static inline struct nfs_direct_req *nfs_direct_req_alloc(void) { struct nfs_direct_req *dreq; @@ -336,7 +307,6 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) INIT_LIST_HEAD(&dreq->mds_cinfo.list); dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); - dreq->mirror_count = 1; spin_lock_init(&dreq->lock); return dreq; @@ -655,7 +625,6 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) LIST_HEAD(reqs); struct nfs_commit_info cinfo; LIST_HEAD(failed); - int i; nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); @@ -666,21 +635,12 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) dreq->max_count += req->wb_bytes; dreq->verf.committed = NFS_INVALID_STABLE_HOW; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); - for (i = 0; i < dreq->mirror_count; i++) - dreq->mirrors[i].count = 0; get_dreq(dreq); nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - req = nfs_list_entry(reqs.next); - nfs_direct_setup_mirroring(dreq, &desc, req); - if (desc.pg_error < 0) { - list_splice_init(&reqs, &failed); - goto out_failed; - } - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { /* Bump the transmission count */ req->wb_nio++; @@ -698,7 +658,6 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) } nfs_pageio_complete(&desc); -out_failed: while (!list_empty(&failed)) { req = nfs_list_entry(failed.next); nfs_list_remove_request(req); @@ -931,7 +890,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; } - nfs_direct_setup_mirroring(dreq, &desc, req); if (desc.pg_error < 0) { nfs_free_request(req); result = desc.pg_error; -- cgit v1.2.3 From 0b10d8a89f55c416f6a1f6a616669543fa8bdb69 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 7 Oct 2019 12:54:15 -0700 Subject: xfs: log the inode on directory sf to block format change When a directory changes from shortform (sf) to block format, the sf format is copied to a temporary buffer, the inode format is modified and the updated format filled with the dentries from the temporary buffer. If the inode format is modified and attempt to grow the inode fails (due to I/O error, for example), it is possible to return an error while leaving the directory in an inconsistent state and with an otherwise clean transaction. This results in corruption of the associated directory and leads to xfs_dabuf_map() errors as subsequent lookups cannot accurately determine the format of the directory. This problem is reproduced occasionally by generic/475. The fundamental problem is that xfs_dir2_sf_to_block() changes the on-disk inode format without logging the inode. The inode is eventually logged by the bmapi layer in the common case, but error checking introduces the possibility of failing the high level request before this happens. Update both of the dir2 and attr callers of xfs_bmap_local_to_extents_empty() to log the inode core as consistent with the bmap local to extent format change codepath. This ensures that any subsequent errors after the format has changed cause the transaction to abort. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr_leaf.c | 1 + fs/xfs/libxfs/xfs_dir2_block.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index b9f019603d0b..36c0a32cefcf 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -827,6 +827,7 @@ xfs_attr_shortform_to_leaf( xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); bp = NULL; error = xfs_da_grow_inode(args, &blkno); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9595ced393dc..3d1e5f6d64fd 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1098,6 +1098,7 @@ xfs_dir2_sf_to_block( xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK); dp->i_d.di_size = 0; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); /* * Add block 0 to the inode. -- cgit v1.2.3 From 603efebd6768356fb7cea02e4a822587c2dc5d7c Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 7 Oct 2019 12:54:15 -0700 Subject: xfs: remove broken error handling on failed attr sf to leaf change xfs_attr_shortform_to_leaf() attempts to put the shortform fork back together after a failed attempt to convert from shortform to leaf format. While this code reallocates and copies back the shortform attr fork data, it never resets the inode format field back to local format. Further, now that the inode is properly logged after the initial switch from local format, any error that triggers the recovery code will eventually abort the transaction and shutdown the fs. Therefore, remove the broken and unnecessary error handling code. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr_leaf.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 36c0a32cefcf..1b956c313b6b 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -831,28 +831,13 @@ xfs_attr_shortform_to_leaf( bp = NULL; error = xfs_da_grow_inode(args, &blkno); - if (error) { - /* - * If we hit an IO error middle of the transaction inside - * grow_inode(), we may have inconsistent data. Bail out. - */ - if (error == -EIO) - goto out; - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ - memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + if (error) goto out; - } ASSERT(blkno == 0); error = xfs_attr3_leaf_create(args, blkno, &bp); - if (error) { - /* xfs_attr3_leaf_create may not have instantiated a block */ - if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0)) - goto out; - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ - memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + if (error) goto out; - } memset((char *)&nargs, 0, sizeof(nargs)); nargs.dp = dp; -- cgit v1.2.3 From aeea4b75f045294e1c026acc380466daa43afc65 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 7 Oct 2019 12:54:16 -0700 Subject: xfs: move local to extent inode logging into bmap helper The callers of xfs_bmap_local_to_extents_empty() log the inode external to the function, yet this function is where the on-disk format value is updated. Push the inode logging down into the function itself to help prevent future mistakes. Note that internal bmap callers track the inode logging flags independently and thus may log the inode core twice due to this change. This is harmless, so leave this code around for consistency with the other attr fork conversion functions. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr_leaf.c | 3 +-- fs/xfs/libxfs/xfs_bmap.c | 6 ++++-- fs/xfs/libxfs/xfs_bmap.h | 3 ++- fs/xfs/libxfs/xfs_dir2_block.c | 3 +-- 4 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 1b956c313b6b..f0089e862216 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -826,8 +826,7 @@ xfs_attr_shortform_to_leaf( sf = (xfs_attr_shortform_t *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); - xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK); - xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); bp = NULL; error = xfs_da_grow_inode(args, &blkno); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4edc25a2ba80..02469d59c787 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -792,6 +792,7 @@ out_root_realloc: */ void xfs_bmap_local_to_extents_empty( + struct xfs_trans *tp, struct xfs_inode *ip, int whichfork) { @@ -808,6 +809,7 @@ xfs_bmap_local_to_extents_empty( ifp->if_u1.if_root = NULL; ifp->if_height = 0; XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -840,7 +842,7 @@ xfs_bmap_local_to_extents( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { - xfs_bmap_local_to_extents_empty(ip, whichfork); + xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags = XFS_ILOG_CORE; goto done; } @@ -887,7 +889,7 @@ xfs_bmap_local_to_extents( /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); - xfs_bmap_local_to_extents_empty(ip, whichfork); + xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags |= XFS_ILOG_CORE; ifp->if_u1.if_root = NULL; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 5bb446d80542..e2798c6f3a5f 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -182,7 +182,8 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); -void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); +void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, + struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, bool skip_discard); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 3d1e5f6d64fd..49e4bc39e7bb 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1096,9 +1096,8 @@ xfs_dir2_sf_to_block( memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); - xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK); + xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK); dp->i_d.di_size = 0; - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); /* * Add block 0 to the inode. -- cgit v1.2.3 From 8a99734081775c012a4a6c442fdef0379fe52bdf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Oct 2019 14:40:13 -0600 Subject: io_uring: only flush workqueues on fileset removal We should not remove the workqueue, we just need to ensure that the workqueues are synced. The workqueues are torn down on ctx removal. Cc: stable@vger.kernel.org Fixes: 6b06314c47e1 ("io_uring: add file set registration") Reported-by: Stefan Hajnoczi Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ceb3497bdd2a..2c44648217bd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2866,8 +2866,12 @@ static void io_finish_async(struct io_ring_ctx *ctx) static void io_destruct_skb(struct sk_buff *skb) { struct io_ring_ctx *ctx = skb->sk->sk_user_data; + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) + if (ctx->sqo_wq[i]) + flush_workqueue(ctx->sqo_wq[i]); - io_finish_async(ctx); unix_destruct_scm(skb); } -- cgit v1.2.3 From 6fcf0c72e4b9360768cf5ef543c4f14c34800ee8 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Wed, 2 Oct 2019 17:56:33 +0800 Subject: vfs: add missing blkdev_put() in get_tree_bdev() Is there are a couple of missing blkdev_put() in get_tree_bdev()? Signed-off-by: Al Viro --- fs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index beaf076d9733..530dd13fa98b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1296,6 +1296,7 @@ int get_tree_bdev(struct fs_context *fc, mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); + blkdev_put(bdev, mode); warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); return -EBUSY; } @@ -1304,8 +1305,10 @@ int get_tree_bdev(struct fs_context *fc, fc->sget_key = bdev; s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); mutex_unlock(&bdev->bd_fsfreeze_mutex); - if (IS_ERR(s)) + if (IS_ERR(s)) { + blkdev_put(bdev, mode); return PTR_ERR(s); + } if (s->s_root) { /* Don't summarily change the RO/RW state. */ -- cgit v1.2.3 From 26b6c984338474b7032a3f1ee28e9d7590c225db Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Sep 2019 16:32:42 -0400 Subject: libfs: take cursors out of list when moving past the end of directory that eliminates the last place where we accessed the tail of ->d_subdirs Signed-off-by: Al Viro --- fs/libfs.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 8e023b08a240..540611b99b9a 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -92,14 +92,13 @@ EXPORT_SYMBOL(dcache_dir_close); /* * Returns an element of siblings' list. * We are looking for th positive after

; if - * found, dentry is grabbed and passed to caller via *. - * If no such element exists, the anchor of list is returned - * and * is set to NULL. + * found, dentry is grabbed and returned to caller. + * If no such element exists, NULL is returned. */ -static struct list_head *scan_positives(struct dentry *cursor, +static struct dentry *scan_positives(struct dentry *cursor, struct list_head *p, loff_t count, - struct dentry **res) + struct dentry *last) { struct dentry *dentry = cursor->d_parent, *found = NULL; @@ -127,9 +126,8 @@ static struct list_head *scan_positives(struct dentry *cursor, } } spin_unlock(&dentry->d_lock); - dput(*res); - *res = found; - return p; + dput(last); + return found; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) @@ -149,25 +147,22 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset != file->f_pos) { struct dentry *cursor = file->private_data; struct dentry *to = NULL; - struct list_head *p; - file->f_pos = offset; inode_lock_shared(dentry->d_inode); - if (file->f_pos > 2) { - p = scan_positives(cursor, &dentry->d_subdirs, - file->f_pos - 2, &to); - spin_lock(&dentry->d_lock); - list_move(&cursor->d_child, p); - spin_unlock(&dentry->d_lock); - } else { - spin_lock(&dentry->d_lock); + if (offset > 2) + to = scan_positives(cursor, &dentry->d_subdirs, + offset - 2, NULL); + spin_lock(&dentry->d_lock); + if (to) + list_move(&cursor->d_child, &to->d_child); + else list_del_init(&cursor->d_child); - spin_unlock(&dentry->d_lock); - } - + spin_unlock(&dentry->d_lock); dput(to); + file->f_pos = offset; + inode_unlock_shared(dentry->d_inode); } return offset; @@ -199,17 +194,23 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) if (ctx->pos == 2) p = anchor; - else + else if (!list_empty(&cursor->d_child)) p = &cursor->d_child; + else + return 0; - while ((p = scan_positives(cursor, p, 1, &next)) != anchor) { + while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, dt_type(d_inode(next)))) break; ctx->pos++; + p = &next->d_child; } spin_lock(&dentry->d_lock); - list_move_tail(&cursor->d_child, p); + if (next) + list_move_tail(&cursor->d_child, &next->d_child); + else + list_del_init(&cursor->d_child); spin_unlock(&dentry->d_lock); dput(next); -- cgit v1.2.3 From 1047ec868332034d1fbcb2fae19fe6d4cb869ff2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 4 Oct 2019 09:58:54 -0400 Subject: NFSv4: Fix leak of clp->cl_acceptor string Our client can issue multiple SETCLIENTID operations to the same server in some circumstances. Ensure that calls to nfs4_proc_setclientid() after the first one do not overwrite the previously allocated cl_acceptor string. unreferenced object 0xffff888461031800 (size 32): comm "mount.nfs", pid 2227, jiffies 4294822467 (age 1407.749s) hex dump (first 32 bytes): 6e 66 73 40 6b 6c 69 6d 74 2e 69 62 2e 31 30 31 nfs@klimt.ib.101 35 67 72 61 6e 67 65 72 2e 6e 65 74 00 00 00 00 5granger.net.... backtrace: [<00000000ab820188>] __kmalloc+0x128/0x176 [<00000000eeaf4ec8>] gss_stringify_acceptor+0xbd/0x1a7 [auth_rpcgss] [<00000000e85e3382>] nfs4_proc_setclientid+0x34e/0x46c [nfsv4] [<000000003d9cf1fa>] nfs40_discover_server_trunking+0x7a/0xed [nfsv4] [<00000000b81c3787>] nfs4_discover_server_trunking+0x81/0x244 [nfsv4] [<000000000801b55f>] nfs4_init_client+0x1b0/0x238 [nfsv4] [<00000000977daf7f>] nfs4_set_client+0xfe/0x14d [nfsv4] [<0000000053a68a2a>] nfs4_create_server+0x107/0x1db [nfsv4] [<0000000088262019>] nfs4_remote_mount+0x2c/0x59 [nfsv4] [<00000000e84a2fd0>] legacy_get_tree+0x2d/0x4c [<00000000797e947c>] vfs_get_tree+0x20/0xc7 [<00000000ecabaaa8>] fc_mount+0xe/0x36 [<00000000f15fafc2>] vfs_kern_mount+0x74/0x8d [<00000000a3ff4e26>] nfs_do_root_mount+0x8a/0xa3 [nfsv4] [<00000000d1c2b337>] nfs4_try_mount+0x58/0xad [nfsv4] [<000000004c9bddee>] nfs_fs_mount+0x820/0x869 [nfs] Fixes: f11b2a1cfbf5 ("nfs4: copy acceptor name from context ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 11eafcfc490b..ab8ca20fd579 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6106,6 +6106,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, status = nfs4_call_sync_custom(&task_setup_data); if (setclientid.sc_cred) { + kfree(clp->cl_acceptor); clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); put_rpccred(setclientid.sc_cred); } -- cgit v1.2.3 From 7adf4eaf60f3d8c3584bed51fe7066d4dfc2cbe1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Oct 2019 21:42:58 -0600 Subject: io_uring: fix sequence logic for timeout requests We have two ways a request can be deferred: 1) It's a regular request that depends on another one 2) It's a timeout that tracks completions We have a shared helper to determine whether to defer, and that attempts to make the right decision based on the request. But we only have some of this information in the caller. Un-share the two timeout/defer helpers so the caller can use the right one. Fixes: 5262f567987d ("io_uring: IORING_OP_TIMEOUT support") Reported-by: yangerkun Reviewed-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2c44648217bd..38d274fc0f25 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -415,27 +415,27 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; } +static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; +} + static inline bool io_sequence_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) { - /* timeout requests always honor sequence */ - if (!(req->flags & REQ_F_TIMEOUT) && - (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) + if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; + return __io_sequence_defer(ctx, req); } -static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, - struct list_head *list) +static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) { struct io_kiocb *req; - if (list_empty(list)) - return NULL; - - req = list_first_entry(list, struct io_kiocb, list); - if (!io_sequence_defer(ctx, req)) { + req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); + if (req && !io_sequence_defer(ctx, req)) { list_del_init(&req->list); return req; } @@ -443,14 +443,17 @@ static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, return NULL; } -static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) -{ - return __io_get_deferred_req(ctx, &ctx->defer_list); -} - static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) { - return __io_get_deferred_req(ctx, &ctx->timeout_list); + struct io_kiocb *req; + + req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); + if (req && !__io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } + + return NULL; } static void __io_commit_cqring(struct io_ring_ctx *ctx) -- cgit v1.2.3 From 3ed270b129a45c7c502c819a4286b464a52dac61 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 13:54:58 -0400 Subject: tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down") Running the latest kernel through my "make instances" stress tests, I triggered the following bug (with KASAN and kmemleak enabled): mkdir invoked oom-killer: gfp_mask=0x40cd0(GFP_KERNEL|__GFP_COMP|__GFP_RECLAIMABLE), order=0, oom_score_adj=0 CPU: 1 PID: 2229 Comm: mkdir Not tainted 5.4.0-rc2-test #325 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 Call Trace: dump_stack+0x64/0x8c dump_header+0x43/0x3b7 ? trace_hardirqs_on+0x48/0x4a oom_kill_process+0x68/0x2d5 out_of_memory+0x2aa/0x2d0 __alloc_pages_nodemask+0x96d/0xb67 __alloc_pages_node+0x19/0x1e alloc_slab_page+0x17/0x45 new_slab+0xd0/0x234 ___slab_alloc.constprop.86+0x18f/0x336 ? alloc_inode+0x2c/0x74 ? irq_trace+0x12/0x1e ? tracer_hardirqs_off+0x1d/0xd7 ? __slab_alloc.constprop.85+0x21/0x53 __slab_alloc.constprop.85+0x31/0x53 ? __slab_alloc.constprop.85+0x31/0x53 ? alloc_inode+0x2c/0x74 kmem_cache_alloc+0x50/0x179 ? alloc_inode+0x2c/0x74 alloc_inode+0x2c/0x74 new_inode_pseudo+0xf/0x48 new_inode+0x15/0x25 tracefs_get_inode+0x23/0x7c ? lookup_one_len+0x54/0x6c tracefs_create_file+0x53/0x11d trace_create_file+0x15/0x33 event_create_dir+0x2a3/0x34b __trace_add_new_event+0x1c/0x26 event_trace_add_tracer+0x56/0x86 trace_array_create+0x13e/0x1e1 instance_mkdir+0x8/0x17 tracefs_syscall_mkdir+0x39/0x50 ? get_dname+0x31/0x31 vfs_mkdir+0x78/0xa3 do_mkdirat+0x71/0xb0 sys_mkdir+0x19/0x1b do_fast_syscall_32+0xb0/0xed I bisected this down to the addition of the proxy_ops into tracefs for lockdown. It appears that the allocation of the proxy_ops and then freeing it in the destroy_inode callback, is causing havoc with the memory system. Reading the documentation about destroy_inode and talking with Linus about this, this is buggy and wrong. When defining the destroy_inode() method, it is expected that the destroy_inode() will also free the inode, and not just the extra allocations done in the creation of the inode. The faulty commit causes a memory leak of the inode data structure when they are deleted. Instead of allocating the proxy_ops (and then having to free it) the checks should be done by the open functions themselves, and not hack into the tracefs directory. First revert the tracefs updates for locked_down and then later we can add the locked_down checks in the kernel/trace files. Link: http://lkml.kernel.org/r/20191011135458.7399da44@gandalf.local.home Fixes: ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down") Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (VMware) --- fs/tracefs/inode.c | 42 +----------------------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 9fc14e38927f..eeeae0475da9 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -20,7 +20,6 @@ #include #include #include -#include #define TRACEFS_DEFAULT_MODE 0700 @@ -28,25 +27,6 @@ static struct vfsmount *tracefs_mount; static int tracefs_mount_count; static bool tracefs_registered; -static int default_open_file(struct inode *inode, struct file *filp) -{ - struct dentry *dentry = filp->f_path.dentry; - struct file_operations *real_fops; - int ret; - - if (!dentry) - return -EINVAL; - - ret = security_locked_down(LOCKDOWN_TRACEFS); - if (ret) - return ret; - - real_fops = dentry->d_fsdata; - if (!real_fops->open) - return 0; - return real_fops->open(inode, filp); -} - static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -241,12 +221,6 @@ static int tracefs_apply_options(struct super_block *sb) return 0; } -static void tracefs_destroy_inode(struct inode *inode) -{ - if (S_ISREG(inode->i_mode)) - kfree(inode->i_fop); -} - static int tracefs_remount(struct super_block *sb, int *flags, char *data) { int err; @@ -283,7 +257,6 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations tracefs_super_operations = { .statfs = simple_statfs, .remount_fs = tracefs_remount, - .destroy_inode = tracefs_destroy_inode, .show_options = tracefs_show_options, }; @@ -414,7 +387,6 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { - struct file_operations *proxy_fops; struct dentry *dentry; struct inode *inode; @@ -430,20 +402,8 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, if (unlikely(!inode)) return failed_creating(dentry); - proxy_fops = kzalloc(sizeof(struct file_operations), GFP_KERNEL); - if (unlikely(!proxy_fops)) { - iput(inode); - return failed_creating(dentry); - } - - if (!fops) - fops = &tracefs_file_operations; - - dentry->d_fsdata = (void *)fops; - memcpy(proxy_fops, fops, sizeof(*proxy_fops)); - proxy_fops->open = default_open_file; inode->i_mode = mode; - inode->i_fop = proxy_fops; + inode->i_fop = fops ? fops : &tracefs_file_operations; inode->i_private = data; d_instantiate(dentry, inode); fsnotify_create(dentry->d_parent->d_inode, dentry); -- cgit v1.2.3 From bf8e602186ec402ed937b2cbd6c39a34c0029757 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 20:41:41 -0400 Subject: tracing: Do not create tracefs files if tracefs lockdown is in effect If on boot up, lockdown is activated for tracefs, don't even bother creating the files. This can also prevent instances from being created if lockdown is in effect. Link: http://lkml.kernel.org/r/CAHk-=whC6Ji=fWnjh2+eS4b15TnbsS4VPVtvBOwCy1jjEG_JHQ@mail.gmail.com Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (VMware) --- fs/tracefs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index eeeae0475da9..0caa151cae4e 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -390,6 +391,9 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *dentry; struct inode *inode; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); -- cgit v1.2.3 From c70d868f272befca09081190ae477c51fcbee5dd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 14 Oct 2019 14:12:11 -0700 Subject: fs/direct-io.c: fix kernel-doc warning Fix kernel-doc warning in fs/direct-io.c: fs/direct-io.c:258: warning: Excess function parameter 'offset' description in 'dio_complete' Also, don't mark this function as having kernel-doc notation since it is not exported. Link: http://lkml.kernel.org/r/97908511-4328-4a56-17fe-f43a1d7aa470@infradead.org Fixes: 6d544bb4d901 ("dio: centralize completion in dio_complete()") Signed-off-by: Randy Dunlap Cc: Zach Brown Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index ae196784f487..9329ced91f1d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -241,9 +241,8 @@ void dio_warn_stale_pagecache(struct file *filp) } } -/** +/* * dio_complete() - called when all DIO BIO I/O has been completed - * @offset: the byte offset in the file of the completed operation * * This drops i_dio_count, lets interested parties know that a DIO operation * has completed, and calculates the resulting return code for the operation. -- cgit v1.2.3 From 8e88bfba77eec6231c6a72076c28bbb7634a0e8c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 14 Oct 2019 14:12:14 -0700 Subject: fs/libfs.c: fix kernel-doc warning Fix kernel-doc warning in fs/libfs.c: fs/libfs.c:496: warning: Excess function parameter 'available' description in 'simple_write_end' Link: http://lkml.kernel.org/r/5fc9d70b-e377-0ec9-066a-970d49579041@infradead.org Fixes: ad2a722f196d ("libfs: Open code simple_commit_write into only user") Signed-off-by: Randy Dunlap Cc: Boaz Harrosh Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/libfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 540611b99b9a..1463b038ffc4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -473,8 +473,7 @@ EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes - * @available: See .write_end of address_space_operations - * @file: " + * @file: See .write_end of address_space_operations * @mapping: " * @pos: " * @len: " -- cgit v1.2.3 From b46ec1da5eb7d728938c8d115c4c291c7c71a98d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 14 Oct 2019 14:12:17 -0700 Subject: fs/fs-writeback.c: fix kernel-doc warning Fix kernel-doc warning in fs/fs-writeback.c: fs/fs-writeback.c:913: warning: Excess function parameter 'nr_pages' description in 'cgroup_writeback_by_id' Link: http://lkml.kernel.org/r/756645ac-0ce8-d47e-d30a-04d9e4923a4f@infradead.org Fixes: d62241c7a406 ("writeback, memcg: Implement cgroup_writeback_by_id()") Signed-off-by: Randy Dunlap Cc: Tejun Heo Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e88421d9a48d..8461a6322039 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -905,7 +905,7 @@ restart: * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id - * @nr_pages: number of pages to write, 0 for best-effort dirty flushing + * @nr: number of pages to write, 0 for best-effort dirty flushing * @reason: reason why some writeback work initiated * @done: target wb_completion * -- cgit v1.2.3 From c7d3d28360fdb3ed3a5aa0bab19315e0fdc994a1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Nov 2019 17:45:31 +0100 Subject: quota: Factor out setup of quota inode Factor out setting up of quota inode and eventual error cleanup from vfs_load_quota_inode(). This will simplify situation for filesystems that don't have any quota inodes. Signed-off-by: Jan Kara --- fs/quota/dquot.c | 108 +++++++++++++++++++++++++++++------------------ include/linux/quotaops.h | 2 + 2 files changed, 69 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 6e826b454082..9e8eb6e71675 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2299,28 +2299,60 @@ EXPORT_SYMBOL(dquot_quota_off); * Turn quotas on on a device */ -/* - * Helper function to turn quotas on when we already have the inode of - * quota file and no quota information is loaded. - */ -static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, +static int vfs_setup_quota_inode(struct inode *inode, int type) +{ + struct super_block *sb = inode->i_sb; + struct quota_info *dqopt = sb_dqopt(sb); + + if (!S_ISREG(inode->i_mode)) + return -EACCES; + if (IS_RDONLY(inode)) + return -EROFS; + if (sb_has_quota_loaded(sb, type)) + return -EBUSY; + + dqopt->files[type] = igrab(inode); + if (!dqopt->files[type]) + return -EIO; + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* We don't want quota and atime on quota files (deadlocks + * possible) Also nobody should write to the file - we use + * special IO operations which ignore the immutable bit. */ + inode_lock(inode); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + /* + * When S_NOQUOTA is set, remove dquot references as no more + * references can be added + */ + __dquot_drop(inode); + } + return 0; +} + +static void vfs_cleanup_quota_inode(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct inode *inode = dqopt->files[type]; + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + inode_lock(inode); + inode->i_flags &= ~S_NOQUOTA; + inode_unlock(inode); + } + dqopt->files[type] = NULL; + iput(inode); +} + +int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags) { struct quota_format_type *fmt = find_quota_format(format_id); - struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); int error; if (!fmt) return -ESRCH; - if (!S_ISREG(inode->i_mode)) { - error = -EACCES; - goto out_fmt; - } - if (IS_RDONLY(inode)) { - error = -EROFS; - goto out_fmt; - } if (!sb->s_op->quota_write || !sb->s_op->quota_read || (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; @@ -2352,27 +2384,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, invalidate_bdev(sb->s_bdev); } - if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { - /* We don't want quota and atime on quota files (deadlocks - * possible) Also nobody should write to the file - we use - * special IO operations which ignore the immutable bit. */ - inode_lock(inode); - inode->i_flags |= S_NOQUOTA; - inode_unlock(inode); - /* - * When S_NOQUOTA is set, remove dquot references as no more - * references can be added - */ - __dquot_drop(inode); - } - - error = -EIO; - dqopt->files[type] = igrab(inode); - if (!dqopt->files[type]) - goto out_file_flags; error = -EINVAL; if (!fmt->qf_ops->check_quota_file(sb, type)) - goto out_file_init; + goto out_fmt; dqopt->ops[type] = fmt->qf_ops; dqopt->info[type].dqi_format = fmt; @@ -2380,7 +2394,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); error = dqopt->ops[type]->read_file_info(sb, type); if (error < 0) - goto out_file_init; + goto out_fmt; if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) { spin_lock(&dq_data_lock); dqopt->info[type].dqi_flags |= DQF_SYS_FILE; @@ -2395,18 +2409,30 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, dquot_disable(sb, type, flags); return error; -out_file_init: - dqopt->files[type] = NULL; - iput(inode); -out_file_flags: - inode_lock(inode); - inode->i_flags &= ~S_NOQUOTA; - inode_unlock(inode); out_fmt: put_quota_format(fmt); return error; } +EXPORT_SYMBOL(dquot_load_quota_sb); + +/* + * Helper function to turn quotas on when we already have the inode of + * quota file and no quota information is loaded. + */ +static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, + unsigned int flags) +{ + int err; + + err = vfs_setup_quota_inode(inode, type); + if (err < 0) + return err; + err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags); + if (err < 0) + vfs_cleanup_quota_inode(inode->i_sb, type); + return err; +} /* Reenable quotas on remount RW */ int dquot_resume(struct super_block *sb, int type) diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 185d94829701..2625766bcfe7 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -89,6 +89,8 @@ int dquot_file_open(struct inode *inode, struct file *file); int dquot_enable(struct inode *inode, int type, int format_id, unsigned int flags); +int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, + unsigned int flags); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, -- cgit v1.2.3 From ae45f07d47cc30e9170488a4e5fe91ba4fe5ed4e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Nov 2019 17:51:05 +0100 Subject: quota: Simplify dquot_resume() We already have quota inode loaded when resuming quotas. Use vfs_load_quota() to avoid some pointless churn with the quota inode. Signed-off-by: Jan Kara --- fs/quota/dquot.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9e8eb6e71675..ecdae91029ed 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2438,7 +2438,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, int dquot_resume(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); - struct inode *inode; int ret = 0, cnt; unsigned int flags; @@ -2452,8 +2451,6 @@ int dquot_resume(struct super_block *sb, int type) if (!sb_has_quota_suspended(sb, cnt)) continue; - inode = dqopt->files[cnt]; - dqopt->files[cnt] = NULL; spin_lock(&dq_state_lock); flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, @@ -2462,9 +2459,10 @@ int dquot_resume(struct super_block *sb, int type) spin_unlock(&dq_state_lock); flags = dquot_generic_flag(flags, cnt); - ret = vfs_load_quota_inode(inode, cnt, - dqopt->info[cnt].dqi_fmt_id, flags); - iput(inode); + ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id, + flags); + if (ret < 0) + vfs_cleanup_quota_inode(sb, type); } return ret; -- cgit v1.2.3 From dc19432ae1c22d696f91edea11ae06c348b4e88a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Nov 2019 18:37:44 +0100 Subject: quota: Rename vfs_load_quota_inode() to dquot_load_quota_inode() Rename vfs_load_quota_inode() to dquot_load_quota_inode() to be consistent with naming of other functions used for enabling quota accounting from filesystems. Also export the function and add some sanity checks to assure filesystems are calling the function properly. Signed-off-by: Jan Kara --- fs/quota/dquot.c | 19 +++++++++++++------ include/linux/quotaops.h | 2 ++ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ecdae91029ed..0ddcbce596f8 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2351,6 +2351,12 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, struct quota_info *dqopt = sb_dqopt(sb); int error; + /* Just unsuspend quotas? */ + BUG_ON(flags & DQUOT_SUSPENDED); + /* s_umount should be held in exclusive mode */ + if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) + up_read(&sb->s_umount); + if (!fmt) return -ESRCH; if (!sb->s_op->quota_write || !sb->s_op->quota_read || @@ -2417,10 +2423,10 @@ out_fmt: EXPORT_SYMBOL(dquot_load_quota_sb); /* - * Helper function to turn quotas on when we already have the inode of - * quota file and no quota information is loaded. + * More powerful function for turning on quotas on given quota inode allowing + * setting of individual quota flags */ -static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, +int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags) { int err; @@ -2433,6 +2439,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, vfs_cleanup_quota_inode(inode->i_sb, type); return err; } +EXPORT_SYMBOL(dquot_load_quota_inode); /* Reenable quotas on remount RW */ int dquot_resume(struct super_block *sb, int type) @@ -2479,7 +2486,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, if (path->dentry->d_sb != sb) error = -EXDEV; else - error = vfs_load_quota_inode(d_inode(path->dentry), type, + error = dquot_load_quota_inode(d_inode(path->dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; @@ -2517,7 +2524,7 @@ int dquot_enable(struct inode *inode, int type, int format_id, return 0; } - return vfs_load_quota_inode(inode, type, format_id, flags); + return dquot_load_quota_inode(inode, type, format_id, flags); } EXPORT_SYMBOL(dquot_enable); @@ -2542,7 +2549,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, error = security_quota_on(dentry); if (!error) - error = vfs_load_quota_inode(d_inode(dentry), type, format_id, + error = dquot_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); out: diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 2625766bcfe7..0ce9da5a1a93 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -91,6 +91,8 @@ int dquot_enable(struct inode *inode, int type, int format_id, unsigned int flags); int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags); +int dquot_load_quota_inode(struct inode *inode, int type, int format_id, + unsigned int flags); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, -- cgit v1.2.3 From 7212b95e61516671672380c6c9d6d80f4f306198 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Nov 2019 18:55:38 +0100 Subject: fs: Use dquot_load_quota_inode() from filesystems Use dquot_load_quota_inode from filesystems instead of dquot_enable(). In all three cases we want to load quota inode and never use the function to update quota flags. Signed-off-by: Jan Kara --- fs/ext4/super.c | 2 +- fs/f2fs/super.c | 2 +- fs/ocfs2/super.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dd654e53ba3d..1b947c95eff2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5835,7 +5835,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); - err = dquot_enable(qf_inode, type, format_id, flags); + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); iput(qf_inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1443cee15863..91745d5b718d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1932,7 +1932,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; - err = dquot_enable(qf_inode, type, format_id, flags); + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); iput(qf_inode); return err; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c81e86c62380..05dd68ade293 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -926,8 +926,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb) status = -ENOENT; goto out_quota_off; } - status = dquot_enable(inode[type], type, QFMT_OCFS2, - DQUOT_USAGE_ENABLED); + status = dquot_load_quota_inode(inode[type], type, QFMT_OCFS2, + DQUOT_USAGE_ENABLED); if (status < 0) goto out_quota_off; } -- cgit v1.2.3 From 069a9166369773627e51c5249cd7f9169aecd7fa Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Nov 2019 18:57:56 +0100 Subject: quota: Drop dquot_enable() Now dquot_enable() has only two internal callers and both of them just need to update quota flags and don't need most of checks. Just drop dquot_enable() and fold necessary functionality into the two calling places. Signed-off-by: Jan Kara --- fs/quota/dquot.c | 61 +++++++++++++----------------------------------- include/linux/quotaops.h | 2 -- 2 files changed, 16 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 0ddcbce596f8..3e4cf0d10955 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2493,41 +2493,6 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, } EXPORT_SYMBOL(dquot_quota_on); -/* - * More powerful function for turning on quotas allowing setting - * of individual quota flags - */ -int dquot_enable(struct inode *inode, int type, int format_id, - unsigned int flags) -{ - struct super_block *sb = inode->i_sb; - - /* Just unsuspend quotas? */ - BUG_ON(flags & DQUOT_SUSPENDED); - /* s_umount should be held in exclusive mode */ - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) - up_read(&sb->s_umount); - - if (!flags) - return 0; - /* Just updating flags needed? */ - if (sb_has_quota_loaded(sb, type)) { - if (flags & DQUOT_USAGE_ENABLED && - sb_has_quota_usage_enabled(sb, type)) - return -EBUSY; - if (flags & DQUOT_LIMITS_ENABLED && - sb_has_quota_limits_enabled(sb, type)) - return -EBUSY; - spin_lock(&dq_state_lock); - sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); - spin_unlock(&dq_state_lock); - return 0; - } - - return dquot_load_quota_inode(inode, type, format_id, flags); -} -EXPORT_SYMBOL(dquot_enable); - /* * This function is used when filesystem needs to initialize quotas * during mount time. @@ -2574,13 +2539,17 @@ static int dquot_quota_enable(struct super_block *sb, unsigned int flags) if (!(flags & qtype_enforce_flag(type))) continue; /* Can't enforce without accounting */ - if (!sb_has_quota_usage_enabled(sb, type)) - return -EINVAL; - ret = dquot_enable(dqopt->files[type], type, - dqopt->info[type].dqi_fmt_id, - DQUOT_LIMITS_ENABLED); - if (ret < 0) + if (!sb_has_quota_usage_enabled(sb, type)) { + ret = -EINVAL; + goto out_err; + } + if (sb_has_quota_limits_enabled(sb, type)) { + ret = -EBUSY; goto out_err; + } + spin_lock(&dq_state_lock); + dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type); + spin_unlock(&dq_state_lock); } return 0; out_err: @@ -2630,10 +2599,12 @@ static int dquot_quota_disable(struct super_block *sb, unsigned int flags) out_err: /* Backout enforcement disabling we already did */ for (type--; type >= 0; type--) { - if (flags & qtype_enforce_flag(type)) - dquot_enable(dqopt->files[type], type, - dqopt->info[type].dqi_fmt_id, - DQUOT_LIMITS_ENABLED); + if (flags & qtype_enforce_flag(type)) { + spin_lock(&dq_state_lock); + dqopt->flags |= + dquot_state_flag(DQUOT_LIMITS_ENABLED, type); + spin_unlock(&dq_state_lock); + } } return ret; } diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 0ce9da5a1a93..6b8ebc8d715e 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -87,8 +87,6 @@ int dquot_mark_dquot_dirty(struct dquot *dquot); int dquot_file_open(struct inode *inode, struct file *file); -int dquot_enable(struct inode *inode, int type, int format_id, - unsigned int flags); int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags); int dquot_load_quota_inode(struct inode *inode, int type, int format_id, -- cgit v1.2.3 From 2ec1f3011f3fdcbafcaad23aafa5acb6861a2646 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Nov 2019 11:12:44 +0100 Subject: quota: Make dquot_disable() work without quota inodes Quota on and quota off are protected by s_umount semaphore held in exclusive mode since commit 7d6cd73d33b6 "quota: Hold s_umount in exclusive mode when enabling / disabling quotas". This makes it impossible for dquot_disable() to race with other enabling or disabling of quotas. Simplify the cleanup done by dquot_disable() based on this fact and also remove some stale comments. As a bonus this cleanup makes dquot_disable() properly handle a case when there are no quota inodes. Signed-off-by: Jan Kara --- fs/quota/dquot.c | 73 ++++++++++++++++++++++---------------------------------- 1 file changed, 29 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3e4cf0d10955..4c3da4ea31bc 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2162,14 +2162,29 @@ int dquot_file_open(struct inode *inode, struct file *file) } EXPORT_SYMBOL(dquot_file_open); +static void vfs_cleanup_quota_inode(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct inode *inode = dqopt->files[type]; + + if (!inode) + return; + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + inode_lock(inode); + inode->i_flags &= ~S_NOQUOTA; + inode_unlock(inode); + } + dqopt->files[type] = NULL; + iput(inode); +} + /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ int dquot_disable(struct super_block *sb, int type, unsigned int flags) { - int cnt, ret = 0; + int cnt; struct quota_info *dqopt = sb_dqopt(sb); - struct inode *toputinode[MAXQUOTAS]; /* s_umount should be held in exclusive mode */ if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) @@ -2191,7 +2206,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - toputinode[cnt] = NULL; if (type != -1 && cnt != type) continue; if (!sb_has_quota_loaded(sb, cnt)) @@ -2211,8 +2225,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) dqopt->flags &= ~dquot_state_flag( DQUOT_SUSPENDED, cnt); spin_unlock(&dq_state_lock); - iput(dqopt->files[cnt]); - dqopt->files[cnt] = NULL; + vfs_cleanup_quota_inode(sb, cnt); continue; } spin_unlock(&dq_state_lock); @@ -2234,10 +2247,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) if (dqopt->ops[cnt]->free_file_info) dqopt->ops[cnt]->free_file_info(sb, cnt); put_quota_format(dqopt->info[cnt].dqi_format); - - toputinode[cnt] = dqopt->files[cnt]; - if (!sb_has_quota_loaded(sb, cnt)) - dqopt->files[cnt] = NULL; dqopt->info[cnt].dqi_flags = 0; dqopt->info[cnt].dqi_igrace = 0; dqopt->info[cnt].dqi_bgrace = 0; @@ -2259,32 +2268,22 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) * must also discard the blockdev buffers so that we see the * changes done by userspace on the next quotaon() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) - /* This can happen when suspending quotas on remount-ro... */ - if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) { - inode_lock(toputinode[cnt]); - toputinode[cnt]->i_flags &= ~S_NOQUOTA; - truncate_inode_pages(&toputinode[cnt]->i_data, 0); - inode_unlock(toputinode[cnt]); - mark_inode_dirty_sync(toputinode[cnt]); + if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) { + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); } if (sb->s_bdev) invalidate_bdev(sb->s_bdev); put_inodes: + /* We are done when suspending quotas */ + if (flags & DQUOT_SUSPENDED) + return 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (toputinode[cnt]) { - /* On remount RO, we keep the inode pointer so that we - * can reenable quota on the subsequent remount RW. We - * have to check 'flags' variable and not use sb_has_ - * function because another quotaon / quotaoff could - * change global state before we got here. We refuse - * to suspend quotas when there is pending delete on - * the quota file... */ - if (!(flags & DQUOT_SUSPENDED)) - iput(toputinode[cnt]); - else if (!toputinode[cnt]->i_nlink) - ret = -EBUSY; - } - return ret; + if (!sb_has_quota_loaded(sb, cnt)) + vfs_cleanup_quota_inode(sb, cnt); + return 0; } EXPORT_SYMBOL(dquot_disable); @@ -2330,20 +2329,6 @@ static int vfs_setup_quota_inode(struct inode *inode, int type) return 0; } -static void vfs_cleanup_quota_inode(struct super_block *sb, int type) -{ - struct quota_info *dqopt = sb_dqopt(sb); - struct inode *inode = dqopt->files[type]; - - if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { - inode_lock(inode); - inode->i_flags &= ~S_NOQUOTA; - inode_unlock(inode); - } - dqopt->files[type] = NULL; - iput(inode); -} - int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags) { -- cgit v1.2.3 From a0828b6ccbdfd46afbbaa9f28df359081c29109b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Nov 2019 11:18:17 +0100 Subject: quota: Handle quotas without quota inodes in dquot_get_state() Make dquot_get_state() gracefully handle a situation when there are no quota files present even though quotas are enabled. Signed-off-by: Jan Kara --- fs/quota/dquot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 4c3da4ea31bc..a69a657209a6 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2787,8 +2787,10 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state) tstate->flags |= QCI_LIMITS_ENFORCED; tstate->spc_timelimit = mi->dqi_bgrace; tstate->ino_timelimit = mi->dqi_igrace; - tstate->ino = dqopt->files[type]->i_ino; - tstate->blocks = dqopt->files[type]->i_blocks; + if (dqopt->files[type]) { + tstate->ino = dqopt->files[type]->i_ino; + tstate->blocks = dqopt->files[type]->i_blocks; + } tstate->nextents = 1; /* We don't know... */ spin_unlock(&dq_data_lock); } -- cgit v1.2.3