Age | Commit message (Collapse) | Author | Files | Lines |
|
Fix afs_apply_status() to mask off the irrelevant bits from status->mode
when OR'ing them into i_mode. This can happen when a 3rd party chmod
occurs.
Also fix afs_inode_init_from_status() to mask off the mode bits when
initialising i_mode.
Fixes: 260a980317da ("[AFS]: Add "directory write" support.")
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
Al pointed out that a malicious or broken MDS could change the type or
device number of a given inode number. It may also be possible for the
MDS to reuse an old inode number.
Ensure that we never allow fill_inode to change the type part of the
i_mode or the i_rdev unless I_NEW is set. Throw warnings if the MDS ever
changes these on us mid-stream, and return an error.
Don't set i_rdev directly, and rely on init_special_inode to do it.
Also, fix up error handling in the callers of ceph_get_inode.
In handle_cap_grant, check for and warn if the inode type changes, and
only overwrite the mode if it didn't.
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
There are several warts in the snapdir error handling. The -EOPNOTSUPP
return in __snapfh_to_dentry is currently lost, and the call to
ceph_handle_snapdir is not currently checked at all.
Fix all of this up and eliminate a BUG_ON in ceph_get_snapdir. We can
handle that case with a warning and return an error.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
inode_wrong_type(inode, mode) returns true if setting inode->i_mode
to given value would've changed the inode type. We have enough of
those checks open-coded to make a helper worthwhile.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
Creating a series of detached mounts, attaching them to the filesystem,
and unmounting them can be used to trigger an integer overflow in
ns->mounts causing the kernel to block any new mounts in count_mounts()
and returning ENOSPC because it falsely assumes that the maximum number
of mounts in the mount namespace has been reached, i.e. it thinks it
can't fit the new mounts into the mount namespace anymore.
Depending on the number of mounts in your system, this can be reproduced
on any kernel that supportes open_tree() and move_mount() by compiling
and running the following program:
/* SPDX-License-Identifier: LGPL-2.1+ */
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
/* open_tree() */
#ifndef OPEN_TREE_CLONE
#define OPEN_TREE_CLONE 1
#endif
#ifndef OPEN_TREE_CLOEXEC
#define OPEN_TREE_CLOEXEC O_CLOEXEC
#endif
#ifndef __NR_open_tree
#if defined __alpha__
#define __NR_open_tree 538
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_open_tree 4428
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_open_tree 6428
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_open_tree 5428
#endif
#elif defined __ia64__
#define __NR_open_tree (428 + 1024)
#else
#define __NR_open_tree 428
#endif
#endif
/* move_mount() */
#ifndef MOVE_MOUNT_F_EMPTY_PATH
#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
#endif
#ifndef __NR_move_mount
#if defined __alpha__
#define __NR_move_mount 539
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
#define __NR_move_mount 4429
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
#define __NR_move_mount 6429
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
#define __NR_move_mount 5429
#endif
#elif defined __ia64__
#define __NR_move_mount (428 + 1024)
#else
#define __NR_move_mount 429
#endif
#endif
static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
{
return syscall(__NR_open_tree, dfd, filename, flags);
}
static inline int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd,
const char *to_pathname, unsigned int flags)
{
return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
}
static bool is_shared_mountpoint(const char *path)
{
bool shared = false;
FILE *f = NULL;
char *line = NULL;
int i;
size_t len = 0;
f = fopen("/proc/self/mountinfo", "re");
if (!f)
return 0;
while (getline(&line, &len, f) > 0) {
char *slider1, *slider2;
for (slider1 = line, i = 0; slider1 && i < 4; i++)
slider1 = strchr(slider1 + 1, ' ');
if (!slider1)
continue;
slider2 = strchr(slider1 + 1, ' ');
if (!slider2)
continue;
*slider2 = '\0';
if (strcmp(slider1 + 1, path) == 0) {
/* This is the path. Is it shared? */
slider1 = strchr(slider2 + 1, ' ');
if (slider1 && strstr(slider1, "shared:")) {
shared = true;
break;
}
}
}
fclose(f);
free(line);
return shared;
}
static void usage(void)
{
const char *text = "mount-new [--recursive] <base-dir>\n";
fprintf(stderr, "%s", text);
_exit(EXIT_SUCCESS);
}
#define exit_usage(format, ...) \
({ \
fprintf(stderr, format "\n", ##__VA_ARGS__); \
usage(); \
})
#define exit_log(format, ...) \
({ \
fprintf(stderr, format "\n", ##__VA_ARGS__); \
exit(EXIT_FAILURE); \
})
static const struct option longopts[] = {
{"help", no_argument, 0, 'a'},
{ NULL, no_argument, 0, 0 },
};
int main(int argc, char *argv[])
{
int exit_code = EXIT_SUCCESS, index = 0;
int dfd, fd_tree, new_argc, ret;
char *base_dir;
char *const *new_argv;
char target[PATH_MAX];
while ((ret = getopt_long_only(argc, argv, "", longopts, &index)) != -1) {
switch (ret) {
case 'a':
/* fallthrough */
default:
usage();
}
}
new_argv = &argv[optind];
new_argc = argc - optind;
if (new_argc < 1)
exit_usage("Missing base directory\n");
base_dir = new_argv[0];
if (*base_dir != '/')
exit_log("Please specify an absolute path");
/* Ensure that target is a shared mountpoint. */
if (!is_shared_mountpoint(base_dir))
exit_log("Please ensure that \"%s\" is a shared mountpoint", base_dir);
dfd = open(base_dir, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
if (dfd < 0)
exit_log("%m - Failed to open base directory \"%s\"", base_dir);
ret = mkdirat(dfd, "detached-move-mount", 0755);
if (ret < 0)
exit_log("%m - Failed to create required temporary directories");
ret = snprintf(target, sizeof(target), "%s/detached-move-mount", base_dir);
if (ret < 0 || (size_t)ret >= sizeof(target))
exit_log("%m - Failed to assemble target path");
/*
* Having a mount table with 10000 mounts is already quite excessive
* and shoult account even for weird test systems.
*/
for (size_t i = 0; i < 10000; i++) {
fd_tree = sys_open_tree(dfd, "detached-move-mount",
OPEN_TREE_CLONE |
OPEN_TREE_CLOEXEC |
AT_EMPTY_PATH);
if (fd_tree < 0) {
fprintf(stderr, "%m - Failed to open %d(detached-move-mount)", dfd);
exit_code = EXIT_FAILURE;
break;
}
ret = sys_move_mount(fd_tree, "", dfd, "detached-move-mount", MOVE_MOUNT_F_EMPTY_PATH);
if (ret < 0) {
if (errno == ENOSPC)
fprintf(stderr, "%m - Buggy mount counting");
else
fprintf(stderr, "%m - Failed to attach mount to %d(detached-move-mount)", dfd);
exit_code = EXIT_FAILURE;
break;
}
close(fd_tree);
ret = umount2(target, MNT_DETACH);
if (ret < 0) {
fprintf(stderr, "%m - Failed to unmount %s", target);
exit_code = EXIT_FAILURE;
break;
}
}
(void)unlinkat(dfd, "detached-move-mount", AT_REMOVEDIR);
close(dfd);
exit(exit_code);
}
and wait for the kernel to refuse any new mounts by returning ENOSPC.
How many iterations are needed depends on the number of mounts in your
system. Assuming you have something like 50 mounts on a standard system
it should be almost instantaneous.
The root cause of this is that detached mounts aren't handled correctly
when source and target mount are identical and reside on a shared mount
causing a broken mount tree where the detached source itself is
propagated which propagation prevents for regular bind-mounts and new
mounts. This ultimately leads to a miscalculation of the number of
mounts in the mount namespace.
Detached mounts created via
open_tree(fd, path, OPEN_TREE_CLONE)
are essentially like an unattached new mount, or an unattached
bind-mount. They can then later on be attached to the filesystem via
move_mount() which calls into attach_recursive_mount(). Part of
attaching it to the filesystem is making sure that mounts get correctly
propagated in case the destination mountpoint is MS_SHARED, i.e. is a
shared mountpoint. This is done by calling into propagate_mnt() which
walks the list of peers calling propagate_one() on each mount in this
list making sure it receives the propagation event.
The propagate_one() functions thereby skips both new mounts and bind
mounts to not propagate them "into themselves". Both are identified by
checking whether the mount is already attached to any mount namespace in
mnt->mnt_ns. The is what the IS_MNT_NEW() helper is responsible for.
However, detached mounts have an anonymous mount namespace attached to
them stashed in mnt->mnt_ns which means that IS_MNT_NEW() doesn't
realize they need to be skipped causing the mount to propagate "into
itself" breaking the mount table and causing a disconnect between the
number of mounts recorded as being beneath or reachable from the target
mountpoint and the number of mounts actually recorded/counted in
ns->mounts ultimately causing an overflow which in turn prevents any new
mounts via the ENOSPC issue.
So teach propagation to handle detached mounts by making it aware of
them. I've been tracking this issue down for the last couple of days and
then verifying that the fix is correct by
unmounting everything in my current mount table leaving only /proc and
/sys mounted and running the reproducer above overnight verifying the
number of mounts counted in ns->mounts. With this fix the counts are
correct and the ENOSPC issue can't be reproduced.
This change will only have an effect on mounts created with the new
mount API since detached mounts cannot be created with the old mount API
so regressions are extremely unlikely.
Link: https://lore.kernel.org/r/20210306101010.243666-1-christian.brauner@ubuntu.com
Fixes: 2db154b3ea8e ("vfs: syscall: Add move_mount(2) to move mounts around")
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: <stable@vger.kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
|
|
Rather than storing the iterator information in the registered
kmsg_dumper structure, create a separate iterator structure. The
kmsg_dump_iter structure can reside on the stack of the caller, thus
allowing lockless use of the kmsg_dump functions.
Update code that accesses the kernel logs using the kmsg_dumper
structure to use the new kmsg_dump_iter structure. For kmsg_dumpers,
this also means adding a call to kmsg_dump_rewind() to initialize
the iterator.
All this is in preparation for removal of @logbuf_lock.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Kees Cook <keescook@chromium.org> # pstore
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20210303101528.29901-13-john.ogness@linutronix.de
|
|
Both s390 and alpha have been switched to 64-bit ino_t with
commit 96c0a6a72d18 ("s390,alpha: switch to 64-bit ino_t").
Therefore enable TMPFS_INODE64 for both architectures again.
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Link: https://lore.kernel.org/linux-mm/YCV7QiyoweJwvN+m@osiris/
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
|
|
Martin reported an issue that directory read could be hung on the
latest -rc kernel with some certain image. The root cause is that
commit baa2c7c97153 ("block: set .bi_max_vecs as actual allocated
vector number") changes .bi_max_vecs behavior. bio->bi_max_vecs
is set as actual allocated vector number rather than the requested
number now.
Let's avoid using .bi_max_vecs completely instead.
Link: https://lore.kernel.org/r/20210306040438.8084-1-hsiangkao@aol.com
Reported-by: Martin DEVERA <devik@eaxlabs.cz>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
[ Gao Xiang: note that <= 5.11 kernels are not impacted. ]
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
|
|
is gone
This brings the behavior back in line with what 5.11 and earlier did,
and this is no longer needed with the improved handling of creds
not needing to do unshare().
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
With IORING_SETUP_ATTACH_WQ we should let __io_sq_thread() use the
initial creds from each ctx.
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
Add a simple warning making sure that nobody tries to create a new
manager while we're under IO_WQ_BIT_EXIT. That can potentially happen
due to racy work submission after final put.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
io_ring_exit_work() have to cancel all requests, including those staying
in io-wq, however it tries only cancellation of current tctx, which is
NULL. If we've got task==NULL, use the ctx-to-tctx map to go over all
tctx/io-wq and try cancellations on them.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
We use system_unbound_wq to run io_ring_exit_work(), so it's hard to
monitor whether removal hang or not. Add WARN_ONCE to catch hangs.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
We don't use task file notes anymore, and no need left in indexing
task->io_uring->xa by file, and replace it with ctx. It's better
design-wise, especially since we keep a dangling file, and so have to
keep an eye on not dereferencing it.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
With ->flush() gone we're now leaving all uring file notes until the
task dies/execs, so the ctx will not be freed until all tasks that have
ever submit a request die. It was nicer with flush but not much, we
could have locked as described ctx in many cases.
Now we guarantee that ctx outlives all tctx in a sense that
io_ring_exit_work() waits for all tctxs to drop their corresponding
enties in ->xa, and ctx won't go away until then. Hence, additional
io_uring file reference (a.k.a. task file notes) are not needed anymore.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
Another preparation patch. When full quiesce is done on ctx exit, use
task_work infra to remove corresponding to the ctx io_uring->xa entries.
For that we use the back tctx map. Also use ->in_idle to prevent
removing it while we traversing ->xa on cancellation, just ignore it.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
For each pair tcxt-ctx create an object and chain it into ctx, so we
have a way to traverse all tctx that are using current ctx. Preparation
patch, will be used later.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
Rework io_uring_del_task_file(), so it accepts an index to delete, and
it's not necessarily have to be in the ->xa. Infer file from xa_erase()
to maintain a single origin of truth.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
This patch adds code to function trans_drain to remove drained
bd elements from the ail lists, if queued, before freeing the bd.
If we don't remove the bd from the ail, function ail_drain will
try to reference the bd after it has been freed by trans_drain.
Thanks to Andy Price for his analysis of the problem.
Reported-by: Andy Price <anprice@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
|
|
It fixes the following warning detected by coccinelle:
./fs/gfs2/super.c:592:5-10: Unneeded variable: "error". Return "0" on
line 628
Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
|
|
Use the documented kernel-doc format for function Return: descriptions.
Begin constant values in kernel-doc comments with '%'.
Remove kernel-doc "/**" from 2 functions that are not documented with
kernel-doc notation.
Fix typos, punctuation, & grammar.
Also fix a few kernel-doc warnings:
../fs/eventpoll.c:1883: warning: Function parameter or member 'ep' not described in 'ep_loop_check_proc'
../fs/eventpoll.c:1883: warning: Excess function parameter 'priv' description in 'ep_loop_check_proc'
../fs/eventpoll.c:1932: warning: Function parameter or member 'ep' not described in 'ep_loop_check'
../fs/eventpoll.c:1932: warning: Excess function parameter 'from' description in 'ep_loop_check'
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
|
|
The typical result of the backwards comparison here is that the source
server in a server-to-server copy will return BAD_STATEID within a few
seconds of the copy starting, instead of giving the copy a full lease
period, so the copy_file_range() call will end up unnecessarily
returning a short read.
Fixes: 624322f1adc5 "NFSD add COPY_NOTIFY operation"
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
|
When NFSD_V4 is enabled and CRYPTO is disabled,
Kbuild gives the following warning:
WARNING: unmet direct dependencies detected for CRYPTO_SHA256
Depends on [n]: CRYPTO [=n]
Selected by [y]:
- NFSD_V4 [=y] && NETWORK_FILESYSTEMS [=y] && NFSD [=y] && PROC_FS [=y]
WARNING: unmet direct dependencies detected for CRYPTO_MD5
Depends on [n]: CRYPTO [=n]
Selected by [y]:
- NFSD_V4 [=y] && NETWORK_FILESYSTEMS [=y] && NFSD [=y] && PROC_FS [=y]
This is because NFSD_V4 selects CRYPTO_MD5 and CRYPTO_SHA256,
without depending on or selecting CRYPTO, despite those config options
being subordinate to CRYPTO.
Signed-off-by: Julian Braha <julianbraha@gmail.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
|
If a file is unhashed, then we're going to reject it anyway and retry,
so make sure we skip it when we're doing the RCU lockless lookup.
This avoids a number of unnecessary nfserr_jukebox returns from
nfsd_file_acquire()
Fixes: 65294c1f2c5e ("nfsd: add a new struct file caching facility to nfsd")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
|
If we go async with a request, grab the creds that the task currently has
assigned and make sure that the async side switches to them. This is
handled in the same way that we do for registered personalities.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
Ran into a use-after-free on the main io-wq struct, wq. It has a worker
ref and completion event, but the manager itself isn't holding a
reference. This can lead to a race where the manager thinks there are
no workers and exits, but a worker is being added. That leads to the
following trace:
BUG: KASAN: use-after-free in io_wqe_worker+0x4c0/0x5e0
Read of size 8 at addr ffff888108baa8a0 by task iou-wrk-3080422/3080425
CPU: 5 PID: 3080425 Comm: iou-wrk-3080422 Not tainted 5.12.0-rc1+ #110
Hardware name: Micro-Star International Co., Ltd. MS-7C60/TRX40 PRO 10G (MS-7C60), BIOS 1.60 05/13/2020
Call Trace:
dump_stack+0x90/0xbe
print_address_description.constprop.0+0x67/0x28d
? io_wqe_worker+0x4c0/0x5e0
kasan_report.cold+0x7b/0xd4
? io_wqe_worker+0x4c0/0x5e0
__asan_load8+0x6d/0xa0
io_wqe_worker+0x4c0/0x5e0
? io_worker_handle_work+0xc00/0xc00
? recalc_sigpending+0xe5/0x120
? io_worker_handle_work+0xc00/0xc00
? io_worker_handle_work+0xc00/0xc00
ret_from_fork+0x1f/0x30
Allocated by task 3080422:
kasan_save_stack+0x23/0x60
__kasan_kmalloc+0x80/0xa0
kmem_cache_alloc_node_trace+0xa0/0x480
io_wq_create+0x3b5/0x600
io_uring_alloc_task_context+0x13c/0x380
io_uring_add_task_file+0x109/0x140
__x64_sys_io_uring_enter+0x45f/0x660
do_syscall_64+0x32/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Freed by task 3080422:
kasan_save_stack+0x23/0x60
kasan_set_track+0x20/0x40
kasan_set_free_info+0x24/0x40
__kasan_slab_free+0xe8/0x120
kfree+0xa8/0x400
io_wq_put+0x14a/0x220
io_wq_put_and_exit+0x9a/0xc0
io_uring_clean_tctx+0x101/0x140
__io_uring_files_cancel+0x36e/0x3c0
do_exit+0x169/0x1340
__x64_sys_exit+0x34/0x40
do_syscall_64+0x32/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Have the manager itself hold a reference, and now both drop points drop
and complete if we hit zero, and the manager can unconditionally do a
wait_for_completion() instead of having a race between reading the ref
count and waiting if it was non-zero.
Fixes: fb3a1f6c745c ("io-wq: have manager wait for all workers to exit")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
When doing a large read or write workload we only
very gradually increase the number of credits
which can cause problems with parallelizing large i/o
(I/O ramps up more slowly than it should for large
read/write workloads) especially with multichannel
when the number of credits on the secondary channels
starts out low (e.g. less than about 130) or when
recovering after server throttled back the number
of credit.
Signed-off-by: Aurelien Aptel <aaptel@suse.com>
Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
|
|
With multichannel, operations like the queries
from "ls -lR" can cause all credits to be used and
errors to be returned since max_credits was not
being set correctly on the secondary channels and
thus the client was requesting 0 credits incorrectly
in some cases (which can lead to not having
enough credits to perform any operation on that
channel).
Signed-off-by: Aurelien Aptel <aaptel@suse.com>
CC: <stable@vger.kernel.org> # v5.8+
Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
|
|
__ext4_journalled_writepage should drop bhs' ref count on error paths
Signed-off-by: Zhaolong Zhang <zhangzl2013@126.com>
Link: https://lore.kernel.org/r/1614678151-70481-1-git-send-email-zhangzl2013@126.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
|
|
syzbot found UBSAN: shift-out-of-bounds in ext4_mb_init [1], when
1 << sbi->s_es->s_log_groups_per_flex is bigger than UINT_MAX,
where sbi->s_mb_prefetch is unsigned integer type.
32 is the maximum allowed power of s_log_groups_per_flex. Following if
check will also trigger UBSAN shift-out-of-bound:
if (1 << sbi->s_es->s_log_groups_per_flex >= UINT_MAX) {
So I'm checking it against the raw number, perhaps there is another way
to calculate UINT_MAX max power. Also use min_t as to make sure it's
uint type.
[1] UBSAN: shift-out-of-bounds in fs/ext4/mballoc.c:2713:24
shift exponent 60 is too large for 32-bit type 'int'
Call Trace:
__dump_stack lib/dump_stack.c:79 [inline]
dump_stack+0x137/0x1be lib/dump_stack.c:120
ubsan_epilogue lib/ubsan.c:148 [inline]
__ubsan_handle_shift_out_of_bounds+0x432/0x4d0 lib/ubsan.c:395
ext4_mb_init_backend fs/ext4/mballoc.c:2713 [inline]
ext4_mb_init+0x19bc/0x19f0 fs/ext4/mballoc.c:2898
ext4_fill_super+0xc2ec/0xfbe0 fs/ext4/super.c:4983
Reported-by: syzbot+a8b4b0c60155e87e9484@syzkaller.appspotmail.com
Signed-off-by: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210224095800.3350002-1-snovitoll@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
|
|
Syzbot is reporting that ext4 can enter fs reclaim from kvmalloc() while
the transaction is started like:
fs_reclaim_acquire+0x117/0x150 mm/page_alloc.c:4340
might_alloc include/linux/sched/mm.h:193 [inline]
slab_pre_alloc_hook mm/slab.h:493 [inline]
slab_alloc_node mm/slub.c:2817 [inline]
__kmalloc_node+0x5f/0x430 mm/slub.c:4015
kmalloc_node include/linux/slab.h:575 [inline]
kvmalloc_node+0x61/0xf0 mm/util.c:587
kvmalloc include/linux/mm.h:781 [inline]
ext4_xattr_inode_cache_find fs/ext4/xattr.c:1465 [inline]
ext4_xattr_inode_lookup_create fs/ext4/xattr.c:1508 [inline]
ext4_xattr_set_entry+0x1ce6/0x3780 fs/ext4/xattr.c:1649
ext4_xattr_ibody_set+0x78/0x2b0 fs/ext4/xattr.c:2224
ext4_xattr_set_handle+0x8f4/0x13e0 fs/ext4/xattr.c:2380
ext4_xattr_set+0x13a/0x340 fs/ext4/xattr.c:2493
This should be impossible since transaction start sets PF_MEMALLOC_NOFS.
Add some assertions to the code to catch if something isn't working as
expected early.
Link: https://lore.kernel.org/linux-ext4/000000000000563a0205bafb7970@google.com/
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210222171626.21884-1-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
|
|
When generic/371 is run on kvm-xfstests using 5.10 and 5.11 kernels, it
fails at significant rates on the two test scenarios that disable
delayed allocation (ext3conv and data_journal) and force actual block
allocation for the fallocate and pwrite functions in the test. The
failure rate on 5.10 for both ext3conv and data_journal on one test
system typically runs about 85%. On 5.11, the failure rate on ext3conv
sometimes drops to as low as 1% while the rate on data_journal
increases to nearly 100%.
The observed failures are largely due to ext4_should_retry_alloc()
cutting off block allocation retries when s_mb_free_pending (used to
indicate that a transaction in progress will free blocks) is 0.
However, free space is usually available when this occurs during runs
of generic/371. It appears that a thread attempting to allocate
blocks is just missing transaction commits in other threads that
increase the free cluster count and reset s_mb_free_pending while
the allocating thread isn't running. Explicitly testing for free space
availability avoids this race.
The current code uses a post-increment operator in the conditional
expression that determines whether the retry limit has been exceeded.
This means that the conditional expression uses the value of the
retry counter before it's increased, resulting in an extra retry cycle.
The current code actually retries twice before hitting its retry limit
rather than once.
Increasing the retry limit to 3 from the current actual maximum retry
count of 2 in combination with the change described above reduces the
observed failure rate to less that 0.1% on both ext3conv and
data_journal with what should be limited impact on users sensitive to
the overhead caused by retries.
A per filesystem percpu counter exported via sysfs is added to allow
users or developers to track the number of times the retry limit is
exceeded without resorting to debugging methods. This should provide
some insight into worst case retry behavior.
Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Link: https://lore.kernel.org/r/20210218151132.19678-1-enwlinux@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux
Pull pstore fixes from Kees Cook:
- Rate-limit ECC warnings (Dmitry Osipenko)
- Fix error path check for NULL (Tetsuo Handa)
* tag 'pstore-v5.12-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux:
pstore/ram: Rate-limit "uncorrectable error in header" message
pstore: Fix warning in pstore_kill_sb()
|
|
Pull io_uring fixes from Jens Axboe:
"A bit of a mix between fallout from the worker change, cleanups and
reductions now possible from that change, and fixes in general. In
detail:
- Fully serialize manager and worker creation, fixing races due to
that.
- Clean up some naming that had gone stale.
- SQPOLL fixes.
- Fix race condition around task_work rework that went into this
merge window.
- Implement unshare. Used for when the original task does unshare(2)
or setuid/seteuid and friends, drops the original workers and forks
new ones.
- Drop the only remaining piece of state shuffling we had left, which
was cred. Move it into issue instead, and we can drop all of that
code too.
- Kill f_op->flush() usage. That was such a nasty hack that we had
out of necessity, we no longer need it.
- Following from ->flush() removal, we can also drop various bits of
ctx state related to SQPOLL and cancelations.
- Fix an issue with IOPOLL retry, which originally was fallout from a
filemap change (removing iov_iter_revert()), but uncovered an issue
with iovec re-import too late.
- Fix an issue with system suspend.
- Use xchg() for fallback work, instead of cmpxchg().
- Properly destroy io-wq on exec.
- Add create_io_thread() core helper, and use that in io-wq and
io_uring. This allows us to remove various silly completion events
related to thread setup.
- A few error handling fixes.
This should be the grunt of fixes necessary for the new workers, next
week should be quieter. We've got a pending series from Pavel on
cancelations, and how tasks and rings are indexed. Outside of that,
should just be minor fixes. Even with these fixes, we're still killing
a net ~80 lines"
* tag 'io_uring-5.12-2021-03-05' of git://git.kernel.dk/linux-block: (41 commits)
io_uring: don't restrict issue_flags for io_openat
io_uring: make SQPOLL thread parking saner
io-wq: kill hashed waitqueue before manager exits
io_uring: clear IOCB_WAITQ for non -EIOCBQUEUED return
io_uring: don't keep looping for more events if we can't flush overflow
io_uring: move to using create_io_thread()
kernel: provide create_io_thread() helper
io_uring: reliably cancel linked timeouts
io_uring: cancel-match based on flags
io-wq: ensure all pending work is canceled on exit
io_uring: ensure that threads freeze on suspend
io_uring: remove extra in_idle wake up
io_uring: inline __io_queue_async_work()
io_uring: inline io_req_clean_work()
io_uring: choose right tctx->io_wq for try cancel
io_uring: fix -EAGAIN retry with IOPOLL
io-wq: fix error path leak of buffered write hash map
io_uring: remove sqo_task
io_uring: kill sqo_dead and sqo submission halting
io_uring: ignore double poll add on the same waitqueue head
...
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba:
"More regression fixes and stabilization.
Regressions:
- zoned mode
- count zone sizes in wider int types
- fix space accounting for read-only block groups
- subpage: fix page tail zeroing
Fixes:
- fix spurious warning when remounting with free space tree
- fix warning when creating a directory with smack enabled
- ioctl checks for qgroup inheritance when creating a snapshot
- qgroup
- fix missing unlock on error path in zero range
- fix amount of released reservation on error
- fix flushing from unsafe context with open transaction,
potentially deadlocking
- minor build warning fixes"
* tag 'for-5.12-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: zoned: do not account freed region of read-only block group as zone_unusable
btrfs: zoned: use sector_t for zone sectors
btrfs: subpage: fix the false data csum mismatch error
btrfs: fix warning when creating a directory with smack enabled
btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
btrfs: export and rename qgroup_reserve_meta
btrfs: free correct amount of space in btrfs_delayed_inode_reserve_metadata
btrfs: fix spurious free_space_tree remount warning
btrfs: validate qgroup inherit for SNAP_CREATE_V2 ioctl
btrfs: unlock extents in btrfs_zero_range in case of quota reservation errors
btrfs: ref-verify: use 'inline void' keyword ordering
|
|
Commit 384d87ef2c95 ("block: Do not discard buffers under a mounted
filesystem") made paths issuing discard or zeroout requests to the
underlying device try to grab block device in exclusive mode. If that
failed we returned EBUSY to userspace. This however caused unexpected
fallout in userspace where e.g. FUSE filesystems issue discard requests
from userspace daemons although the device is open exclusively by the
kernel. Also shrinking of logical volume by LVM issues discard requests
to a device which may be claimed exclusively because there's another LV
on the same PV. So to avoid these userspace regressions, fall back to
invalidate_inode_pages2_range() instead of returning EBUSY to userspace
and return EBUSY only of that call fails as well (meaning that there's
indeed someone using the particular device range we are trying to
discard).
Link: https://bugzilla.kernel.org/show_bug.cgi?id=211167
Fixes: 384d87ef2c95 ("block: Do not discard buffers under a mounted filesystem")
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
45d189c606292 ("io_uring: replace force_nonblock with flags") did
something strange for io_openat() slicing all issue_flags but
IO_URING_F_NONBLOCK. Not a bug for now, but better to just forward the
flags.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
We have this weird true/false return from parking, and then some of the
callers decide to look at that. It can lead to unbalanced parks and
sqd locking. Have the callers check the thread status once it's parked.
We know we have the lock at that point, so it's either valid or it's NULL.
Fix race with parking on thread exit. We need to be careful here with
ordering of the sdq->lock and the IO_SQ_THREAD_SHOULD_PARK bit.
Rename sqd->completion to sqd->parked to reflect that this is the only
thing this completion event doesn.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
If we race with shutting down the io-wq context and someone queueing
a hashed entry, then we can exit the manager with it armed. If it then
triggers after the manager has exited, we can have a use-after-free where
io_wqe_hash_wake() attempts to wake a now gone manager process.
Move the killing of the hashed write queue into the manager itself, so
that we know we've killed it before the task exits.
Fixes: e941894eae31 ("io-wq: make buffered file write hashed work map per-ctx")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
The callback can only be armed, if we get -EIOCBQUEUED returned. It's
important that we clear the WAITQ bit for other cases, otherwise we can
queue for async retry and filemap will assume that we're armed and
return -EAGAIN instead of just blocking for the IO.
Cc: stable@vger.kernel.org # 5.9+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
It doesn't make sense to wait for more events to come in, if we can't
even flush the overflow we already have to the ring. Return -EBUSY for
that condition, just like we do for attempts to submit with overflow
pending.
Cc: stable@vger.kernel.org # 5.11
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
This allows us to do task creation and setup without needing to use
completions to try and synchronize with the starting thread. Get rid of
the old io_wq_fork_thread() wrapper, and the 'wq' and 'worker' startup
completion events - we can now do setup before the task is running.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
Right now "mount -t virtiofs -o dax myfs /mnt/virtiofs" succeeds even
if filesystem deivce does not have a cache window and hence DAX can't
be supported.
This gives a false sense to user that they are using DAX with virtiofs
but fact of the matter is that they are not.
Fix this by returning error if dax can't be supported and user has asked
for it.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
|
|
Linked timeouts are fired asynchronously (i.e. soft-irq), and use
generic cancellation paths to do its stuff, including poking into io-wq.
The problem is that it's racy to access tctx->io_wq, as
io_uring_task_cancel() and others may be happening at this exact moment.
Mark linked timeouts with REQ_F_INLIFGHT for now, making sure there are
no timeouts before io-wq destraction.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
Instead of going into request internals, like checking req->file->f_op,
do match them based on REQ_F_INFLIGHT, it's set only when we want it to
be reliably cancelled.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
zone_unusable
We migrate zone unusable bytes to read-only bytes when a block group is
set to read-only, and account all the free region as bytes_readonly.
Thus, we should not increase block_group->zone_unusable when the block
group is read-only.
Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones")
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
|
|
We need to use sector_t for zone_sectors, or it would set the zone size
to zero when the size >= 4GB (= 2^24 sectors) by shifting the
zone_sectors value by SECTOR_SHIFT. We're assuming zones sizes up to
8GiB.
Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices")
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
|
|
If we race on shutting down the io-wq, then we should ensure that any
work that was queued after workers shutdown is canceled. Harden the
add work check a bit too, checking for IO_WQ_BIT_EXIT and cancel if
it's set.
Add a WARN_ON() for having any work before we kill the io-wq context.
Reported-by: syzbot+91b4b56ead187d35c9d3@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
Alex reports that his system fails to suspend using 5.12-rc1, with the
following dump:
[ 240.650300] PM: suspend entry (deep)
[ 240.650748] Filesystems sync: 0.000 seconds
[ 240.725605] Freezing user space processes ...
[ 260.739483] Freezing of tasks failed after 20.013 seconds (3 tasks refusing to freeze, wq_busy=0):
[ 260.739497] task:iou-mgr-446 state:S stack: 0 pid: 516 ppid: 439 flags:0x00004224
[ 260.739504] Call Trace:
[ 260.739507] ? sysvec_apic_timer_interrupt+0xb/0x81
[ 260.739515] ? pick_next_task_fair+0x197/0x1cde
[ 260.739519] ? sysvec_reschedule_ipi+0x2f/0x6a
[ 260.739522] ? asm_sysvec_reschedule_ipi+0x12/0x20
[ 260.739525] ? __schedule+0x57/0x6d6
[ 260.739529] ? del_timer_sync+0xb9/0x115
[ 260.739533] ? schedule+0x63/0xd5
[ 260.739536] ? schedule_timeout+0x219/0x356
[ 260.739540] ? __next_timer_interrupt+0xf1/0xf1
[ 260.739544] ? io_wq_manager+0x73/0xb1
[ 260.739549] ? io_wq_create+0x262/0x262
[ 260.739553] ? ret_from_fork+0x22/0x30
[ 260.739557] task:iou-mgr-517 state:S stack: 0 pid: 522 ppid: 439 flags:0x00004224
[ 260.739561] Call Trace:
[ 260.739563] ? sysvec_apic_timer_interrupt+0xb/0x81
[ 260.739566] ? pick_next_task_fair+0x16f/0x1cde
[ 260.739569] ? sysvec_apic_timer_interrupt+0xb/0x81
[ 260.739571] ? asm_sysvec_apic_timer_interrupt+0x12/0x20
[ 260.739574] ? __schedule+0x5b7/0x6d6
[ 260.739578] ? del_timer_sync+0x70/0x115
[ 260.739581] ? schedule_timeout+0x211/0x356
[ 260.739585] ? __next_timer_interrupt+0xf1/0xf1
[ 260.739588] ? io_wq_check_workers+0x15/0x11f
[ 260.739592] ? io_wq_manager+0x69/0xb1
[ 260.739596] ? io_wq_create+0x262/0x262
[ 260.739600] ? ret_from_fork+0x22/0x30
[ 260.739603] task:iou-wrk-517 state:S stack: 0 pid: 523 ppid: 439 flags:0x00004224
[ 260.739607] Call Trace:
[ 260.739609] ? __schedule+0x5b7/0x6d6
[ 260.739614] ? schedule+0x63/0xd5
[ 260.739617] ? schedule_timeout+0x219/0x356
[ 260.739621] ? __next_timer_interrupt+0xf1/0xf1
[ 260.739624] ? task_thread.isra.0+0x148/0x3af
[ 260.739628] ? task_thread_unbound+0xa/0xa
[ 260.739632] ? task_thread_bound+0x7/0x7
[ 260.739636] ? ret_from_fork+0x22/0x30
[ 260.739647] OOM killer enabled.
[ 260.739648] Restarting tasks ... done.
[ 260.740077] PM: suspend exit
Play nice and ensure that any thread we create will call try_to_freeze()
at an opportune time so that memory suspend can proceed. For the io-wq
worker threads, mark them as PF_NOFREEZE. They could potentially be
blocked for a long time.
Reported-by: Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
Tested-by: Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
|
io_dismantle_req() is always followed by io_put_task(), which already do
proper in_idle wake ups, so we can skip waking the owner task in
io_dismantle_req(). The rules are simpler now, do io_put_task() shortly
after ending a request, and it will be fine.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|