summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig24
-rw-r--r--fs/Makefile2
-rw-r--r--fs/attr.c50
-rw-r--r--fs/block_dev.c1695
-rw-r--r--fs/btrfs/disk-io.c48
-rw-r--r--fs/btrfs/ioctl.c15
-rw-r--r--fs/btrfs/misc.h2
-rw-r--r--fs/btrfs/ordered-data.c8
-rw-r--r--fs/btrfs/volumes.c48
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/cache.h6
-rw-r--r--fs/ceph/caps.c266
-rw-r--r--fs/ceph/file.c32
-rw-r--r--fs/ceph/inode.c11
-rw-r--r--fs/ceph/mds_client.c218
-rw-r--r--fs/ceph/mds_client.h5
-rw-r--r--fs/ceph/mdsmap.c12
-rw-r--r--fs/ceph/metric.c4
-rw-r--r--fs/ceph/snap.c59
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.h9
-rw-r--r--fs/ceph/xattr.c19
-rw-r--r--fs/coredump.c15
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/eventpoll.c23
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext4/super.c3
-rw-r--r--fs/filesystems.c27
-rw-r--r--fs/fuse/control.c10
-rw-r--r--fs/fuse/dev.c4
-rw-r--r--fs/fuse/file.c45
-rw-r--r--fs/fuse/fuse_i.h20
-rw-r--r--fs/fuse/inode.c203
-rw-r--r--fs/fuse/virtio_fs.c12
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/hostfs/hostfs_kern.c1
-rw-r--r--fs/internal.h2
-rw-r--r--fs/io-wq.c445
-rw-r--r--fs/io_uring.c94
-rw-r--r--fs/ksmbd/ndr.c383
-rw-r--r--fs/ksmbd/oplock.c6
-rw-r--r--fs/ksmbd/smb2pdu.c69
-rw-r--r--fs/ksmbd/smb_common.c4
-rw-r--r--fs/ksmbd/smb_common.h1
-rw-r--r--fs/ksmbd/smbacl.c79
-rw-r--r--fs/ksmbd/smbacl.h25
-rw-r--r--fs/ksmbd/transport_rdma.c2
-rw-r--r--fs/ksmbd/vfs.c47
-rw-r--r--fs/ksmbd/vfs.h3
-rw-r--r--fs/ksmbd/vfs_cache.c16
-rw-r--r--fs/ksmbd/vfs_cache.h1
-rw-r--r--fs/locks.c6
-rw-r--r--fs/nilfs2/sysfs.c26
-rw-r--r--fs/nilfs2/the_nilfs.c9
-rw-r--r--fs/notify/mark.c1
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/array.c18
-rw-r--r--fs/proc/base.c5
-rw-r--r--fs/select.c4
-rw-r--r--fs/xfs/xfs_super.c16
61 files changed, 1639 insertions, 2536 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index d8207a1b8c44..47af46a573ba 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,7 +43,7 @@ source "fs/f2fs/Kconfig"
source "fs/zonefs/Kconfig"
config FS_DAX
- bool "Direct Access (DAX) support"
+ bool "File system based Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
@@ -53,8 +53,23 @@ config FS_DAX
Direct Access (DAX) can be used on memory-backed block devices.
If the block device supports DAX and the filesystem supports DAX,
then you can avoid using the pagecache to buffer I/Os. Turning
- on this option will compile in support for DAX; you will need to
- mount the filesystem using the -o dax option.
+ on this option will compile in support for DAX.
+
+ For a DAX device to support file system access it needs to have
+ struct pages. For the nfit based NVDIMMs this can be enabled
+ using the ndctl utility:
+
+ # ndctl create-namespace --force --reconfig=namespace0.0 \
+ --mode=fsdax --map=mem
+
+ See the 'create-namespace' man page for details on the overhead of
+ --map=mem:
+ https://docs.pmem.io/ndctl-user-guide/ndctl-man-pages/ndctl-create-namespace
+
+ For ndctl to work CONFIG_DEV_DAX needs to be enabled as well. For most
+ file systems DAX support needs to be manually enabled globally or
+ per-inode using a mount option as well. See the file documentation in
+ Documentation/filesystems/dax.rst for details.
If you do not have a block device that is capable of using this,
or if unsure, say N. Saying Y will increase the size of the kernel
@@ -219,8 +234,7 @@ config ARCH_SUPPORTS_HUGETLBFS
config HUGETLBFS
bool "HugeTLB file system support"
- depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
- ARCH_SUPPORTS_HUGETLBFS || BROKEN
+ depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
diff --git a/fs/Makefile b/fs/Makefile
index 2f21300851ae..9e712aa5e254 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \
kernel_read_file.o remap_range.o
ifeq ($(CONFIG_BLOCK),y)
-obj-y += buffer.o block_dev.o direct-io.o mpage.o
+obj-y += buffer.o direct-io.o mpage.o
else
obj-y += no-block.o
endif
diff --git a/fs/attr.c b/fs/attr.c
index 87ef39db1c34..473d21b3a86d 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -249,6 +249,34 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
}
EXPORT_SYMBOL(setattr_copy);
+int may_setattr(struct user_namespace *mnt_userns, struct inode *inode,
+ unsigned int ia_valid)
+{
+ int error;
+
+ if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ return -EPERM;
+ }
+
+ /*
+ * If utimes(2) and friends are called with times == NULL (or both
+ * times are UTIME_NOW), then we need to check for write permission
+ */
+ if (ia_valid & ATTR_TOUCH) {
+ if (IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ if (!inode_owner_or_capable(mnt_userns, inode)) {
+ error = inode_permission(mnt_userns, inode, MAY_WRITE);
+ if (error)
+ return error;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL(may_setattr);
+
/**
* notify_change - modify attributes of a filesytem object
* @mnt_userns: user namespace of the mount the inode was found from
@@ -290,25 +318,9 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
WARN_ON_ONCE(!inode_is_locked(inode));
- if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- }
-
- /*
- * If utimes(2) and friends are called with times == NULL (or both
- * times are UTIME_NOW), then we need to check for write permission
- */
- if (ia_valid & ATTR_TOUCH) {
- if (IS_IMMUTABLE(inode))
- return -EPERM;
-
- if (!inode_owner_or_capable(mnt_userns, inode)) {
- error = inode_permission(mnt_userns, inode, MAY_WRITE);
- if (error)
- return error;
- }
- }
+ error = may_setattr(mnt_userns, inode, ia_valid);
+ if (error)
+ return error;
if ((ia_valid & ATTR_MODE)) {
umode_t amode = attr->ia_mode;
diff --git a/fs/block_dev.c b/fs/block_dev.c
deleted file mode 100644
index 45df6cbccf12..000000000000
--- a/fs/block_dev.c
+++ /dev/null
@@ -1,1695 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
- * Copyright (C) 2016 - 2020 Christoph Hellwig
- */
-
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/fcntl.h>
-#include <linux/slab.h>
-#include <linux/kmod.h>
-#include <linux/major.h>
-#include <linux/device_cgroup.h>
-#include <linux/highmem.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/module.h>
-#include <linux/blkpg.h>
-#include <linux/magic.h>
-#include <linux/buffer_head.h>
-#include <linux/swap.h>
-#include <linux/pagevec.h>
-#include <linux/writeback.h>
-#include <linux/mpage.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
-#include <linux/uio.h>
-#include <linux/namei.h>
-#include <linux/log2.h>
-#include <linux/cleancache.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/falloc.h>
-#include <linux/part_stat.h>
-#include <linux/uaccess.h>
-#include <linux/suspend.h>
-#include "internal.h"
-#include "../block/blk.h"
-
-struct bdev_inode {
- struct block_device bdev;
- struct inode vfs_inode;
-};
-
-static const struct address_space_operations def_blk_aops;
-
-static inline struct bdev_inode *BDEV_I(struct inode *inode)
-{
- return container_of(inode, struct bdev_inode, vfs_inode);
-}
-
-struct block_device *I_BDEV(struct inode *inode)
-{
- return &BDEV_I(inode)->bdev;
-}
-EXPORT_SYMBOL(I_BDEV);
-
-static void bdev_write_inode(struct block_device *bdev)
-{
- struct inode *inode = bdev->bd_inode;
- int ret;
-
- spin_lock(&inode->i_lock);
- while (inode->i_state & I_DIRTY) {
- spin_unlock(&inode->i_lock);
- ret = write_inode_now(inode, true);
- if (ret) {
- char name[BDEVNAME_SIZE];
- pr_warn_ratelimited("VFS: Dirty inode writeback failed "
- "for block device %s (err=%d).\n",
- bdevname(bdev, name), ret);
- }
- spin_lock(&inode->i_lock);
- }
- spin_unlock(&inode->i_lock);
-}
-
-/* Kill _all_ buffers and pagecache , dirty or not.. */
-static void kill_bdev(struct block_device *bdev)
-{
- struct address_space *mapping = bdev->bd_inode->i_mapping;
-
- if (mapping_empty(mapping))
- return;
-
- invalidate_bh_lrus();
- truncate_inode_pages(mapping, 0);
-}
-
-/* Invalidate clean unused buffers and pagecache. */
-void invalidate_bdev(struct block_device *bdev)
-{
- struct address_space *mapping = bdev->bd_inode->i_mapping;
-
- if (mapping->nrpages) {
- invalidate_bh_lrus();
- lru_add_drain_all(); /* make sure all lru add caches are flushed */
- invalidate_mapping_pages(mapping, 0, -1);
- }
- /* 99% of the time, we don't need to flush the cleancache on the bdev.
- * But, for the strange corners, lets be cautious
- */
- cleancache_invalidate_inode(mapping);
-}
-EXPORT_SYMBOL(invalidate_bdev);
-
-/*
- * Drop all buffers & page cache for given bdev range. This function bails
- * with error if bdev has other exclusive owner (such as filesystem).
- */
-int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
- loff_t lstart, loff_t lend)
-{
- /*
- * If we don't hold exclusive handle for the device, upgrade to it
- * while we discard the buffer cache to avoid discarding buffers
- * under live filesystem.
- */
- if (!(mode & FMODE_EXCL)) {
- int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
- if (err)
- goto invalidate;
- }
-
- truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
- if (!(mode & FMODE_EXCL))
- bd_abort_claiming(bdev, truncate_bdev_range);
- return 0;
-
-invalidate:
- /*
- * Someone else has handle exclusively open. Try invalidating instead.
- * The 'end' argument is inclusive so the rounding is safe.
- */
- return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
- lstart >> PAGE_SHIFT,
- lend >> PAGE_SHIFT);
-}
-
-static void set_init_blocksize(struct block_device *bdev)
-{
- unsigned int bsize = bdev_logical_block_size(bdev);
- loff_t size = i_size_read(bdev->bd_inode);
-
- while (bsize < PAGE_SIZE) {
- if (size & bsize)
- break;
- bsize <<= 1;
- }
- bdev->bd_inode->i_blkbits = blksize_bits(bsize);
-}
-
-int set_blocksize(struct block_device *bdev, int size)
-{
- /* Size must be a power of two, and between 512 and PAGE_SIZE */
- if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
- return -EINVAL;
-
- /* Size cannot be smaller than the size supported by the device */
- if (size < bdev_logical_block_size(bdev))
- return -EINVAL;
-
- /* Don't change the size if it is same as current */
- if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
- sync_blockdev(bdev);
- bdev->bd_inode->i_blkbits = blksize_bits(size);
- kill_bdev(bdev);
- }
- return 0;
-}
-
-EXPORT_SYMBOL(set_blocksize);
-
-int sb_set_blocksize(struct super_block *sb, int size)
-{
- if (set_blocksize(sb->s_bdev, size))
- return 0;
- /* If we get here, we know size is power of two
- * and it's value is between 512 and PAGE_SIZE */
- sb->s_blocksize = size;
- sb->s_blocksize_bits = blksize_bits(size);
- return sb->s_blocksize;
-}
-
-EXPORT_SYMBOL(sb_set_blocksize);
-
-int sb_min_blocksize(struct super_block *sb, int size)
-{
- int minsize = bdev_logical_block_size(sb->s_bdev);
- if (size < minsize)
- size = minsize;
- return sb_set_blocksize(sb, size);
-}
-
-EXPORT_SYMBOL(sb_min_blocksize);
-
-static int
-blkdev_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- bh->b_bdev = I_BDEV(inode);
- bh->b_blocknr = iblock;
- set_buffer_mapped(bh);
- return 0;
-}
-
-static struct inode *bdev_file_inode(struct file *file)
-{
- return file->f_mapping->host;
-}
-
-static unsigned int dio_bio_write_op(struct kiocb *iocb)
-{
- unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
-
- /* avoid the need for a I/O completion work item */
- if (iocb->ki_flags & IOCB_DSYNC)
- op |= REQ_FUA;
- return op;
-}
-
-#define DIO_INLINE_BIO_VECS 4
-
-static void blkdev_bio_end_io_simple(struct bio *bio)
-{
- struct task_struct *waiter = bio->bi_private;
-
- WRITE_ONCE(bio->bi_private, NULL);
- blk_wake_io_task(waiter);
-}
-
-static ssize_t
-__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
- unsigned int nr_pages)
-{
- struct file *file = iocb->ki_filp;
- struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
- loff_t pos = iocb->ki_pos;
- bool should_dirty = false;
- struct bio bio;
- ssize_t ret;
- blk_qc_t qc;
-
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
- return -EINVAL;
-
- if (nr_pages <= DIO_INLINE_BIO_VECS)
- vecs = inline_vecs;
- else {
- vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
- GFP_KERNEL);
- if (!vecs)
- return -ENOMEM;
- }
-
- bio_init(&bio, vecs, nr_pages);
- bio_set_dev(&bio, bdev);
- bio.bi_iter.bi_sector = pos >> 9;
- bio.bi_write_hint = iocb->ki_hint;
- bio.bi_private = current;
- bio.bi_end_io = blkdev_bio_end_io_simple;
- bio.bi_ioprio = iocb->ki_ioprio;
-
- ret = bio_iov_iter_get_pages(&bio, iter);
- if (unlikely(ret))
- goto out;
- ret = bio.bi_iter.bi_size;
-
- if (iov_iter_rw(iter) == READ) {
- bio.bi_opf = REQ_OP_READ;
- if (iter_is_iovec(iter))
- should_dirty = true;
- } else {
- bio.bi_opf = dio_bio_write_op(iocb);
- task_io_account_write(ret);
- }
- if (iocb->ki_flags & IOCB_NOWAIT)
- bio.bi_opf |= REQ_NOWAIT;
- if (iocb->ki_flags & IOCB_HIPRI)
- bio_set_polled(&bio, iocb);
-
- qc = submit_bio(&bio);
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!READ_ONCE(bio.bi_private))
- break;
- if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(bdev), qc, true))
- blk_io_schedule();
- }
- __set_current_state(TASK_RUNNING);
-
- bio_release_pages(&bio, should_dirty);
- if (unlikely(bio.bi_status))
- ret = blk_status_to_errno(bio.bi_status);
-
-out:
- if (vecs != inline_vecs)
- kfree(vecs);
-
- bio_uninit(&bio);
-
- return ret;
-}
-
-struct blkdev_dio {
- union {
- struct kiocb *iocb;
- struct task_struct *waiter;
- };
- size_t size;
- atomic_t ref;
- bool multi_bio : 1;
- bool should_dirty : 1;
- bool is_sync : 1;
- struct bio bio;
-};
-
-static struct bio_set blkdev_dio_pool;
-
-static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
-{
- struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
- struct request_queue *q = bdev_get_queue(bdev);
-
- return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
-}
-
-static void blkdev_bio_end_io(struct bio *bio)
-{
- struct blkdev_dio *dio = bio->bi_private;
- bool should_dirty = dio->should_dirty;
-
- if (bio->bi_status && !dio->bio.bi_status)
- dio->bio.bi_status = bio->bi_status;
-
- if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
- if (!dio->is_sync) {
- struct kiocb *iocb = dio->iocb;
- ssize_t ret;
-
- if (likely(!dio->bio.bi_status)) {
- ret = dio->size;
- iocb->ki_pos += ret;
- } else {
- ret = blk_status_to_errno(dio->bio.bi_status);
- }
-
- dio->iocb->ki_complete(iocb, ret, 0);
- if (dio->multi_bio)
- bio_put(&dio->bio);
- } else {
- struct task_struct *waiter = dio->waiter;
-
- WRITE_ONCE(dio->waiter, NULL);
- blk_wake_io_task(waiter);
- }
- }
-
- if (should_dirty) {
- bio_check_pages_dirty(bio);
- } else {
- bio_release_pages(bio, false);
- bio_put(bio);
- }
-}
-
-static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- unsigned int nr_pages)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = bdev_file_inode(file);
- struct block_device *bdev = I_BDEV(inode);
- struct blk_plug plug;
- struct blkdev_dio *dio;
- struct bio *bio;
- bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
- bool is_read = (iov_iter_rw(iter) == READ), is_sync;
- loff_t pos = iocb->ki_pos;
- blk_qc_t qc = BLK_QC_T_NONE;
- int ret = 0;
-
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
- return -EINVAL;
-
- bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
-
- dio = container_of(bio, struct blkdev_dio, bio);
- dio->is_sync = is_sync = is_sync_kiocb(iocb);
- if (dio->is_sync) {
- dio->waiter = current;
- bio_get(bio);
- } else {
- dio->iocb = iocb;
- }
-
- dio->size = 0;
- dio->multi_bio = false;
- dio->should_dirty = is_read && iter_is_iovec(iter);
-
- /*
- * Don't plug for HIPRI/polled IO, as those should go straight
- * to issue
- */
- if (!is_poll)
- blk_start_plug(&plug);
-
- for (;;) {
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = pos >> 9;
- bio->bi_write_hint = iocb->ki_hint;
- bio->bi_private = dio;
- bio->bi_end_io = blkdev_bio_end_io;
- bio->bi_ioprio = iocb->ki_ioprio;
-
- ret = bio_iov_iter_get_pages(bio, iter);
- if (unlikely(ret)) {
- bio->bi_status = BLK_STS_IOERR;
- bio_endio(bio);
- break;
- }
-
- if (is_read) {
- bio->bi_opf = REQ_OP_READ;
- if (dio->should_dirty)
- bio_set_pages_dirty(bio);
- } else {
- bio->bi_opf = dio_bio_write_op(iocb);
- task_io_account_write(bio->bi_iter.bi_size);
- }
- if (iocb->ki_flags & IOCB_NOWAIT)
- bio->bi_opf |= REQ_NOWAIT;
-
- dio->size += bio->bi_iter.bi_size;
- pos += bio->bi_iter.bi_size;
-
- nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
- if (!nr_pages) {
- bool polled = false;
-
- if (iocb->ki_flags & IOCB_HIPRI) {
- bio_set_polled(bio, iocb);
- polled = true;
- }
-
- qc = submit_bio(bio);
-
- if (polled)
- WRITE_ONCE(iocb->ki_cookie, qc);
- break;
- }
-
- if (!dio->multi_bio) {
- /*
- * AIO needs an extra reference to ensure the dio
- * structure which is embedded into the first bio
- * stays around.
- */
- if (!is_sync)
- bio_get(bio);
- dio->multi_bio = true;
- atomic_set(&dio->ref, 2);
- } else {
- atomic_inc(&dio->ref);
- }
-
- submit_bio(bio);
- bio = bio_alloc(GFP_KERNEL, nr_pages);
- }
-
- if (!is_poll)
- blk_finish_plug(&plug);
-
- if (!is_sync)
- return -EIOCBQUEUED;
-
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!READ_ONCE(dio->waiter))
- break;
-
- if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(bdev), qc, true))
- blk_io_schedule();
- }
- __set_current_state(TASK_RUNNING);
-
- if (!ret)
- ret = blk_status_to_errno(dio->bio.bi_status);
- if (likely(!ret))
- ret = dio->size;
-
- bio_put(&dio->bio);
- return ret;
-}
-
-static ssize_t
-blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- unsigned int nr_pages;
-
- if (!iov_iter_count(iter))
- return 0;
-
- nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
- if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
- return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-
- return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
-}
-
-static __init int blkdev_init(void)
-{
- return bioset_init(&blkdev_dio_pool, 4,
- offsetof(struct blkdev_dio, bio),
- BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
-}
-module_init(blkdev_init);
-
-int __sync_blockdev(struct block_device *bdev, int wait)
-{
- if (!bdev)
- return 0;
- if (!wait)
- return filemap_flush(bdev->bd_inode->i_mapping);
- return filemap_write_and_wait(bdev->bd_inode->i_mapping);
-}
-
-/*
- * Write out and wait upon all the dirty data associated with a block
- * device via its mapping. Does not take the superblock lock.
- */
-int sync_blockdev(struct block_device *bdev)
-{
- return __sync_blockdev(bdev, 1);
-}
-EXPORT_SYMBOL(sync_blockdev);
-
-/*
- * Write out and wait upon all dirty data associated with this
- * device. Filesystem data as well as the underlying block
- * device. Takes the superblock lock.
- */
-int fsync_bdev(struct block_device *bdev)
-{
- struct super_block *sb = get_super(bdev);
- if (sb) {
- int res = sync_filesystem(sb);
- drop_super(sb);
- return res;
- }
- return sync_blockdev(bdev);
-}
-EXPORT_SYMBOL(fsync_bdev);
-
-/**
- * freeze_bdev -- lock a filesystem and force it into a consistent state
- * @bdev: blockdevice to lock
- *
- * If a superblock is found on this device, we take the s_umount semaphore
- * on it to make sure nobody unmounts until the snapshot creation is done.
- * The reference counter (bd_fsfreeze_count) guarantees that only the last
- * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
- * actually.
- */
-int freeze_bdev(struct block_device *bdev)
-{
- struct super_block *sb;
- int error = 0;
-
- mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (++bdev->bd_fsfreeze_count > 1)
- goto done;
-
- sb = get_active_super(bdev);
- if (!sb)
- goto sync;
- if (sb->s_op->freeze_super)
- error = sb->s_op->freeze_super(sb);
- else
- error = freeze_super(sb);
- deactivate_super(sb);
-
- if (error) {
- bdev->bd_fsfreeze_count--;
- goto done;
- }
- bdev->bd_fsfreeze_sb = sb;
-
-sync:
- sync_blockdev(bdev);
-done:
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return error;
-}
-EXPORT_SYMBOL(freeze_bdev);
-
-/**
- * thaw_bdev -- unlock filesystem
- * @bdev: blockdevice to unlock
- *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
- */
-int thaw_bdev(struct block_device *bdev)
-{
- struct super_block *sb;
- int error = -EINVAL;
-
- mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (!bdev->bd_fsfreeze_count)
- goto out;
-
- error = 0;
- if (--bdev->bd_fsfreeze_count > 0)
- goto out;
-
- sb = bdev->bd_fsfreeze_sb;
- if (!sb)
- goto out;
-
- if (sb->s_op->thaw_super)
- error = sb->s_op->thaw_super(sb);
- else
- error = thaw_super(sb);
- if (error)
- bdev->bd_fsfreeze_count++;
- else
- bdev->bd_fsfreeze_sb = NULL;
-out:
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return error;
-}
-EXPORT_SYMBOL(thaw_bdev);
-
-static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, blkdev_get_block, wbc);
-}
-
-static int blkdev_readpage(struct file * file, struct page * page)
-{
- return block_read_full_page(page, blkdev_get_block);
-}
-
-static void blkdev_readahead(struct readahead_control *rac)
-{
- mpage_readahead(rac, blkdev_get_block);
-}
-
-static int blkdev_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- return block_write_begin(mapping, pos, len, flags, pagep,
- blkdev_get_block);
-}
-
-static int blkdev_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- int ret;
- ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- unlock_page(page);
- put_page(page);
-
- return ret;
-}
-
-/*
- * private llseek:
- * for a block special file file_inode(file)->i_size is zero
- * so we compute the size by hand (just as in block_read/write above)
- */
-static loff_t block_llseek(struct file *file, loff_t offset, int whence)
-{
- struct inode *bd_inode = bdev_file_inode(file);
- loff_t retval;
-
- inode_lock(bd_inode);
- retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
- inode_unlock(bd_inode);
- return retval;
-}
-
-static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *bd_inode = bdev_file_inode(filp);
- struct block_device *bdev = I_BDEV(bd_inode);
- int error;
-
- error = file_write_and_wait_range(filp, start, end);
- if (error)
- return error;
-
- /*
- * There is no need to serialise calls to blkdev_issue_flush with
- * i_mutex and doing so causes performance issues with concurrent
- * O_SYNC writers to a block device.
- */
- error = blkdev_issue_flush(bdev);
- if (error == -EOPNOTSUPP)
- error = 0;
-
- return error;
-}
-
-/**
- * bdev_read_page() - Start reading a page from a block device
- * @bdev: The device to read the page from
- * @sector: The offset on the device to read the page to (need not be aligned)
- * @page: The page to read
- *
- * On entry, the page should be locked. It will be unlocked when the page
- * has been read. If the block driver implements rw_page synchronously,
- * that will be true on exit from this function, but it need not be.
- *
- * Errors returned by this function are usually "soft", eg out of memory, or
- * queue full; callers should try a different route to read this page rather
- * than propagate an error back up the stack.
- *
- * Return: negative errno if an error occurs, 0 if submission was successful.
- */
-int bdev_read_page(struct block_device *bdev, sector_t sector,
- struct page *page)
-{
- const struct block_device_operations *ops = bdev->bd_disk->fops;
- int result = -EOPNOTSUPP;
-
- if (!ops->rw_page || bdev_get_integrity(bdev))
- return result;
-
- result = blk_queue_enter(bdev->bd_disk->queue, 0);
- if (result)
- return result;
- result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
- REQ_OP_READ);
- blk_queue_exit(bdev->bd_disk->queue);
- return result;
-}
-
-/**
- * bdev_write_page() - Start writing a page to a block device
- * @bdev: The device to write the page to
- * @sector: The offset on the device to write the page to (need not be aligned)
- * @page: The page to write
- * @wbc: The writeback_control for the write
- *
- * On entry, the page should be locked and not currently under writeback.
- * On exit, if the write started successfully, the page will be unlocked and
- * under writeback. If the write failed already (eg the driver failed to
- * queue the page to the device), the page will still be locked. If the
- * caller is a ->writepage implementation, it will need to unlock the page.
- *
- * Errors returned by this function are usually "soft", eg out of memory, or
- * queue full; callers should try a different route to write this page rather
- * than propagate an error back up the stack.
- *
- * Return: negative errno if an error occurs, 0 if submission was successful.
- */
-int bdev_write_page(struct block_device *bdev, sector_t sector,
- struct page *page, struct writeback_control *wbc)
-{
- int result;
- const struct block_device_operations *ops = bdev->bd_disk->fops;
-
- if (!ops->rw_page || bdev_get_integrity(bdev))
- return -EOPNOTSUPP;
- result = blk_queue_enter(bdev->bd_disk->queue, 0);
- if (result)
- return result;
-
- set_page_writeback(page);
- result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
- REQ_OP_WRITE);
- if (result) {
- end_page_writeback(page);
- } else {
- clean_page_buffers(page);
- unlock_page(page);
- }
- blk_queue_exit(bdev->bd_disk->queue);
- return result;
-}
-
-/*
- * pseudo-fs
- */
-
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
-static struct kmem_cache * bdev_cachep __read_mostly;
-
-static struct inode *bdev_alloc_inode(struct super_block *sb)
-{
- struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
-
- if (!ei)
- return NULL;
- memset(&ei->bdev, 0, sizeof(ei->bdev));
- return &ei->vfs_inode;
-}
-
-static void bdev_free_inode(struct inode *inode)
-{
- struct block_device *bdev = I_BDEV(inode);
-
- free_percpu(bdev->bd_stats);
- kfree(bdev->bd_meta_info);
-
- if (!bdev_is_partition(bdev)) {
- if (bdev->bd_disk && bdev->bd_disk->bdi)
- bdi_put(bdev->bd_disk->bdi);
- kfree(bdev->bd_disk);
- }
-
- if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
- blk_free_ext_minor(MINOR(bdev->bd_dev));
-
- kmem_cache_free(bdev_cachep, BDEV_I(inode));
-}
-
-static void init_once(void *data)
-{
- struct bdev_inode *ei = data;
-
- inode_init_once(&ei->vfs_inode);
-}
-
-static void bdev_evict_inode(struct inode *inode)
-{
- truncate_inode_pages_final(&inode->i_data);
- invalidate_inode_buffers(inode); /* is it needed here? */
- clear_inode(inode);
-}
-
-static const struct super_operations bdev_sops = {
- .statfs = simple_statfs,
- .alloc_inode = bdev_alloc_inode,
- .free_inode = bdev_free_inode,
- .drop_inode = generic_delete_inode,
- .evict_inode = bdev_evict_inode,
-};
-
-static int bd_init_fs_context(struct fs_context *fc)
-{
- struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
- if (!ctx)
- return -ENOMEM;
- fc->s_iflags |= SB_I_CGROUPWB;
- ctx->ops = &bdev_sops;
- return 0;
-}
-
-static struct file_system_type bd_type = {
- .name = "bdev",
- .init_fs_context = bd_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
-struct super_block *blockdev_superblock __read_mostly;
-EXPORT_SYMBOL_GPL(blockdev_superblock);
-
-void __init bdev_cache_init(void)
-{
- int err;
- static struct vfsmount *bd_mnt;
-
- bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
- 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
- init_once);
- err = register_filesystem(&bd_type);
- if (err)
- panic("Cannot register bdev pseudo-fs");
- bd_mnt = kern_mount(&bd_type);
- if (IS_ERR(bd_mnt))
- panic("Cannot create bdev pseudo-fs");
- blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
-}
-
-struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
-{
- struct block_device *bdev;
- struct inode *inode;
-
- inode = new_inode(blockdev_superblock);
- if (!inode)
- return NULL;
- inode->i_mode = S_IFBLK;
- inode->i_rdev = 0;
- inode->i_data.a_ops = &def_blk_aops;
- mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-
- bdev = I_BDEV(inode);
- mutex_init(&bdev->bd_fsfreeze_mutex);
- spin_lock_init(&bdev->bd_size_lock);
- bdev->bd_disk = disk;
- bdev->bd_partno = partno;
- bdev->bd_inode = inode;
- bdev->bd_stats = alloc_percpu(struct disk_stats);
- if (!bdev->bd_stats) {
- iput(inode);
- return NULL;
- }
- return bdev;
-}
-
-void bdev_add(struct block_device *bdev, dev_t dev)
-{
- bdev->bd_dev = dev;
- bdev->bd_inode->i_rdev = dev;
- bdev->bd_inode->i_ino = dev;
- insert_inode_hash(bdev->bd_inode);
-}
-
-long nr_blockdev_pages(void)
-{
- struct inode *inode;
- long ret = 0;
-
- spin_lock(&blockdev_superblock->s_inode_list_lock);
- list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
- ret += inode->i_mapping->nrpages;
- spin_unlock(&blockdev_superblock->s_inode_list_lock);
-
- return ret;
-}
-
-/**
- * bd_may_claim - test whether a block device can be claimed
- * @bdev: block device of interest
- * @whole: whole block device containing @bdev, may equal @bdev
- * @holder: holder trying to claim @bdev
- *
- * Test whether @bdev can be claimed by @holder.
- *
- * CONTEXT:
- * spin_lock(&bdev_lock).
- *
- * RETURNS:
- * %true if @bdev can be claimed, %false otherwise.
- */
-static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
- void *holder)
-{
- if (bdev->bd_holder == holder)
- return true; /* already a holder */
- else if (bdev->bd_holder != NULL)
- return false; /* held by someone else */
- else if (whole == bdev)
- return true; /* is a whole device which isn't held */
-
- else if (whole->bd_holder == bd_may_claim)
- return true; /* is a partition of a device that is being partitioned */
- else if (whole->bd_holder != NULL)
- return false; /* is a partition of a held device */
- else
- return true; /* is a partition of an un-held device */
-}
-
-/**
- * bd_prepare_to_claim - claim a block device
- * @bdev: block device of interest
- * @holder: holder trying to claim @bdev
- *
- * Claim @bdev. This function fails if @bdev is already claimed by another
- * holder and waits if another claiming is in progress. return, the caller
- * has ownership of bd_claiming and bd_holder[s].
- *
- * RETURNS:
- * 0 if @bdev can be claimed, -EBUSY otherwise.
- */
-int bd_prepare_to_claim(struct block_device *bdev, void *holder)
-{
- struct block_device *whole = bdev_whole(bdev);
-
- if (WARN_ON_ONCE(!holder))
- return -EINVAL;
-retry:
- spin_lock(&bdev_lock);
- /* if someone else claimed, fail */
- if (!bd_may_claim(bdev, whole, holder)) {
- spin_unlock(&bdev_lock);
- return -EBUSY;
- }
-
- /* if claiming is already in progress, wait for it to finish */
- if (whole->bd_claiming) {
- wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
- DEFINE_WAIT(wait);
-
- prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&bdev_lock);
- schedule();
- finish_wait(wq, &wait);
- goto retry;
- }
-
- /* yay, all mine */
- whole->bd_claiming = holder;
- spin_unlock(&bdev_lock);
- return 0;
-}
-EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
-
-static void bd_clear_claiming(struct block_device *whole, void *holder)
-{
- lockdep_assert_held(&bdev_lock);
- /* tell others that we're done */
- BUG_ON(whole->bd_claiming != holder);
- whole->bd_claiming = NULL;
- wake_up_bit(&whole->bd_claiming, 0);
-}
-
-/**
- * bd_finish_claiming - finish claiming of a block device
- * @bdev: block device of interest
- * @holder: holder that has claimed @bdev
- *
- * Finish exclusive open of a block device. Mark the device as exlusively
- * open by the holder and wake up all waiters for exclusive open to finish.
- */
-static void bd_finish_claiming(struct block_device *bdev, void *holder)
-{
- struct block_device *whole = bdev_whole(bdev);
-
- spin_lock(&bdev_lock);
- BUG_ON(!bd_may_claim(bdev, whole, holder));
- /*
- * Note that for a whole device bd_holders will be incremented twice,
- * and bd_holder will be set to bd_may_claim before being set to holder
- */
- whole->bd_holders++;
- whole->bd_holder = bd_may_claim;
- bdev->bd_holders++;
- bdev->bd_holder = holder;
- bd_clear_claiming(whole, holder);
- spin_unlock(&bdev_lock);
-}
-
-/**
- * bd_abort_claiming - abort claiming of a block device
- * @bdev: block device of interest
- * @holder: holder that has claimed @bdev
- *
- * Abort claiming of a block device when the exclusive open failed. This can be
- * also used when exclusive open is not actually desired and we just needed
- * to block other exclusive openers for a while.
- */
-void bd_abort_claiming(struct block_device *bdev, void *holder)
-{
- spin_lock(&bdev_lock);
- bd_clear_claiming(bdev_whole(bdev), holder);
- spin_unlock(&bdev_lock);
-}
-EXPORT_SYMBOL(bd_abort_claiming);
-
-static void blkdev_flush_mapping(struct block_device *bdev)
-{
- WARN_ON_ONCE(bdev->bd_holders);
- sync_blockdev(bdev);
- kill_bdev(bdev);
- bdev_write_inode(bdev);
-}
-
-static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
-{
- struct gendisk *disk = bdev->bd_disk;
- int ret = 0;
-
- if (disk->fops->open) {
- ret = disk->fops->open(bdev, mode);
- if (ret) {
- /* avoid ghost partitions on a removed medium */
- if (ret == -ENOMEDIUM &&
- test_bit(GD_NEED_PART_SCAN, &disk->state))
- bdev_disk_changed(disk, true);
- return ret;
- }
- }
-
- if (!bdev->bd_openers)
- set_init_blocksize(bdev);
- if (test_bit(GD_NEED_PART_SCAN, &disk->state))
- bdev_disk_changed(disk, false);
- bdev->bd_openers++;
- return 0;;
-}
-
-static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
-{
- if (!--bdev->bd_openers)
- blkdev_flush_mapping(bdev);
- if (bdev->bd_disk->fops->release)
- bdev->bd_disk->fops->release(bdev->bd_disk, mode);
-}
-
-static int blkdev_get_part(struct block_device *part, fmode_t mode)
-{
- struct gendisk *disk = part->bd_disk;
- int ret;
-
- if (part->bd_openers)
- goto done;
-
- ret = blkdev_get_whole(bdev_whole(part), mode);
- if (ret)
- return ret;
-
- ret = -ENXIO;
- if (!bdev_nr_sectors(part))
- goto out_blkdev_put;
-
- disk->open_partitions++;
- set_init_blocksize(part);
-done:
- part->bd_openers++;
- return 0;
-
-out_blkdev_put:
- blkdev_put_whole(bdev_whole(part), mode);
- return ret;
-}
-
-static void blkdev_put_part(struct block_device *part, fmode_t mode)
-{
- struct block_device *whole = bdev_whole(part);
-
- if (--part->bd_openers)
- return;
- blkdev_flush_mapping(part);
- whole->bd_disk->open_partitions--;
- blkdev_put_whole(whole, mode);
-}
-
-struct block_device *blkdev_get_no_open(dev_t dev)
-{
- struct block_device *bdev;
- struct inode *inode;
-
- inode = ilookup(blockdev_superblock, dev);
- if (!inode) {
- blk_request_module(dev);
- inode = ilookup(blockdev_superblock, dev);
- if (!inode)
- return NULL;
- }
-
- /* switch from the inode reference to a device mode one: */
- bdev = &BDEV_I(inode)->bdev;
- if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
- bdev = NULL;
- iput(inode);
-
- if (!bdev)
- return NULL;
- if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
- !try_module_get(bdev->bd_disk->fops->owner)) {
- put_device(&bdev->bd_device);
- return NULL;
- }
-
- return bdev;
-}
-
-void blkdev_put_no_open(struct block_device *bdev)
-{
- module_put(bdev->bd_disk->fops->owner);
- put_device(&bdev->bd_device);
-}
-
-/**
- * blkdev_get_by_dev - open a block device by device number
- * @dev: device number of block device to open
- * @mode: FMODE_* mask
- * @holder: exclusive holder identifier
- *
- * Open the block device described by device number @dev. If @mode includes
- * %FMODE_EXCL, the block device is opened with exclusive access. Specifying
- * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for
- * the same @holder.
- *
- * Use this interface ONLY if you really do not have anything better - i.e. when
- * you are behind a truly sucky interface and all you are given is a device
- * number. Everything else should use blkdev_get_by_path().
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
- */
-struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
-{
- bool unblock_events = true;
- struct block_device *bdev;
- struct gendisk *disk;
- int ret;
-
- ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
- MAJOR(dev), MINOR(dev),
- ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
- ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
- if (ret)
- return ERR_PTR(ret);
-
- bdev = blkdev_get_no_open(dev);
- if (!bdev)
- return ERR_PTR(-ENXIO);
- disk = bdev->bd_disk;
-
- if (mode & FMODE_EXCL) {
- ret = bd_prepare_to_claim(bdev, holder);
- if (ret)
- goto put_blkdev;
- }
-
- disk_block_events(disk);
-
- mutex_lock(&disk->open_mutex);
- ret = -ENXIO;
- if (!disk_live(disk))
- goto abort_claiming;
- if (bdev_is_partition(bdev))
- ret = blkdev_get_part(bdev, mode);
- else
- ret = blkdev_get_whole(bdev, mode);
- if (ret)
- goto abort_claiming;
- if (mode & FMODE_EXCL) {
- bd_finish_claiming(bdev, holder);
-
- /*
- * Block event polling for write claims if requested. Any write
- * holder makes the write_holder state stick until all are
- * released. This is good enough and tracking individual
- * writeable reference is too fragile given the way @mode is
- * used in blkdev_get/put().
- */
- if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
- (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
- bdev->bd_write_holder = true;
- unblock_events = false;
- }
- }
- mutex_unlock(&disk->open_mutex);
-
- if (unblock_events)
- disk_unblock_events(disk);
- return bdev;
-
-abort_claiming:
- if (mode & FMODE_EXCL)
- bd_abort_claiming(bdev, holder);
- mutex_unlock(&disk->open_mutex);
- disk_unblock_events(disk);
-put_blkdev:
- blkdev_put_no_open(bdev);
- return ERR_PTR(ret);
-}
-EXPORT_SYMBOL(blkdev_get_by_dev);
-
-/**
- * blkdev_get_by_path - open a block device by name
- * @path: path to the block device to open
- * @mode: FMODE_* mask
- * @holder: exclusive holder identifier
- *
- * Open the block device described by the device file at @path. If @mode
- * includes %FMODE_EXCL, the block device is opened with exclusive access.
- * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may
- * nest for the same @holder.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
- */
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
- void *holder)
-{
- struct block_device *bdev;
- dev_t dev;
- int error;
-
- error = lookup_bdev(path, &dev);
- if (error)
- return ERR_PTR(error);
-
- bdev = blkdev_get_by_dev(dev, mode, holder);
- if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
- blkdev_put(bdev, mode);
- return ERR_PTR(-EACCES);
- }
-
- return bdev;
-}
-EXPORT_SYMBOL(blkdev_get_by_path);
-
-static int blkdev_open(struct inode * inode, struct file * filp)
-{
- struct block_device *bdev;
-
- /*
- * Preserve backwards compatibility and allow large file access
- * even if userspace doesn't ask for it explicitly. Some mkfs
- * binary needs it. We might want to drop this workaround
- * during an unstable branch.
- */
- filp->f_flags |= O_LARGEFILE;
-
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
-
- if (filp->f_flags & O_NDELAY)
- filp->f_mode |= FMODE_NDELAY;
- if (filp->f_flags & O_EXCL)
- filp->f_mode |= FMODE_EXCL;
- if ((filp->f_flags & O_ACCMODE) == 3)
- filp->f_mode |= FMODE_WRITE_IOCTL;
-
- bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
- filp->f_mapping = bdev->bd_inode->i_mapping;
- filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
- return 0;
-}
-
-void blkdev_put(struct block_device *bdev, fmode_t mode)
-{
- struct gendisk *disk = bdev->bd_disk;
-
- /*
- * Sync early if it looks like we're the last one. If someone else
- * opens the block device between now and the decrement of bd_openers
- * then we did a sync that we didn't need to, but that's not the end
- * of the world and we want to avoid long (could be several minute)
- * syncs while holding the mutex.
- */
- if (bdev->bd_openers == 1)
- sync_blockdev(bdev);
-
- mutex_lock(&disk->open_mutex);
- if (mode & FMODE_EXCL) {
- struct block_device *whole = bdev_whole(bdev);
- bool bdev_free;
-
- /*
- * Release a claim on the device. The holder fields
- * are protected with bdev_lock. open_mutex is to
- * synchronize disk_holder unlinking.
- */
- spin_lock(&bdev_lock);
-
- WARN_ON_ONCE(--bdev->bd_holders < 0);
- WARN_ON_ONCE(--whole->bd_holders < 0);
-
- if ((bdev_free = !bdev->bd_holders))
- bdev->bd_holder = NULL;
- if (!whole->bd_holders)
- whole->bd_holder = NULL;
-
- spin_unlock(&bdev_lock);
-
- /*
- * If this was the last claim, remove holder link and
- * unblock evpoll if it was a write holder.
- */
- if (bdev_free && bdev->bd_write_holder) {
- disk_unblock_events(disk);
- bdev->bd_write_holder = false;
- }
- }
-
- /*
- * Trigger event checking and tell drivers to flush MEDIA_CHANGE
- * event. This is to ensure detection of media removal commanded
- * from userland - e.g. eject(1).
- */
- disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
-
- if (bdev_is_partition(bdev))
- blkdev_put_part(bdev, mode);
- else
- blkdev_put_whole(bdev, mode);
- mutex_unlock(&disk->open_mutex);
-
- blkdev_put_no_open(bdev);
-}
-EXPORT_SYMBOL(blkdev_put);
-
-static int blkdev_close(struct inode * inode, struct file * filp)
-{
- struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
- blkdev_put(bdev, filp->f_mode);
- return 0;
-}
-
-static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
- struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- fmode_t mode = file->f_mode;
-
- /*
- * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
- * to updated it before every ioctl.
- */
- if (file->f_flags & O_NDELAY)
- mode |= FMODE_NDELAY;
- else
- mode &= ~FMODE_NDELAY;
-
- return blkdev_ioctl(bdev, mode, cmd, arg);
-}
-
-/*
- * Write data to the block device. Only intended for the block device itself
- * and the raw driver which basically is a fake block device.
- *
- * Does not take i_mutex for the write and thus is not for general purpose
- * use.
- */
-static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *bd_inode = bdev_file_inode(file);
- loff_t size = i_size_read(bd_inode);
- struct blk_plug plug;
- size_t shorted = 0;
- ssize_t ret;
-
- if (bdev_read_only(I_BDEV(bd_inode)))
- return -EPERM;
-
- if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
- return -ETXTBSY;
-
- if (!iov_iter_count(from))
- return 0;
-
- if (iocb->ki_pos >= size)
- return -ENOSPC;
-
- if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
- return -EOPNOTSUPP;
-
- size -= iocb->ki_pos;
- if (iov_iter_count(from) > size) {
- shorted = iov_iter_count(from) - size;
- iov_iter_truncate(from, size);
- }
-
- blk_start_plug(&plug);
- ret = __generic_file_write_iter(iocb, from);
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
- iov_iter_reexpand(from, iov_iter_count(from) + shorted);
- blk_finish_plug(&plug);
- return ret;
-}
-
-static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
- struct file *file = iocb->ki_filp;
- struct inode *bd_inode = bdev_file_inode(file);
- loff_t size = i_size_read(bd_inode);
- loff_t pos = iocb->ki_pos;
- size_t shorted = 0;
- ssize_t ret;
-
- if (pos >= size)
- return 0;
-
- size -= pos;
- if (iov_iter_count(to) > size) {
- shorted = iov_iter_count(to) - size;
- iov_iter_truncate(to, size);
- }
-
- ret = generic_file_read_iter(iocb, to);
- iov_iter_reexpand(to, iov_iter_count(to) + shorted);
- return ret;
-}
-
-static int blkdev_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return generic_writepages(mapping, wbc);
-}
-
-static const struct address_space_operations def_blk_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
- .readpage = blkdev_readpage,
- .readahead = blkdev_readahead,
- .writepage = blkdev_writepage,
- .write_begin = blkdev_write_begin,
- .write_end = blkdev_write_end,
- .writepages = blkdev_writepages,
- .direct_IO = blkdev_direct_IO,
- .migratepage = buffer_migrate_page_norefs,
- .is_dirty_writeback = buffer_check_dirty_writeback,
-};
-
-#define BLKDEV_FALLOC_FL_SUPPORTED \
- (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
-
-static long blkdev_fallocate(struct file *file, int mode, loff_t start,
- loff_t len)
-{
- struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- loff_t end = start + len - 1;
- loff_t isize;
- int error;
-
- /* Fail if we don't recognize the flags. */
- if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
- return -EOPNOTSUPP;
-
- /* Don't go off the end of the device. */
- isize = i_size_read(bdev->bd_inode);
- if (start >= isize)
- return -EINVAL;
- if (end >= isize) {
- if (mode & FALLOC_FL_KEEP_SIZE) {
- len = isize - start;
- end = start + len - 1;
- } else
- return -EINVAL;
- }
-
- /*
- * Don't allow IO that isn't aligned to logical block size.
- */
- if ((start | len) & (bdev_logical_block_size(bdev) - 1))
- return -EINVAL;
-
- /* Invalidate the page cache, including dirty pages. */
- error = truncate_bdev_range(bdev, file->f_mode, start, end);
- if (error)
- return error;
-
- switch (mode) {
- case FALLOC_FL_ZERO_RANGE:
- case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
- error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
- GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
- break;
- case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
- GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
- break;
- case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
- error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
- GFP_KERNEL, 0);
- break;
- default:
- return -EOPNOTSUPP;
- }
- if (error)
- return error;
-
- /*
- * Invalidate the page cache again; if someone wandered in and dirtied
- * a page, we just discard it - userspace has no way of knowing whether
- * the write happened before or after discard completing...
- */
- return truncate_bdev_range(bdev, file->f_mode, start, end);
-}
-
-const struct file_operations def_blk_fops = {
- .open = blkdev_open,
- .release = blkdev_close,
- .llseek = block_llseek,
- .read_iter = blkdev_read_iter,
- .write_iter = blkdev_write_iter,
- .iopoll = blkdev_iopoll,
- .mmap = generic_file_mmap,
- .fsync = blkdev_fsync,
- .unlocked_ioctl = block_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = compat_blkdev_ioctl,
-#endif
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
- .fallocate = blkdev_fallocate,
-};
-
-/**
- * lookup_bdev - lookup a struct block_device by name
- * @pathname: special file representing the block device
- * @dev: return value of the block device's dev_t
- *
- * Get a reference to the blockdevice at @pathname in the current
- * namespace if possible and return it. Return ERR_PTR(error)
- * otherwise.
- */
-int lookup_bdev(const char *pathname, dev_t *dev)
-{
- struct inode *inode;
- struct path path;
- int error;
-
- if (!pathname || !*pathname)
- return -EINVAL;
-
- error = kern_path(pathname, LOOKUP_FOLLOW, &path);
- if (error)
- return error;
-
- inode = d_backing_inode(path.dentry);
- error = -ENOTBLK;
- if (!S_ISBLK(inode->i_mode))
- goto out_path_put;
- error = -EACCES;
- if (!may_open_dev(&path))
- goto out_path_put;
-
- *dev = inode->i_rdev;
- error = 0;
-out_path_put:
- path_put(&path);
- return error;
-}
-EXPORT_SYMBOL(lookup_bdev);
-
-int __invalidate_device(struct block_device *bdev, bool kill_dirty)
-{
- struct super_block *sb = get_super(bdev);
- int res = 0;
-
- if (sb) {
- /*
- * no need to lock the super, get_super holds the
- * read mutex so the filesystem cannot go away
- * under us (->put_super runs with the write lock
- * hold).
- */
- shrink_dcache_sb(sb);
- res = invalidate_inodes(sb, kill_dirty);
- drop_super(sb);
- }
- invalidate_bdev(bdev);
- return res;
-}
-EXPORT_SYMBOL(__invalidate_device);
-
-void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
-{
- struct inode *inode, *old_inode = NULL;
-
- spin_lock(&blockdev_superblock->s_inode_list_lock);
- list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
- struct address_space *mapping = inode->i_mapping;
- struct block_device *bdev;
-
- spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
- mapping->nrpages == 0) {
- spin_unlock(&inode->i_lock);
- continue;
- }
- __iget(inode);
- spin_unlock(&inode->i_lock);
- spin_unlock(&blockdev_superblock->s_inode_list_lock);
- /*
- * We hold a reference to 'inode' so it couldn't have been
- * removed from s_inodes list while we dropped the
- * s_inode_list_lock We cannot iput the inode now as we can
- * be holding the last reference and we cannot iput it under
- * s_inode_list_lock. So we keep the reference and iput it
- * later.
- */
- iput(old_inode);
- old_inode = inode;
- bdev = I_BDEV(inode);
-
- mutex_lock(&bdev->bd_disk->open_mutex);
- if (bdev->bd_openers)
- func(bdev, arg);
- mutex_unlock(&bdev->bd_disk->open_mutex);
-
- spin_lock(&blockdev_superblock->s_inode_list_lock);
- }
- spin_unlock(&blockdev_superblock->s_inode_list_lock);
- iput(old_inode);
-}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2f9515dccce0..355ea88d5c5f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3314,6 +3314,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+ /*
+ * Flag our filesystem as having big metadata blocks if they are bigger
+ * than the page size.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
+ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+ btrfs_info(fs_info,
+ "flagging fs with big metadata feature");
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+ }
+
+ /* Set up fs_info before parsing mount options */
+ nodesize = btrfs_super_nodesize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = sectorsize;
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
+ fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+
+ fs_info->nodesize = nodesize;
+ fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
+ fs_info->stripesize = stripesize;
+
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
@@ -3341,30 +3365,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
btrfs_info(fs_info, "has skinny extents");
/*
- * flag our filesystem as having big metadata blocks if
- * they are bigger than the page size
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- btrfs_info(fs_info,
- "flagging fs with big metadata feature");
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
- }
-
- nodesize = btrfs_super_nodesize(disk_super);
- sectorsize = btrfs_super_sectorsize(disk_super);
- stripesize = sectorsize;
- fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
- fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
-
- /* Cache block sizes */
- fs_info->nodesize = nodesize;
- fs_info->sectorsize = sectorsize;
- fs_info->sectorsize_bits = ilog2(sectorsize);
- fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
- fs_info->stripesize = stripesize;
-
- /*
* mixed block groups end up with duplicate but slightly offset
* extent buffers for the same range. It leads to corruptions
*/
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 41524f9aeac3..cc61813213d8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3223,6 +3223,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
+ struct block_device *bdev = NULL;
+ fmode_t mode;
int ret;
bool cancel = false;
@@ -3255,9 +3257,9 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
/* Exclusive operation is now claimed */
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
- ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
+ ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode);
else
- ret = btrfs_rm_device(fs_info, vol_args->name, 0);
+ ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
btrfs_exclop_finish(fs_info);
@@ -3273,6 +3275,8 @@ out:
kfree(vol_args);
err_drop:
mnt_drop_write_file(file);
+ if (bdev)
+ blkdev_put(bdev, mode);
return ret;
}
@@ -3281,6 +3285,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
+ struct block_device *bdev = NULL;
+ fmode_t mode;
int ret;
bool cancel;
@@ -3302,7 +3308,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, vol_args->name, 0);
+ ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
@@ -3311,7 +3317,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
kfree(vol_args);
out_drop_write:
mnt_drop_write_file(file);
-
+ if (bdev)
+ blkdev_put(bdev, mode);
return ret;
}
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 6461ebc3a1c1..340f995652f2 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -5,7 +5,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
#include <linux/rbtree.h>
#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index edb65abf0393..6b51fd2ec5ac 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1049,6 +1049,7 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
u64 len)
{
struct inode *inode = ordered->inode;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
u64 file_offset = ordered->file_offset + pos;
u64 disk_bytenr = ordered->disk_bytenr + pos;
u64 num_bytes = len;
@@ -1066,6 +1067,13 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
else
type = __ffs(flags_masked);
+ /*
+ * The splitting extent is already counted and will be added again
+ * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid
+ * double counting.
+ */
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes,
+ fs_info->delalloc_batch);
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
WARN_ON_ONCE(1);
ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ec3a874165de..464485aa7318 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -558,6 +558,8 @@ static int btrfs_free_stale_devices(const char *path,
struct btrfs_device *device, *tmp_device;
int ret = 0;
+ lockdep_assert_held(&uuid_mutex);
+
if (path)
ret = -ENOENT;
@@ -988,11 +990,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
struct btrfs_device *orig_dev;
int ret = 0;
+ lockdep_assert_held(&uuid_mutex);
+
fs_devices = alloc_fs_devices(orig->fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
- mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
@@ -1024,10 +1027,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
- mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
- mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(ret);
}
@@ -1869,15 +1870,17 @@ out:
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
*/
-static void update_dev_time(const char *path_name)
+static void update_dev_time(struct block_device *bdev)
{
- struct file *filp;
+ struct inode *inode = bdev->bd_inode;
+ struct timespec64 now;
- filp = filp_open(path_name, O_RDWR, 0);
- if (IS_ERR(filp))
+ /* Shouldn't happen but just in case. */
+ if (!inode)
return;
- file_update_time(filp);
- filp_close(filp, NULL);
+
+ now = current_time(inode);
+ generic_update_time(inode, &now, S_MTIME | S_CTIME);
}
static int btrfs_rm_dev_item(struct btrfs_device *device)
@@ -2053,11 +2056,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
+ update_dev_time(bdev);
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid)
+ u64 devid, struct block_device **bdev, fmode_t *mode)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -2171,15 +2174,26 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&fs_devices->device_list_mutex);
/*
- * at this point, the device is zero sized and detached from
- * the devices list. All that's left is to zero out the old
- * supers and free the device.
+ * At this point, the device is zero sized and detached from the
+ * devices list. All that's left is to zero out the old supers and
+ * free the device.
+ *
+ * We cannot call btrfs_close_bdev() here because we're holding the sb
+ * write lock, and blkdev_put() will pull in the ->open_mutex on the
+ * block device and it's dependencies. Instead just flush the device
+ * and let the caller do the final blkdev_put.
*/
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
btrfs_scratch_superblocks(fs_info, device->bdev,
device->name->str);
+ if (device->bdev) {
+ sync_blockdev(device->bdev);
+ invalidate_bdev(device->bdev);
+ }
+ }
- btrfs_close_bdev(device);
+ *bdev = device->bdev;
+ *mode = device->mode;
synchronize_rcu();
btrfs_free_device(device);
@@ -2706,7 +2720,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_forget_devices(device_path);
/* Update ctime/mtime for blkid or udev */
- update_dev_time(device_path);
+ update_dev_time(bdev);
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b082250b42e0..2183361db614 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -472,7 +472,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u8 *uuid);
void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
- const char *device_path, u64 devid);
+ const char *device_path, u64 devid,
+ struct block_device **bdev, fmode_t *mode);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7e7a897ae0d3..99b80b5c7a93 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1281,8 +1281,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
inode, page, (int)pos, (int)copied, (int)len);
- /* zero the stale part of the page if we did a short copy */
if (!PageUptodate(page)) {
+ /* just return that nothing was copied on a short copy */
if (copied < len) {
copied = 0;
goto out;
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 1409d6149281..058ea2a04376 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -26,12 +26,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp);
void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci);
-int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
-int ceph_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages);
-
static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
ci->fscache = NULL;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 39db97f149b9..6c0e52fd0743 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -703,29 +703,12 @@ void ceph_add_cap(struct inode *inode,
*/
struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
realmino);
- if (realm) {
- struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
- if (oldrealm) {
- spin_lock(&oldrealm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- spin_unlock(&oldrealm->inodes_with_caps_lock);
- }
-
- spin_lock(&realm->inodes_with_caps_lock);
- list_add(&ci->i_snap_realm_item,
- &realm->inodes_with_caps);
- ci->i_snap_realm = realm;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = inode;
- spin_unlock(&realm->inodes_with_caps_lock);
-
- if (oldrealm)
- ceph_put_snap_realm(mdsc, oldrealm);
- } else {
- pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
- realmino);
- WARN_ON(!realm);
- }
+ if (realm)
+ ceph_change_snap_realm(inode, realm);
+ else
+ WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n",
+ __func__, realmino, ci->i_vino.ino,
+ ci->i_snap_realm ? ci->i_snap_realm->ino : 0);
}
__check_cap_issue(ci, cap, issued);
@@ -1112,20 +1095,6 @@ int ceph_is_any_caps(struct inode *inode)
return ret;
}
-static void drop_inode_snap_realm(struct ceph_inode_info *ci)
-{
- struct ceph_snap_realm *realm = ci->i_snap_realm;
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm_counter++;
- ci->i_snap_realm = NULL;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
- realm);
-}
-
/*
* Remove a cap. Take steps to deal with a racing iterate_session_caps.
*
@@ -1145,17 +1114,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
return;
}
+ lockdep_assert_held(&ci->i_ceph_lock);
+
dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc;
/* remove from inode's cap rbtree, and clear auth cap */
rb_erase(&cap->ci_node, &ci->i_caps);
- if (ci->i_auth_cap == cap) {
- WARN_ON_ONCE(!list_empty(&ci->i_dirty_item) &&
- !mdsc->fsc->blocklisted);
+ if (ci->i_auth_cap == cap)
ci->i_auth_cap = NULL;
- }
/* remove from session list */
spin_lock(&session->s_cap_lock);
@@ -1201,12 +1169,34 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
* keep i_snap_realm.
*/
if (ci->i_wr_ref == 0 && ci->i_snap_realm)
- drop_inode_snap_realm(ci);
+ ceph_change_snap_realm(&ci->vfs_inode, NULL);
__cap_delay_cancel(mdsc, ci);
}
}
+void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
+{
+ struct ceph_inode_info *ci = cap->ci;
+ struct ceph_fs_client *fsc;
+
+ /* 'ci' being NULL means the remove have already occurred */
+ if (!ci) {
+ dout("%s: cap inode is NULL\n", __func__);
+ return;
+ }
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ WARN_ON_ONCE(ci->i_auth_cap == cap &&
+ !list_empty(&ci->i_dirty_item) &&
+ !fsc->blocklisted &&
+ READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
+
+ __ceph_remove_cap(cap, queue_release);
+}
+
struct cap_msg_args {
struct ceph_mds_session *session;
u64 ino, cid, follows;
@@ -1335,7 +1325,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
while (p) {
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
p = rb_next(p);
- __ceph_remove_cap(cap, true);
+ ceph_remove_cap(cap, true);
}
spin_unlock(&ci->i_ceph_lock);
}
@@ -1746,6 +1736,9 @@ struct ceph_cap_flush *ceph_alloc_cap_flush(void)
struct ceph_cap_flush *cf;
cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+ if (!cf)
+ return NULL;
+
cf->is_capsnap = false;
return cf;
}
@@ -1856,6 +1849,8 @@ static u64 __mark_caps_flushing(struct inode *inode,
* try to invalidate mapping pages without blocking.
*/
static int try_nonblocking_invalidate(struct inode *inode)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
u32 invalidating_gen = ci->i_rdcache_gen;
@@ -2219,6 +2214,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
*/
static int unsafe_request_wait(struct inode *inode)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
int ret, err = 0;
@@ -2238,6 +2234,81 @@ static int unsafe_request_wait(struct inode *inode)
}
spin_unlock(&ci->i_unsafe_lock);
+ /*
+ * Trigger to flush the journal logs in all the relevant MDSes
+ * manually, or in the worst case we must wait at most 5 seconds
+ * to wait the journal logs to be flushed by the MDSes periodically.
+ */
+ if (req1 || req2) {
+ struct ceph_mds_session **sessions = NULL;
+ struct ceph_mds_session *s;
+ struct ceph_mds_request *req;
+ unsigned int max;
+ int i;
+
+ /*
+ * The mdsc->max_sessions is unlikely to be changed
+ * mostly, here we will retry it by reallocating the
+ * sessions arrary memory to get rid of the mdsc->mutex
+ * lock.
+ */
+retry:
+ max = mdsc->max_sessions;
+ sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
+ if (!sessions)
+ return -ENOMEM;
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (req1) {
+ list_for_each_entry(req, &ci->i_unsafe_dirops,
+ r_unsafe_dir_item) {
+ s = req->r_session;
+ if (unlikely(s->s_mds > max)) {
+ spin_unlock(&ci->i_unsafe_lock);
+ goto retry;
+ }
+ if (!sessions[s->s_mds]) {
+ s = ceph_get_mds_session(s);
+ sessions[s->s_mds] = s;
+ }
+ }
+ }
+ if (req2) {
+ list_for_each_entry(req, &ci->i_unsafe_iops,
+ r_unsafe_target_item) {
+ s = req->r_session;
+ if (unlikely(s->s_mds > max)) {
+ spin_unlock(&ci->i_unsafe_lock);
+ goto retry;
+ }
+ if (!sessions[s->s_mds]) {
+ s = ceph_get_mds_session(s);
+ sessions[s->s_mds] = s;
+ }
+ }
+ }
+ spin_unlock(&ci->i_unsafe_lock);
+
+ /* the auth MDS */
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap) {
+ s = ci->i_auth_cap->session;
+ if (!sessions[s->s_mds])
+ sessions[s->s_mds] = ceph_get_mds_session(s);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ /* send flush mdlog request to MDSes */
+ for (i = 0; i < max; i++) {
+ s = sessions[i];
+ if (s) {
+ send_flush_mdlog(s);
+ ceph_put_mds_session(s);
+ }
+ }
+ kfree(sessions);
+ }
+
dout("unsafe_request_wait %p wait on tid %llu %llu\n",
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) {
@@ -3008,7 +3079,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
}
/* see comment in __ceph_remove_cap() */
if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
- drop_inode_snap_realm(ci);
+ ceph_change_snap_realm(inode, NULL);
}
}
if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
@@ -3114,7 +3185,16 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
break;
}
}
- BUG_ON(!found);
+
+ if (!found) {
+ /*
+ * The capsnap should already be removed when removing
+ * auth cap in the case of a forced unmount.
+ */
+ WARN_ON_ONCE(ci->i_auth_cap);
+ goto unlock;
+ }
+
capsnap->dirty_pages -= nr;
if (capsnap->dirty_pages == 0) {
complete_capsnap = true;
@@ -3136,6 +3216,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
complete_capsnap ? " (complete capsnap)" : "");
}
+unlock:
spin_unlock(&ci->i_ceph_lock);
if (last) {
@@ -3606,6 +3687,43 @@ out:
iput(inode);
}
+void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ bool ret;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);
+
+ list_del_init(&capsnap->ci_item);
+ ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
+ if (wake_ci)
+ *wake_ci = ret;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (list_empty(&ci->i_cap_flush_list))
+ list_del_init(&ci->i_flushing_item);
+
+ ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush);
+ if (wake_mdsc)
+ *wake_mdsc = ret;
+ spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing);
+ __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
+}
+
/*
* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
* throw away our cap_snap.
@@ -3643,23 +3761,10 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
capsnap, capsnap->follows);
}
}
- if (flushed) {
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
- dout(" removing %p cap_snap %p follows %lld\n",
- inode, capsnap, follows);
- list_del(&capsnap->ci_item);
- wake_ci |= __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
-
- spin_lock(&mdsc->cap_dirty_lock);
-
- if (list_empty(&ci->i_cap_flush_list))
- list_del_init(&ci->i_flushing_item);
-
- wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc,
- &capsnap->cap_flush);
- spin_unlock(&mdsc->cap_dirty_lock);
- }
+ if (flushed)
+ ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
spin_unlock(&ci->i_ceph_lock);
+
if (flushed) {
ceph_put_snap_context(capsnap->context);
ceph_put_cap_snap(capsnap);
@@ -3743,7 +3848,7 @@ retry:
goto out_unlock;
if (target < 0) {
- __ceph_remove_cap(cap, false);
+ ceph_remove_cap(cap, false);
goto out_unlock;
}
@@ -3778,7 +3883,7 @@ retry:
change_auth_cap_ses(ci, tcap->session);
}
}
- __ceph_remove_cap(cap, false);
+ ceph_remove_cap(cap, false);
goto out_unlock;
} else if (tsession) {
/* add placeholder for the export tagert */
@@ -3795,7 +3900,7 @@ retry:
spin_unlock(&mdsc->cap_dirty_lock);
}
- __ceph_remove_cap(cap, false);
+ ceph_remove_cap(cap, false);
goto out_unlock;
}
@@ -3906,7 +4011,7 @@ retry:
ocap->mseq, mds, le32_to_cpu(ph->seq),
le32_to_cpu(ph->mseq));
}
- __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+ ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
}
*old_issued = issued;
@@ -4134,8 +4239,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
done:
mutex_unlock(&session->s_mutex);
done_unlocked:
- ceph_put_string(extra_info.pool_ns);
iput(inode);
+out:
+ ceph_put_string(extra_info.pool_ns);
return;
flush_cap_releases:
@@ -4150,7 +4256,7 @@ flush_cap_releases:
bad:
pr_err("ceph_handle_caps: corrupt message\n");
ceph_msg_dump(msg);
- return;
+ goto out;
}
/*
@@ -4225,33 +4331,9 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
dout("flush_dirty_caps done\n");
}
-static void iterate_sessions(struct ceph_mds_client *mdsc,
- void (*cb)(struct ceph_mds_session *))
-{
- int mds;
-
- mutex_lock(&mdsc->mutex);
- for (mds = 0; mds < mdsc->max_sessions; ++mds) {
- struct ceph_mds_session *s;
-
- if (!mdsc->sessions[mds])
- continue;
-
- s = ceph_get_mds_session(mdsc->sessions[mds]);
- if (!s)
- continue;
-
- mutex_unlock(&mdsc->mutex);
- cb(s);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
-}
-
void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
{
- iterate_sessions(mdsc, flush_dirty_session_caps);
+ ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
}
void __ceph_touch_fmode(struct ceph_inode_info *ci,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e1d605a02d4a..d16fd2d5fd42 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1722,32 +1722,26 @@ retry_snap:
goto out;
}
- err = file_remove_privs(file);
- if (err)
+ down_read(&osdc->lock);
+ map_flags = osdc->osdmap->flags;
+ pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
+ up_read(&osdc->lock);
+ if ((map_flags & CEPH_OSDMAP_FULL) ||
+ (pool_flags & CEPH_POOL_FLAG_FULL)) {
+ err = -ENOSPC;
goto out;
+ }
- err = file_update_time(file);
+ err = file_remove_privs(file);
if (err)
goto out;
- inode_inc_iversion_raw(inode);
-
if (ci->i_inline_version != CEPH_INLINE_NONE) {
err = ceph_uninline_data(file, NULL);
if (err < 0)
goto out;
}
- down_read(&osdc->lock);
- map_flags = osdc->osdmap->flags;
- pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
- up_read(&osdc->lock);
- if ((map_flags & CEPH_OSDMAP_FULL) ||
- (pool_flags & CEPH_POOL_FLAG_FULL)) {
- err = -ENOSPC;
- goto out;
- }
-
dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
inode, ceph_vinop(inode), pos, count, i_size_read(inode));
if (fi->fmode & CEPH_FILE_MODE_LAZY)
@@ -1759,6 +1753,12 @@ retry_snap:
if (err < 0)
goto out;
+ err = file_update_time(file);
+ if (err)
+ goto out_caps;
+
+ inode_inc_iversion_raw(inode);
+
dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
@@ -1842,6 +1842,8 @@ retry_snap:
}
goto out_unlocked;
+out_caps:
+ ceph_put_cap_refs(ci, got);
out:
if (direct_lock)
ceph_end_io_direct(inode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1bd2cc015913..2df1e1284451 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -581,16 +581,9 @@ void ceph_evict_inode(struct inode *inode)
*/
if (ci->i_snap_realm) {
if (ceph_snap(inode) == CEPH_NOSNAP) {
- struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n",
- realm);
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm = NULL;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
+ ci->i_snap_realm);
+ ceph_change_snap_realm(inode, NULL);
} else {
ceph_put_snapid_map(mdsc, ci->i_snapid_map);
ci->i_snap_realm = NULL;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0b69aec23e5c..7cad180d6deb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -11,6 +11,7 @@
#include <linux/ratelimit.h>
#include <linux/bits.h>
#include <linux/ktime.h>
+#include <linux/bitmap.h>
#include "super.h"
#include "mds_client.h"
@@ -652,14 +653,9 @@ const char *ceph_session_state_name(int s)
struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
{
- if (refcount_inc_not_zero(&s->s_ref)) {
- dout("mdsc get_session %p %d -> %d\n", s,
- refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
+ if (refcount_inc_not_zero(&s->s_ref))
return s;
- } else {
- dout("mdsc get_session %p 0 -- FAIL\n", s);
- return NULL;
- }
+ return NULL;
}
void ceph_put_mds_session(struct ceph_mds_session *s)
@@ -667,8 +663,6 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
if (IS_ERR_OR_NULL(s))
return;
- dout("mdsc put_session %p %d -> %d\n", s,
- refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
ceph_auth_destroy_authorizer(s->s_auth.authorizer);
@@ -743,8 +737,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_mdsc = mdsc;
s->s_mds = mds;
s->s_state = CEPH_MDS_SESSION_NEW;
- s->s_ttl = 0;
- s->s_seq = 0;
mutex_init(&s->s_mutex);
ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
@@ -753,17 +745,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_ttl = jiffies - 1;
spin_lock_init(&s->s_cap_lock);
- s->s_renew_requested = 0;
- s->s_renew_seq = 0;
INIT_LIST_HEAD(&s->s_caps);
- s->s_nr_caps = 0;
refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
xa_init(&s->s_delegated_inos);
- s->s_num_cap_releases = 0;
- s->s_cap_reconnect = 0;
- s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
@@ -811,6 +797,33 @@ static void put_request_session(struct ceph_mds_request *req)
}
}
+void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+ void (*cb)(struct ceph_mds_session *),
+ bool check_state)
+{
+ int mds;
+
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; mds < mdsc->max_sessions; ++mds) {
+ struct ceph_mds_session *s;
+
+ s = __ceph_lookup_mds_session(mdsc, mds);
+ if (!s)
+ continue;
+
+ if (check_state && !check_session_state(s)) {
+ ceph_put_mds_session(s);
+ continue;
+ }
+
+ mutex_unlock(&mdsc->mutex);
+ cb(s);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
void ceph_mdsc_release_request(struct kref *kref)
{
struct ceph_mds_request *req = container_of(kref,
@@ -1155,7 +1168,7 @@ random:
/*
* session messages
*/
-static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
{
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
@@ -1163,7 +1176,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
false);
if (!msg) {
- pr_err("create_session_msg ENOMEM creating msg\n");
+ pr_err("ENOMEM creating session %s msg\n",
+ ceph_session_op_name(op));
return NULL;
}
h = msg->front.iov_base;
@@ -1294,7 +1308,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
if (!msg) {
- pr_err("create_session_msg ENOMEM creating msg\n");
+ pr_err("ENOMEM creating session open msg\n");
return ERR_PTR(-ENOMEM);
}
p = msg->front.iov_base;
@@ -1583,14 +1597,39 @@ out:
return ret;
}
+static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap_snap *capsnap;
+ int capsnap_release = 0;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
+
+ while (!list_empty(&ci->i_cap_snaps)) {
+ capsnap = list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ __ceph_remove_capsnap(inode, capsnap, NULL, NULL);
+ ceph_put_snap_context(capsnap->context);
+ ceph_put_cap_snap(capsnap);
+ capsnap_release++;
+ }
+ wake_up_all(&ci->i_cap_wq);
+ wake_up_all(&mdsc->cap_flushing_wq);
+ return capsnap_release;
+}
+
static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
LIST_HEAD(to_remove);
bool dirty_dropped = false;
bool invalidate = false;
+ int capsnap_release = 0;
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
@@ -1598,7 +1637,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
__ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf;
- struct ceph_mds_client *mdsc = fsc->mdsc;
if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
if (inode->i_data.nrpages > 0)
@@ -1662,6 +1700,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
ci->i_prealloc_cap_flush = NULL;
}
+
+ if (!list_empty(&ci->i_cap_snaps))
+ capsnap_release = remove_capsnaps(mdsc, inode);
}
spin_unlock(&ci->i_ceph_lock);
while (!list_empty(&to_remove)) {
@@ -1678,6 +1719,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_queue_invalidate(inode);
if (dirty_dropped)
iput(inode);
+ while (capsnap_release--)
+ iput(inode);
return 0;
}
@@ -1803,8 +1846,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
ceph_mds_state_name(state));
- msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
- ++session->s_renew_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ ++session->s_renew_seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1818,7 +1861,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
session->s_mds, ceph_session_state_name(session->s_state), seq);
- msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1870,7 +1913,8 @@ static int request_close_session(struct ceph_mds_session *session)
dout("request_close_session mds%d state %s seq %lld\n",
session->s_mds, ceph_session_state_name(session->s_state),
session->s_seq);
- msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
+ session->s_seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1965,7 +2009,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if (oissued) {
/* we aren't the only cap.. just remove us */
- __ceph_remove_cap(cap, true);
+ ceph_remove_cap(cap, true);
(*remaining)--;
} else {
struct dentry *dentry;
@@ -4150,13 +4194,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
struct ceph_mdsmap *newmap,
struct ceph_mdsmap *oldmap)
{
- int i;
+ int i, j, err;
int oldstate, newstate;
struct ceph_mds_session *s;
+ unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0};
dout("check_new_map new %u old %u\n",
newmap->m_epoch, oldmap->m_epoch);
+ if (newmap->m_info) {
+ for (i = 0; i < newmap->possible_max_rank; i++) {
+ for (j = 0; j < newmap->m_info[i].num_export_targets; j++)
+ set_bit(newmap->m_info[i].export_targets[j], targets);
+ }
+ }
+
for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
if (!mdsc->sessions[i])
continue;
@@ -4210,6 +4262,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
newstate >= CEPH_MDS_STATE_RECONNECT) {
mutex_unlock(&mdsc->mutex);
+ clear_bit(i, targets);
send_mds_reconnect(mdsc, s);
mutex_lock(&mdsc->mutex);
}
@@ -4232,6 +4285,51 @@ static void check_new_map(struct ceph_mds_client *mdsc,
}
}
+ /*
+ * Only open and reconnect sessions that don't exist yet.
+ */
+ for (i = 0; i < newmap->possible_max_rank; i++) {
+ /*
+ * In case the import MDS is crashed just after
+ * the EImportStart journal is flushed, so when
+ * a standby MDS takes over it and is replaying
+ * the EImportStart journal the new MDS daemon
+ * will wait the client to reconnect it, but the
+ * client may never register/open the session yet.
+ *
+ * Will try to reconnect that MDS daemon if the
+ * rank number is in the export targets array and
+ * is the up:reconnect state.
+ */
+ newstate = ceph_mdsmap_get_state(newmap, i);
+ if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT)
+ continue;
+
+ /*
+ * The session maybe registered and opened by some
+ * requests which were choosing random MDSes during
+ * the mdsc->mutex's unlock/lock gap below in rare
+ * case. But the related MDS daemon will just queue
+ * that requests and be still waiting for the client's
+ * reconnection request in up:reconnect state.
+ */
+ s = __ceph_lookup_mds_session(mdsc, i);
+ if (likely(!s)) {
+ s = __open_export_target_session(mdsc, i);
+ if (IS_ERR(s)) {
+ err = PTR_ERR(s);
+ pr_err("failed to open export target session, err %d\n",
+ err);
+ continue;
+ }
+ }
+ dout("send reconnect to export target mds.%d\n", i);
+ mutex_unlock(&mdsc->mutex);
+ send_mds_reconnect(mdsc, s);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+
for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
s = mdsc->sessions[i];
if (!s)
@@ -4409,24 +4507,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
}
/*
- * lock unlock sessions, to wait ongoing session activities
+ * lock unlock the session, to wait ongoing session activities
*/
-static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
+static void lock_unlock_session(struct ceph_mds_session *s)
{
- int i;
-
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (!s)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_unlock(&s->s_mutex);
}
static void maybe_recover_session(struct ceph_mds_client *mdsc)
@@ -4448,6 +4534,8 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
bool check_session_state(struct ceph_mds_session *s)
{
+ struct ceph_fs_client *fsc = s->s_mdsc->fsc;
+
switch (s->s_state) {
case CEPH_MDS_SESSION_OPEN:
if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
@@ -4456,8 +4544,9 @@ bool check_session_state(struct ceph_mds_session *s)
}
break;
case CEPH_MDS_SESSION_CLOSING:
- /* Should never reach this when we're unmounting */
- WARN_ON_ONCE(s->s_ttl);
+ /* Should never reach this when not force unmounting */
+ WARN_ON_ONCE(s->s_ttl &&
+ READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
fallthrough;
case CEPH_MDS_SESSION_NEW:
case CEPH_MDS_SESSION_RESTARTING:
@@ -4584,21 +4673,12 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
init_completion(&mdsc->safe_umount_waiters);
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
- mdsc->sessions = NULL;
- atomic_set(&mdsc->num_sessions, 0);
- mdsc->max_sessions = 0;
- mdsc->stopping = 0;
- atomic64_set(&mdsc->quotarealms_count, 0);
mdsc->quotarealms_inodes = RB_ROOT;
mutex_init(&mdsc->quotarealms_inodes_mutex);
- mdsc->last_snap_seq = 0;
init_rwsem(&mdsc->snap_rwsem);
mdsc->snap_realms = RB_ROOT;
INIT_LIST_HEAD(&mdsc->snap_empty);
- mdsc->num_snap_realms = 0;
spin_lock_init(&mdsc->snap_empty_lock);
- mdsc->last_tid = 0;
- mdsc->oldest_tid = 0;
mdsc->request_tree = RB_ROOT;
INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
mdsc->last_renew_caps = jiffies;
@@ -4610,11 +4690,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
mdsc->last_cap_flush_tid = 1;
INIT_LIST_HEAD(&mdsc->cap_flush_list);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
- mdsc->num_cap_flushing = 0;
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
- atomic_set(&mdsc->cap_reclaim_pending, 0);
err = ceph_metric_init(&mdsc->metric);
if (err)
goto err_mdsmap;
@@ -4676,6 +4754,30 @@ static void wait_requests(struct ceph_mds_client *mdsc)
dout("wait_requests done\n");
}
+void send_flush_mdlog(struct ceph_mds_session *s)
+{
+ struct ceph_msg *msg;
+
+ /*
+ * Pre-luminous MDS crashes when it sees an unknown session request
+ */
+ if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
+ return;
+
+ mutex_lock(&s->s_mutex);
+ dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
+ ceph_session_state_name(s->s_state), s->s_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
+ s->s_seq);
+ if (!msg) {
+ pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
+ } else {
+ ceph_con_send(&s->s_con, msg);
+ }
+ mutex_unlock(&s->s_mutex);
+}
+
/*
* called before mount is ro, and before dentries are torn down.
* (hmm, does this still race with new lookups?)
@@ -4685,7 +4787,8 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
dout("pre_umount\n");
mdsc->stopping = 1;
- lock_unlock_sessions(mdsc);
+ ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
+ ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
ceph_flush_dirty_caps(mdsc);
wait_requests(mdsc);
@@ -4912,7 +5015,6 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
ceph_metric_destroy(&mdsc->metric);
- flush_delayed_work(&mdsc->metric.delayed_work);
fsc->mdsc = NULL;
kfree(mdsc);
dout("mdsc_destroy %p done\n", mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 20e42d8b66c6..97c7f7bfa55f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -522,6 +522,11 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
+extern void send_flush_mdlog(struct ceph_mds_session *s);
+extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+ void (*cb)(struct ceph_mds_session *),
+ bool check_state);
+extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
struct ceph_cap *cap);
extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 3c444b9cb17b..61d67cbcb367 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -122,6 +122,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
int err;
u8 mdsmap_v;
u16 mdsmap_ev;
+ u32 target;
m = kzalloc(sizeof(*m), GFP_NOFS);
if (!m)
@@ -260,9 +261,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
sizeof(u32), GFP_NOFS);
if (!info->export_targets)
goto nomem;
- for (j = 0; j < num_export_targets; j++)
- info->export_targets[j] =
- ceph_decode_32(&pexport_targets);
+ for (j = 0; j < num_export_targets; j++) {
+ target = ceph_decode_32(&pexport_targets);
+ if (target >= m->possible_max_rank) {
+ err = -EIO;
+ goto corrupt;
+ }
+ info->export_targets[j] = target;
+ }
} else {
info->export_targets = NULL;
}
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 5ac151eb0d49..04d5df29bbbf 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -302,6 +302,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
if (!m)
return;
+ cancel_delayed_work_sync(&m->delayed_work);
+
percpu_counter_destroy(&m->total_inodes);
percpu_counter_destroy(&m->opened_inodes);
percpu_counter_destroy(&m->i_caps_mis);
@@ -309,8 +311,6 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
percpu_counter_destroy(&m->d_lease_mis);
percpu_counter_destroy(&m->d_lease_hit);
- cancel_delayed_work_sync(&m->delayed_work);
-
ceph_put_mds_session(m->session);
}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 15105f9da3fd..b41e6724c591 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -849,6 +849,43 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
dout("flush_snaps done\n");
}
+/**
+ * ceph_change_snap_realm - change the snap_realm for an inode
+ * @inode: inode to move to new snap realm
+ * @realm: new realm to move inode into (may be NULL)
+ *
+ * Detach an inode from its old snaprealm (if any) and attach it to
+ * the new snaprealm (if any). The old snap realm reference held by
+ * the inode is put. If realm is non-NULL, then the caller's reference
+ * to it is taken over by the inode.
+ */
+void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ if (oldrealm) {
+ spin_lock(&oldrealm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ if (oldrealm->ino == ci->i_vino.ino)
+ oldrealm->inode = NULL;
+ spin_unlock(&oldrealm->inodes_with_caps_lock);
+ ceph_put_snap_realm(mdsc, oldrealm);
+ }
+
+ ci->i_snap_realm = realm;
+
+ if (realm) {
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps);
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = inode;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ }
+}
/*
* Handle a snap notification from the MDS.
@@ -935,7 +972,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
};
struct inode *inode = ceph_find_inode(sb, vino);
struct ceph_inode_info *ci;
- struct ceph_snap_realm *oldrealm;
if (!inode)
continue;
@@ -960,27 +996,10 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
}
dout(" will move %p to split realm %llx %p\n",
inode, realm->ino, realm);
- /*
- * Move the inode to the new realm
- */
- oldrealm = ci->i_snap_realm;
- spin_lock(&oldrealm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- spin_unlock(&oldrealm->inodes_with_caps_lock);
-
- spin_lock(&realm->inodes_with_caps_lock);
- list_add(&ci->i_snap_realm_item,
- &realm->inodes_with_caps);
- ci->i_snap_realm = realm;
- if (realm->ino == ci->i_vino.ino)
- realm->inode = inode;
- spin_unlock(&realm->inodes_with_caps_lock);
-
- spin_unlock(&ci->i_ceph_lock);
ceph_get_snap_realm(mdsc, realm);
- ceph_put_snap_realm(mdsc, oldrealm);
-
+ ceph_change_snap_realm(inode, realm);
+ spin_unlock(&ci->i_ceph_lock);
iput(inode);
continue;
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 4a79f3632260..573bb9556fb5 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -46,6 +46,7 @@ const char *ceph_session_op_name(int op)
case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
case CEPH_SESSION_FORCE_RO: return "force_ro";
case CEPH_SESSION_REJECT: return "reject";
+ case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog";
}
return "???";
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c30258f95e37..a40eb14c282a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -418,7 +418,6 @@ struct ceph_inode_info {
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
};
- int i_snap_realm_counter; /* snap realm (if caps) */
struct list_head i_snap_realm_item;
struct list_head i_snap_flush_item;
struct timespec64 i_btime;
@@ -929,6 +928,7 @@ extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
extern int ceph_update_snap_trace(struct ceph_mds_client *m,
void *p, void *e, bool deletion,
struct ceph_snap_realm **realm_ret);
+void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm);
extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
@@ -1138,6 +1138,7 @@ extern void ceph_add_cap(struct inode *inode,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
extern void __ceph_remove_caps(struct ceph_inode_info *ci);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
@@ -1163,6 +1164,12 @@ extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci,
int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
+extern void __ceph_remove_capsnap(struct inode *inode,
+ struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc);
+extern void ceph_remove_capsnap(struct inode *inode,
+ struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc);
extern void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession);
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 1242db8d3444..159a1ffa4f4b 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -340,6 +340,18 @@ static ssize_t ceph_vxattrcb_caps(struct ceph_inode_info *ci, char *val,
ceph_cap_string(issued), issued);
}
+static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = ceph_fmt_xattr(val, size, "%d",
+ ci->i_auth_cap ? ci->i_auth_cap->session->s_mds : -1);
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
#define CEPH_XATTR_NAME2(_type, _name, _name2) \
XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
@@ -473,6 +485,13 @@ static struct ceph_vxattr ceph_common_vxattrs[] = {
.exists_cb = NULL,
.flags = VXATTR_FLAG_READONLY,
},
+ {
+ .name = "ceph.auth_mds",
+ .name_size = sizeof("ceph.auth_mds"),
+ .getxattr_cb = ceph_vxattrcb_auth_mds,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_READONLY,
+ },
{ .name = NULL, 0 } /* Required table terminator */
};
diff --git a/fs/coredump.c b/fs/coredump.c
index 07afb5ddb1c4..3224dee44d30 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -782,10 +782,17 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* filesystem.
*/
mnt_userns = file_mnt_user_ns(cprm.file);
- if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), current_fsuid()))
+ if (!uid_eq(i_uid_into_mnt(mnt_userns, inode),
+ current_fsuid())) {
+ pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n",
+ cn.corename);
goto close_fail;
- if ((inode->i_mode & 0677) != 0600)
+ }
+ if ((inode->i_mode & 0677) != 0600) {
+ pr_info_ratelimited("Core dump to %s aborted: cannot preserve file permissions\n",
+ cn.corename);
goto close_fail;
+ }
if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
if (do_truncate(mnt_userns, cprm.file->f_path.dentry,
@@ -1127,8 +1134,10 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
mmap_write_unlock(mm);
- if (WARN_ON(i != *vma_count))
+ if (WARN_ON(i != *vma_count)) {
+ kvfree(*vma_meta);
return -EFAULT;
+ }
*vma_data_size_ptr = vma_data_size;
return 0;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index a8d49e8fc83a..11b88559f8bf 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -546,7 +546,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return err;
if (test_opt(ctx, DAX_ALWAYS) &&
- !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) {
+ !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) {
errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
clear_opt(ctx, DAX_ALWAYS);
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1e596e1d0bba..06f4c5ae1451 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -723,7 +723,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
*/
call_rcu(&epi->rcu, epi_rcu_free);
- atomic_long_dec(&ep->user->epoll_watches);
+ percpu_counter_dec(&ep->user->epoll_watches);
return 0;
}
@@ -1439,7 +1439,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
{
int error, pwake = 0;
__poll_t revents;
- long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
struct eventpoll *tep = NULL;
@@ -1449,11 +1448,15 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
lockdep_assert_irqs_enabled();
- user_watches = atomic_long_read(&ep->user->epoll_watches);
- if (unlikely(user_watches >= max_user_watches))
+ if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
+ max_user_watches) >= 0))
return -ENOSPC;
- if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL)))
+ percpu_counter_inc(&ep->user->epoll_watches);
+
+ if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
+ percpu_counter_dec(&ep->user->epoll_watches);
return -ENOMEM;
+ }
/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
@@ -1466,17 +1469,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
mutex_lock_nested(&tep->mtx, 1);
/* Add the current item to the list of active epoll hook for this file */
if (unlikely(attach_epitem(tfile, epi) < 0)) {
- kmem_cache_free(epi_cache, epi);
if (tep)
mutex_unlock(&tep->mtx);
+ kmem_cache_free(epi_cache, epi);
+ percpu_counter_dec(&ep->user->epoll_watches);
return -ENOMEM;
}
if (full_check && !tep)
list_file(tfile);
- atomic_long_inc(&ep->user->epoll_watches);
-
/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
@@ -1684,8 +1686,8 @@ static int ep_send_events(struct eventpoll *ep,
if (!revents)
continue;
- if (__put_user(revents, &events->events) ||
- __put_user(epi->event.data, &events->data)) {
+ events = epoll_put_uevent(revents, epi->event.data, events);
+ if (!events) {
list_add(&epi->rdllink, &txlist);
ep_pm_stay_awake(epi);
if (!res)
@@ -1693,7 +1695,6 @@ static int ep_send_events(struct eventpoll *ep,
break;
}
res++;
- events++;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
else if (!(epi->event.events & EPOLLET)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 987bcf32ed46..d8d580b609ba 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -946,7 +946,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (test_opt(sb, DAX)) {
- if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
+ if (!dax_supported(dax_dev, sb->s_bdev, blocksize, 0,
+ bdev_nr_sectors(sb->s_bdev))) {
ext2_msg(sb, KERN_ERR,
"DAX unsupported by block device. Turning off DAX.");
clear_opt(sbi->s_mount_opt, DAX);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 136940af00b8..0775950ee84e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4287,7 +4287,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (bdev_dax_supported(sb->s_bdev, blocksize))
+ if (dax_supported(dax_dev, sb->s_bdev, blocksize, 0,
+ bdev_nr_sectors(sb->s_bdev)))
set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 90b8d879fbaf..58b9067b2391 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -209,21 +209,28 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
}
#endif
-int __init get_filesystem_list(char *buf)
+int __init list_bdev_fs_names(char *buf, size_t size)
{
- int len = 0;
- struct file_system_type * tmp;
+ struct file_system_type *p;
+ size_t len;
+ int count = 0;
read_lock(&file_systems_lock);
- tmp = file_systems;
- while (tmp && len < PAGE_SIZE - 80) {
- len += sprintf(buf+len, "%s\t%s\n",
- (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
- tmp->name);
- tmp = tmp->next;
+ for (p = file_systems; p; p = p->next) {
+ if (!(p->fs_flags & FS_REQUIRES_DEV))
+ continue;
+ len = strlen(p->name) + 1;
+ if (len > size) {
+ pr_warn("%s: truncating file system list\n", __func__);
+ break;
+ }
+ memcpy(buf, p->name, len);
+ buf += len;
+ size -= len;
+ count++;
}
read_unlock(&file_systems_lock);
- return len;
+ return count;
}
#ifdef CONFIG_PROC_FS
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index cc7e94d73c6c..000d2e5627e9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -328,7 +328,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
drop_nlink(d_inode(fuse_control_sb->s_root));
}
-static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx)
+static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc)
{
static const struct tree_descr empty_descr = {""};
struct fuse_conn *fc;
@@ -354,18 +354,18 @@ static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx)
return 0;
}
-static int fuse_ctl_get_tree(struct fs_context *fc)
+static int fuse_ctl_get_tree(struct fs_context *fsc)
{
- return get_tree_single(fc, fuse_ctl_fill_super);
+ return get_tree_single(fsc, fuse_ctl_fill_super);
}
static const struct fs_context_operations fuse_ctl_context_ops = {
.get_tree = fuse_ctl_get_tree,
};
-static int fuse_ctl_init_fs_context(struct fs_context *fc)
+static int fuse_ctl_init_fs_context(struct fs_context *fsc)
{
- fc->ops = &fuse_ctl_context_ops;
+ fsc->ops = &fuse_ctl_context_ops;
return 0;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1c8f79b3dd06..dde341a6388a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -288,10 +288,10 @@ void fuse_request_end(struct fuse_req *req)
/*
* test_and_set_bit() implies smp_mb() between bit
- * changing and below intr_entry check. Pairs with
+ * changing and below FR_INTERRUPTED check. Pairs with
* smp_mb() from queue_interrupt().
*/
- if (!list_empty(&req->intr_entry)) {
+ if (test_bit(FR_INTERRUPTED, &req->flags)) {
spin_lock(&fiq->lock);
list_del_init(&req->intr_entry);
spin_unlock(&fiq->lock);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 621a662c19fb..11404f8c21c7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -198,12 +198,11 @@ void fuse_finish_open(struct inode *inode, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!(ff->open_flags & FOPEN_KEEP_CACHE))
- invalidate_inode_pages2(inode->i_mapping);
if (ff->open_flags & FOPEN_STREAM)
stream_open(inode, file);
else if (ff->open_flags & FOPEN_NONSEEKABLE)
nonseekable_open(inode, file);
+
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -211,10 +210,14 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
+ truncate_pagecache(inode, 0);
fuse_invalidate_attr(inode);
if (fc->writeback_cache)
file_update_time(file);
+ } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
+ invalidate_inode_pages2(inode->i_mapping);
}
+
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
@@ -389,6 +392,7 @@ struct fuse_writepage_args {
struct list_head queue_entry;
struct fuse_writepage_args *next;
struct inode *inode;
+ struct fuse_sync_bucket *bucket;
};
static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
@@ -1608,6 +1612,9 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
struct fuse_args_pages *ap = &wpa->ia.ap;
int i;
+ if (wpa->bucket)
+ fuse_sync_bucket_dec(wpa->bucket);
+
for (i = 0; i < ap->num_pages; i++)
__free_page(ap->pages[i]);
@@ -1813,8 +1820,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
fuse_writepage_free(wpa);
}
-static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
- struct fuse_inode *fi)
+static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
{
struct fuse_file *ff = NULL;
@@ -1829,22 +1835,20 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
return ff;
}
-static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
- struct fuse_inode *fi)
+static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
{
- struct fuse_file *ff = __fuse_write_file_get(fc, fi);
+ struct fuse_file *ff = __fuse_write_file_get(fi);
WARN_ON(!ff);
return ff;
}
int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff;
int err;
- ff = __fuse_write_file_get(fc, fi);
+ ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
fuse_file_put(ff, false, false);
@@ -1871,6 +1875,20 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
}
+static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
+ struct fuse_writepage_args *wpa)
+{
+ if (!fc->sync_fs)
+ return;
+
+ rcu_read_lock();
+ /* Prevent resurrection of dead bucket in unlikely race with syncfs */
+ do {
+ wpa->bucket = rcu_dereference(fc->curr_bucket);
+ } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
+ rcu_read_unlock();
+}
+
static int fuse_writepage_locked(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -1894,10 +1912,11 @@ static int fuse_writepage_locked(struct page *page)
goto err_free;
error = -EIO;
- wpa->ia.ff = fuse_write_file_get(fc, fi);
+ wpa->ia.ff = fuse_write_file_get(fi);
if (!wpa->ia.ff)
goto err_nofile;
+ fuse_writepage_add_to_bucket(fc, wpa);
fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
copy_highpage(tmp_page, page);
@@ -2113,7 +2132,7 @@ static int fuse_writepages_fill(struct page *page,
if (!data->ff) {
err = -EIO;
- data->ff = fuse_write_file_get(fc, fi);
+ data->ff = fuse_write_file_get(fi);
if (!data->ff)
goto out_unlock;
}
@@ -2148,6 +2167,8 @@ static int fuse_writepages_fill(struct page *page,
__free_page(tmp_page);
goto out_unlock;
}
+ fuse_writepage_add_to_bucket(fc, wpa);
+
data->max_pages = 1;
ap = &wpa->ia.ap;
@@ -2881,7 +2902,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
{
- int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ int err = filemap_write_and_wait_range(inode->i_mapping, start, -1);
if (!err)
fuse_sync_writes(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 3d18556a01ad..319596df5dc6 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -482,6 +482,7 @@ struct fuse_dev {
struct fuse_fs_context {
int fd;
+ struct file *file;
unsigned int rootmode;
kuid_t user_id;
kgid_t group_id;
@@ -508,6 +509,13 @@ struct fuse_fs_context {
void **fudptr;
};
+struct fuse_sync_bucket {
+ /* count is a possible scalability bottleneck */
+ atomic_t count;
+ wait_queue_head_t waitq;
+ struct rcu_head rcu;
+};
+
/**
* A Fuse connection.
*
@@ -800,6 +808,9 @@ struct fuse_conn {
/** List of filesystems using this connection */
struct list_head mounts;
+
+ /* New writepages go into this bucket */
+ struct fuse_sync_bucket __rcu *curr_bucket;
};
/*
@@ -903,6 +914,15 @@ static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
descs[i].length = PAGE_SIZE - descs[i].offset;
}
+static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
+{
+ /* Need RCU protection to prevent use after free after the decrement */
+ rcu_read_lock();
+ if (atomic_dec_and_test(&bucket->count))
+ wake_up(&bucket->waitq);
+ rcu_read_unlock();
+}
+
/** Device operations */
extern const struct file_operations fuse_dev_operations;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e07e429f32e1..36cd03114b6d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -137,12 +137,12 @@ static void fuse_evict_inode(struct inode *inode)
}
}
-static int fuse_reconfigure(struct fs_context *fc)
+static int fuse_reconfigure(struct fs_context *fsc)
{
- struct super_block *sb = fc->root->d_sb;
+ struct super_block *sb = fsc->root->d_sb;
sync_filesystem(sb);
- if (fc->sb_flags & SB_MANDLOCK)
+ if (fsc->sb_flags & SB_MANDLOCK)
return -EINVAL;
return 0;
@@ -505,6 +505,57 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
return err;
}
+static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void)
+{
+ struct fuse_sync_bucket *bucket;
+
+ bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL);
+ if (bucket) {
+ init_waitqueue_head(&bucket->waitq);
+ /* Initial active count */
+ atomic_set(&bucket->count, 1);
+ }
+ return bucket;
+}
+
+static void fuse_sync_fs_writes(struct fuse_conn *fc)
+{
+ struct fuse_sync_bucket *bucket, *new_bucket;
+ int count;
+
+ new_bucket = fuse_sync_bucket_alloc();
+ spin_lock(&fc->lock);
+ bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+ count = atomic_read(&bucket->count);
+ WARN_ON(count < 1);
+ /* No outstanding writes? */
+ if (count == 1) {
+ spin_unlock(&fc->lock);
+ kfree(new_bucket);
+ return;
+ }
+
+ /*
+ * Completion of new bucket depends on completion of this bucket, so add
+ * one more count.
+ */
+ atomic_inc(&new_bucket->count);
+ rcu_assign_pointer(fc->curr_bucket, new_bucket);
+ spin_unlock(&fc->lock);
+ /*
+ * Drop initial active count. At this point if all writes in this and
+ * ancestor buckets complete, the count will go to zero and this task
+ * will be woken up.
+ */
+ atomic_dec(&bucket->count);
+
+ wait_event(bucket->waitq, atomic_read(&bucket->count) == 0);
+
+ /* Drop temp count on descendant bucket */
+ fuse_sync_bucket_dec(new_bucket);
+ kfree_rcu(bucket, rcu);
+}
+
static int fuse_sync_fs(struct super_block *sb, int wait)
{
struct fuse_mount *fm = get_fuse_mount_super(sb);
@@ -527,6 +578,8 @@ static int fuse_sync_fs(struct super_block *sb, int wait)
if (!fc->sync_fs)
return 0;
+ fuse_sync_fs_writes(fc);
+
memset(&inarg, 0, sizeof(inarg));
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
@@ -572,38 +625,38 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = {
{}
};
-static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
+static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
{
struct fs_parse_result result;
- struct fuse_fs_context *ctx = fc->fs_private;
+ struct fuse_fs_context *ctx = fsc->fs_private;
int opt;
- if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
/*
* Ignore options coming from mount(MS_REMOUNT) for backward
* compatibility.
*/
- if (fc->oldapi)
+ if (fsc->oldapi)
return 0;
- return invalfc(fc, "No changes allowed in reconfigure");
+ return invalfc(fsc, "No changes allowed in reconfigure");
}
- opt = fs_parse(fc, fuse_fs_parameters, param, &result);
+ opt = fs_parse(fsc, fuse_fs_parameters, param, &result);
if (opt < 0)
return opt;
switch (opt) {
case OPT_SOURCE:
- if (fc->source)
- return invalfc(fc, "Multiple sources specified");
- fc->source = param->string;
+ if (fsc->source)
+ return invalfc(fsc, "Multiple sources specified");
+ fsc->source = param->string;
param->string = NULL;
break;
case OPT_SUBTYPE:
if (ctx->subtype)
- return invalfc(fc, "Multiple subtypes specified");
+ return invalfc(fsc, "Multiple subtypes specified");
ctx->subtype = param->string;
param->string = NULL;
return 0;
@@ -615,22 +668,22 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
case OPT_ROOTMODE:
if (!fuse_valid_type(result.uint_32))
- return invalfc(fc, "Invalid rootmode");
+ return invalfc(fsc, "Invalid rootmode");
ctx->rootmode = result.uint_32;
ctx->rootmode_present = true;
break;
case OPT_USER_ID:
- ctx->user_id = make_kuid(fc->user_ns, result.uint_32);
+ ctx->user_id = make_kuid(fsc->user_ns, result.uint_32);
if (!uid_valid(ctx->user_id))
- return invalfc(fc, "Invalid user_id");
+ return invalfc(fsc, "Invalid user_id");
ctx->user_id_present = true;
break;
case OPT_GROUP_ID:
- ctx->group_id = make_kgid(fc->user_ns, result.uint_32);
+ ctx->group_id = make_kgid(fsc->user_ns, result.uint_32);
if (!gid_valid(ctx->group_id))
- return invalfc(fc, "Invalid group_id");
+ return invalfc(fsc, "Invalid group_id");
ctx->group_id_present = true;
break;
@@ -648,7 +701,7 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
case OPT_BLKSIZE:
if (!ctx->is_bdev)
- return invalfc(fc, "blksize only supported for fuseblk");
+ return invalfc(fsc, "blksize only supported for fuseblk");
ctx->blksize = result.uint_32;
break;
@@ -659,9 +712,9 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
return 0;
}
-static void fuse_free_fc(struct fs_context *fc)
+static void fuse_free_fsc(struct fs_context *fsc)
{
- struct fuse_fs_context *ctx = fc->fs_private;
+ struct fuse_fs_context *ctx = fsc->fs_private;
if (ctx) {
kfree(ctx->subtype);
@@ -762,6 +815,7 @@ void fuse_conn_put(struct fuse_conn *fc)
{
if (refcount_dec_and_test(&fc->count)) {
struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_sync_bucket *bucket;
if (IS_ENABLED(CONFIG_FUSE_DAX))
fuse_dax_conn_free(fc);
@@ -769,6 +823,11 @@ void fuse_conn_put(struct fuse_conn *fc)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
put_user_ns(fc->user_ns);
+ bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+ if (bucket) {
+ WARN_ON(atomic_read(&bucket->count) != 1);
+ kfree(bucket);
+ }
fc->release(fc);
}
}
@@ -1417,6 +1476,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
if (sb->s_flags & SB_MANDLOCK)
goto err;
+ rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
fuse_sb_defaults(sb);
if (ctx->is_bdev) {
@@ -1508,34 +1568,33 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common);
static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
{
struct fuse_fs_context *ctx = fsc->fs_private;
- struct file *file;
int err;
struct fuse_conn *fc;
struct fuse_mount *fm;
- err = -EINVAL;
- file = fget(ctx->fd);
- if (!file)
- goto err;
+ if (!ctx->file || !ctx->rootmode_present ||
+ !ctx->user_id_present || !ctx->group_id_present)
+ return -EINVAL;
/*
* Require mount to happen from the same user namespace which
* opened /dev/fuse to prevent potential attacks.
*/
- if ((file->f_op != &fuse_dev_operations) ||
- (file->f_cred->user_ns != sb->s_user_ns))
- goto err_fput;
- ctx->fudptr = &file->private_data;
+ err = -EINVAL;
+ if ((ctx->file->f_op != &fuse_dev_operations) ||
+ (ctx->file->f_cred->user_ns != sb->s_user_ns))
+ goto err;
+ ctx->fudptr = &ctx->file->private_data;
fc = kmalloc(sizeof(*fc), GFP_KERNEL);
err = -ENOMEM;
if (!fc)
- goto err_fput;
+ goto err;
fm = kzalloc(sizeof(*fm), GFP_KERNEL);
if (!fm) {
kfree(fc);
- goto err_fput;
+ goto err;
}
fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
@@ -1546,12 +1605,8 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
err = fuse_fill_super_common(sb, ctx);
if (err)
goto err_put_conn;
- /*
- * atomic_dec_and_test() in fput() provides the necessary
- * memory barrier for file->private_data to be visible on all
- * CPUs after this
- */
- fput(file);
+ /* file->private_data shall be visible on all CPUs after this */
+ smp_mb();
fuse_send_init(get_fuse_mount_super(sb));
return 0;
@@ -1559,30 +1614,68 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
fuse_conn_put(fc);
kfree(fm);
sb->s_fs_info = NULL;
- err_fput:
- fput(file);
err:
return err;
}
-static int fuse_get_tree(struct fs_context *fc)
+/*
+ * This is the path where user supplied an already initialized fuse dev. In
+ * this case never create a new super if the old one is gone.
+ */
+static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc)
{
- struct fuse_fs_context *ctx = fc->fs_private;
+ return -ENOTCONN;
+}
- if (!ctx->fd_present || !ctx->rootmode_present ||
- !ctx->user_id_present || !ctx->group_id_present)
- return -EINVAL;
+static int fuse_test_super(struct super_block *sb, struct fs_context *fsc)
+{
-#ifdef CONFIG_BLOCK
- if (ctx->is_bdev)
- return get_tree_bdev(fc, fuse_fill_super);
-#endif
+ return fsc->sget_key == get_fuse_conn_super(sb);
+}
+
+static int fuse_get_tree(struct fs_context *fsc)
+{
+ struct fuse_fs_context *ctx = fsc->fs_private;
+ struct fuse_dev *fud;
+ struct super_block *sb;
+ int err;
- return get_tree_nodev(fc, fuse_fill_super);
+ if (ctx->fd_present)
+ ctx->file = fget(ctx->fd);
+
+ if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) {
+ err = get_tree_bdev(fsc, fuse_fill_super);
+ goto out_fput;
+ }
+ /*
+ * While block dev mount can be initialized with a dummy device fd
+ * (found by device name), normal fuse mounts can't
+ */
+ if (!ctx->file)
+ return -EINVAL;
+
+ /*
+ * Allow creating a fuse mount with an already initialized fuse
+ * connection
+ */
+ fud = READ_ONCE(ctx->file->private_data);
+ if (ctx->file->f_op == &fuse_dev_operations && fud) {
+ fsc->sget_key = fud->fc;
+ sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super);
+ err = PTR_ERR_OR_ZERO(sb);
+ if (!IS_ERR(sb))
+ fsc->root = dget(sb->s_root);
+ } else {
+ err = get_tree_nodev(fsc, fuse_fill_super);
+ }
+out_fput:
+ if (ctx->file)
+ fput(ctx->file);
+ return err;
}
static const struct fs_context_operations fuse_context_ops = {
- .free = fuse_free_fc,
+ .free = fuse_free_fsc,
.parse_param = fuse_parse_param,
.reconfigure = fuse_reconfigure,
.get_tree = fuse_get_tree,
@@ -1591,7 +1684,7 @@ static const struct fs_context_operations fuse_context_ops = {
/*
* Set up the filesystem mount context.
*/
-static int fuse_init_fs_context(struct fs_context *fc)
+static int fuse_init_fs_context(struct fs_context *fsc)
{
struct fuse_fs_context *ctx;
@@ -1604,14 +1697,14 @@ static int fuse_init_fs_context(struct fs_context *fc)
ctx->legacy_opts_show = true;
#ifdef CONFIG_BLOCK
- if (fc->fs_type == &fuseblk_fs_type) {
+ if (fsc->fs_type == &fuseblk_fs_type) {
ctx->is_bdev = true;
ctx->destroy = true;
}
#endif
- fc->fs_private = ctx;
- fc->ops = &fuse_context_ops;
+ fsc->fs_private = ctx;
+ fsc->ops = &fuse_context_ops;
return 0;
}
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 8f52cdaa8445..0ad89c6629d7 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -97,14 +97,14 @@ static const struct fs_parameter_spec virtio_fs_parameters[] = {
{}
};
-static int virtio_fs_parse_param(struct fs_context *fc,
+static int virtio_fs_parse_param(struct fs_context *fsc,
struct fs_parameter *param)
{
struct fs_parse_result result;
- struct fuse_fs_context *ctx = fc->fs_private;
+ struct fuse_fs_context *ctx = fsc->fs_private;
int opt;
- opt = fs_parse(fc, virtio_fs_parameters, param, &result);
+ opt = fs_parse(fsc, virtio_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -119,9 +119,9 @@ static int virtio_fs_parse_param(struct fs_context *fc,
return 0;
}
-static void virtio_fs_free_fc(struct fs_context *fc)
+static void virtio_fs_free_fsc(struct fs_context *fsc)
{
- struct fuse_fs_context *ctx = fc->fs_private;
+ struct fuse_fs_context *ctx = fsc->fs_private;
kfree(ctx);
}
@@ -1488,7 +1488,7 @@ out_err:
}
static const struct fs_context_operations virtio_fs_context_ops = {
- .free = virtio_fs_free_fc,
+ .free = virtio_fs_free_fsc,
.parse_param = virtio_fs_parse_param,
.get_tree = virtio_fs_get_tree,
};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6e15434b23ac..3130f85d2b3f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1985,8 +1985,8 @@ static int gfs2_setattr(struct user_namespace *mnt_userns,
if (error)
goto out;
- error = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ error = may_setattr(&init_user_ns, inode, attr->ia_valid);
+ if (error)
goto error;
error = setattr_prepare(&init_user_ns, dentry, attr);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7d0c3dbb2898..d5c9d886cd9f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -381,6 +381,7 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
static const struct file_operations hostfs_file_fops = {
.llseek = generic_file_llseek,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
diff --git a/fs/internal.h b/fs/internal.h
index 68a2ae029a27..3cd065c8a66b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -18,7 +18,7 @@ struct user_namespace;
struct pipe_inode_info;
/*
- * block_dev.c
+ * block/bdev.c
*/
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cd9bd095fb1b..6c55362c1f99 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -23,8 +23,7 @@ enum {
IO_WORKER_F_UP = 1, /* up and active */
IO_WORKER_F_RUNNING = 2, /* account as running */
IO_WORKER_F_FREE = 4, /* worker on free list */
- IO_WORKER_F_FIXED = 8, /* static idle worker */
- IO_WORKER_F_BOUND = 16, /* is doing bounded work */
+ IO_WORKER_F_BOUND = 8, /* is doing bounded work */
};
enum {
@@ -32,7 +31,7 @@ enum {
};
enum {
- IO_WQE_FLAG_STALLED = 1, /* stalled on hash */
+ IO_ACCT_STALLED_BIT = 0, /* stalled on hash */
};
/*
@@ -55,7 +54,10 @@ struct io_worker {
struct callback_head create_work;
int create_index;
- struct rcu_head rcu;
+ union {
+ struct rcu_head rcu;
+ struct work_struct work;
+ };
};
#if BITS_PER_LONG == 64
@@ -71,25 +73,24 @@ struct io_wqe_acct {
unsigned max_workers;
int index;
atomic_t nr_running;
+ struct io_wq_work_list work_list;
+ unsigned long flags;
};
enum {
IO_WQ_ACCT_BOUND,
IO_WQ_ACCT_UNBOUND,
+ IO_WQ_ACCT_NR,
};
/*
* Per-node worker thread pool
*/
struct io_wqe {
- struct {
- raw_spinlock_t lock;
- struct io_wq_work_list work_list;
- unsigned flags;
- } ____cacheline_aligned_in_smp;
+ raw_spinlock_t lock;
+ struct io_wqe_acct acct[2];
int node;
- struct io_wqe_acct acct[2];
struct hlist_nulls_head free_list;
struct list_head all_list;
@@ -133,8 +134,11 @@ struct io_cb_cancel_data {
bool cancel_all;
};
-static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first);
+static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
static void io_wqe_dec_running(struct io_worker *worker);
+static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
+ struct io_wqe_acct *acct,
+ struct io_cb_cancel_data *match);
static bool io_worker_get(struct io_worker *worker)
{
@@ -195,11 +199,10 @@ static void io_worker_exit(struct io_worker *worker)
do_exit(0);
}
-static inline bool io_wqe_run_queue(struct io_wqe *wqe)
- __must_hold(wqe->lock)
+static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
- if (!wq_list_empty(&wqe->work_list) &&
- !(wqe->flags & IO_WQE_FLAG_STALLED))
+ if (!wq_list_empty(&acct->work_list) &&
+ !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
return true;
return false;
}
@@ -208,7 +211,8 @@ static inline bool io_wqe_run_queue(struct io_wqe *wqe)
* Check head of free list for an available worker. If one isn't available,
* caller must create one.
*/
-static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
+static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
+ struct io_wqe_acct *acct)
__must_hold(RCU)
{
struct hlist_nulls_node *n;
@@ -222,6 +226,10 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
if (!io_worker_get(worker))
continue;
+ if (io_wqe_get_acct(worker) != acct) {
+ io_worker_release(worker);
+ continue;
+ }
if (wake_up_process(worker->task)) {
io_worker_release(worker);
return true;
@@ -236,9 +244,9 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
* We need a worker. If we find a free one, we're good. If not, and we're
* below the max number of workers, create one.
*/
-static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
{
- bool ret;
+ bool do_create = false;
/*
* Most likely an attempt to queue unbounded work on an io_wq that
@@ -247,27 +255,19 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
if (unlikely(!acct->max_workers))
pr_warn_once("io-wq is not configured for unbound workers");
- rcu_read_lock();
- ret = io_wqe_activate_free_worker(wqe);
- rcu_read_unlock();
-
- if (!ret) {
- bool do_create = false, first = false;
-
- raw_spin_lock(&wqe->lock);
- if (acct->nr_workers < acct->max_workers) {
- if (!acct->nr_workers)
- first = true;
- acct->nr_workers++;
- do_create = true;
- }
- raw_spin_unlock(&wqe->lock);
- if (do_create) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- create_io_worker(wqe->wq, wqe, acct->index, first);
- }
+ raw_spin_lock(&wqe->lock);
+ if (acct->nr_workers < acct->max_workers) {
+ acct->nr_workers++;
+ do_create = true;
}
+ raw_spin_unlock(&wqe->lock);
+ if (do_create) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ return create_io_worker(wqe->wq, wqe, acct->index);
+ }
+
+ return true;
}
static void io_wqe_inc_running(struct io_worker *worker)
@@ -283,7 +283,7 @@ static void create_worker_cb(struct callback_head *cb)
struct io_wq *wq;
struct io_wqe *wqe;
struct io_wqe_acct *acct;
- bool do_create = false, first = false;
+ bool do_create = false;
worker = container_of(cb, struct io_worker, create_work);
wqe = worker->wqe;
@@ -291,14 +291,12 @@ static void create_worker_cb(struct callback_head *cb)
acct = &wqe->acct[worker->create_index];
raw_spin_lock(&wqe->lock);
if (acct->nr_workers < acct->max_workers) {
- if (!acct->nr_workers)
- first = true;
acct->nr_workers++;
do_create = true;
}
raw_spin_unlock(&wqe->lock);
if (do_create) {
- create_io_worker(wq, wqe, worker->create_index, first);
+ create_io_worker(wq, wqe, worker->create_index);
} else {
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
@@ -307,9 +305,11 @@ static void create_worker_cb(struct callback_head *cb)
io_worker_release(worker);
}
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
- struct io_wqe_acct *acct)
+static bool io_queue_worker_create(struct io_worker *worker,
+ struct io_wqe_acct *acct,
+ task_work_func_t func)
{
+ struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
/* raced with exit, just ignore create call */
@@ -327,16 +327,17 @@ static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
test_and_set_bit_lock(0, &worker->create_state))
goto fail_release;
- init_task_work(&worker->create_work, create_worker_cb);
+ init_task_work(&worker->create_work, func);
worker->create_index = acct->index;
if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
- return;
+ return true;
clear_bit_unlock(0, &worker->create_state);
fail_release:
io_worker_release(worker);
fail:
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
+ return false;
}
static void io_wqe_dec_running(struct io_worker *worker)
@@ -348,10 +349,10 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (!(worker->flags & IO_WORKER_F_UP))
return;
- if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
+ if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
- io_queue_worker_create(wqe, worker, acct);
+ io_queue_worker_create(worker, acct, create_worker_cb);
}
}
@@ -363,29 +364,10 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
struct io_wq_work *work)
__must_hold(wqe->lock)
{
- bool worker_bound, work_bound;
-
- BUILD_BUG_ON((IO_WQ_ACCT_UNBOUND ^ IO_WQ_ACCT_BOUND) != 1);
-
if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE;
hlist_nulls_del_init_rcu(&worker->nulls_node);
}
-
- /*
- * If worker is moving from bound to unbound (or vice versa), then
- * ensure we update the running accounting.
- */
- worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
- work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
- if (worker_bound != work_bound) {
- int index = work_bound ? IO_WQ_ACCT_UNBOUND : IO_WQ_ACCT_BOUND;
- io_wqe_dec_running(worker);
- worker->flags ^= IO_WORKER_F_BOUND;
- wqe->acct[index].nr_workers--;
- wqe->acct[index ^ 1].nr_workers++;
- io_wqe_inc_running(worker);
- }
}
/*
@@ -413,7 +395,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
{
struct io_wq *wq = wqe->wq;
- spin_lock(&wq->hash->wait.lock);
+ spin_lock_irq(&wq->hash->wait.lock);
if (list_empty(&wqe->wait.entry)) {
__add_wait_queue(&wq->hash->wait, &wqe->wait);
if (!test_bit(hash, &wq->hash->map)) {
@@ -421,48 +403,26 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
list_del_init(&wqe->wait.entry);
}
}
- spin_unlock(&wq->hash->wait.lock);
-}
-
-/*
- * We can always run the work if the worker is currently the same type as
- * the work (eg both are bound, or both are unbound). If they are not the
- * same, only allow it if incrementing the worker count would be allowed.
- */
-static bool io_worker_can_run_work(struct io_worker *worker,
- struct io_wq_work *work)
-{
- struct io_wqe_acct *acct;
-
- if (!(worker->flags & IO_WORKER_F_BOUND) !=
- !(work->flags & IO_WQ_WORK_UNBOUND))
- return true;
-
- /* not the same type, check if we'd go over the limit */
- acct = io_work_get_acct(worker->wqe, work);
- return acct->nr_workers < acct->max_workers;
+ spin_unlock_irq(&wq->hash->wait.lock);
}
-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
- struct io_worker *worker,
- bool *stalled)
+static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
+ struct io_worker *worker)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
unsigned int stall_hash = -1U;
+ struct io_wqe *wqe = worker->wqe;
- wq_list_for_each(node, prev, &wqe->work_list) {
+ wq_list_for_each(node, prev, &acct->work_list) {
unsigned int hash;
work = container_of(node, struct io_wq_work, list);
- if (!io_worker_can_run_work(worker, work))
- break;
-
/* not hashed, can run anytime */
if (!io_wq_is_hashed(work)) {
- wq_list_del(&wqe->work_list, node, prev);
+ wq_list_del(&acct->work_list, node, prev);
return work;
}
@@ -473,7 +433,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
/* hashed, can run if not already running */
if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
wqe->hash_tail[hash] = NULL;
- wq_list_cut(&wqe->work_list, &tail->list, prev);
+ wq_list_cut(&acct->work_list, &tail->list, prev);
return work;
}
if (stall_hash == -1U)
@@ -483,10 +443,14 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
}
if (stall_hash != -1U) {
+ /*
+ * Set this before dropping the lock to avoid racing with new
+ * work being added and clearing the stalled bit.
+ */
+ set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
raw_spin_unlock(&wqe->lock);
io_wait_on_hash(wqe, stall_hash);
raw_spin_lock(&wqe->lock);
- *stalled = true;
}
return NULL;
@@ -520,13 +484,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
do {
struct io_wq_work *work;
- bool stalled;
get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
@@ -535,12 +499,9 @@ get_next:
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
- stalled = false;
- work = io_get_next_work(wqe, worker, &stalled);
+ work = io_get_next_work(acct, worker);
if (work)
__io_worker_busy(wqe, worker, work);
- else if (stalled)
- wqe->flags |= IO_WQE_FLAG_STALLED;
raw_spin_unlock(&wqe->lock);
if (!work)
@@ -572,10 +533,10 @@ get_next:
if (hash != -1U && !next_hashed) {
clear_bit(hash, &wq->hash->map);
+ clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
if (wq_has_sleeper(&wq->hash->wait))
wake_up(&wq->hash->wait);
raw_spin_lock(&wqe->lock);
- wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
goto get_next;
@@ -590,8 +551,10 @@ get_next:
static int io_wqe_worker(void *data)
{
struct io_worker *worker = data;
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
+ bool last_timeout = false;
char buf[TASK_COMM_LEN];
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
@@ -605,10 +568,17 @@ static int io_wqe_worker(void *data)
set_current_state(TASK_INTERRUPTIBLE);
loop:
raw_spin_lock(&wqe->lock);
- if (io_wqe_run_queue(wqe)) {
+ if (io_acct_run_queue(acct)) {
io_worker_handle_work(worker);
goto loop;
}
+ /* timed out, exit unless we're the last worker */
+ if (last_timeout && acct->nr_workers > 1) {
+ raw_spin_unlock(&wqe->lock);
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+ last_timeout = false;
__io_worker_idle(wqe, worker);
raw_spin_unlock(&wqe->lock);
if (io_flush_signals())
@@ -619,13 +589,11 @@ loop:
if (!get_signal(&ksig))
continue;
- break;
- }
- if (ret)
+ if (fatal_signal_pending(current))
+ break;
continue;
- /* timed out, exit unless we're the fixed worker */
- if (!(worker->flags & IO_WORKER_F_FIXED))
- break;
+ }
+ last_timeout = !ret;
}
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
@@ -676,51 +644,131 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
raw_spin_unlock(&worker->wqe->lock);
}
-static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first)
+static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
+ struct task_struct *tsk)
+{
+ tsk->pf_io_worker = worker;
+ worker->task = tsk;
+ set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
+ tsk->flags |= PF_NO_SETAFFINITY;
+
+ raw_spin_lock(&wqe->lock);
+ hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
+ list_add_tail_rcu(&worker->all_list, &wqe->all_list);
+ worker->flags |= IO_WORKER_F_FREE;
+ raw_spin_unlock(&wqe->lock);
+ wake_up_new_task(tsk);
+}
+
+static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
+{
+ return true;
+}
+
+static inline bool io_should_retry_thread(long err)
+{
+ switch (err) {
+ case -EAGAIN:
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ case -ERESTARTNOHAND:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void create_worker_cont(struct callback_head *cb)
{
- struct io_wqe_acct *acct = &wqe->acct[index];
struct io_worker *worker;
struct task_struct *tsk;
+ struct io_wqe *wqe;
- __set_current_state(TASK_RUNNING);
+ worker = container_of(cb, struct io_worker, create_work);
+ clear_bit_unlock(0, &worker->create_state);
+ wqe = worker->wqe;
+ tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
+ if (!IS_ERR(tsk)) {
+ io_init_new_worker(wqe, worker, tsk);
+ io_worker_release(worker);
+ return;
+ } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
- if (!worker)
- goto fail;
+ atomic_dec(&acct->nr_running);
+ raw_spin_lock(&wqe->lock);
+ acct->nr_workers--;
+ if (!acct->nr_workers) {
+ struct io_cb_cancel_data match = {
+ .fn = io_wq_work_match_all,
+ .cancel_all = true,
+ };
- refcount_set(&worker->ref, 1);
- worker->nulls_node.pprev = NULL;
- worker->wqe = wqe;
- spin_lock_init(&worker->lock);
- init_completion(&worker->ref_done);
+ while (io_acct_cancel_pending_work(wqe, acct, &match))
+ raw_spin_lock(&wqe->lock);
+ }
+ raw_spin_unlock(&wqe->lock);
+ io_worker_ref_put(wqe->wq);
+ kfree(worker);
+ return;
+ }
- tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
- if (IS_ERR(tsk)) {
+ /* re-create attempts grab a new worker ref, drop the existing one */
+ io_worker_release(worker);
+ schedule_work(&worker->work);
+}
+
+static void io_workqueue_create(struct work_struct *work)
+{
+ struct io_worker *worker = container_of(work, struct io_worker, work);
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+
+ if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
kfree(worker);
+ }
+}
+
+static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+{
+ struct io_wqe_acct *acct = &wqe->acct[index];
+ struct io_worker *worker;
+ struct task_struct *tsk;
+
+ __set_current_state(TASK_RUNNING);
+
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
+ if (!worker) {
fail:
atomic_dec(&acct->nr_running);
raw_spin_lock(&wqe->lock);
acct->nr_workers--;
raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wq);
- return;
+ return false;
}
- tsk->pf_io_worker = worker;
- worker->task = tsk;
- set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
- tsk->flags |= PF_NO_SETAFFINITY;
+ refcount_set(&worker->ref, 1);
+ worker->wqe = wqe;
+ spin_lock_init(&worker->lock);
+ init_completion(&worker->ref_done);
- raw_spin_lock(&wqe->lock);
- hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
- list_add_tail_rcu(&worker->all_list, &wqe->all_list);
- worker->flags |= IO_WORKER_F_FREE;
if (index == IO_WQ_ACCT_BOUND)
worker->flags |= IO_WORKER_F_BOUND;
- if (first && (worker->flags & IO_WORKER_F_BOUND))
- worker->flags |= IO_WORKER_F_FIXED;
- raw_spin_unlock(&wqe->lock);
- wake_up_new_task(tsk);
+
+ tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
+ if (!IS_ERR(tsk)) {
+ io_init_new_worker(wqe, worker, tsk);
+ } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+ kfree(worker);
+ goto fail;
+ } else {
+ INIT_WORK(&worker->work, io_workqueue_create);
+ schedule_work(&worker->work);
+ }
+
+ return true;
}
/*
@@ -755,11 +803,6 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
return false;
}
-static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
-{
- return true;
-}
-
static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{
struct io_wq *wq = wqe->wq;
@@ -773,12 +816,13 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
{
+ struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
unsigned int hash;
struct io_wq_work *tail;
if (!io_wq_is_hashed(work)) {
append:
- wq_list_add_tail(&work->list, &wqe->work_list);
+ wq_list_add_tail(&work->list, &acct->work_list);
return;
}
@@ -788,13 +832,19 @@ append:
if (!tail)
goto append;
- wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
+ wq_list_add_after(&work->list, &tail->list, &acct->work_list);
+}
+
+static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
+{
+ return work == data;
}
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- bool do_wake;
+ unsigned work_flags = work->flags;
+ bool do_create;
/*
* If io-wq is exiting for this task, or if the request has explicitly
@@ -808,13 +858,36 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work);
- wqe->flags &= ~IO_WQE_FLAG_STALLED;
- do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running);
+ clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+
+ rcu_read_lock();
+ do_create = !io_wqe_activate_free_worker(wqe, acct);
+ rcu_read_unlock();
+
raw_spin_unlock(&wqe->lock);
- if (do_wake)
- io_wqe_wake_worker(wqe, acct);
+ if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running))) {
+ bool did_create;
+
+ did_create = io_wqe_create_worker(wqe, acct);
+ if (likely(did_create))
+ return;
+
+ raw_spin_lock(&wqe->lock);
+ /* fatal condition, failed to create the first worker */
+ if (!acct->nr_workers) {
+ struct io_cb_cancel_data match = {
+ .fn = io_wq_work_match_item,
+ .data = work,
+ .cancel_all = false,
+ };
+
+ if (io_acct_cancel_pending_work(wqe, acct, &match))
+ raw_spin_lock(&wqe->lock);
+ }
+ raw_spin_unlock(&wqe->lock);
+ }
}
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
@@ -859,6 +932,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
struct io_wq_work *work,
struct io_wq_work_node *prev)
{
+ struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
unsigned int hash = io_get_work_hash(work);
struct io_wq_work *prev_work = NULL;
@@ -870,18 +944,18 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
else
wqe->hash_tail[hash] = NULL;
}
- wq_list_del(&wqe->work_list, &work->list, prev);
+ wq_list_del(&acct->work_list, &work->list, prev);
}
-static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
- struct io_cb_cancel_data *match)
+static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
+ struct io_wqe_acct *acct,
+ struct io_cb_cancel_data *match)
+ __releases(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
-retry:
- raw_spin_lock(&wqe->lock);
- wq_list_for_each(node, prev, &wqe->work_list) {
+ wq_list_for_each(node, prev, &acct->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
@@ -889,11 +963,27 @@ retry:
raw_spin_unlock(&wqe->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
- if (!match->cancel_all)
- return;
-
/* not safe to continue after unlock */
- goto retry;
+ return true;
+ }
+
+ return false;
+}
+
+static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
+ struct io_cb_cancel_data *match)
+{
+ int i;
+retry:
+ raw_spin_lock(&wqe->lock);
+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
+ struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
+
+ if (io_acct_cancel_pending_work(wqe, acct, match)) {
+ if (match->cancel_all)
+ goto retry;
+ return;
+ }
}
raw_spin_unlock(&wqe->lock);
}
@@ -954,18 +1044,24 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
+ int i;
list_del_init(&wait->entry);
rcu_read_lock();
- io_wqe_activate_free_worker(wqe);
+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
+ struct io_wqe_acct *acct = &wqe->acct[i];
+
+ if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
+ io_wqe_activate_free_worker(wqe, acct);
+ }
rcu_read_unlock();
return 1;
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
- int ret, node;
+ int ret, node, i;
struct io_wq *wq;
if (WARN_ON_ONCE(!data->free_work || !data->do_work))
@@ -1000,18 +1096,20 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
wq->wqes[node] = wqe;
wqe->node = alloc_node;
- wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND;
- wqe->acct[IO_WQ_ACCT_UNBOUND].index = IO_WQ_ACCT_UNBOUND;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
- atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC);
- atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
- wqe->wait.func = io_wqe_hash_wake;
INIT_LIST_HEAD(&wqe->wait.entry);
+ wqe->wait.func = io_wqe_hash_wake;
+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
+ struct io_wqe_acct *acct = &wqe->acct[i];
+
+ acct->index = i;
+ atomic_set(&acct->nr_running, 0);
+ INIT_WQ_LIST(&acct->work_list);
+ }
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
- INIT_WQ_LIST(&wqe->work_list);
INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
INIT_LIST_HEAD(&wqe->all_list);
}
@@ -1038,7 +1136,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data)
{
struct io_worker *worker;
- if (cb->func != create_worker_cb)
+ if (cb->func != create_worker_cb && cb->func != create_worker_cont)
return false;
worker = container_of(cb, struct io_worker, create_work);
return worker->wqe->wq == data;
@@ -1059,9 +1157,14 @@ static void io_wq_exit_workers(struct io_wq *wq)
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
struct io_worker *worker;
+ struct io_wqe_acct *acct;
worker = container_of(cb, struct io_worker, create_work);
- atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
+ acct = io_wqe_get_acct(worker);
+ atomic_dec(&acct->nr_running);
+ raw_spin_lock(&worker->wqe->lock);
+ acct->nr_workers--;
+ raw_spin_unlock(&worker->wqe->lock);
io_worker_ref_put(wq);
clear_bit_unlock(0, &worker->create_state);
io_worker_release(worker);
@@ -1193,7 +1296,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
for_each_node(node) {
struct io_wqe_acct *acct;
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
acct = &wq->wqes[node]->acct[i];
prev = max_t(int, acct->max_workers, prev);
if (new_count[i])
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6f35b1285865..16fb7436043c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1021,6 +1021,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_WRITE] = {
.needs_file = 1,
+ .hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
@@ -1481,6 +1482,8 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
struct io_timeout_data *io = req->async_data;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
+ if (status)
+ req_set_fail(req);
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
@@ -1618,8 +1621,11 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
+ /* see waitqueue_active() comment */
+ smp_mb();
+
if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (wq_has_sleeper(&ctx->cq_wait))
+ if (waitqueue_active(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
}
if (io_should_trigger_evfd(ctx))
@@ -1851,6 +1857,17 @@ static void io_req_complete_failed(struct io_kiocb *req, long res)
io_req_complete_post(req, res, 0);
}
+static void io_req_complete_fail_submit(struct io_kiocb *req)
+{
+ /*
+ * We don't submit, fail them all, for that replace hardlinks with
+ * normal links. Extra REQ_F_LINK is tolerated.
+ */
+ req->flags &= ~REQ_F_HARDLINK;
+ req->flags |= REQ_F_LINK;
+ io_req_complete_failed(req, req->result);
+}
+
/*
* Don't initialise the fields below on every allocation, but do that in
* advance and keep them valid across allocations.
@@ -2119,6 +2136,9 @@ static void tctx_task_work(struct callback_head *cb)
while (1) {
struct io_wq_work_node *node;
+ if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
+ io_submit_flush_completions(ctx);
+
spin_lock_irq(&tctx->task_lock);
node = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
@@ -2673,7 +2693,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
{
if (__io_complete_rw_common(req, res))
return;
- __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
+ __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
@@ -3410,6 +3430,12 @@ static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
return -EINVAL;
}
+static bool need_read_all(struct io_kiocb *req)
+{
+ return req->flags & REQ_F_ISREG ||
+ S_ISBLK(file_inode(req->file)->i_mode);
+}
+
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -3459,12 +3485,13 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (req->flags & REQ_F_NOWAIT)
goto done;
/* some cases will consume bytes even on error returns */
+ iov_iter_reexpand(iter, iter->count + iter->truncated);
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
} else if (ret <= 0 || ret == io_size || !force_nonblock ||
- (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
+ (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
}
@@ -3598,6 +3625,7 @@ done:
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
+ iov_iter_reexpand(iter, iter->count + iter->truncated);
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
return ret ?: -EAGAIN;
@@ -5249,7 +5277,7 @@ static void io_poll_remove_double(struct io_kiocb *req)
}
}
-static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
+static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
__must_hold(&req->ctx->completion_lock)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -5271,10 +5299,19 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
if (flags & IORING_CQE_F_MORE)
ctx->cq_extra++;
- io_commit_cqring(ctx);
return !(flags & IORING_CQE_F_MORE);
}
+static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
+ __must_hold(&req->ctx->completion_lock)
+{
+ bool done;
+
+ done = __io_poll_complete(req, mask);
+ io_commit_cqring(req->ctx);
+ return done;
+}
+
static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -5285,7 +5322,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
} else {
bool done;
- done = io_poll_complete(req, req->result);
+ done = __io_poll_complete(req, req->result);
if (done) {
io_poll_remove_double(req);
hash_del(&req->hash_node);
@@ -5293,6 +5330,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
req->result = 0;
add_wait_queue(req->poll.head, &req->poll.wait);
}
+ io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
@@ -6398,6 +6436,11 @@ static bool io_drain_req(struct io_kiocb *req)
int ret;
u32 seq;
+ if (req->flags & REQ_F_FAIL) {
+ io_req_complete_fail_submit(req);
+ return true;
+ }
+
/*
* If we need to drain a request in the middle of a link, drain the
* head request and the next request/link after the current link.
@@ -6914,7 +6957,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
__io_queue_sqe(req);
} else if (req->flags & REQ_F_FAIL) {
- io_req_complete_failed(req, req->result);
+ io_req_complete_fail_submit(req);
} else {
int ret = io_req_prep_async(req);
@@ -10498,26 +10541,53 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
void __user *arg)
{
- struct io_uring_task *tctx = current->io_uring;
+ struct io_uring_task *tctx = NULL;
+ struct io_sq_data *sqd = NULL;
__u32 new_count[2];
int i, ret;
- if (!tctx || !tctx->io_wq)
- return -EINVAL;
if (copy_from_user(new_count, arg, sizeof(new_count)))
return -EFAULT;
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i] > INT_MAX)
return -EINVAL;
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ sqd = ctx->sq_data;
+ if (sqd) {
+ /*
+ * Observe the correct sqd->lock -> ctx->uring_lock
+ * ordering. Fine to drop uring_lock here, we hold
+ * a ref to the ctx.
+ */
+ mutex_unlock(&ctx->uring_lock);
+ mutex_lock(&sqd->lock);
+ mutex_lock(&ctx->uring_lock);
+ tctx = sqd->thread->io_uring;
+ }
+ } else {
+ tctx = current->io_uring;
+ }
+
+ ret = -EINVAL;
+ if (!tctx || !tctx->io_wq)
+ goto err;
+
ret = io_wq_max_workers(tctx->io_wq, new_count);
if (ret)
- return ret;
+ goto err;
+
+ if (sqd)
+ mutex_unlock(&sqd->lock);
if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT;
return 0;
+err:
+ if (sqd)
+ mutex_unlock(&sqd->lock);
+ return ret;
}
static bool io_register_op_must_quiesce(int op)
@@ -10795,7 +10865,7 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
- BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
+ BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT);
diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c
index 2243a2c64b37..8317f7ca402b 100644
--- a/fs/ksmbd/ndr.c
+++ b/fs/ksmbd/ndr.c
@@ -28,37 +28,60 @@ static int try_to_realloc_ndr_blob(struct ndr *n, size_t sz)
return 0;
}
-static void ndr_write_int16(struct ndr *n, __u16 value)
+static int ndr_write_int16(struct ndr *n, __u16 value)
{
- if (n->length <= n->offset + sizeof(value))
- try_to_realloc_ndr_blob(n, sizeof(value));
+ if (n->length <= n->offset + sizeof(value)) {
+ int ret;
+
+ ret = try_to_realloc_ndr_blob(n, sizeof(value));
+ if (ret)
+ return ret;
+ }
*(__le16 *)ndr_get_field(n) = cpu_to_le16(value);
n->offset += sizeof(value);
+ return 0;
}
-static void ndr_write_int32(struct ndr *n, __u32 value)
+static int ndr_write_int32(struct ndr *n, __u32 value)
{
- if (n->length <= n->offset + sizeof(value))
- try_to_realloc_ndr_blob(n, sizeof(value));
+ if (n->length <= n->offset + sizeof(value)) {
+ int ret;
+
+ ret = try_to_realloc_ndr_blob(n, sizeof(value));
+ if (ret)
+ return ret;
+ }
*(__le32 *)ndr_get_field(n) = cpu_to_le32(value);
n->offset += sizeof(value);
+ return 0;
}
-static void ndr_write_int64(struct ndr *n, __u64 value)
+static int ndr_write_int64(struct ndr *n, __u64 value)
{
- if (n->length <= n->offset + sizeof(value))
- try_to_realloc_ndr_blob(n, sizeof(value));
+ if (n->length <= n->offset + sizeof(value)) {
+ int ret;
+
+ ret = try_to_realloc_ndr_blob(n, sizeof(value));
+ if (ret)
+ return ret;
+ }
*(__le64 *)ndr_get_field(n) = cpu_to_le64(value);
n->offset += sizeof(value);
+ return 0;
}
static int ndr_write_bytes(struct ndr *n, void *value, size_t sz)
{
- if (n->length <= n->offset + sz)
- try_to_realloc_ndr_blob(n, sz);
+ if (n->length <= n->offset + sz) {
+ int ret;
+
+ ret = try_to_realloc_ndr_blob(n, sz);
+ if (ret)
+ return ret;
+ }
memcpy(ndr_get_field(n), value, sz);
n->offset += sz;
@@ -70,8 +93,13 @@ static int ndr_write_string(struct ndr *n, char *value)
size_t sz;
sz = strlen(value) + 1;
- if (n->length <= n->offset + sz)
- try_to_realloc_ndr_blob(n, sz);
+ if (n->length <= n->offset + sz) {
+ int ret;
+
+ ret = try_to_realloc_ndr_blob(n, sz);
+ if (ret)
+ return ret;
+ }
memcpy(ndr_get_field(n), value, sz);
n->offset += sz;
@@ -81,9 +109,14 @@ static int ndr_write_string(struct ndr *n, char *value)
static int ndr_read_string(struct ndr *n, void *value, size_t sz)
{
- int len = strnlen(ndr_get_field(n), sz);
+ int len;
- memcpy(value, ndr_get_field(n), len);
+ if (n->offset + sz > n->length)
+ return -EINVAL;
+
+ len = strnlen(ndr_get_field(n), sz);
+ if (value)
+ memcpy(value, ndr_get_field(n), len);
len++;
n->offset += len;
n->offset = ALIGN(n->offset, 2);
@@ -92,41 +125,52 @@ static int ndr_read_string(struct ndr *n, void *value, size_t sz)
static int ndr_read_bytes(struct ndr *n, void *value, size_t sz)
{
- memcpy(value, ndr_get_field(n), sz);
+ if (n->offset + sz > n->length)
+ return -EINVAL;
+
+ if (value)
+ memcpy(value, ndr_get_field(n), sz);
n->offset += sz;
return 0;
}
-static __u16 ndr_read_int16(struct ndr *n)
+static int ndr_read_int16(struct ndr *n, __u16 *value)
{
- __u16 ret;
+ if (n->offset + sizeof(__u16) > n->length)
+ return -EINVAL;
- ret = le16_to_cpu(*(__le16 *)ndr_get_field(n));
+ if (value)
+ *value = le16_to_cpu(*(__le16 *)ndr_get_field(n));
n->offset += sizeof(__u16);
- return ret;
+ return 0;
}
-static __u32 ndr_read_int32(struct ndr *n)
+static int ndr_read_int32(struct ndr *n, __u32 *value)
{
- __u32 ret;
+ if (n->offset + sizeof(__u32) > n->length)
+ return 0;
- ret = le32_to_cpu(*(__le32 *)ndr_get_field(n));
+ if (value)
+ *value = le32_to_cpu(*(__le32 *)ndr_get_field(n));
n->offset += sizeof(__u32);
- return ret;
+ return 0;
}
-static __u64 ndr_read_int64(struct ndr *n)
+static int ndr_read_int64(struct ndr *n, __u64 *value)
{
- __u64 ret;
+ if (n->offset + sizeof(__u64) > n->length)
+ return -EINVAL;
- ret = le64_to_cpu(*(__le64 *)ndr_get_field(n));
+ if (value)
+ *value = le64_to_cpu(*(__le64 *)ndr_get_field(n));
n->offset += sizeof(__u64);
- return ret;
+ return 0;
}
int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da)
{
char hex_attr[12] = {0};
+ int ret;
n->offset = 0;
n->length = 1024;
@@ -136,97 +180,161 @@ int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da)
if (da->version == 3) {
snprintf(hex_attr, 10, "0x%x", da->attr);
- ndr_write_string(n, hex_attr);
+ ret = ndr_write_string(n, hex_attr);
} else {
- ndr_write_string(n, "");
+ ret = ndr_write_string(n, "");
}
- ndr_write_int16(n, da->version);
- ndr_write_int32(n, da->version);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int16(n, da->version);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int32(n, da->version);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int32(n, da->flags);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int32(n, da->attr);
+ if (ret)
+ return ret;
- ndr_write_int32(n, da->flags);
- ndr_write_int32(n, da->attr);
if (da->version == 3) {
- ndr_write_int32(n, da->ea_size);
- ndr_write_int64(n, da->size);
- ndr_write_int64(n, da->alloc_size);
+ ret = ndr_write_int32(n, da->ea_size);
+ if (ret)
+ return ret;
+ ret = ndr_write_int64(n, da->size);
+ if (ret)
+ return ret;
+ ret = ndr_write_int64(n, da->alloc_size);
} else {
- ndr_write_int64(n, da->itime);
+ ret = ndr_write_int64(n, da->itime);
}
- ndr_write_int64(n, da->create_time);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int64(n, da->create_time);
+ if (ret)
+ return ret;
+
if (da->version == 3)
- ndr_write_int64(n, da->change_time);
- return 0;
+ ret = ndr_write_int64(n, da->change_time);
+ return ret;
}
int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da)
{
- char *hex_attr;
- int version2;
-
- hex_attr = kzalloc(n->length, GFP_KERNEL);
- if (!hex_attr)
- return -ENOMEM;
+ char hex_attr[12];
+ unsigned int version2;
+ int ret;
n->offset = 0;
- ndr_read_string(n, hex_attr, n->length);
- kfree(hex_attr);
- da->version = ndr_read_int16(n);
+ ret = ndr_read_string(n, hex_attr, sizeof(hex_attr));
+ if (ret)
+ return ret;
+
+ ret = ndr_read_int16(n, &da->version);
+ if (ret)
+ return ret;
if (da->version != 3 && da->version != 4) {
pr_err("v%d version is not supported\n", da->version);
return -EINVAL;
}
- version2 = ndr_read_int32(n);
+ ret = ndr_read_int32(n, &version2);
+ if (ret)
+ return ret;
+
if (da->version != version2) {
pr_err("ndr version mismatched(version: %d, version2: %d)\n",
da->version, version2);
return -EINVAL;
}
- ndr_read_int32(n);
- da->attr = ndr_read_int32(n);
+ ret = ndr_read_int32(n, NULL);
+ if (ret)
+ return ret;
+
+ ret = ndr_read_int32(n, &da->attr);
+ if (ret)
+ return ret;
+
if (da->version == 4) {
- da->itime = ndr_read_int64(n);
- da->create_time = ndr_read_int64(n);
+ ret = ndr_read_int64(n, &da->itime);
+ if (ret)
+ return ret;
+
+ ret = ndr_read_int64(n, &da->create_time);
} else {
- ndr_read_int32(n);
- ndr_read_int64(n);
- ndr_read_int64(n);
- da->create_time = ndr_read_int64(n);
- ndr_read_int64(n);
+ ret = ndr_read_int32(n, NULL);
+ if (ret)
+ return ret;
+
+ ret = ndr_read_int64(n, NULL);
+ if (ret)
+ return ret;
+
+ ret = ndr_read_int64(n, NULL);
+ if (ret)
+ return ret;
+
+ ret = ndr_read_int64(n, &da->create_time);
+ if (ret)
+ return ret;
+
+ ret = ndr_read_int64(n, NULL);
}
- return 0;
+ return ret;
}
static int ndr_encode_posix_acl_entry(struct ndr *n, struct xattr_smb_acl *acl)
{
- int i;
+ int i, ret;
+
+ ret = ndr_write_int32(n, acl->count);
+ if (ret)
+ return ret;
- ndr_write_int32(n, acl->count);
n->offset = ALIGN(n->offset, 8);
- ndr_write_int32(n, acl->count);
- ndr_write_int32(n, 0);
+ ret = ndr_write_int32(n, acl->count);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int32(n, 0);
+ if (ret)
+ return ret;
for (i = 0; i < acl->count; i++) {
n->offset = ALIGN(n->offset, 8);
- ndr_write_int16(n, acl->entries[i].type);
- ndr_write_int16(n, acl->entries[i].type);
+ ret = ndr_write_int16(n, acl->entries[i].type);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int16(n, acl->entries[i].type);
+ if (ret)
+ return ret;
if (acl->entries[i].type == SMB_ACL_USER) {
n->offset = ALIGN(n->offset, 8);
- ndr_write_int64(n, acl->entries[i].uid);
+ ret = ndr_write_int64(n, acl->entries[i].uid);
} else if (acl->entries[i].type == SMB_ACL_GROUP) {
n->offset = ALIGN(n->offset, 8);
- ndr_write_int64(n, acl->entries[i].gid);
+ ret = ndr_write_int64(n, acl->entries[i].gid);
}
+ if (ret)
+ return ret;
/* push permission */
- ndr_write_int32(n, acl->entries[i].perm);
+ ret = ndr_write_int32(n, acl->entries[i].perm);
}
- return 0;
+ return ret;
}
int ndr_encode_posix_acl(struct ndr *n,
@@ -235,7 +343,8 @@ int ndr_encode_posix_acl(struct ndr *n,
struct xattr_smb_acl *acl,
struct xattr_smb_acl *def_acl)
{
- int ref_id = 0x00020000;
+ unsigned int ref_id = 0x00020000;
+ int ret;
n->offset = 0;
n->length = 1024;
@@ -245,35 +354,46 @@ int ndr_encode_posix_acl(struct ndr *n,
if (acl) {
/* ACL ACCESS */
- ndr_write_int32(n, ref_id);
+ ret = ndr_write_int32(n, ref_id);
ref_id += 4;
} else {
- ndr_write_int32(n, 0);
+ ret = ndr_write_int32(n, 0);
}
+ if (ret)
+ return ret;
if (def_acl) {
/* DEFAULT ACL ACCESS */
- ndr_write_int32(n, ref_id);
+ ret = ndr_write_int32(n, ref_id);
ref_id += 4;
} else {
- ndr_write_int32(n, 0);
+ ret = ndr_write_int32(n, 0);
}
-
- ndr_write_int64(n, from_kuid(user_ns, inode->i_uid));
- ndr_write_int64(n, from_kgid(user_ns, inode->i_gid));
- ndr_write_int32(n, inode->i_mode);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int64(n, from_kuid(&init_user_ns, i_uid_into_mnt(user_ns, inode)));
+ if (ret)
+ return ret;
+ ret = ndr_write_int64(n, from_kgid(&init_user_ns, i_gid_into_mnt(user_ns, inode)));
+ if (ret)
+ return ret;
+ ret = ndr_write_int32(n, inode->i_mode);
+ if (ret)
+ return ret;
if (acl) {
- ndr_encode_posix_acl_entry(n, acl);
- if (def_acl)
- ndr_encode_posix_acl_entry(n, def_acl);
+ ret = ndr_encode_posix_acl_entry(n, acl);
+ if (def_acl && !ret)
+ ret = ndr_encode_posix_acl_entry(n, def_acl);
}
- return 0;
+ return ret;
}
int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl)
{
- int ref_id = 0x00020004;
+ unsigned int ref_id = 0x00020004;
+ int ret;
n->offset = 0;
n->length = 2048;
@@ -281,36 +401,65 @@ int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl)
if (!n->data)
return -ENOMEM;
- ndr_write_int16(n, acl->version);
- ndr_write_int32(n, acl->version);
- ndr_write_int16(n, 2);
- ndr_write_int32(n, ref_id);
+ ret = ndr_write_int16(n, acl->version);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int32(n, acl->version);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int16(n, 2);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int32(n, ref_id);
+ if (ret)
+ return ret;
/* push hash type and hash 64bytes */
- ndr_write_int16(n, acl->hash_type);
- ndr_write_bytes(n, acl->hash, XATTR_SD_HASH_SIZE);
- ndr_write_bytes(n, acl->desc, acl->desc_len);
- ndr_write_int64(n, acl->current_time);
- ndr_write_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE);
+ ret = ndr_write_int16(n, acl->hash_type);
+ if (ret)
+ return ret;
- /* push ndr for security descriptor */
- ndr_write_bytes(n, acl->sd_buf, acl->sd_size);
+ ret = ndr_write_bytes(n, acl->hash, XATTR_SD_HASH_SIZE);
+ if (ret)
+ return ret;
- return 0;
+ ret = ndr_write_bytes(n, acl->desc, acl->desc_len);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_int64(n, acl->current_time);
+ if (ret)
+ return ret;
+
+ ret = ndr_write_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE);
+ if (ret)
+ return ret;
+
+ /* push ndr for security descriptor */
+ ret = ndr_write_bytes(n, acl->sd_buf, acl->sd_size);
+ return ret;
}
int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl)
{
- int version2;
+ unsigned int version2;
+ int ret;
n->offset = 0;
- acl->version = ndr_read_int16(n);
+ ret = ndr_read_int16(n, &acl->version);
+ if (ret)
+ return ret;
if (acl->version != 4) {
pr_err("v%d version is not supported\n", acl->version);
return -EINVAL;
}
- version2 = ndr_read_int32(n);
+ ret = ndr_read_int32(n, &version2);
+ if (ret)
+ return ret;
if (acl->version != version2) {
pr_err("ndr version mismatched(version: %d, version2: %d)\n",
acl->version, version2);
@@ -318,11 +467,22 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl)
}
/* Read Level */
- ndr_read_int16(n);
+ ret = ndr_read_int16(n, NULL);
+ if (ret)
+ return ret;
+
/* Read Ref Id */
- ndr_read_int32(n);
- acl->hash_type = ndr_read_int16(n);
- ndr_read_bytes(n, acl->hash, XATTR_SD_HASH_SIZE);
+ ret = ndr_read_int32(n, NULL);
+ if (ret)
+ return ret;
+
+ ret = ndr_read_int16(n, &acl->hash_type);
+ if (ret)
+ return ret;
+
+ ret = ndr_read_bytes(n, acl->hash, XATTR_SD_HASH_SIZE);
+ if (ret)
+ return ret;
ndr_read_bytes(n, acl->desc, 10);
if (strncmp(acl->desc, "posix_acl", 9)) {
@@ -331,15 +491,20 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl)
}
/* Read Time */
- ndr_read_int64(n);
+ ret = ndr_read_int64(n, NULL);
+ if (ret)
+ return ret;
+
/* Read Posix ACL hash */
- ndr_read_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE);
+ ret = ndr_read_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE);
+ if (ret)
+ return ret;
+
acl->sd_size = n->length - n->offset;
acl->sd_buf = kzalloc(acl->sd_size, GFP_KERNEL);
if (!acl->sd_buf)
return -ENOMEM;
- ndr_read_bytes(n, acl->sd_buf, acl->sd_size);
-
- return 0;
+ ret = ndr_read_bytes(n, acl->sd_buf, acl->sd_size);
+ return ret;
}
diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c
index 6ace6c2f22dc..16b6236d1bd2 100644
--- a/fs/ksmbd/oplock.c
+++ b/fs/ksmbd/oplock.c
@@ -1614,9 +1614,11 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp)
buf->nlink = cpu_to_le32(inode->i_nlink);
buf->reparse_tag = cpu_to_le32(fp->volatile_id);
buf->mode = cpu_to_le32(inode->i_mode);
- id_to_sid(from_kuid(user_ns, inode->i_uid),
+ id_to_sid(from_kuid_munged(&init_user_ns,
+ i_uid_into_mnt(user_ns, inode)),
SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]);
- id_to_sid(from_kgid(user_ns, inode->i_gid),
+ id_to_sid(from_kgid_munged(&init_user_ns,
+ i_gid_into_mnt(user_ns, inode)),
SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]);
}
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index d329ea49fa14..c86164dc70bb 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -2381,10 +2381,12 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work,
le32_to_cpu(sd_buf->ccontext.DataLength), true);
}
-static void ksmbd_acls_fattr(struct smb_fattr *fattr, struct inode *inode)
+static void ksmbd_acls_fattr(struct smb_fattr *fattr,
+ struct user_namespace *mnt_userns,
+ struct inode *inode)
{
- fattr->cf_uid = inode->i_uid;
- fattr->cf_gid = inode->i_gid;
+ fattr->cf_uid = i_uid_into_mnt(mnt_userns, inode);
+ fattr->cf_gid = i_gid_into_mnt(mnt_userns, inode);
fattr->cf_mode = inode->i_mode;
fattr->cf_acls = NULL;
fattr->cf_dacls = NULL;
@@ -2893,7 +2895,7 @@ int smb2_open(struct ksmbd_work *work)
struct smb_ntsd *pntsd;
int pntsd_size, ace_num = 0;
- ksmbd_acls_fattr(&fattr, inode);
+ ksmbd_acls_fattr(&fattr, user_ns, inode);
if (fattr.cf_acls)
ace_num = fattr.cf_acls->a_count;
if (fattr.cf_dacls)
@@ -3324,7 +3326,6 @@ static int dentry_name(struct ksmbd_dir_info *d_info, int info_level)
*/
static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
struct ksmbd_dir_info *d_info,
- struct user_namespace *user_ns,
struct ksmbd_kstat *ksmbd_kstat)
{
int next_entry_offset = 0;
@@ -3478,9 +3479,9 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
S_ISDIR(ksmbd_kstat->kstat->mode) ? ATTR_DIRECTORY_LE : ATTR_ARCHIVE_LE;
if (d_info->hide_dot_file && d_info->name[0] == '.')
posix_info->DosAttributes |= ATTR_HIDDEN_LE;
- id_to_sid(from_kuid(user_ns, ksmbd_kstat->kstat->uid),
+ id_to_sid(from_kuid_munged(&init_user_ns, ksmbd_kstat->kstat->uid),
SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]);
- id_to_sid(from_kgid(user_ns, ksmbd_kstat->kstat->gid),
+ id_to_sid(from_kgid_munged(&init_user_ns, ksmbd_kstat->kstat->gid),
SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]);
memcpy(posix_info->name, conv_name, conv_len);
posix_info->name_len = cpu_to_le32(conv_len);
@@ -3543,9 +3544,9 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv)
return -EINVAL;
lock_dir(priv->dir_fp);
- dent = lookup_one_len(priv->d_info->name,
- priv->dir_fp->filp->f_path.dentry,
- priv->d_info->name_len);
+ dent = lookup_one(user_ns, priv->d_info->name,
+ priv->dir_fp->filp->f_path.dentry,
+ priv->d_info->name_len);
unlock_dir(priv->dir_fp);
if (IS_ERR(dent)) {
@@ -3571,7 +3572,6 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv)
rc = smb2_populate_readdir_entry(priv->work->conn,
priv->info_level,
priv->d_info,
- user_ns,
&ksmbd_kstat);
dput(dent);
if (rc)
@@ -5008,7 +5008,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
user_ns = file_mnt_user_ns(fp->filp);
inode = file_inode(fp->filp);
- ksmbd_acls_fattr(&fattr, inode);
+ ksmbd_acls_fattr(&fattr, user_ns, inode);
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_ACL_XATTR))
@@ -5246,7 +5246,9 @@ int smb2_echo(struct ksmbd_work *work)
return 0;
}
-static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
+static int smb2_rename(struct ksmbd_work *work,
+ struct ksmbd_file *fp,
+ struct user_namespace *user_ns,
struct smb2_file_rename_info *file_info,
struct nls_table *local_nls)
{
@@ -5310,7 +5312,7 @@ static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
if (rc)
goto out;
- rc = ksmbd_vfs_setxattr(file_mnt_user_ns(fp->filp),
+ rc = ksmbd_vfs_setxattr(user_ns,
fp->filp->f_path.dentry,
xattr_stream_name,
NULL, 0, 0);
@@ -5438,11 +5440,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf,
{
struct smb2_file_all_info *file_info;
struct iattr attrs;
- struct iattr temp_attrs;
+ struct timespec64 ctime;
struct file *filp;
struct inode *inode;
struct user_namespace *user_ns;
- int rc;
+ int rc = 0;
if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE))
return -EACCES;
@@ -5462,11 +5464,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf,
}
if (file_info->ChangeTime) {
- temp_attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime);
- attrs.ia_ctime = temp_attrs.ia_ctime;
+ attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime);
+ ctime = attrs.ia_ctime;
attrs.ia_valid |= ATTR_CTIME;
} else {
- temp_attrs.ia_ctime = inode->i_ctime;
+ ctime = inode->i_ctime;
}
if (file_info->LastWriteTime) {
@@ -5505,13 +5507,6 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf,
rc = 0;
}
- /*
- * HACK : set ctime here to avoid ctime changed
- * when file_info->ChangeTime is zero.
- */
- attrs.ia_ctime = temp_attrs.ia_ctime;
- attrs.ia_valid |= ATTR_CTIME;
-
if (attrs.ia_valid) {
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = d_inode(dentry);
@@ -5519,17 +5514,15 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf,
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EACCES;
- rc = setattr_prepare(user_ns, dentry, &attrs);
- if (rc)
- return -EINVAL;
-
inode_lock(inode);
- setattr_copy(user_ns, inode, &attrs);
- attrs.ia_valid &= ~ATTR_CTIME;
rc = notify_change(user_ns, dentry, &attrs, NULL);
+ if (!rc) {
+ inode->i_ctime = ctime;
+ mark_inode_dirty(inode);
+ }
inode_unlock(inode);
}
- return 0;
+ return rc;
}
static int set_file_allocation_info(struct ksmbd_work *work,
@@ -5624,6 +5617,7 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
char *buf)
{
+ struct user_namespace *user_ns;
struct ksmbd_file *parent_fp;
struct dentry *parent;
struct dentry *dentry = fp->filp->f_path.dentry;
@@ -5634,11 +5628,12 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EACCES;
}
+ user_ns = file_mnt_user_ns(fp->filp);
if (ksmbd_stream_fd(fp))
goto next;
parent = dget_parent(dentry);
- ret = ksmbd_vfs_lock_parent(parent, dentry);
+ ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry);
if (ret) {
dput(parent);
return ret;
@@ -5655,7 +5650,7 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
}
}
next:
- return smb2_rename(work, fp,
+ return smb2_rename(work, fp, user_ns,
(struct smb2_file_rename_info *)buf,
work->sess->conn->local_nls);
}
@@ -7116,8 +7111,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
netdev->ethtool_ops->get_link_ksettings(netdev, &cmd);
speed = cmd.base.speed;
} else {
- pr_err("%s %s\n", netdev->name,
- "speed is unknown, defaulting to 1Gb/sec");
+ ksmbd_debug(SMB, "%s %s\n", netdev->name,
+ "speed is unknown, defaulting to 1Gb/sec");
speed = SPEED_1000;
}
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index b108b918ec84..43d3123d8b62 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -291,7 +291,6 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
char *search_pattern,
int (*fn)(struct ksmbd_conn *, int,
struct ksmbd_dir_info *,
- struct user_namespace *,
struct ksmbd_kstat *))
{
int i, rc = 0;
@@ -322,8 +321,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
user_ns,
dir->filp->f_path.dentry->d_parent,
&ksmbd_kstat);
- rc = fn(conn, info_level, d_info,
- user_ns, &ksmbd_kstat);
+ rc = fn(conn, info_level, d_info, &ksmbd_kstat);
if (rc)
break;
if (d_info->out_buf_len <= 0)
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index eb667d85558e..57c667c1be06 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -511,7 +511,6 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work,
int (*fn)(struct ksmbd_conn *,
int,
struct ksmbd_dir_info *,
- struct user_namespace *,
struct ksmbd_kstat *));
int ksmbd_extract_shortname(struct ksmbd_conn *conn,
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index 5456e3ad943e..0a95cdec8c80 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -274,24 +274,34 @@ static int sid_to_id(struct user_namespace *user_ns,
uid_t id;
id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
- if (id > 0) {
- uid = make_kuid(user_ns, id);
- if (uid_valid(uid) && kuid_has_mapping(user_ns, uid)) {
- fattr->cf_uid = uid;
- rc = 0;
- }
+ /*
+ * Translate raw sid into kuid in the server's user
+ * namespace.
+ */
+ uid = make_kuid(&init_user_ns, id);
+
+ /* If this is an idmapped mount, apply the idmapping. */
+ uid = kuid_from_mnt(user_ns, uid);
+ if (uid_valid(uid)) {
+ fattr->cf_uid = uid;
+ rc = 0;
}
} else {
kgid_t gid;
gid_t id;
id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
- if (id > 0) {
- gid = make_kgid(user_ns, id);
- if (gid_valid(gid) && kgid_has_mapping(user_ns, gid)) {
- fattr->cf_gid = gid;
- rc = 0;
- }
+ /*
+ * Translate raw sid into kgid in the server's user
+ * namespace.
+ */
+ gid = make_kgid(&init_user_ns, id);
+
+ /* If this is an idmapped mount, apply the idmapping. */
+ gid = kgid_from_mnt(user_ns, gid);
+ if (gid_valid(gid)) {
+ fattr->cf_gid = gid;
+ rc = 0;
}
}
@@ -587,14 +597,14 @@ static void set_posix_acl_entries_dacl(struct user_namespace *user_ns,
uid_t uid;
unsigned int sid_type = SIDOWNER;
- uid = from_kuid(user_ns, pace->e_uid);
+ uid = posix_acl_uid_translate(user_ns, pace);
if (!uid)
sid_type = SIDUNIX_USER;
id_to_sid(uid, sid_type, sid);
} else if (pace->e_tag == ACL_GROUP) {
gid_t gid;
- gid = from_kgid(user_ns, pace->e_gid);
+ gid = posix_acl_gid_translate(user_ns, pace);
id_to_sid(gid, SIDUNIX_GROUP, sid);
} else if (pace->e_tag == ACL_OTHER && !nt_aces_num) {
smb_copy_sid(sid, &sid_everyone);
@@ -653,12 +663,12 @@ posix_default_acl:
if (pace->e_tag == ACL_USER) {
uid_t uid;
- uid = from_kuid(user_ns, pace->e_uid);
+ uid = posix_acl_uid_translate(user_ns, pace);
id_to_sid(uid, SIDCREATOR_OWNER, sid);
} else if (pace->e_tag == ACL_GROUP) {
gid_t gid;
- gid = from_kgid(user_ns, pace->e_gid);
+ gid = posix_acl_gid_translate(user_ns, pace);
id_to_sid(gid, SIDCREATOR_GROUP, sid);
} else {
kfree(sid);
@@ -723,7 +733,7 @@ static void set_mode_dacl(struct user_namespace *user_ns,
}
/* owner RID */
- uid = from_kuid(user_ns, fattr->cf_uid);
+ uid = from_kuid(&init_user_ns, fattr->cf_uid);
if (uid)
sid = &server_conf.domain_sid;
else
@@ -739,7 +749,7 @@ static void set_mode_dacl(struct user_namespace *user_ns,
ace_size = fill_ace_for_sid(pace, &sid_unix_groups,
ACCESS_ALLOWED, 0, fattr->cf_mode, 0070);
pace->sid.sub_auth[pace->sid.num_subauth++] =
- cpu_to_le32(from_kgid(user_ns, fattr->cf_gid));
+ cpu_to_le32(from_kgid(&init_user_ns, fattr->cf_gid));
pace->size = cpu_to_le16(ace_size + 4);
size += le16_to_cpu(pace->size);
pace = (struct smb_ace *)((char *)pndace + size);
@@ -880,7 +890,7 @@ int build_sec_desc(struct user_namespace *user_ns,
if (!nowner_sid_ptr)
return -ENOMEM;
- uid = from_kuid(user_ns, fattr->cf_uid);
+ uid = from_kuid(&init_user_ns, fattr->cf_uid);
if (!uid)
sid_type = SIDUNIX_USER;
id_to_sid(uid, sid_type, nowner_sid_ptr);
@@ -891,7 +901,7 @@ int build_sec_desc(struct user_namespace *user_ns,
return -ENOMEM;
}
- gid = from_kgid(user_ns, fattr->cf_gid);
+ gid = from_kgid(&init_user_ns, fattr->cf_gid);
id_to_sid(gid, SIDUNIX_GROUP, ngroup_sid_ptr);
offset = sizeof(struct smb_ntsd);
@@ -1234,11 +1244,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
pa_entry = posix_acls->a_entries;
for (i = 0; i < posix_acls->a_count; i++, pa_entry++) {
if (pa_entry->e_tag == ACL_USER)
- id = from_kuid(user_ns,
- pa_entry->e_uid);
+ id = posix_acl_uid_translate(user_ns, pa_entry);
else if (pa_entry->e_tag == ACL_GROUP)
- id = from_kgid(user_ns,
- pa_entry->e_gid);
+ id = posix_acl_gid_translate(user_ns, pa_entry);
else
continue;
@@ -1322,22 +1330,31 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
newattrs.ia_valid |= ATTR_MODE;
newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777);
- inode_lock(inode);
- rc = notify_change(user_ns, path->dentry, &newattrs, NULL);
- inode_unlock(inode);
- if (rc)
- goto out;
-
ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry);
/* Update posix acls */
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) {
rc = set_posix_acl(user_ns, inode,
ACL_TYPE_ACCESS, fattr.cf_acls);
- if (S_ISDIR(inode->i_mode) && fattr.cf_dacls)
+ if (rc < 0)
+ ksmbd_debug(SMB,
+ "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
+ rc);
+ if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) {
rc = set_posix_acl(user_ns, inode,
ACL_TYPE_DEFAULT, fattr.cf_dacls);
+ if (rc)
+ ksmbd_debug(SMB,
+ "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
+ rc);
+ }
}
+ inode_lock(inode);
+ rc = notify_change(user_ns, path->dentry, &newattrs, NULL);
+ inode_unlock(inode);
+ if (rc)
+ goto out;
+
/* Check it only calling from SD BUFFER context */
if (type_check && !(le16_to_cpu(pntsd->type) & DACL_PRESENT))
goto out;
diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h
index 940f686a1d95..73e08cad412b 100644
--- a/fs/ksmbd/smbacl.h
+++ b/fs/ksmbd/smbacl.h
@@ -209,4 +209,29 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
bool type_check);
void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid);
void ksmbd_init_domain(u32 *sub_auth);
+
+static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns,
+ struct posix_acl_entry *pace)
+{
+ kuid_t kuid;
+
+ /* If this is an idmapped mount, apply the idmapping. */
+ kuid = kuid_into_mnt(mnt_userns, pace->e_uid);
+
+ /* Translate the kuid into a userspace id ksmbd would see. */
+ return from_kuid(&init_user_ns, kuid);
+}
+
+static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns,
+ struct posix_acl_entry *pace)
+{
+ kgid_t kgid;
+
+ /* If this is an idmapped mount, apply the idmapping. */
+ kgid = kgid_into_mnt(mnt_userns, pace->e_gid);
+
+ /* Translate the kgid into a userspace id ksmbd would see. */
+ return from_kgid(&init_user_ns, kgid);
+}
+
#endif /* _SMBACL_H */
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index 58f530056ac0..52b2556e76b1 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -1168,7 +1168,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t,
pr_err("failed to map buffer\n");
ret = -ENOMEM;
goto err;
- } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES - 1) {
+ } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES) {
pr_err("buffer not fitted into sges\n");
ret = -E2BIG;
ib_dma_unmap_sg(t->cm_id->device, sg, sg_cnt,
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index aee28ee6b19c..b047f2980d96 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -69,14 +69,15 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
*
* the reference count of @parent isn't incremented.
*/
-int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
+int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent,
+ struct dentry *child)
{
struct dentry *dentry;
int ret = 0;
inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
- dentry = lookup_one_len(child->d_name.name, parent,
- child->d_name.len);
+ dentry = lookup_one(user_ns, child->d_name.name, parent,
+ child->d_name.len);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto out_err;
@@ -102,7 +103,7 @@ int ksmbd_vfs_may_delete(struct user_namespace *user_ns,
int ret;
parent = dget_parent(dentry);
- ret = ksmbd_vfs_lock_parent(parent, dentry);
+ ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry);
if (ret) {
dput(parent);
return ret;
@@ -137,7 +138,7 @@ int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns,
*daccess |= FILE_EXECUTE_LE;
parent = dget_parent(dentry);
- ret = ksmbd_vfs_lock_parent(parent, dentry);
+ ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry);
if (ret) {
dput(parent);
return ret;
@@ -197,6 +198,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
*/
int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
{
+ struct user_namespace *user_ns;
struct path path;
struct dentry *dentry;
int err;
@@ -210,16 +212,16 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
return err;
}
+ user_ns = mnt_user_ns(path.mnt);
mode |= S_IFDIR;
- err = vfs_mkdir(mnt_user_ns(path.mnt), d_inode(path.dentry),
- dentry, mode);
+ err = vfs_mkdir(user_ns, d_inode(path.dentry), dentry, mode);
if (err) {
goto out;
} else if (d_unhashed(dentry)) {
struct dentry *d;
- d = lookup_one_len(dentry->d_name.name, dentry->d_parent,
- dentry->d_name.len);
+ d = lookup_one(user_ns, dentry->d_name.name, dentry->d_parent,
+ dentry->d_name.len);
if (IS_ERR(d)) {
err = PTR_ERR(d);
goto out;
@@ -582,6 +584,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id)
*/
int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
{
+ struct user_namespace *user_ns;
struct path path;
struct dentry *parent;
int err;
@@ -601,8 +604,9 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
return err;
}
+ user_ns = mnt_user_ns(path.mnt);
parent = dget_parent(path.dentry);
- err = ksmbd_vfs_lock_parent(parent, path.dentry);
+ err = ksmbd_vfs_lock_parent(user_ns, parent, path.dentry);
if (err) {
dput(parent);
path_put(&path);
@@ -616,14 +620,12 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
}
if (S_ISDIR(d_inode(path.dentry)->i_mode)) {
- err = vfs_rmdir(mnt_user_ns(path.mnt), d_inode(parent),
- path.dentry);
+ err = vfs_rmdir(user_ns, d_inode(parent), path.dentry);
if (err && err != -ENOTEMPTY)
ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name,
err);
} else {
- err = vfs_unlink(mnt_user_ns(path.mnt), d_inode(parent),
- path.dentry, NULL);
+ err = vfs_unlink(user_ns, d_inode(parent), path.dentry, NULL);
if (err)
ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name,
err);
@@ -748,7 +750,8 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work,
if (ksmbd_override_fsids(work))
return -ENOMEM;
- dst_dent = lookup_one_len(dst_name, dst_dent_parent, strlen(dst_name));
+ dst_dent = lookup_one(dst_user_ns, dst_name, dst_dent_parent,
+ strlen(dst_name));
err = PTR_ERR(dst_dent);
if (IS_ERR(dst_dent)) {
pr_err("lookup failed %s [%d]\n", dst_name, err);
@@ -779,6 +782,7 @@ out:
int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
char *newname)
{
+ struct user_namespace *user_ns;
struct path dst_path;
struct dentry *src_dent_parent, *dst_dent_parent;
struct dentry *src_dent, *trap_dent, *src_child;
@@ -808,8 +812,9 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
trap_dent = lock_rename(src_dent_parent, dst_dent_parent);
dget(src_dent);
dget(dst_dent_parent);
- src_child = lookup_one_len(src_dent->d_name.name, src_dent_parent,
- src_dent->d_name.len);
+ user_ns = file_mnt_user_ns(fp->filp);
+ src_child = lookup_one(user_ns, src_dent->d_name.name, src_dent_parent,
+ src_dent->d_name.len);
if (IS_ERR(src_child)) {
err = PTR_ERR(src_child);
goto out_lock;
@@ -823,7 +828,7 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
dput(src_child);
err = __ksmbd_vfs_rename(work,
- file_mnt_user_ns(fp->filp),
+ user_ns,
src_dent_parent,
src_dent,
mnt_user_ns(dst_path.mnt),
@@ -1109,7 +1114,7 @@ int ksmbd_vfs_unlink(struct user_namespace *user_ns,
{
int err = 0;
- err = ksmbd_vfs_lock_parent(dir, dentry);
+ err = ksmbd_vfs_lock_parent(user_ns, dir, dentry);
if (err)
return err;
dget(dentry);
@@ -1385,14 +1390,14 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac
switch (pa_entry->e_tag) {
case ACL_USER:
xa_entry->type = SMB_ACL_USER;
- xa_entry->uid = from_kuid(user_ns, pa_entry->e_uid);
+ xa_entry->uid = posix_acl_uid_translate(user_ns, pa_entry);
break;
case ACL_USER_OBJ:
xa_entry->type = SMB_ACL_USER_OBJ;
break;
case ACL_GROUP:
xa_entry->type = SMB_ACL_GROUP;
- xa_entry->gid = from_kgid(user_ns, pa_entry->e_gid);
+ xa_entry->gid = posix_acl_gid_translate(user_ns, pa_entry);
break;
case ACL_GROUP_OBJ:
xa_entry->type = SMB_ACL_GROUP_OBJ;
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index cb0cba0d5d07..85db50abdb24 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -107,7 +107,8 @@ struct ksmbd_kstat {
__le32 file_attributes;
};
-int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child);
+int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent,
+ struct dentry *child);
int ksmbd_vfs_may_delete(struct user_namespace *user_ns, struct dentry *dentry);
int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns,
struct dentry *dentry, __le32 *daccess);
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index 92d8c61ffd2a..29c1db66bd0f 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -666,22 +666,6 @@ void ksmbd_free_global_file_table(void)
ksmbd_destroy_file_table(&global_ft);
}
-int ksmbd_file_table_flush(struct ksmbd_work *work)
-{
- struct ksmbd_file *fp = NULL;
- unsigned int id;
- int ret;
-
- read_lock(&work->sess->file_table.lock);
- idr_for_each_entry(work->sess->file_table.idr, fp, id) {
- ret = ksmbd_vfs_fsync(work, fp->volatile_id, KSMBD_NO_FID);
- if (ret)
- break;
- }
- read_unlock(&work->sess->file_table.lock);
- return ret;
-}
-
int ksmbd_init_file_table(struct ksmbd_file_table *ft)
{
ft->idr = kzalloc(sizeof(struct idr), GFP_KERNEL);
diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h
index 70dfe6a99f13..448576fbe4b7 100644
--- a/fs/ksmbd/vfs_cache.h
+++ b/fs/ksmbd/vfs_cache.h
@@ -152,7 +152,6 @@ void ksmbd_close_session_fds(struct ksmbd_work *work);
int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode);
int ksmbd_init_global_file_table(void);
void ksmbd_free_global_file_table(void);
-int ksmbd_file_table_flush(struct ksmbd_work *work);
void ksmbd_set_fd_limit(unsigned long limit);
/*
diff --git a/fs/locks.c b/fs/locks.c
index 51a5b72ef302..3d6fb4ae847b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2941,12 +2941,10 @@ static int __init filelock_init(void)
int i;
flctx_cache = kmem_cache_create("file_lock_ctx",
- sizeof(struct file_lock_context), 0,
- SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
filelock_cache = kmem_cache_create("file_lock_cache",
- sizeof(struct file_lock), 0,
- SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
for_each_possible_cpu(i) {
struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 68e8d61e28dd..62f8a7ac19c8 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -51,11 +51,9 @@ static const struct sysfs_ops nilfs_##name##_attr_ops = { \
#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
static void nilfs_##name##_attr_release(struct kobject *kobj) \
{ \
- struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
- struct the_nilfs *nilfs = container_of(kobj->parent, \
- struct the_nilfs, \
- ns_##parent_name##_kobj); \
- subgroups = nilfs->ns_##parent_name##_subgroups; \
+ struct nilfs_sysfs_##parent_name##_subgroups *subgroups = container_of(kobj, \
+ struct nilfs_sysfs_##parent_name##_subgroups, \
+ sg_##name##_kobj); \
complete(&subgroups->sg_##name##_kobj_unregister); \
} \
static struct kobj_type nilfs_##name##_ktype = { \
@@ -81,12 +79,12 @@ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \
#name); \
if (err) \
- return err; \
- return 0; \
+ kobject_put(kobj); \
+ return err; \
} \
static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \
{ \
- kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
+ kobject_put(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
}
/************************************************************************
@@ -197,14 +195,14 @@ int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root)
}
if (err)
- return err;
+ kobject_put(&root->snapshot_kobj);
- return 0;
+ return err;
}
void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root)
{
- kobject_del(&root->snapshot_kobj);
+ kobject_put(&root->snapshot_kobj);
}
/************************************************************************
@@ -986,7 +984,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL,
"%s", sb->s_id);
if (err)
- goto free_dev_subgroups;
+ goto cleanup_dev_kobject;
err = nilfs_sysfs_create_mounted_snapshots_group(nilfs);
if (err)
@@ -1023,9 +1021,7 @@ delete_mounted_snapshots_group:
nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
cleanup_dev_kobject:
- kobject_del(&nilfs->ns_dev_kobj);
-
-free_dev_subgroups:
+ kobject_put(&nilfs->ns_dev_kobj);
kfree(nilfs->ns_dev_subgroups);
failed_create_device_group:
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8b7b01a380ce..c8bfc01da5d7 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -792,14 +792,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
void nilfs_put_root(struct nilfs_root *root)
{
- if (refcount_dec_and_test(&root->count)) {
- struct the_nilfs *nilfs = root->nilfs;
+ struct the_nilfs *nilfs = root->nilfs;
- nilfs_sysfs_delete_snapshot_group(root);
-
- spin_lock(&nilfs->ns_cptree_lock);
+ if (refcount_dec_and_lock(&root->count, &nilfs->ns_cptree_lock)) {
rb_erase(&root->rb_node, &nilfs->ns_cptree);
spin_unlock(&nilfs->ns_cptree_lock);
+
+ nilfs_sysfs_delete_snapshot_group(root);
iput(root->ifile);
kfree(root);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 95006d1d29ab..fa1d99101f89 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -531,6 +531,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
/* Someone else created list structure for us */
if (inode)
fsnotify_put_inode_ref(inode);
+ fsnotify_put_sb_connectors(conn);
kmem_cache_free(fsnotify_mark_connector_cachep, conn);
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 1fa1f52763f0..6d4342bad9f1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal);
*/
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- return try_get_compound_head(buf->page, 1);
+ return try_get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ee0ce8cecc4a..49be8c8ef555 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -98,27 +98,17 @@
void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
{
- char *buf;
- size_t size;
char tcomm[64];
- int ret;
if (p->flags & PF_WQ_WORKER)
wq_worker_comm(tcomm, sizeof(tcomm), p);
else
__get_task_comm(tcomm, sizeof(tcomm), p);
- size = seq_get_buf(m, &buf);
- if (escape) {
- ret = string_escape_str(tcomm, buf, size,
- ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
- if (ret >= size)
- ret = -1;
- } else {
- ret = strscpy(buf, tcomm, size);
- }
-
- seq_commit(m, ret);
+ if (escape)
+ seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ else
+ seq_printf(m, "%.64s", tcomm);
}
/*
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e5b5f7709d48..533d5836eb9a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -95,6 +95,7 @@
#include <linux/posix-timers.h>
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
+#include <linux/cn_proc.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -1674,8 +1675,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
if (!p)
return -ESRCH;
- if (same_thread_group(current, p))
+ if (same_thread_group(current, p)) {
set_task_comm(p, buffer);
+ proc_comm_connector(p);
+ }
else
count = -EINVAL;
diff --git a/fs/select.c b/fs/select.c
index e83e563a351d..945896d0ac9e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -655,7 +655,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
goto out_nofds;
alloc_size = 6 * size;
- bits = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT);
+ bits = kvmalloc(alloc_size, GFP_KERNEL);
if (!bits)
goto out_nofds;
}
@@ -1000,7 +1000,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
len = min(todo, POLLFD_PER_PAGE);
walk = walk->next = kmalloc(struct_size(walk, entries, len),
- GFP_KERNEL_ACCOUNT);
+ GFP_KERNEL);
if (!walk) {
err = -ENOMEM;
goto out_fds;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 9a86d3ec2cb6..c4e0cd1c1c8c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -330,6 +330,15 @@ xfs_set_inode_alloc(
return xfs_is_inode32(mp) ? maxagi : agcount;
}
+static bool
+xfs_buftarg_is_dax(
+ struct super_block *sb,
+ struct xfs_buftarg *bt)
+{
+ return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0,
+ bdev_nr_sectors(bt->bt_bdev));
+}
+
STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
@@ -1588,11 +1597,10 @@ xfs_fs_fill_super(
xfs_warn(mp,
"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
- sb->s_blocksize);
+ datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp);
if (mp->m_rtdev_targp)
- rtdev_is_dax = bdev_dax_supported(
- mp->m_rtdev_targp->bt_bdev, sb->s_blocksize);
+ rtdev_is_dax = xfs_buftarg_is_dax(sb,
+ mp->m_rtdev_targp);
if (!rtdev_is_dax && !datadev_is_dax) {
xfs_alert(mp,
"DAX unsupported by block device. Turning off DAX.");