diff options
-rw-r--r-- | Documentation/filesystems/porting.rst | 12 | ||||
-rw-r--r-- | block/Kconfig | 20 | ||||
-rw-r--r-- | block/bdev.c | 258 | ||||
-rw-r--r-- | drivers/md/dm.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/fs-ioctl.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/super-io.c | 19 | ||||
-rw-r--r-- | fs/bcachefs/super_types.h | 1 | ||||
-rw-r--r-- | fs/btrfs/super.c | 2 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 4 | ||||
-rw-r--r-- | fs/ext4/super.c | 8 | ||||
-rw-r--r-- | fs/f2fs/file.c | 4 | ||||
-rw-r--r-- | fs/nilfs2/super.c | 8 | ||||
-rw-r--r-- | fs/super.c | 498 | ||||
-rw-r--r-- | fs/xfs/xfs_fsops.c | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 24 | ||||
-rw-r--r-- | include/linux/blk_types.h | 8 | ||||
-rw-r--r-- | include/linux/blkdev.h | 29 | ||||
-rw-r--r-- | include/linux/fs.h | 19 |
18 files changed, 531 insertions, 395 deletions
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 878e72b2f8b7..ced3a6761329 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1061,3 +1061,15 @@ export_operations ->encode_fh() no longer has a default implementation to encode FILEID_INO32_GEN* file handles. Filesystems that used the default implementation may use the generic helper generic_encode_ino32_fh() explicitly. + +--- + +**recommended** + +Block device freezing and thawing have been moved to holder operations. + +Before this change, get_active_super() would only be able to find the +superblock of the main block device, i.e., the one stored in sb->s_bdev. Block +device freezing now works for any block device owned by a given superblock, not +just the main block device. The get_active_super() helper and bd_fsfreeze_sb +pointer are gone. diff --git a/block/Kconfig b/block/Kconfig index 55ae2286a4de..1de4682d48cc 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10 select CRC_T10DIF select CRC64_ROCKSOFT +config BLK_DEV_WRITE_MOUNTED + bool "Allow writing to mounted block devices" + default y + help + When a block device is mounted, writing to its buffer cache is very + likely going to cause filesystem corruption. It is also rather easy to + crash the kernel in this way since the filesystem has no practical way + of detecting these writes to buffer cache and verifying its metadata + integrity. However there are some setups that need this capability + like running fsck on read-only mounted root device, modifying some + features on mounted ext4 filesystem, and similar. If you say N, the + kernel will prevent processes from writing to block devices that are + mounted by filesystems which provides some more protection from runaway + privileged processes and generally makes it much harder to crash + filesystem drivers. Note however that this does not prevent + underlying device(s) from being modified by other means, e.g. by + directly submitting SCSI commands or through access to lower layers of + storage stack. If in doubt, say Y. The configuration can be overridden + with the bdev_allow_write_mounted boot option. + config BLK_DEV_ZONED bool "Zoned block device support" select MQ_IOSCHED_DEADLINE diff --git a/block/bdev.c b/block/bdev.c index 750aec178b6a..e9f1b12bd75c 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -30,6 +30,9 @@ #include "../fs/internal.h" #include "blk.h" +/* Should we allow writing to mounted block devices? */ +static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); + struct bdev_inode { struct block_device bdev; struct inode vfs_inode; @@ -207,85 +210,88 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) EXPORT_SYMBOL(sync_blockdev_range); /** - * freeze_bdev - lock a filesystem and force it into a consistent state + * bdev_freeze - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last * unfreeze process can unfreeze the frozen filesystem actually when multiple - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * freeze requests arrive simultaneously. It counts up in bdev_freeze() and + * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze * actually. + * + * Return: On success zero is returned, negative error code on failure. */ -int freeze_bdev(struct block_device *bdev) +int bdev_freeze(struct block_device *bdev) { - struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) - goto done; - - sb = get_active_super(bdev); - if (!sb) - goto sync; - if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); - deactivate_super(sb); - if (error) { - bdev->bd_fsfreeze_count--; - goto done; + if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; } - bdev->bd_fsfreeze_sb = sb; -sync: - sync_blockdev(bdev); -done: + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { + error = bdev->bd_holder_ops->freeze(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + error = sync_blockdev(bdev); + } + + if (error) + atomic_dec(&bdev->bd_fsfreeze_count); + mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } -EXPORT_SYMBOL(freeze_bdev); +EXPORT_SYMBOL(bdev_freeze); /** - * thaw_bdev - unlock filesystem + * bdev_thaw - unlock filesystem * @bdev: blockdevice to unlock * - * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + * Unlocks the filesystem and marks it writeable again after bdev_freeze(). + * + * Return: On success zero is returned, negative error code on failure. */ -int thaw_bdev(struct block_device *bdev) +int bdev_thaw(struct block_device *bdev) { - struct super_block *sb; - int error = -EINVAL; + int error = -EINVAL, nr_freeze; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) + + /* + * If this returns < 0 it means that @bd_fsfreeze_count was + * already 0 and no decrement was performed. + */ + nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); + if (nr_freeze < 0) goto out; error = 0; - if (--bdev->bd_fsfreeze_count > 0) + if (nr_freeze > 0) goto out; - sb = bdev->bd_fsfreeze_sb; - if (!sb) - goto out; + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { + error = bdev->bd_holder_ops->thaw(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + } - if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); if (error) - bdev->bd_fsfreeze_count++; - else - bdev->bd_fsfreeze_sb = NULL; + atomic_inc(&bdev->bd_fsfreeze_count); out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } -EXPORT_SYMBOL(thaw_bdev); +EXPORT_SYMBOL(bdev_thaw); /* * pseudo-fs @@ -729,9 +735,60 @@ void blkdev_put_no_open(struct block_device *bdev) { put_device(&bdev->bd_device); } - + +static bool bdev_writes_blocked(struct block_device *bdev) +{ + return bdev->bd_writers == -1; +} + +static void bdev_block_writes(struct block_device *bdev) +{ + bdev->bd_writers = -1; +} + +static void bdev_unblock_writes(struct block_device *bdev) +{ + bdev->bd_writers = 0; +} + +static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return true; + /* Writes blocked? */ + if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) + return false; + if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) + return false; + return true; +} + +static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return; + + /* Claim exclusive or shared write access. */ + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_block_writes(bdev); + else if (mode & BLK_OPEN_WRITE) + bdev->bd_writers++; +} + +static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return; + + /* Yield exclusive or shared write access. */ + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_unblock_writes(bdev); + else if (mode & BLK_OPEN_WRITE) + bdev->bd_writers--; +} + /** - * blkdev_get_by_dev - open a block device by device number + * bdev_open_by_dev - open a block device by device number * @dev: device number of block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier @@ -743,32 +800,46 @@ void blkdev_put_no_open(struct block_device *bdev) * * Use this interface ONLY if you really do not have anything better - i.e. when * you are behind a truly sucky interface and all you are given is a device - * number. Everything else should use blkdev_get_by_path(). + * number. Everything else should use bdev_open_by_path(). * * CONTEXT: * Might sleep. * * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. + * Handle with a reference to the block_device on success, ERR_PTR(-errno) on + * failure. */ -struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops) +struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops) { - bool unblock_events = true; + struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle), + GFP_KERNEL); struct block_device *bdev; + bool unblock_events = true; struct gendisk *disk; int ret; + if (!handle) + return ERR_PTR(-ENOMEM); + ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, MAJOR(dev), MINOR(dev), ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) - return ERR_PTR(ret); + goto free_handle; + + /* Blocking writes requires exclusive opener */ + if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) { + ret = -EINVAL; + goto free_handle; + } bdev = blkdev_get_no_open(dev); - if (!bdev) - return ERR_PTR(-ENXIO); + if (!bdev) { + ret = -ENXIO; + goto free_handle; + } disk = bdev->bd_disk; if (holder) { @@ -791,12 +862,16 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, goto abort_claiming; if (!try_module_get(disk->fops->owner)) goto abort_claiming; + ret = -EBUSY; + if (!bdev_may_open(bdev, mode)) + goto abort_claiming; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); else ret = blkdev_get_whole(bdev, mode); if (ret) goto put_module; + bdev_claim_write_access(bdev, mode); if (holder) { bd_finish_claiming(bdev, holder, hops); @@ -817,7 +892,10 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, if (unblock_events) disk_unblock_events(disk); - return bdev; + handle->bdev = bdev; + handle->holder = holder; + handle->mode = mode; + return handle; put_module: module_put(disk->fops->owner); abort_claiming: @@ -827,34 +905,14 @@ abort_claiming: disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); +free_handle: + kfree(handle); return ERR_PTR(ret); } -EXPORT_SYMBOL(blkdev_get_by_dev); - -struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops) -{ - struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL); - struct block_device *bdev; - - if (!handle) - return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_dev(dev, mode, holder, hops); - if (IS_ERR(bdev)) { - kfree(handle); - return ERR_CAST(bdev); - } - handle->bdev = bdev; - handle->holder = holder; - if (holder) - mode |= BLK_OPEN_EXCL; - handle->mode = mode; - return handle; -} EXPORT_SYMBOL(bdev_open_by_dev); /** - * blkdev_get_by_path - open a block device by name + * bdev_open_by_path - open a block device by name * @path: path to the block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier @@ -868,29 +926,9 @@ EXPORT_SYMBOL(bdev_open_by_dev); * Might sleep. * * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. + * Handle with a reference to the block_device on success, ERR_PTR(-errno) on + * failure. */ -struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops) -{ - struct block_device *bdev; - dev_t dev; - int error; - - error = lookup_bdev(path, &dev); - if (error) - return ERR_PTR(error); - - bdev = blkdev_get_by_dev(dev, mode, holder, hops); - if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - blkdev_put(bdev, holder); - return ERR_PTR(-EACCES); - } - - return bdev; -} -EXPORT_SYMBOL(blkdev_get_by_path); - struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { @@ -913,8 +951,9 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, } EXPORT_SYMBOL(bdev_open_by_path); -void blkdev_put(struct block_device *bdev, void *holder) +void bdev_release(struct bdev_handle *handle) { + struct block_device *bdev = handle->bdev; struct gendisk *disk = bdev->bd_disk; /* @@ -928,8 +967,10 @@ void blkdev_put(struct block_device *bdev, void *holder) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); - if (holder) - bd_end_claim(bdev, holder); + bdev_yield_write_access(bdev, handle->mode); + + if (handle->holder) + bd_end_claim(bdev, handle->holder); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE @@ -946,12 +987,6 @@ void blkdev_put(struct block_device *bdev, void *holder) module_put(disk->fops->owner); blkdev_put_no_open(bdev); -} -EXPORT_SYMBOL(blkdev_put); - -void bdev_release(struct bdev_handle *handle) -{ - blkdev_put(handle->bdev, handle->holder); kfree(handle); } EXPORT_SYMBOL(bdev_release); @@ -1102,3 +1137,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) blkdev_put_no_open(bdev); } + +static int __init setup_bdev_allow_write_mounted(char *str) +{ + if (kstrtobool(str, &bdev_allow_write_mounted)) + pr_warn("Invalid option string for bdev_allow_write_mounted:" + " '%s'\n", str); + return 1; +} +__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 23c32cd1f1d8..8dcabf84d866 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2675,7 +2675,7 @@ static int lock_fs(struct mapped_device *md) WARN_ON(test_bit(DMF_FROZEN, &md->flags)); - r = freeze_bdev(md->disk->part0); + r = bdev_freeze(md->disk->part0); if (!r) set_bit(DMF_FROZEN, &md->flags); return r; @@ -2685,7 +2685,7 @@ static void unlock_fs(struct mapped_device *md) { if (!test_bit(DMF_FROZEN, &md->flags)) return; - thaw_bdev(md->disk->part0); + bdev_thaw(md->disk->part0); clear_bit(DMF_FROZEN, &md->flags); } diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 14d5cc6f90d7..94e5a567fa44 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -289,14 +289,14 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: - ret = freeze_bdev(c->vfs_sb->s_bdev); + ret = bdev_freeze(c->vfs_sb->s_bdev); if (ret) goto err; bch2_journal_flush(&c->journal); c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); - thaw_bdev(c->vfs_sb->s_bdev); + bdev_thaw(c->vfs_sb->s_bdev); break; case FSOP_GOING_FLAGS_LOGFLUSH: diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 4c98d8cc2a79..78013deda9df 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -164,8 +164,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb, void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, sb->holder); + if (!IS_ERR_OR_NULL(sb->bdev_handle)) + bdev_release(sb->bdev_handle); kfree(sb->holder); kfree(sb->sb_name); @@ -725,21 +725,22 @@ retry: if (!opt_get(*opts, nochanges)) sb->mode |= BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->bdev) && - PTR_ERR(sb->bdev) == -EACCES && + sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (IS_ERR(sb->bdev_handle) && + PTR_ERR(sb->bdev_handle) == -EACCES && opt_get(*opts, read_only)) { sb->mode &= ~BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->bdev)) + sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (!IS_ERR(sb->bdev_handle)) opt_set(*opts, nochanges, true); } - if (IS_ERR(sb->bdev)) { - ret = PTR_ERR(sb->bdev); + if (IS_ERR(sb->bdev_handle)) { + ret = PTR_ERR(sb->bdev_handle); goto out; } + sb->bdev = sb->bdev_handle->bdev; ret = bch2_sb_realloc(sb, 0); if (ret) { diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 9c1fd4ca2b10..b2119686e2e1 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -4,6 +4,7 @@ struct bch_sb_handle { struct bch_sb *sb; + struct bdev_handle *bdev_handle; struct block_device *bdev; char *sb_name; struct bio *bio; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ef256b944c72..2a3a5bf102dc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1406,6 +1406,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, return ERR_PTR(error); } + /* No support for restricting writes to btrfs devices yet... */ + mode &= ~BLK_OPEN_RESTRICT_WRITES; /* * Setup a dummy root and fs_info for test/set super. This is because * we don't actually fill this stuff out until open_ctree, but we need diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 4f931f80cb34..aa6be510eb8f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -819,11 +819,11 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: - ret = freeze_bdev(sb->s_bdev); + ret = bdev_freeze(sb->s_bdev); if (ret) return ret; set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - thaw_bdev(sb->s_bdev); + bdev_thaw(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c5fcf377ab1f..0980845c8b8f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5864,11 +5864,9 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, struct ext4_super_block *es; int errno; - /* see get_tree_bdev why this is needed and safe */ - up_write(&sb->s_umount); - bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, - sb, &fs_holder_ops); - down_write(&sb->s_umount); + bdev_handle = bdev_open_by_dev(j_dev, + BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, + sb, &fs_holder_ops); if (IS_ERR(bdev_handle)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e50363583f01..4580dfefd5e9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2239,11 +2239,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) switch (in) { case F2FS_GOING_DOWN_FULLSYNC: - ret = freeze_bdev(sb->s_bdev); + ret = bdev_freeze(sb->s_bdev); if (ret) goto out; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - thaw_bdev(sb->s_bdev); + bdev_thaw(sb->s_bdev); break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index a5d1fa4e7552..df8674173b22 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1314,15 +1314,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, return ERR_CAST(s); if (!s->s_root) { - /* - * We drop s_umount here because we need to open the bdev and - * bdev->open_mutex ranks above s_umount (blkdev_put() -> - * __invalidate_device()). It is safe because we have active sb - * reference and SB_BORN is not set yet. - */ - up_write(&s->s_umount); err = setup_bdev_super(s, flags, NULL); - down_write(&s->s_umount); if (!err) err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0); diff --git a/fs/super.c b/fs/super.c index 6fe482371633..e35936000408 100644 --- a/fs/super.c +++ b/fs/super.c @@ -81,16 +81,13 @@ static inline void super_unlock_shared(struct super_block *sb) super_unlock(sb, false); } -static inline bool wait_born(struct super_block *sb) +static bool super_flags(const struct super_block *sb, unsigned int flags) { - unsigned int flags; - /* * Pairs with smp_store_release() in super_wake() and ensures - * that we see SB_BORN or SB_DYING after we're woken. + * that we see @flags after we're woken. */ - flags = smp_load_acquire(&sb->s_flags); - return flags & (SB_BORN | SB_DYING); + return smp_load_acquire(&sb->s_flags) & flags; } /** @@ -105,15 +102,21 @@ static inline bool wait_born(struct super_block *sb) * * The caller must have acquired a temporary reference on @sb->s_count. * - * Return: This returns true if SB_BORN was set, false if SB_DYING was - * set. The function acquires s_umount and returns with it held. + * Return: The function returns true if SB_BORN was set and with + * s_umount held. The function returns false if SB_DYING was + * set and without s_umount held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { - lockdep_assert_not_held(&sb->s_umount); -relock: + /* wait until the superblock is ready or dying */ + wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING)); + + /* Don't pointlessly acquire s_umount. */ + if (super_flags(sb, SB_DYING)) + return false; + __super_lock(sb, excl); /* @@ -121,32 +124,22 @@ relock: * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ - if (sb->s_flags & SB_DYING) + if (sb->s_flags & SB_DYING) { + super_unlock(sb, excl); return false; + } - /* Has called ->get_tree() successfully. */ - if (sb->s_flags & SB_BORN) - return true; - - super_unlock(sb, excl); - - /* wait until the superblock is ready or dying */ - wait_var_event(&sb->s_flags, wait_born(sb)); - - /* - * Neither SB_BORN nor SB_DYING are ever unset so we never loop. - * Just reacquire @sb->s_umount for the caller. - */ - goto relock; + WARN_ON_ONCE(!(sb->s_flags & SB_BORN)); + return true; } -/* wait and acquire read-side of @sb->s_umount */ +/* wait and try to acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } -/* wait and acquire write-side of @sb->s_umount */ +/* wait and try to acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); @@ -521,48 +514,7 @@ void deactivate_super(struct super_block *s) EXPORT_SYMBOL(deactivate_super); /** - * grab_super - acquire an active reference - * @s: reference we are trying to make active - * - * Tries to acquire an active reference. grab_super() is used when we - * had just found a superblock in super_blocks or fs_type->fs_supers - * and want to turn it into a full-blown active reference. grab_super() - * is called with sb_lock held and drops it. Returns 1 in case of - * success, 0 if we had failed (superblock contents was already dead or - * dying when grab_super() had been called). Note that this is only - * called for superblocks not in rundown mode (== ones still on ->fs_supers - * of their type), so increment of ->s_count is OK here. - */ -static int grab_super(struct super_block *s) __releases(sb_lock) -{ - bool born; - - s->s_count++; - spin_unlock(&sb_lock); - born = super_lock_excl(s); - if (born && atomic_inc_not_zero(&s->s_active)) { - put_super(s); - return 1; - } - super_unlock_excl(s); - put_super(s); - return 0; -} - -static inline bool wait_dead(struct super_block *sb) -{ - unsigned int flags; - - /* - * Pairs with memory barrier in super_wake() and ensures - * that we see SB_DEAD after we're woken. - */ - flags = smp_load_acquire(&sb->s_flags); - return flags & SB_DEAD; -} - -/** - * grab_super_dead - acquire an active reference to a superblock + * grab_super - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for @@ -573,17 +525,21 @@ static inline bool wait_dead(struct super_block *sb) * Return: This returns true if an active reference could be acquired, * false if not. */ -static bool grab_super_dead(struct super_block *sb) +static bool grab_super(struct super_block *sb) { + bool locked; sb->s_count++; - if (grab_super(sb)) { - put_super(sb); - lockdep_assert_held(&sb->s_umount); - return true; + spin_unlock(&sb_lock); + locked = super_lock_excl(sb); + if (locked) { + if (atomic_inc_not_zero(&sb->s_active)) { + put_super(sb); + return true; + } + super_unlock_excl(sb); } - wait_var_event(&sb->s_flags, wait_dead(sb)); - lockdep_assert_not_held(&sb->s_umount); + wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD)); put_super(sb); return false; } @@ -834,7 +790,7 @@ share_extant_sb: warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } - if (!grab_super_dead(old)) + if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; @@ -878,7 +834,7 @@ retry: destroy_unused_super(s); return ERR_PTR(-EBUSY); } - if (!grab_super_dead(old)) + if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; @@ -930,8 +886,7 @@ static void __iterate_supers(void (*f)(struct super_block *)) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - /* Pairs with memory marrier in super_wake(). */ - if (smp_load_acquire(&sb->s_flags) & SB_DYING) + if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); @@ -961,15 +916,17 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - bool born; + bool locked; sb->s_count++; spin_unlock(&sb_lock); - born = super_lock_shared(sb); - if (born && sb->s_root) - f(sb, arg); - super_unlock_shared(sb); + locked = super_lock_shared(sb); + if (locked) { + if (sb->s_root) + f(sb, arg); + super_unlock_shared(sb); + } spin_lock(&sb_lock); if (p) @@ -997,15 +954,17 @@ void iterate_supers_type(struct file_system_type *type, spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { - bool born; + bool locked; sb->s_count++; spin_unlock(&sb_lock); - born = super_lock_shared(sb); - if (born && sb->s_root) - f(sb, arg); - super_unlock_shared(sb); + locked = super_lock_shared(sb); + if (locked) { + if (sb->s_root) + f(sb, arg); + super_unlock_shared(sb); + } spin_lock(&sb_lock); if (p) @@ -1019,34 +978,6 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); -/** - * get_active_super - get an active reference to the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. Returns the superblock with an active - * reference or %NULL if none was found. - */ -struct super_block *get_active_super(struct block_device *bdev) -{ - struct super_block *sb; - - if (!bdev) - return NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_bdev == bdev) { - if (!grab_super(sb)) - return NULL; - super_unlock_excl(sb); - return sb; - } - } - spin_unlock(&sb_lock); - return NULL; -} - struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; @@ -1054,15 +985,17 @@ struct super_block *user_get_super(dev_t dev, bool excl) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dev == dev) { - bool born; + bool locked; sb->s_count++; spin_unlock(&sb_lock); /* still alive? */ - born = super_lock(sb, excl); - if (born && sb->s_root) - return sb; - super_unlock(sb, excl); + locked = super_lock(sb, excl); + if (locked) { + if (sb->s_root) + return sb; + super_unlock(sb, excl); + } /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); @@ -1173,9 +1106,9 @@ cancel_readonly: static void do_emergency_remount_callback(struct super_block *sb) { - bool born = super_lock_excl(sb); + bool locked = super_lock_excl(sb); - if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { + if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, @@ -1186,7 +1119,8 @@ static void do_emergency_remount_callback(struct super_block *sb) put_fs_context(fc); } } - super_unlock_excl(sb); + if (locked) + super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) @@ -1209,16 +1143,17 @@ void emergency_remount(void) static void do_thaw_all_callback(struct super_block *sb) { - bool born = super_lock_excl(sb); + bool locked = super_lock_excl(sb); - if (born && sb->s_root) { + if (locked && sb->s_root) { if (IS_ENABLED(CONFIG_BLOCK)) - while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) + while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); - } else { - super_unlock_excl(sb); + return; } + if (locked) + super_unlock_excl(sb); } static void do_thaw_all(struct work_struct *work) @@ -1428,11 +1363,11 @@ EXPORT_SYMBOL(sget_dev); * * The function must be called with bdev->bd_holder_lock and releases it. */ -static struct super_block *bdev_super_lock_shared(struct block_device *bdev) +static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl) __releases(&bdev->bd_holder_lock) { struct super_block *sb = bdev->bd_holder; - bool born; + bool locked; lockdep_assert_held(&bdev->bd_holder_lock); lockdep_assert_not_held(&sb->s_umount); @@ -1442,19 +1377,25 @@ static struct super_block *bdev_super_lock_shared(struct block_device *bdev) spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); + mutex_unlock(&bdev->bd_holder_lock); - born = super_lock_shared(sb); - if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { - super_unlock_shared(sb); - put_super(sb); - return NULL; - } + locked = super_lock(sb, excl); + /* - * The superblock is active and we hold s_umount, we can drop our - * temporary reference now. - */ + * If the superblock wasn't already SB_DYING then we hold + * s_umount and can safely drop our temporary reference. + */ put_super(sb); + + if (!locked) + return NULL; + + if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) { + super_unlock(sb, excl); + return NULL; + } + return sb; } @@ -1462,7 +1403,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb; - sb = bdev_super_lock_shared(bdev); + sb = bdev_super_lock(bdev, false); if (!sb) return; @@ -1480,16 +1421,110 @@ static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb; - sb = bdev_super_lock_shared(bdev); + sb = bdev_super_lock(bdev, false); if (!sb) return; + sync_filesystem(sb); super_unlock_shared(sb); } +static struct super_block *get_bdev_super(struct block_device *bdev) +{ + bool active = false; + struct super_block *sb; + + sb = bdev_super_lock(bdev, true); + if (sb) { + active = atomic_inc_not_zero(&sb->s_active); + super_unlock_excl(sb); + } + if (!active) + return NULL; + return sb; +} + +/** + * fs_bdev_freeze - freeze owning filesystem of block device + * @bdev: block device + * + * Freeze the filesystem that owns this block device if it is still + * active. + * + * A filesystem that owns multiple block devices may be frozen from each + * block device and won't be unfrozen until all block devices are + * unfrozen. Each block device can only freeze the filesystem once as we + * nest freezes for block devices in the block layer. + * + * Return: If the freeze was successful zero is returned. If the freeze + * failed a negative error code is returned. + */ +static int fs_bdev_freeze(struct block_device *bdev) +{ + struct super_block *sb; + int error = 0; + + lockdep_assert_held(&bdev->bd_fsfreeze_mutex); + + sb = get_bdev_super(bdev); + if (!sb) + return -EINVAL; + + if (sb->s_op->freeze_super) + error = sb->s_op->freeze_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + else + error = freeze_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + if (!error) + error = sync_blockdev(bdev); + deactivate_super(sb); + return error; +} + +/** + * fs_bdev_thaw - thaw owning filesystem of block device + * @bdev: block device + * + * Thaw the filesystem that owns this block device. + * + * A filesystem that owns multiple block devices may be frozen from each + * block device and won't be unfrozen until all block devices are + * unfrozen. Each block device can only freeze the filesystem once as we + * nest freezes for block devices in the block layer. + * + * Return: If the thaw was successful zero is returned. If the thaw + * failed a negative error code is returned. If this function + * returns zero it doesn't mean that the filesystem is unfrozen + * as it may have been frozen multiple times (kernel may hold a + * freeze or might be frozen from other block devices). + */ +static int fs_bdev_thaw(struct block_device *bdev) +{ + struct super_block *sb; + int error; + + lockdep_assert_held(&bdev->bd_fsfreeze_mutex); + + sb = get_bdev_super(bdev); + if (WARN_ON_ONCE(!sb)) + return -EINVAL; + + if (sb->s_op->thaw_super) + error = sb->s_op->thaw_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + else + error = thaw_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + deactivate_super(sb); + return error; +} + const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, + .freeze = fs_bdev_freeze, + .thaw = fs_bdev_thaw, }; EXPORT_SYMBOL_GPL(fs_holder_ops); @@ -1519,15 +1554,10 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, } /* - * Until SB_BORN flag is set, there can be no active superblock - * references and thus no filesystem freezing. get_active_super() will - * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed. - * - * It is enough to check bdev was not frozen before we set s_bdev. + * It is enough to check bdev was not frozen before we set + * s_bdev as freezing will wait until SB_BORN is set. */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); bdev_release(bdev_handle); @@ -1540,7 +1570,6 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); - mutex_unlock(&bdev->bd_fsfreeze_mutex); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, @@ -1585,15 +1614,7 @@ int get_tree_bdev(struct fs_context *fc, return -EBUSY; } } else { - /* - * We drop s_umount here because we need to open the bdev and - * bdev->open_mutex ranks above s_umount (blkdev_put() -> - * bdev_mark_dead()). It is safe because we have active sb - * reference and SB_BORN is not set yet. - */ - super_unlock_excl(s); error = setup_bdev_super(s, fc->sb_flags, fc); - __super_lock_excl(s); if (!error) error = fill_super(s, fc); if (error) { @@ -1637,15 +1658,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, return ERR_PTR(-EBUSY); } } else { - /* - * We drop s_umount here because we need to open the bdev and - * bdev->open_mutex ranks above s_umount (blkdev_put() -> - * bdev_mark_dead()). It is safe because we have active sb - * reference and SB_BORN is not set yet. - */ - super_unlock_excl(s); error = setup_bdev_super(s, flags, NULL); - __super_lock_excl(s); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { @@ -1914,6 +1927,47 @@ static int wait_for_partially_frozen(struct super_block *sb) return ret; } +#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE) +#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST) + +static inline int freeze_inc(struct super_block *sb, enum freeze_holder who) +{ + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); + WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + + if (who & FREEZE_HOLDER_KERNEL) + ++sb->s_writers.freeze_kcount; + if (who & FREEZE_HOLDER_USERSPACE) + ++sb->s_writers.freeze_ucount; + return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; +} + +static inline int freeze_dec(struct super_block *sb, enum freeze_holder who) +{ + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); + WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + + if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount) + --sb->s_writers.freeze_kcount; + if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount) + --sb->s_writers.freeze_ucount; + return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; +} + +static inline bool may_freeze(struct super_block *sb, enum freeze_holder who) +{ + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); + WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + + if (who & FREEZE_HOLDER_KERNEL) + return (who & FREEZE_MAY_NEST) || + sb->s_writers.freeze_kcount == 0; + if (who & FREEZE_HOLDER_USERSPACE) + return (who & FREEZE_MAY_NEST) || + sb->s_writers.freeze_ucount == 0; + return false; +} + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock @@ -1926,6 +1980,7 @@ static int wait_for_partially_frozen(struct super_block *sb) * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. + * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or @@ -1933,6 +1988,13 @@ static int wait_for_partially_frozen(struct super_block *sb) * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * + * A filesystem may hold multiple devices and thus a filesystems may be + * frozen through the block layer via multiple block devices. In this + * case the request is marked as being allowed to nest by passing + * FREEZE_MAY_NEST. The filesystem remains frozen until all block + * devices are unfrozen. If multiple freezes are attempted without + * FREEZE_MAY_NEST -EBUSY will be returned. + * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. @@ -1957,31 +2019,29 @@ static int wait_for_partially_frozen(struct super_block *sb) * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. + * + * Return: If the freeze was successful zero is returned. If the freeze + * failed a negative error code is returned. */ int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; + if (!super_lock_excl(sb)) { + WARN_ON_ONCE("Dying superblock while freezing!"); + return -EINVAL; + } atomic_inc(&sb->s_active); - if (!super_lock_excl(sb)) - WARN(1, "Dying superblock while freezing!"); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { - if (sb->s_writers.freeze_holders & who) { - deactivate_locked_super(sb); - return -EBUSY; - } - - WARN_ON(sb->s_writers.freeze_holders == 0); - - /* - * Someone else already holds this type of freeze; share the - * freeze and assign the active ref to the freeze. - */ - sb->s_writers.freeze_holders |= who; - super_unlock_excl(sb); - return 0; + if (may_freeze(sb, who)) + ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1); + else + ret = -EBUSY; + /* All freezers share a single active reference. */ + deactivate_locked_super(sb); + return ret; } if (sb->s_writers.frozen != SB_UNFROZEN) { @@ -1994,14 +2054,9 @@ retry: goto retry; } - if (!(sb->s_flags & SB_BORN)) { - super_unlock_excl(sb); - return 0; /* sic - it's "nothing to do" */ - } - if (sb_rdonly(sb)) { /* Nothing to do really... */ - sb->s_writers.freeze_holders |= who; + WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); @@ -2012,8 +2067,7 @@ retry: /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); - if (!super_lock_excl(sb)) - WARN(1, "Dying superblock while freezing!"); + __super_lock_excl(sb); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; @@ -2049,7 +2103,7 @@ retry: * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ - sb->s_writers.freeze_holders |= who; + WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); @@ -2066,34 +2120,22 @@ EXPORT_SYMBOL(freeze_super); */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { - int error; + int error = -EINVAL; - if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { - if (!(sb->s_writers.freeze_holders & who)) { - super_unlock_excl(sb); - return -EINVAL; - } + if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) + goto out_unlock; - /* - * Freeze is shared with someone else. Release our hold and - * drop the active ref that freeze_super assigned to the - * freezer. - */ - if (sb->s_writers.freeze_holders & ~who) { - sb->s_writers.freeze_holders &= ~who; - deactivate_locked_super(sb); - return 0; - } - } else { - super_unlock_excl(sb); - return -EINVAL; - } + /* + * All freezers share a single active reference. + * So just unlock in case there are any left. + */ + if (freeze_dec(sb, who)) + goto out_unlock; if (sb_rdonly(sb)) { - sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); - goto out; + goto out_deactivate; } lockdep_sb_freeze_acquire(sb); @@ -2101,20 +2143,23 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { - printk(KERN_ERR "VFS:Filesystem thaw failed\n"); + pr_err("VFS: Filesystem thaw failed\n"); + freeze_inc(sb, who); lockdep_sb_freeze_release(sb); - super_unlock_excl(sb); - return error; + goto out_unlock; } } - sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); -out: +out_deactivate: deactivate_locked_super(sb); return 0; + +out_unlock: + super_unlock_excl(sb); + return error; } /** @@ -2128,11 +2173,18 @@ out: * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. + * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed + * + * A filesystem may hold multiple devices and thus a filesystems may + * have been frozen through the block layer via multiple block devices. + * The filesystem remains frozen until all block devices are unfrozen. */ int thaw_super(struct super_block *sb, enum freeze_holder who) { - if (!super_lock_excl(sb)) - WARN(1, "Dying superblock while thawing!"); + if (!super_lock_excl(sb)) { + WARN_ON_ONCE("Dying superblock while thawing!"); + return -EINVAL; + } return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7cb75cb6b8e9..57076a25f17d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -482,9 +482,9 @@ xfs_fs_goingdown( { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { - if (!freeze_bdev(mp->m_super->s_bdev)) { + if (!bdev_freeze(mp->m_super->s_bdev)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); - thaw_bdev(mp->m_super->s_bdev); + bdev_thaw(mp->m_super->s_bdev); } break; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 764304595e8b..07857d967ee8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -366,8 +366,9 @@ xfs_blkdev_get( { int error = 0; - *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, - mp->m_super, &fs_holder_ops); + *handlep = bdev_open_by_path(name, + BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, + mp->m_super, &fs_holder_ops); if (IS_ERR(*handlep)) { error = PTR_ERR(*handlep); *handlep = NULL; @@ -439,18 +440,12 @@ xfs_open_devices( int error; /* - * blkdev_put() can't be called under s_umount, see the comment - * in get_tree_bdev() for more details - */ - up_write(&sb->s_umount); - - /* * Open real time and log devices - order is important. */ if (mp->m_logname) { error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle); if (error) - goto out_relock; + return error; } if (mp->m_rtname) { @@ -493,10 +488,7 @@ xfs_open_devices( bdev_release(logdev_handle); } - error = 0; -out_relock: - down_write(&sb->s_umount); - return error; + return 0; out_free_rtdev_targ: if (mp->m_rtdev_targp) @@ -509,7 +501,7 @@ out_relock: out_close_logdev: if (logdev_handle) bdev_release(logdev_handle); - goto out_relock; + return error; } /* @@ -759,10 +751,6 @@ static void xfs_mount_free( struct xfs_mount *mp) { - /* - * Free the buftargs here because blkdev_put needs to be called outside - * of sb->s_umount, which is held around the call to ->put_super. - */ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_logdev_targp); if (mp->m_rtdev_targp) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b29ebd53417d..7c2316c91cbd 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -57,20 +57,18 @@ struct block_device { void * bd_holder; const struct blk_holder_ops *bd_holder_ops; struct mutex bd_holder_lock; - /* The counter of freeze processes */ - int bd_fsfreeze_count; int bd_holders; struct kobject *bd_holder_dir; - /* Mutex for freeze */ - struct mutex bd_fsfreeze_mutex; - struct super_block *bd_fsfreeze_sb; + atomic_t bd_fsfreeze_count; /* number of freeze requests */ + struct mutex bd_fsfreeze_mutex; /* serialize freeze/thaw */ struct partition_meta_info *bd_meta_info; #ifdef CONFIG_FAIL_MAKE_REQUEST bool bd_make_it_fail; #endif bool bd_ro_warned; + int bd_writers; /* * keep this out-of-line as it's both big and not needed in the fast * path diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 88e9dd4b71fb..c30a98e08423 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -124,6 +124,8 @@ typedef unsigned int __bitwise blk_mode_t; #define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) /* open for "writes" only for ioctls (specialy hack for floppy.c) */ #define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) +/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ +#define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) struct gendisk { /* @@ -1468,8 +1470,23 @@ struct blk_holder_ops { * Sync the file system mounted on the block device. */ void (*sync)(struct block_device *bdev); + + /* + * Freeze the file system mounted on the block device. + */ + int (*freeze)(struct block_device *bdev); + + /* + * Thaw the file system mounted on the block device. + */ + int (*thaw)(struct block_device *bdev); }; +/* + * For filesystems using @fs_holder_ops, the @holder argument passed to + * helpers used to open and claim block devices via + * bd_prepare_to_claim() must point to a superblock. + */ extern const struct blk_holder_ops fs_holder_ops; /* @@ -1477,7 +1494,8 @@ extern const struct blk_holder_ops fs_holder_ops; * as stored in sb->s_flags. */ #define sb_open_mode(flags) \ - (BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) + (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \ + (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) struct bdev_handle { struct block_device *bdev; @@ -1485,10 +1503,6 @@ struct bdev_handle { blk_mode_t mode; }; -struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops); -struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops); struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, @@ -1496,7 +1510,6 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); -void blkdev_put(struct block_device *bdev, void *holder); void bdev_release(struct bdev_handle *handle); /* just for blk-cgroup, don't use elsewhere */ @@ -1541,8 +1554,8 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) } #endif /* CONFIG_BLOCK */ -int freeze_bdev(struct block_device *bdev); -int thaw_bdev(struct block_device *bdev); +int bdev_freeze(struct block_device *bdev); +int bdev_thaw(struct block_device *bdev); struct io_comp_batch { struct request *req_list; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3d58376ed39e..7e7399bd8bc5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1187,7 +1187,8 @@ enum { struct sb_writers { unsigned short frozen; /* Is sb frozen? */ - unsigned short freeze_holders; /* Who froze fs? */ + int freeze_kcount; /* How many kernel freeze requests? */ + int freeze_ucount; /* How many userspace freeze requests? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; @@ -2053,9 +2054,24 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); +/** + * enum freeze_holder - holder of the freeze + * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem + * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem + * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed + * + * Indicate who the owner of the freeze or thaw request is and whether + * the freeze needs to be exclusive or can nest. + * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the + * same holder aren't allowed. It is however allowed to hold a single + * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at + * the same time. This is relied upon by some filesystems during online + * repair or similar. + */ enum freeze_holder { FREEZE_HOLDER_KERNEL = (1U << 0), FREEZE_HOLDER_USERSPACE = (1U << 1), + FREEZE_MAY_NEST = (1U << 2), }; struct super_operations { @@ -3131,7 +3147,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); -extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); |