diff options
Diffstat (limited to 'drivers/block/loop.c')
-rw-r--r-- | drivers/block/loop.c | 393 |
1 files changed, 179 insertions, 214 deletions
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a58595f5ee2c..e2cb51810e89 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1,54 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * linux/drivers/block/loop.c - * - * Written by Theodore Ts'o, 3/29/93 - * - * Copyright 1993 by Theodore Ts'o. Redistribution of this file is - * permitted under the GNU General Public License. - * - * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 - * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 - * - * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 - * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 - * - * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 - * - * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998 - * - * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 - * - * Loadable modules and other fixes by AK, 1998 - * - * Make real block number available to downstream transfer functions, enables - * CBC (and relatives) mode encryption requiring unique IVs per data block. - * Reed H. Petty, rhp@draper.net - * - * Maximum number of loop devices now dynamic via max_loop module parameter. - * Russell Kroll <rkroll@exploits.org> 19990701 - * - * Maximum number of loop devices when compiled-in now selectable by passing - * max_loop=<1-255> to the kernel on boot. - * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999 - * - * Completely rewrite request handling to be make_request_fn style and - * non blocking, pushing work to a helper thread. Lots of fixes from - * Al Viro too. - * Jens Axboe <axboe@suse.de>, Nov 2000 - * - * Support up to 256 loop devices - * Heinz Mauelshagen <mge@sistina.com>, Feb 2002 - * - * Support for falling back on the write file operation when the address space - * operations write_begin is not available on the backing filesystem. - * Anton Altaparmakov, 16 Feb 2005 - * - * Still To Fix: - * - Advisory locking is ignored here. - * - Should use an own CAP_* category instead of CAP_SYS_ADMIN - * + * Copyright 1993 by Theodore Ts'o. */ - #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> @@ -59,7 +12,6 @@ #include <linux/errno.h> #include <linux/major.h> #include <linux/wait.h> -#include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/init.h> #include <linux/swap.h> @@ -80,10 +32,62 @@ #include <linux/blk-cgroup.h> #include <linux/sched/mm.h> #include <linux/statfs.h> +#include <linux/uaccess.h> +#include <linux/blk-mq.h> +#include <linux/spinlock.h> +#include <uapi/linux/loop.h> + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, + Lo_deleting, +}; -#include "loop.h" +struct loop_func_table; + +struct loop_device { + int lo_number; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + char lo_file_name[LO_NAME_SIZE]; + + struct file * lo_backing_file; + struct block_device *lo_device; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + spinlock_t lo_work_lock; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct list_head idle_worker_list; + struct rb_root worker_tree; + struct timer_list timer; + bool use_dio; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; + struct mutex lo_mutex; + bool idr_visible; +}; -#include <linux/uaccess.h> +struct loop_cmd { + struct list_head list_entry; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *blkcg_css; + struct cgroup_subsys_state *memcg_css; +}; #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) #define LOOP_DEFAULT_HW_Q_DEPTH (128) @@ -314,15 +318,12 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, mode |= FALLOC_FL_KEEP_SIZE; - if (!blk_queue_discard(lo->lo_queue)) { - ret = -EOPNOTSUPP; - goto out; - } + if (!bdev_max_discard_sectors(lo->lo_device)) + return -EOPNOTSUPP; ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) - ret = -EIO; - out: + return -EIO; return ret; } @@ -572,6 +573,10 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (!file) return -EBADF; + + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + is_loop = is_loop_device(file); error = loop_global_lock_killable(lo, is_loop); if (error) @@ -626,13 +631,18 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, fput(old_file); if (partscan) loop_reread_partitions(lo); - return 0; + + error = 0; +done: + /* enable and uncork uevent now that we are done */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + return error; out_err: loop_global_unlock(lo, is_loop); out_putf: fput(file); - return error; + goto done; } /* loop sysfs attributes */ @@ -762,7 +772,7 @@ static void loop_config_discard(struct loop_device *lo) struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); max_discard_sectors = backingq->limits.max_write_zeroes_sectors; - granularity = backingq->limits.discard_granularity ?: + granularity = bdev_discard_granularity(I_BDEV(inode)) ?: queue_physical_block_size(backingq); /* @@ -787,14 +797,11 @@ static void loop_config_discard(struct loop_device *lo) q->limits.discard_granularity = granularity; blk_queue_max_discard_sectors(q, max_discard_sectors); blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } else { q->limits.discard_granularity = 0; blk_queue_max_discard_sectors(q, 0); blk_queue_max_write_zeroes_sectors(q, 0); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); } - q->limits.discard_alignment = 0; } struct loop_worker { @@ -808,8 +815,6 @@ struct loop_worker { }; static void loop_workfn(struct work_struct *work); -static void loop_rootcg_workfn(struct work_struct *work); -static void loop_free_idle_workers(struct timer_list *timer); #ifdef CONFIG_BLK_CGROUP static inline int queue_on_root_worker(struct cgroup_subsys_state *css) @@ -893,6 +898,39 @@ queue_work: spin_unlock_irq(&lo->lo_work_lock); } +static void loop_set_timer(struct loop_device *lo) +{ + timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); +} + +static void loop_free_idle_workers(struct loop_device *lo, bool delete_all) +{ + struct loop_worker *pos, *worker; + + spin_lock_irq(&lo->lo_work_lock); + list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, + idle_list) { + if (!delete_all && + time_is_after_jiffies(worker->last_ran_at + + LOOP_IDLE_WORKER_TIMEOUT)) + break; + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &lo->worker_tree); + css_put(worker->blkcg_css); + kfree(worker); + } + if (!list_empty(&lo->idle_worker_list)) + loop_set_timer(lo); + spin_unlock_irq(&lo->lo_work_lock); +} + +static void loop_free_idle_workers_timer(struct timer_list *timer) +{ + struct loop_device *lo = container_of(timer, struct loop_device, timer); + + return loop_free_idle_workers(lo, false); +} + static void loop_update_rotational(struct loop_device *lo) { struct file *file = lo->lo_backing_file; @@ -903,7 +941,7 @@ static void loop_update_rotational(struct loop_device *lo) /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ if (file_bdev) - nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev)); + nonrot = bdev_nonrot(file_bdev); if (nonrot) blk_queue_flag_set(QUEUE_FLAG_NONROT, q); @@ -967,6 +1005,9 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. @@ -1011,24 +1052,19 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, !file->f_op->write_iter) lo->lo_flags |= LO_FLAGS_READ_ONLY; - lo->workqueue = alloc_workqueue("loop%d", - WQ_UNBOUND | WQ_FREEZABLE, - 0, - lo->lo_number); if (!lo->workqueue) { - error = -ENOMEM; - goto out_unlock; + lo->workqueue = alloc_workqueue("loop%d", + WQ_UNBOUND | WQ_FREEZABLE, + 0, lo->lo_number); + if (!lo->workqueue) { + error = -ENOMEM; + goto out_unlock; + } } disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); - INIT_LIST_HEAD(&lo->rootcg_cmd_list); - INIT_LIST_HEAD(&lo->idle_worker_list); - lo->worker_tree = RB_ROOT; - timer_setup(&lo->timer, loop_free_idle_workers, - TIMER_DEFERRABLE); lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; lo->lo_backing_file = file; @@ -1073,7 +1109,12 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, loop_reread_partitions(lo); if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, loop_configure); - return 0; + + error = 0; +done: + /* enable and uncork uevent now that we are done */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + return error; out_unlock: loop_global_unlock(lo, is_loop); @@ -1084,53 +1125,24 @@ out_putf: fput(file); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - return error; + goto done; } static void __loop_clr_fd(struct loop_device *lo, bool release) { struct file *filp; gfp_t gfp = lo->old_gfp_mask; - struct loop_worker *pos, *worker; - - /* - * Flush loop_configure() and loop_change_fd(). It is acceptable for - * loop_validate_file() to succeed, for actual clear operation has not - * started yet. - */ - mutex_lock(&loop_validate_mutex); - mutex_unlock(&loop_validate_mutex); - /* - * loop_validate_file() now fails because l->lo_state != Lo_bound - * became visible. - */ - - /* - * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close() - * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if - * lo->lo_state has changed while waiting for lo->lo_mutex. - */ - mutex_lock(&lo->lo_mutex); - BUG_ON(lo->lo_state != Lo_rundown); - mutex_unlock(&lo->lo_mutex); if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) blk_queue_write_cache(lo->lo_queue, false, false); - /* freeze request queue during the transition */ - blk_mq_freeze_queue(lo->lo_queue); - - destroy_workqueue(lo->workqueue); - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - spin_unlock_irq(&lo->lo_work_lock); - del_timer_sync(&lo->timer); + /* + * Freeze the request queue when unbinding on a live file descriptor and + * thus an open device. When called from ->release we are guaranteed + * that there is no I/O in progress already. + */ + if (!release) + blk_mq_freeze_queue(lo->lo_queue); spin_lock_irq(&lo->lo_lock); filp = lo->lo_backing_file; @@ -1151,7 +1163,8 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - blk_mq_unfreeze_queue(lo->lo_queue); + if (!release) + blk_mq_unfreeze_queue(lo->lo_queue); disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); @@ -1202,11 +1215,20 @@ static int loop_clr_fd(struct loop_device *lo) { int err; - err = mutex_lock_killable(&lo->lo_mutex); + /* + * Since lo_ioctl() is called without locks held, it is possible that + * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel. + * + * Therefore, use global lock when setting Lo_rundown state in order to + * make sure that loop_validate_file() will fail if the "struct file" + * which loop_configure()/loop_change_fd() found via fget() was this + * loop device. + */ + err = loop_global_lock_killable(lo, true); if (err) return err; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return -ENXIO; } /* @@ -1219,13 +1241,13 @@ static int loop_clr_fd(struct loop_device *lo) * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d * command to fail with EBUSY. */ - if (atomic_read(&lo->lo_refcnt) > 1) { + if (disk_openers(lo->lo_disk) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); return 0; } lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); + loop_global_unlock(lo, true); __loop_clr_fd(lo, false); return 0; @@ -1257,15 +1279,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) /* I/O need to be drained during transfer transition */ blk_mq_freeze_queue(lo->lo_queue); - if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) { - /* If any pages were dirtied after invalidate_bdev(), try again */ - err = -EAGAIN; - pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - prev_lo_flags = lo->lo_flags; err = loop_set_status_from_info(lo, info); @@ -1476,21 +1489,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) invalidate_bdev(lo->lo_device); blk_mq_freeze_queue(lo->lo_queue); - - /* invalidate_bdev should have truncated all the pages */ - if (lo->lo_device->bd_inode->i_mapping->nrpages) { - err = -EAGAIN; - pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n", - __func__, lo->lo_number, lo->lo_file_name, - lo->lo_device->bd_inode->i_mapping->nrpages); - goto out_unfreeze; - } - blk_queue_logical_block_size(lo->lo_queue, arg); blk_queue_physical_block_size(lo->lo_queue, arg); blk_queue_io_min(lo->lo_queue, arg); loop_update_dio(lo); -out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); return err; @@ -1720,33 +1722,15 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif -static int lo_open(struct block_device *bdev, fmode_t mode) -{ - struct loop_device *lo = bdev->bd_disk->private_data; - int err; - - err = mutex_lock_killable(&lo->lo_mutex); - if (err) - return err; - if (lo->lo_state == Lo_deleting) - err = -ENXIO; - else - atomic_inc(&lo->lo_refcnt); - mutex_unlock(&lo->lo_mutex); - return err; -} - static void lo_release(struct gendisk *disk, fmode_t mode) { struct loop_device *lo = disk->private_data; - mutex_lock(&lo->lo_mutex); - if (atomic_dec_return(&lo->lo_refcnt)) - goto out_unlock; + if (disk_openers(disk) > 0) + return; - if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { - if (lo->lo_state != Lo_bound) - goto out_unlock; + mutex_lock(&lo->lo_mutex); + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) { lo->lo_state = Lo_rundown; mutex_unlock(&lo->lo_mutex); /* @@ -1755,27 +1739,30 @@ static void lo_release(struct gendisk *disk, fmode_t mode) */ __loop_clr_fd(lo, true); return; - } else if (lo->lo_state == Lo_bound) { - /* - * Otherwise keep thread (if running) and config, - * but flush possible ongoing bios in thread. - */ - blk_mq_freeze_queue(lo->lo_queue); - blk_mq_unfreeze_queue(lo->lo_queue); } - -out_unlock: mutex_unlock(&lo->lo_mutex); } +static void lo_free_disk(struct gendisk *disk) +{ + struct loop_device *lo = disk->private_data; + + if (lo->workqueue) + destroy_workqueue(lo->workqueue); + loop_free_idle_workers(lo, true); + del_timer_sync(&lo->timer); + mutex_destroy(&lo->lo_mutex); + kfree(lo); +} + static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, - .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = lo_compat_ioctl, #endif + .free_disk = lo_free_disk, }; /* @@ -1834,12 +1821,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->blkcg_css = NULL; cmd->memcg_css = NULL; #ifdef CONFIG_BLK_CGROUP - if (rq->bio && rq->bio->bi_blkg) { - cmd->blkcg_css = &bio_blkcg(rq->bio)->css; + if (rq->bio) { + cmd->blkcg_css = bio_blkcg_css(rq->bio); #ifdef CONFIG_MEMCG - cmd->memcg_css = - cgroup_get_e_css(cmd->blkcg_css->cgroup, - &memory_cgrp_subsys); + if (cmd->blkcg_css) { + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); + } #endif } #endif @@ -1888,11 +1877,6 @@ static void loop_handle_cmd(struct loop_cmd *cmd) } } -static void loop_set_timer(struct loop_device *lo) -{ - timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); -} - static void loop_process_work(struct loop_worker *worker, struct list_head *cmd_list, struct loop_device *lo) { @@ -1941,27 +1925,6 @@ static void loop_rootcg_workfn(struct work_struct *work) loop_process_work(NULL, &lo->rootcg_cmd_list, lo); } -static void loop_free_idle_workers(struct timer_list *timer) -{ - struct loop_device *lo = container_of(timer, struct loop_device, timer); - struct loop_worker *pos, *worker; - - spin_lock_irq(&lo->lo_work_lock); - list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, - idle_list) { - if (time_is_after_jiffies(worker->last_ran_at + - LOOP_IDLE_WORKER_TIMEOUT)) - break; - list_del(&worker->idle_list); - rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->blkcg_css); - kfree(worker); - } - if (!list_empty(&lo->idle_worker_list)) - loop_set_timer(lo); - spin_unlock_irq(&lo->lo_work_lock); -} - static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, .complete = lo_complete_rq, @@ -1977,6 +1940,9 @@ static int loop_add(int i) lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; + lo->worker_tree = RB_ROOT; + INIT_LIST_HEAD(&lo->idle_worker_list); + timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); lo->lo_state = Lo_unbound; err = mutex_lock_killable(&loop_ctl_mutex); @@ -2046,11 +2012,12 @@ static int loop_add(int i) */ if (!part_shift) disk->flags |= GENHD_FL_NO_PART; - atomic_set(&lo->lo_refcnt, 0); mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); spin_lock_init(&lo->lo_work_lock); + INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); + INIT_LIST_HEAD(&lo->rootcg_cmd_list); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; disk->minors = 1 << part_shift; @@ -2090,15 +2057,14 @@ static void loop_remove(struct loop_device *lo) { /* Make this loop device unreachable from pathname. */ del_gendisk(lo->lo_disk); - blk_cleanup_disk(lo->lo_disk); + blk_cleanup_queue(lo->lo_disk->queue); blk_mq_free_tag_set(&lo->tag_set); mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, lo->lo_number); mutex_unlock(&loop_ctl_mutex); - /* There is no route which can find this loop device. */ - mutex_destroy(&lo->lo_mutex); - kfree(lo); + + put_disk(lo->lo_disk); } static void loop_probe(dev_t dev) @@ -2137,13 +2103,12 @@ static int loop_control_remove(int idx) ret = mutex_lock_killable(&lo->lo_mutex); if (ret) goto mark_visible; - if (lo->lo_state != Lo_unbound || - atomic_read(&lo->lo_refcnt) > 0) { + if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) { mutex_unlock(&lo->lo_mutex); ret = -EBUSY; goto mark_visible; } - /* Mark this loop device no longer open()-able. */ + /* Mark this loop device as no more bound, but not quite unbound yet */ lo->lo_state = Lo_deleting; mutex_unlock(&lo->lo_mutex); |