From 34dd82afd27da2537199d7f71f1542501c6f96e7 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Sun, 31 Jul 2011 22:08:04 +0200 Subject: loop: replace linked list of allocated devices with an idr index Replace the linked list, that keeps track of allocated devices, with an idr index to allow a more efficient lookup of devices. Cc: Tejun Heo Signed-off-by: Kay Sievers Signed-off-by: Jens Axboe --- include/linux/loop.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/loop.h') diff --git a/include/linux/loop.h b/include/linux/loop.h index 66c194e2d9b9..5f08d18fa148 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -64,7 +64,6 @@ struct loop_device { struct request_queue *lo_queue; struct gendisk *lo_disk; - struct list_head lo_list; }; #endif /* __KERNEL__ */ -- cgit v1.2.3 From 770fe30a46a12b6fb6b63fbe1737654d28e84844 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Sun, 31 Jul 2011 22:08:04 +0200 Subject: loop: add management interface for on-demand device allocation Loop devices today have a fixed pre-allocated number of usually 8. The number can only be changed at module init time. To find a free device to use, /dev/loop%i needs to be scanned, and all devices need to be opened until a free one is possibly found. This adds a new /dev/loop-control device node, that allows to dynamically find or allocate a free device, and to add and remove loop devices from the running system: LOOP_CTL_ADD adds a specific device. Arg is the number of the device. It returns the device i or a negative error code. LOOP_CTL_REMOVE removes a specific device, Arg is the number the device. It returns the device i or a negative error code. LOOP_CTL_GET_FREE finds the next unbound device or allocates a new one. No arg is given. It returns the device i or a negative error code. The loop kernel module gets automatically loaded when /dev/loop-control is accessed the first time. The alias specified in the module, instructs udev to create this 'dead' device node, even when the module is not loaded. Example: cfd = open("/dev/loop-control", O_RDWR); # add a new specific loop device err = ioctl(cfd, LOOP_CTL_ADD, devnr); # remove a specific loop device err = ioctl(cfd, LOOP_CTL_REMOVE, devnr); # find or allocate a free loop device to use devnr = ioctl(cfd, LOOP_CTL_GET_FREE); sprintf(loopname, "/dev/loop%i", devnr); ffd = open("backing-file", O_RDWR); lfd = open(loopname, O_RDWR); err = ioctl(lfd, LOOP_SET_FD, ffd); Cc: Tejun Heo Cc: Karel Zak Signed-off-by: Kay Sievers Signed-off-by: Jens Axboe --- drivers/block/loop.c | 120 +++++++++++++++++++++++++++++++++++++++++++-- include/linux/loop.h | 4 ++ include/linux/miscdevice.h | 1 + 3 files changed, 121 insertions(+), 4 deletions(-) (limited to 'include/linux/loop.h') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f58532e77777..5c9edf944879 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -75,7 +75,7 @@ #include #include #include - +#include #include static DEFINE_IDR(loop_index_idr); @@ -1478,13 +1478,22 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, static int lo_open(struct block_device *bdev, fmode_t mode) { - struct loop_device *lo = bdev->bd_disk->private_data; + struct loop_device *lo; + int err = 0; + + mutex_lock(&loop_index_mutex); + lo = bdev->bd_disk->private_data; + if (!lo) { + err = -ENXIO; + goto out; + } mutex_lock(&lo->lo_ctl_mutex); lo->lo_refcnt++; mutex_unlock(&lo->lo_ctl_mutex); - - return 0; +out: + mutex_unlock(&loop_index_mutex); + return err; } static int lo_release(struct gendisk *disk, fmode_t mode) @@ -1603,6 +1612,13 @@ static int loop_add(struct loop_device **l, int i) idr_remove(&loop_index_idr, m); err = -EEXIST; } + } else if (i == -1) { + int m; + + /* get next free nr */ + err = idr_get_new(&loop_index_idr, lo, &m); + if (err >= 0) + i = m; } else { err = -EINVAL; } @@ -1648,16 +1664,41 @@ static void loop_remove(struct loop_device *lo) kfree(lo); } +static int find_free_cb(int id, void *ptr, void *data) +{ + struct loop_device *lo = ptr; + struct loop_device **l = data; + + if (lo->lo_state == Lo_unbound) { + *l = lo; + return 1; + } + return 0; +} + static int loop_lookup(struct loop_device **l, int i) { struct loop_device *lo; int ret = -ENODEV; + if (i < 0) { + int err; + + err = idr_for_each(&loop_index_idr, &find_free_cb, &lo); + if (err == 1) { + *l = lo; + ret = lo->lo_number; + } + goto out; + } + + /* lookup and return a specific i */ lo = idr_find(&loop_index_idr, i); if (lo) { *l = lo; ret = lo->lo_number; } +out: return ret; } @@ -1681,11 +1722,76 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data) return kobj; } +static long loop_control_ioctl(struct file *file, unsigned int cmd, + unsigned long parm) +{ + struct loop_device *lo; + int ret = -ENOSYS; + + mutex_lock(&loop_index_mutex); + switch (cmd) { + case LOOP_CTL_ADD: + ret = loop_lookup(&lo, parm); + if (ret >= 0) { + ret = -EEXIST; + break; + } + ret = loop_add(&lo, parm); + break; + case LOOP_CTL_REMOVE: + ret = loop_lookup(&lo, parm); + if (ret < 0) + break; + mutex_lock(&lo->lo_ctl_mutex); + if (lo->lo_state != Lo_unbound) { + ret = -EBUSY; + mutex_unlock(&lo->lo_ctl_mutex); + break; + } + if (lo->lo_refcnt > 0) { + ret = -EBUSY; + mutex_unlock(&lo->lo_ctl_mutex); + break; + } + lo->lo_disk->private_data = NULL; + mutex_unlock(&lo->lo_ctl_mutex); + idr_remove(&loop_index_idr, lo->lo_number); + loop_remove(lo); + break; + case LOOP_CTL_GET_FREE: + ret = loop_lookup(&lo, -1); + if (ret >= 0) + break; + ret = loop_add(&lo, -1); + } + mutex_unlock(&loop_index_mutex); + + return ret; +} + +static const struct file_operations loop_ctl_fops = { + .open = nonseekable_open, + .unlocked_ioctl = loop_control_ioctl, + .compat_ioctl = loop_control_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice loop_misc = { + .minor = LOOP_CTRL_MINOR, + .name = "loop-control", + .fops = &loop_ctl_fops, +}; + +MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR); +MODULE_ALIAS("devname:loop-control"); + static int __init loop_init(void) { int i, nr; unsigned long range; struct loop_device *lo; + int err; /* * loop module now has a feature to instantiate underlying device @@ -1702,6 +1808,10 @@ static int __init loop_init(void) * device on-demand. */ + err = misc_register(&loop_misc); + if (err < 0) + return err; + part_shift = 0; if (max_part > 0) { part_shift = fls(max_part); @@ -1767,6 +1877,8 @@ static void __exit loop_exit(void) blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); unregister_blkdev(LOOP_MAJOR, "loop"); + + misc_deregister(&loop_misc); } module_init(loop_init); diff --git a/include/linux/loop.h b/include/linux/loop.h index 5f08d18fa148..683d69890119 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -160,4 +160,8 @@ int loop_unregister_transfer(int number); #define LOOP_CHANGE_FD 0x4C06 #define LOOP_SET_CAPACITY 0x4C07 +/* /dev/loop-control interface */ +#define LOOP_CTL_ADD 0x4C80 +#define LOOP_CTL_REMOVE 0x4C81 +#define LOOP_CTL_GET_FREE 0x4C82 #endif diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 18fd13028ba1..c309b1ecdc1c 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -40,6 +40,7 @@ #define BTRFS_MINOR 234 #define AUTOFS_MINOR 235 #define MAPPER_CTRL_MINOR 236 +#define LOOP_CTRL_MINOR 237 #define MISC_DYNAMIC_MINOR 255 struct device; -- cgit v1.2.3 From e03c8dd14915fabc101aa495828d58598dc5af98 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Tue, 23 Aug 2011 20:12:04 +0200 Subject: loop: always allow userspace partitions and optionally support automatic scanning Automatic partition scanning can be requested individually per loop device during its setup by setting LO_FLAGS_PARTSCAN. By default, no partition tables are scanned. Userspace can now always add and remove partitions from all loop devices, regardless if the in-kernel partition scanner is enabled or not. The needed partition minor numbers are allocated from the extended minors space, the main loop device numbers will continue to match the loop minors, regardless of the number of partitions used. # grep . /sys/class/block/loop1/loop/* /sys/block/loop1/loop/autoclear:0 /sys/block/loop1/loop/backing_file:/home/kay/data/stuff/part.img /sys/block/loop1/loop/offset:0 /sys/block/loop1/loop/partscan:1 /sys/block/loop1/loop/sizelimit:0 # ls -l /dev/loop* brw-rw---- 1 root disk 7, 0 Aug 14 20:22 /dev/loop0 brw-rw---- 1 root disk 7, 1 Aug 14 20:23 /dev/loop1 brw-rw---- 1 root disk 259, 0 Aug 14 20:23 /dev/loop1p1 brw-rw---- 1 root disk 259, 1 Aug 14 20:23 /dev/loop1p2 brw-rw---- 1 root disk 7, 99 Aug 14 20:23 /dev/loop99 brw-rw---- 1 root disk 259, 2 Aug 14 20:23 /dev/loop99p1 brw-rw---- 1 root disk 259, 3 Aug 14 20:23 /dev/loop99p2 crw------T 1 root root 10, 237 Aug 14 20:22 /dev/loop-control Cc: Karel Zak Cc: Davidlohr Bueso Acked-By: Tejun Heo Signed-off-by: Kay Sievers Signed-off-by: Jens Axboe --- drivers/block/loop.c | 49 +++++++++++++++++++++++++++++++++++++++++++++---- include/linux/loop.h | 1 + 2 files changed, 46 insertions(+), 4 deletions(-) (limited to 'include/linux/loop.h') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 936cac3c3126..b336433f8157 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -724,7 +724,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, goto out_putf; fput(old_file); - if (max_part > 0) + if (lo->lo_flags & LO_FLAGS_PARTSCAN) ioctl_by_bdev(bdev, BLKRRPART, 0); return 0; @@ -808,16 +808,25 @@ static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) return sprintf(buf, "%s\n", autoclear ? "1" : "0"); } +static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) +{ + int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); + + return sprintf(buf, "%s\n", partscan ? "1" : "0"); +} + LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); +LOOP_ATTR_RO(partscan); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, &loop_attr_offset.attr, &loop_attr_sizelimit.attr, &loop_attr_autoclear.attr, + &loop_attr_partscan.attr, NULL, }; @@ -979,7 +988,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, } lo->lo_state = Lo_bound; wake_up_process(lo->lo_thread); - if (max_part > 0) + if (part_shift) + lo->lo_flags |= LO_FLAGS_PARTSCAN; + if (lo->lo_flags & LO_FLAGS_PARTSCAN) ioctl_by_bdev(bdev, BLKRRPART, 0); return 0; @@ -1070,7 +1081,6 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) lo->lo_offset = 0; lo->lo_sizelimit = 0; lo->lo_encrypt_key_size = 0; - lo->lo_flags = 0; lo->lo_thread = NULL; memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); @@ -1088,8 +1098,11 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - if (max_part > 0 && bdev) + if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) ioctl_by_bdev(bdev, BLKRRPART, 0); + lo->lo_flags = 0; + if (!part_shift) + lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; mutex_unlock(&lo->lo_ctl_mutex); /* * Need not hold lo_ctl_mutex to fput backing file. @@ -1159,6 +1172,13 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) (info->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; + if ((info->lo_flags & LO_FLAGS_PARTSCAN) && + !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { + lo->lo_flags |= LO_FLAGS_PARTSCAN; + lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + ioctl_by_bdev(lo->lo_device, BLKRRPART, 0); + } + lo->lo_encrypt_key_size = info->lo_encrypt_key_size; lo->lo_init[0] = info->lo_init[0]; lo->lo_init[1] = info->lo_init[1]; @@ -1654,6 +1674,27 @@ static struct loop_device *loop_alloc(int i) if (!disk) goto out_free_queue; + /* + * Disable partition scanning by default. The in-kernel partition + * scanning can be requested individually per-device during its + * setup. Userspace can always add and remove partitions from all + * devices. The needed partition minors are allocated from the + * extended minor space, the main loop device numbers will continue + * to match the loop minors, regardless of the number of partitions + * used. + * + * If max_part is given, partition scanning is globally enabled for + * all loop devices. The minors for the main loop devices will be + * multiples of max_part. + * + * Note: Global-for-all-devices, set-only-at-init, read-only module + * parameteters like 'max_loop' and 'max_part' make things needlessly + * complicated, are too static, inflexible and may surprise + * userspace tools. Parameters like this in general should be avoided. + */ + if (!part_shift) + disk->flags |= GENHD_FL_NO_PART_SCAN; + disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); lo->lo_number = i; lo->lo_thread = NULL; diff --git a/include/linux/loop.h b/include/linux/loop.h index 66c194e2d9b9..4367fc507fe9 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -76,6 +76,7 @@ enum { LO_FLAGS_READ_ONLY = 1, LO_FLAGS_USE_AOPS = 2, LO_FLAGS_AUTOCLEAR = 4, + LO_FLAGS_PARTSCAN = 8, }; #include /* for __kernel_old_dev_t */ -- cgit v1.2.3 From 456be1484ffc72a24bdb4200b5847c4fa90139d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Oct 2011 12:57:20 +0200 Subject: loop: remove the incorrect write_begin/write_end shortcut Currently the loop device tries to call directly into write_begin/write_end instead of going through ->write if it can. This is a fairly nasty shortcut as write_begin and write_end are only callbacks for the generic write code and expect to be called with filesystem specific locks held. This code currently causes various issues for clustered filesystems as it doesn't take the required cluster locks, and it also causes issues for XFS as it doesn't properly lock against the swapext ioctl as called by the defragmentation tools. This in case causes data corruption if defragmentation hits a busy loop device in the wrong time window, as reported by RH QA. The reason why we have this shortcut is that it saves a data copy when doing a transformation on the loop device, which is the technical term for using cryptoloop (or an XOR transformation). Given that cryptoloop has been deprecated in favour of dm-crypt my opinion is that we should simply drop this shortcut instead of finding complicated ways to to introduce a formal interface for this shortcut. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/loop.c | 135 +++++++++------------------------------------------ include/linux/loop.h | 1 - 2 files changed, 23 insertions(+), 113 deletions(-) (limited to 'include/linux/loop.h') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 4720c7ade0ae..46cdd6945557 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -202,74 +202,6 @@ lo_do_transfer(struct loop_device *lo, int cmd, return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); } -/** - * do_lo_send_aops - helper for writing data to a loop device - * - * This is the fast version for backing filesystems which implement the address - * space operations write_begin and write_end. - */ -static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, - loff_t pos, struct page *unused) -{ - struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ - struct address_space *mapping = file->f_mapping; - pgoff_t index; - unsigned offset, bv_offs; - int len, ret; - - mutex_lock(&mapping->host->i_mutex); - index = pos >> PAGE_CACHE_SHIFT; - offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1); - bv_offs = bvec->bv_offset; - len = bvec->bv_len; - while (len > 0) { - sector_t IV; - unsigned size, copied; - int transfer_result; - struct page *page; - void *fsdata; - - IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); - size = PAGE_CACHE_SIZE - offset; - if (size > len) - size = len; - - ret = pagecache_write_begin(file, mapping, pos, size, 0, - &page, &fsdata); - if (ret) - goto fail; - - file_update_time(file); - - transfer_result = lo_do_transfer(lo, WRITE, page, offset, - bvec->bv_page, bv_offs, size, IV); - copied = size; - if (unlikely(transfer_result)) - copied = 0; - - ret = pagecache_write_end(file, mapping, pos, size, copied, - page, fsdata); - if (ret < 0 || ret != copied) - goto fail; - - if (unlikely(transfer_result)) - goto fail; - - bv_offs += copied; - len -= copied; - offset = 0; - index++; - pos += copied; - } - ret = 0; -out: - mutex_unlock(&mapping->host->i_mutex); - return ret; -fail: - ret = -1; - goto out; -} - /** * __do_lo_send_write - helper for writing data to a loop device * @@ -297,10 +229,8 @@ static int __do_lo_send_write(struct file *file, /** * do_lo_send_direct_write - helper for writing data to a loop device * - * This is the fast, non-transforming version for backing filesystems which do - * not implement the address space operations write_begin and write_end. - * It uses the write file operation which should be present on all writeable - * filesystems. + * This is the fast, non-transforming version that does not need double + * buffering. */ static int do_lo_send_direct_write(struct loop_device *lo, struct bio_vec *bvec, loff_t pos, struct page *page) @@ -316,15 +246,9 @@ static int do_lo_send_direct_write(struct loop_device *lo, /** * do_lo_send_write - helper for writing data to a loop device * - * This is the slow, transforming version for filesystems which do not - * implement the address space operations write_begin and write_end. It - * uses the write file operation which should be present on all writeable - * filesystems. - * - * Using fops->write is slower than using aops->{prepare,commit}_write in the - * transforming case because we need to double buffer the data as we cannot do - * the transformations in place as we do not have direct access to the - * destination pages of the backing file. + * This is the slow, transforming version that needs to double buffer the + * data as it cannot do the transformations in place without having direct + * access to the destination pages of the backing file. */ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, loff_t pos, struct page *page) @@ -350,17 +274,16 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) struct page *page = NULL; int i, ret = 0; - do_lo_send = do_lo_send_aops; - if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) { + if (lo->transfer != transfer_none) { + page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); + if (unlikely(!page)) + goto fail; + kmap(page); + do_lo_send = do_lo_send_write; + } else { do_lo_send = do_lo_send_direct_write; - if (lo->transfer != transfer_none) { - page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); - if (unlikely(!page)) - goto fail; - kmap(page); - do_lo_send = do_lo_send_write; - } } + bio_for_each_segment(bvec, bio, i) { ret = do_lo_send(lo, bvec, pos, page); if (ret < 0) @@ -849,35 +772,23 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, mapping = file->f_mapping; inode = mapping->host; - if (!(file->f_mode & FMODE_WRITE)) - lo_flags |= LO_FLAGS_READ_ONLY; - error = -EINVAL; - if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) { - const struct address_space_operations *aops = mapping->a_ops; - - if (aops->write_begin) - lo_flags |= LO_FLAGS_USE_AOPS; - if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) - lo_flags |= LO_FLAGS_READ_ONLY; + if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) + goto out_putf; - lo_blocksize = S_ISBLK(inode->i_mode) ? - inode->i_bdev->bd_block_size : PAGE_SIZE; + if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || + !file->f_op->write) + lo_flags |= LO_FLAGS_READ_ONLY; - error = 0; - } else { - goto out_putf; - } + lo_blocksize = S_ISBLK(inode->i_mode) ? + inode->i_bdev->bd_block_size : PAGE_SIZE; + error = -EFBIG; size = get_loop_size(lo, file); - - if ((loff_t)(sector_t)size != size) { - error = -EFBIG; + if ((loff_t)(sector_t)size != size) goto out_putf; - } - if (!(mode & FMODE_WRITE)) - lo_flags |= LO_FLAGS_READ_ONLY; + error = 0; set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); diff --git a/include/linux/loop.h b/include/linux/loop.h index 683d69890119..a06880689115 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -73,7 +73,6 @@ struct loop_device { */ enum { LO_FLAGS_READ_ONLY = 1, - LO_FLAGS_USE_AOPS = 2, LO_FLAGS_AUTOCLEAR = 4, }; -- cgit v1.2.3