summaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
authorMauro Carvalho Chehab <mchehab@osg.samsung.com>2015-02-23 22:02:19 +0300
committerMauro Carvalho Chehab <mchehab@osg.samsung.com>2015-02-23 22:02:19 +0300
commit99a85b901eb54f62ff0c3fd6eb56e60b7b9f15c8 (patch)
tree0c6637b7d2172e079c30e966847326767cbaf45c /drivers/block
parent135f9be9194cf7778eb73594aa55791b229cf27c (diff)
parentc517d838eb7d07bbe9507871fab3931deccff539 (diff)
downloadlinux-99a85b901eb54f62ff0c3fd6eb56e60b7b9f15c8.tar.xz
Merge tag 'v4.0-rc1' into patchwork
Linux 34.0-rc1 * tag 'v4.0-rc1': (8947 commits) Linux 4.0-rc1 autofs4 copy_dev_ioctl(): keep the value of ->size we'd used for allocation procfs: fix race between symlink removals and traversals debugfs: leave freeing a symlink body until inode eviction Documentation/filesystems/Locking: ->get_sb() is long gone trylock_super(): replacement for grab_super_passive() fanotify: Fix up scripted S_ISDIR/S_ISREG/S_ISLNK conversions Cachefiles: Fix up scripted S_ISDIR/S_ISREG/S_ISLNK conversions VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry) SELinux: Use d_is_positive() rather than testing dentry->d_inode Smack: Use d_is_positive() rather than testing dentry->d_inode TOMOYO: Use d_is_dir() rather than d_inode and S_ISDIR() Apparmor: Use d_is_positive/negative() rather than testing dentry->d_inode Apparmor: mediated_filesystem() should use dentry->d_sb not inode->i_sb VFS: Split DCACHE_FILE_TYPE into regular and special types VFS: Add a fallthrough flag for marking virtual dentries VFS: Add a whiteout dentry type VFS: Introduce inode-getting helpers for layered/unioned fs environments kernel: make READ_ONCE() valid on const arguments blk-throttle: check stats_cpu before reading it from sysfs ...
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig13
-rw-r--r--drivers/block/brd.c137
-rw-r--r--drivers/block/drbd/drbd_receiver.c2
-rw-r--r--drivers/block/floppy.c17
-rw-r--r--drivers/block/loop.c416
-rw-r--r--drivers/block/loop.h18
-rw-r--r--drivers/block/null_blk.c2
-rw-r--r--drivers/block/nvme-core.c624
-rw-r--r--drivers/block/nvme-scsi.c96
-rw-r--r--drivers/block/osdblk.c2
-rw-r--r--drivers/block/rbd.c218
-rw-r--r--drivers/block/virtio_blk.c12
-rw-r--r--drivers/block/xen-blkback/blkback.c177
-rw-r--r--drivers/block/xen-blkback/common.h12
-rw-r--r--drivers/block/xen-blkback/xenbus.c4
-rw-r--r--drivers/block/xen-blkfront.c6
-rw-r--r--drivers/block/zram/zram_drv.c227
-rw-r--r--drivers/block/zram/zram_drv.h21
18 files changed, 1139 insertions, 865 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 014a1cfc41c5..1b8094d4d7af 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -393,14 +393,15 @@ config BLK_DEV_RAM_SIZE
The default value is 4096 kilobytes. Only change this if you know
what you are doing.
-config BLK_DEV_XIP
- bool "Support XIP filesystems on RAM block device"
- depends on BLK_DEV_RAM
+config BLK_DEV_RAM_DAX
+ bool "Support Direct Access (DAX) to RAM block devices"
+ depends on BLK_DEV_RAM && FS_DAX
default n
help
- Support XIP filesystems (such as ext2 with XIP support on) on
- top of block ram device. This will slightly enlarge the kernel, and
- will prevent RAM block device backing store memory from being
+ Support filesystems using DAX to access RAM block devices. This
+ avoids double-buffering data in the page cache before copying it
+ to the block device. Answering Y will slightly enlarge the kernel,
+ and will prevent RAM block device backing store memory from being
allocated from highmem (only a problem for highmem systems).
config CDROM_PKTCDVD
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 3598110d2cef..64ab4951e9d6 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -97,13 +97,13 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
* Must use NOIO because we don't want to recurse back into the
* block or filesystem layers from page reclaim.
*
- * Cannot support XIP and highmem, because our ->direct_access
- * routine for XIP must return memory that is always addressable.
- * If XIP was reworked to use pfns and kmap throughout, this
+ * Cannot support DAX and highmem, because our ->direct_access
+ * routine for DAX must return memory that is always addressable.
+ * If DAX was reworked to use pfns and kmap throughout, this
* restriction might be able to be lifted.
*/
gfp_flags = GFP_NOIO | __GFP_ZERO;
-#ifndef CONFIG_BLK_DEV_XIP
+#ifndef CONFIG_BLK_DEV_RAM_DAX
gfp_flags |= __GFP_HIGHMEM;
#endif
page = alloc_page(gfp_flags);
@@ -369,27 +369,29 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
return err;
}
-#ifdef CONFIG_BLK_DEV_XIP
-static int brd_direct_access(struct block_device *bdev, sector_t sector,
- void **kaddr, unsigned long *pfn)
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+static long brd_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, unsigned long *pfn, long size)
{
struct brd_device *brd = bdev->bd_disk->private_data;
struct page *page;
if (!brd)
return -ENODEV;
- if (sector & (PAGE_SECTORS-1))
- return -EINVAL;
- if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk))
- return -ERANGE;
page = brd_insert_page(brd, sector);
if (!page)
return -ENOSPC;
*kaddr = page_address(page);
*pfn = page_to_pfn(page);
- return 0;
+ /*
+ * TODO: If size > PAGE_SIZE, we could look to see if the next page in
+ * the file happens to be mapped to the next page of physical RAM.
+ */
+ return PAGE_SIZE;
}
+#else
+#define brd_direct_access NULL
#endif
static int brd_ioctl(struct block_device *bdev, fmode_t mode,
@@ -430,27 +432,24 @@ static const struct block_device_operations brd_fops = {
.owner = THIS_MODULE,
.rw_page = brd_rw_page,
.ioctl = brd_ioctl,
-#ifdef CONFIG_BLK_DEV_XIP
.direct_access = brd_direct_access,
-#endif
};
/*
* And now the modules code and kernel interface.
*/
-static int rd_nr;
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
-static int max_part;
-static int part_shift;
-static int part_show = 0;
+static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
module_param(rd_nr, int, S_IRUGO);
MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
+
+int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
module_param(rd_size, int, S_IRUGO);
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
+
+static int max_part = 1;
module_param(max_part, int, S_IRUGO);
-MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
-module_param(part_show, int, S_IRUGO);
-MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions");
+MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
+
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
MODULE_ALIAS("rd");
@@ -487,25 +486,33 @@ static struct brd_device *brd_alloc(int i)
brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
if (!brd->brd_queue)
goto out_free_dev;
+
blk_queue_make_request(brd->brd_queue, brd_make_request);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
+ /* This is so fdisk will align partitions on 4k, because of
+ * direct_access API needing 4k alignment, returning a PFN
+ * (This is only a problem on very small devices <= 4M,
+ * otherwise fdisk will align on 1M. Regardless this call
+ * is harmless)
+ */
+ blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
+
brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
brd->brd_queue->limits.discard_zeroes_data = 1;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
- disk = brd->brd_disk = alloc_disk(1 << part_shift);
+ disk = brd->brd_disk = alloc_disk(max_part);
if (!disk)
goto out_free_queue;
disk->major = RAMDISK_MAJOR;
- disk->first_minor = i << part_shift;
+ disk->first_minor = i * max_part;
disk->fops = &brd_fops;
disk->private_data = brd;
disk->queue = brd->brd_queue;
- if (!part_show)
- disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
+ disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "ram%d", i);
set_capacity(disk, rd_size * 2);
@@ -527,10 +534,11 @@ static void brd_free(struct brd_device *brd)
kfree(brd);
}
-static struct brd_device *brd_init_one(int i)
+static struct brd_device *brd_init_one(int i, bool *new)
{
struct brd_device *brd;
+ *new = false;
list_for_each_entry(brd, &brd_devices, brd_list) {
if (brd->brd_number == i)
goto out;
@@ -541,6 +549,7 @@ static struct brd_device *brd_init_one(int i)
add_disk(brd->brd_disk);
list_add_tail(&brd->brd_list, &brd_devices);
}
+ *new = true;
out:
return brd;
}
@@ -556,70 +565,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
{
struct brd_device *brd;
struct kobject *kobj;
+ bool new;
mutex_lock(&brd_devices_mutex);
- brd = brd_init_one(MINOR(dev) >> part_shift);
+ brd = brd_init_one(MINOR(dev) / max_part, &new);
kobj = brd ? get_disk(brd->brd_disk) : NULL;
mutex_unlock(&brd_devices_mutex);
- *part = 0;
+ if (new)
+ *part = 0;
+
return kobj;
}
static int __init brd_init(void)
{
- int i, nr;
- unsigned long range;
struct brd_device *brd, *next;
+ int i;
/*
* brd module now has a feature to instantiate underlying device
* structure on-demand, provided that there is an access dev node.
- * However, this will not work well with user space tool that doesn't
- * know about such "feature". In order to not break any existing
- * tool, we do the following:
*
- * (1) if rd_nr is specified, create that many upfront, and this
- * also becomes a hard limit.
- * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT
- * (default 16) rd device on module load, user can further
- * extend brd device by create dev node themselves and have
- * kernel automatically instantiate actual device on-demand.
+ * (1) if rd_nr is specified, create that many upfront. else
+ * it defaults to CONFIG_BLK_DEV_RAM_COUNT
+ * (2) User can further extend brd devices by create dev node themselves
+ * and have kernel automatically instantiate actual device
+ * on-demand. Example:
+ * mknod /path/devnod_name b 1 X # 1 is the rd major
+ * fdisk -l /path/devnod_name
+ * If (X / max_part) was not already created it will be created
+ * dynamically.
*/
- part_shift = 0;
- if (max_part > 0) {
- part_shift = fls(max_part);
-
- /*
- * Adjust max_part according to part_shift as it is exported
- * to user space so that user can decide correct minor number
- * if [s]he want to create more devices.
- *
- * Note that -1 is required because partition 0 is reserved
- * for the whole disk.
- */
- max_part = (1UL << part_shift) - 1;
- }
-
- if ((1UL << part_shift) > DISK_MAX_PARTS)
- return -EINVAL;
-
- if (rd_nr > 1UL << (MINORBITS - part_shift))
- return -EINVAL;
-
- if (rd_nr) {
- nr = rd_nr;
- range = rd_nr << part_shift;
- } else {
- nr = CONFIG_BLK_DEV_RAM_COUNT;
- range = 1UL << MINORBITS;
- }
-
if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
return -EIO;
- for (i = 0; i < nr; i++) {
+ if (unlikely(!max_part))
+ max_part = 1;
+
+ for (i = 0; i < rd_nr; i++) {
brd = brd_alloc(i);
if (!brd)
goto out_free;
@@ -631,10 +616,10 @@ static int __init brd_init(void)
list_for_each_entry(brd, &brd_devices, brd_list)
add_disk(brd->brd_disk);
- blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
+ blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
THIS_MODULE, brd_probe, NULL, NULL);
- printk(KERN_INFO "brd: module loaded\n");
+ pr_info("brd: module loaded\n");
return 0;
out_free:
@@ -644,21 +629,21 @@ out_free:
}
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+ pr_info("brd: module NOT loaded !!!\n");
return -ENOMEM;
}
static void __exit brd_exit(void)
{
- unsigned long range;
struct brd_device *brd, *next;
- range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
-
list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
brd_del_one(brd);
- blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
+ blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+ pr_info("brd: module unloaded\n");
}
module_init(brd_init);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d169b4a79267..cee20354ac37 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1388,7 +1388,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
list_add_tail(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
if (blkdev_issue_zeroout(device->ldev->backing_bdev,
- sector, data_size >> 9, GFP_NOIO))
+ sector, data_size >> 9, GFP_NOIO, false))
peer_req->flags |= EE_WAS_ERROR;
drbd_endio_write_sec_final(peer_req);
return 0;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 56d46ffb08e1..a08cda955285 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev,
static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
+static struct attribute *floppy_dev_attrs[] = {
+ &dev_attr_cmos.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(floppy_dev);
+
static void floppy_device_release(struct device *dev)
{
}
@@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void)
floppy_device[drive].name = floppy_device_name;
floppy_device[drive].id = drive;
floppy_device[drive].dev.release = floppy_device_release;
+ floppy_device[drive].dev.groups = floppy_dev_groups;
err = platform_device_register(&floppy_device[drive]);
if (err)
goto out_remove_drives;
- err = device_create_file(&floppy_device[drive].dev,
- &dev_attr_cmos);
- if (err)
- goto out_unreg_platform_dev;
-
/* to be cleaned up... */
disks[drive]->private_data = (void *)(long)drive;
disks[drive]->flags |= GENHD_FL_REMOVABLE;
@@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void)
return 0;
-out_unreg_platform_dev:
- platform_device_unregister(&floppy_device[drive]);
out_remove_drives:
while (drive--) {
if (floppy_available(drive)) {
del_gendisk(disks[drive]);
- device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
}
@@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void)
if (floppy_available(drive)) {
del_gendisk(disks[drive]);
- device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
blk_cleanup_queue(disks[drive]->queue);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6cb1beb47c25..d1f168b73634 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -85,6 +85,8 @@ static DEFINE_MUTEX(loop_index_mutex);
static int max_part;
static int part_shift;
+static struct workqueue_struct *loop_wq;
+
/*
* Transfer functions
*/
@@ -284,12 +286,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
return ret;
}
-static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
+static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos)
{
int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t,
struct page *page);
struct bio_vec bvec;
- struct bvec_iter iter;
+ struct req_iterator iter;
struct page *page = NULL;
int ret = 0;
@@ -303,7 +305,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
do_lo_send = do_lo_send_direct_write;
}
- bio_for_each_segment(bvec, bio, iter) {
+ rq_for_each_segment(bvec, rq, iter) {
ret = do_lo_send(lo, &bvec, pos, page);
if (ret < 0)
break;
@@ -391,19 +393,22 @@ do_lo_receive(struct loop_device *lo,
}
static int
-lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
+lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos)
{
struct bio_vec bvec;
- struct bvec_iter iter;
+ struct req_iterator iter;
ssize_t s;
- bio_for_each_segment(bvec, bio, iter) {
+ rq_for_each_segment(bvec, rq, iter) {
s = do_lo_receive(lo, &bvec, bsize, pos);
if (s < 0)
return s;
if (s != bvec.bv_len) {
- zero_fill_bio(bio);
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
break;
}
pos += bvec.bv_len;
@@ -411,106 +416,58 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
return 0;
}
-static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
+static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
{
- loff_t pos;
+ /*
+ * We use punch hole to reclaim the free space used by the
+ * image a.k.a. discard. However we do not support discard if
+ * encryption is enabled, because it may give an attacker
+ * useful information.
+ */
+ struct file *file = lo->lo_backing_file;
+ int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
int ret;
- pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset;
-
- if (bio_rw(bio) == WRITE) {
- struct file *file = lo->lo_backing_file;
-
- if (bio->bi_rw & REQ_FLUSH) {
- ret = vfs_fsync(file, 0);
- if (unlikely(ret && ret != -EINVAL)) {
- ret = -EIO;
- goto out;
- }
- }
-
- /*
- * We use punch hole to reclaim the free space used by the
- * image a.k.a. discard. However we do not support discard if
- * encryption is enabled, because it may give an attacker
- * useful information.
- */
- if (bio->bi_rw & REQ_DISCARD) {
- struct file *file = lo->lo_backing_file;
- int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
-
- if ((!file->f_op->fallocate) ||
- lo->lo_encrypt_key_size) {
- ret = -EOPNOTSUPP;
- goto out;
- }
- ret = file->f_op->fallocate(file, mode, pos,
- bio->bi_iter.bi_size);
- if (unlikely(ret && ret != -EINVAL &&
- ret != -EOPNOTSUPP))
- ret = -EIO;
- goto out;
- }
-
- ret = lo_send(lo, bio, pos);
-
- if ((bio->bi_rw & REQ_FUA) && !ret) {
- ret = vfs_fsync(file, 0);
- if (unlikely(ret && ret != -EINVAL))
- ret = -EIO;
- }
- } else
- ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
+ if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
-out:
+ ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
+ if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
+ ret = -EIO;
+ out:
return ret;
}
-/*
- * Add bio to back of pending list
- */
-static void loop_add_bio(struct loop_device *lo, struct bio *bio)
+static int lo_req_flush(struct loop_device *lo, struct request *rq)
{
- lo->lo_bio_count++;
- bio_list_add(&lo->lo_bio_list, bio);
-}
+ struct file *file = lo->lo_backing_file;
+ int ret = vfs_fsync(file, 0);
+ if (unlikely(ret && ret != -EINVAL))
+ ret = -EIO;
-/*
- * Grab first pending buffer
- */
-static struct bio *loop_get_bio(struct loop_device *lo)
-{
- lo->lo_bio_count--;
- return bio_list_pop(&lo->lo_bio_list);
+ return ret;
}
-static void loop_make_request(struct request_queue *q, struct bio *old_bio)
+static int do_req_filebacked(struct loop_device *lo, struct request *rq)
{
- struct loop_device *lo = q->queuedata;
- int rw = bio_rw(old_bio);
-
- if (rw == READA)
- rw = READ;
+ loff_t pos;
+ int ret;
- BUG_ON(!lo || (rw != READ && rw != WRITE));
+ pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
- spin_lock_irq(&lo->lo_lock);
- if (lo->lo_state != Lo_bound)
- goto out;
- if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
- goto out;
- if (lo->lo_bio_count >= q->nr_congestion_on)
- wait_event_lock_irq(lo->lo_req_wait,
- lo->lo_bio_count < q->nr_congestion_off,
- lo->lo_lock);
- loop_add_bio(lo, old_bio);
- wake_up(&lo->lo_event);
- spin_unlock_irq(&lo->lo_lock);
- return;
+ if (rq->cmd_flags & REQ_WRITE) {
+ if (rq->cmd_flags & REQ_FLUSH)
+ ret = lo_req_flush(lo, rq);
+ else if (rq->cmd_flags & REQ_DISCARD)
+ ret = lo_discard(lo, rq, pos);
+ else
+ ret = lo_send(lo, rq, pos);
+ } else
+ ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
-out:
- spin_unlock_irq(&lo->lo_lock);
- bio_io_error(old_bio);
+ return ret;
}
struct switch_request {
@@ -518,57 +475,26 @@ struct switch_request {
struct completion wait;
};
-static void do_loop_switch(struct loop_device *, struct switch_request *);
-
-static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
-{
- if (unlikely(!bio->bi_bdev)) {
- do_loop_switch(lo, bio->bi_private);
- bio_put(bio);
- } else {
- int ret = do_bio_filebacked(lo, bio);
- bio_endio(bio, ret);
- }
-}
-
/*
- * worker thread that handles reads/writes to file backed loop devices,
- * to avoid blocking in our make_request_fn. it also does loop decrypting
- * on reads for block backed loop, as that is too heavy to do from
- * b_end_io context where irqs may be disabled.
- *
- * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before
- * calling kthread_stop(). Therefore once kthread_should_stop() is
- * true, make_request will not place any more requests. Therefore
- * once kthread_should_stop() is true and lo_bio is NULL, we are
- * done with the loop.
+ * Do the actual switch; called from the BIO completion routine
*/
-static int loop_thread(void *data)
+static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
{
- struct loop_device *lo = data;
- struct bio *bio;
-
- set_user_nice(current, MIN_NICE);
-
- while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
-
- wait_event_interruptible(lo->lo_event,
- !bio_list_empty(&lo->lo_bio_list) ||
- kthread_should_stop());
-
- if (bio_list_empty(&lo->lo_bio_list))
- continue;
- spin_lock_irq(&lo->lo_lock);
- bio = loop_get_bio(lo);
- if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
- wake_up(&lo->lo_req_wait);
- spin_unlock_irq(&lo->lo_lock);
+ struct file *file = p->file;
+ struct file *old_file = lo->lo_backing_file;
+ struct address_space *mapping;
- BUG_ON(!bio);
- loop_handle_bio(lo, bio);
- }
+ /* if no new file, only flush of queued bios requested */
+ if (!file)
+ return;
- return 0;
+ mapping = file->f_mapping;
+ mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
+ lo->lo_backing_file = file;
+ lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
+ mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
+ lo->old_gfp_mask = mapping_gfp_mask(mapping);
+ mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
}
/*
@@ -579,15 +505,18 @@ static int loop_thread(void *data)
static int loop_switch(struct loop_device *lo, struct file *file)
{
struct switch_request w;
- struct bio *bio = bio_alloc(GFP_KERNEL, 0);
- if (!bio)
- return -ENOMEM;
- init_completion(&w.wait);
+
w.file = file;
- bio->bi_private = &w;
- bio->bi_bdev = NULL;
- loop_make_request(lo->lo_queue, bio);
- wait_for_completion(&w.wait);
+
+ /* freeze queue and wait for completion of scheduled requests */
+ blk_mq_freeze_queue(lo->lo_queue);
+
+ /* do the switch action */
+ do_loop_switch(lo, &w);
+
+ /* unfreeze */
+ blk_mq_unfreeze_queue(lo->lo_queue);
+
return 0;
}
@@ -596,39 +525,10 @@ static int loop_switch(struct loop_device *lo, struct file *file)
*/
static int loop_flush(struct loop_device *lo)
{
- /* loop not yet configured, no running thread, nothing to flush */
- if (!lo->lo_thread)
- return 0;
-
return loop_switch(lo, NULL);
}
/*
- * Do the actual switch; called from the BIO completion routine
- */
-static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
-{
- struct file *file = p->file;
- struct file *old_file = lo->lo_backing_file;
- struct address_space *mapping;
-
- /* if no new file, only flush of queued bios requested */
- if (!file)
- goto out;
-
- mapping = file->f_mapping;
- mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
- lo->lo_backing_file = file;
- lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
- mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
- lo->old_gfp_mask = mapping_gfp_mask(mapping);
- mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
-out:
- complete(&p->wait);
-}
-
-
-/*
* loop_change_fd switched the backing store of a loopback device to
* a new file. This is useful for operating system installers to free up
* the original file and in High Availability environments to switch to
@@ -889,12 +789,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
lo->transfer = transfer_none;
lo->ioctl = NULL;
lo->lo_sizelimit = 0;
- lo->lo_bio_count = 0;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
- bio_list_init(&lo->lo_bio_list);
-
if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
blk_queue_flush(lo->lo_queue, REQ_FLUSH);
@@ -906,14 +803,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
set_blocksize(bdev, lo_blocksize);
- lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
- lo->lo_number);
- if (IS_ERR(lo->lo_thread)) {
- error = PTR_ERR(lo->lo_thread);
- goto out_clr;
- }
lo->lo_state = Lo_bound;
- wake_up_process(lo->lo_thread);
if (part_shift)
lo->lo_flags |= LO_FLAGS_PARTSCAN;
if (lo->lo_flags & LO_FLAGS_PARTSCAN)
@@ -925,18 +815,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
bdgrab(bdev);
return 0;
-out_clr:
- loop_sysfs_exit(lo);
- lo->lo_thread = NULL;
- lo->lo_device = NULL;
- lo->lo_backing_file = NULL;
- lo->lo_flags = 0;
- set_capacity(lo->lo_disk, 0);
- invalidate_bdev(bdev);
- bd_set_size(bdev, 0);
- kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
- mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
- lo->lo_state = Lo_unbound;
out_putf:
fput(file);
out:
@@ -1012,11 +890,6 @@ static int loop_clr_fd(struct loop_device *lo)
spin_lock_irq(&lo->lo_lock);
lo->lo_state = Lo_rundown;
- spin_unlock_irq(&lo->lo_lock);
-
- kthread_stop(lo->lo_thread);
-
- spin_lock_irq(&lo->lo_lock);
lo->lo_backing_file = NULL;
spin_unlock_irq(&lo->lo_lock);
@@ -1028,7 +901,6 @@ static int loop_clr_fd(struct loop_device *lo)
lo->lo_offset = 0;
lo->lo_sizelimit = 0;
lo->lo_encrypt_key_size = 0;
- lo->lo_thread = NULL;
memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
@@ -1601,6 +1473,105 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);
+static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+
+ blk_mq_start_request(bd->rq);
+
+ if (cmd->rq->cmd_flags & REQ_WRITE) {
+ struct loop_device *lo = cmd->rq->q->queuedata;
+ bool need_sched = true;
+
+ spin_lock_irq(&lo->lo_lock);
+ if (lo->write_started)
+ need_sched = false;
+ else
+ lo->write_started = true;
+ list_add_tail(&cmd->list, &lo->write_cmd_head);
+ spin_unlock_irq(&lo->lo_lock);
+
+ if (need_sched)
+ queue_work(loop_wq, &lo->write_work);
+ } else {
+ queue_work(loop_wq, &cmd->read_work);
+ }
+
+ return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static void loop_handle_cmd(struct loop_cmd *cmd)
+{
+ const bool write = cmd->rq->cmd_flags & REQ_WRITE;
+ struct loop_device *lo = cmd->rq->q->queuedata;
+ int ret = -EIO;
+
+ if (lo->lo_state != Lo_bound)
+ goto failed;
+
+ if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
+ goto failed;
+
+ ret = do_req_filebacked(lo, cmd->rq);
+
+ failed:
+ if (ret)
+ cmd->rq->errors = -EIO;
+ blk_mq_complete_request(cmd->rq);
+}
+
+static void loop_queue_write_work(struct work_struct *work)
+{
+ struct loop_device *lo =
+ container_of(work, struct loop_device, write_work);
+ LIST_HEAD(cmd_list);
+
+ spin_lock_irq(&lo->lo_lock);
+ repeat:
+ list_splice_init(&lo->write_cmd_head, &cmd_list);
+ spin_unlock_irq(&lo->lo_lock);
+
+ while (!list_empty(&cmd_list)) {
+ struct loop_cmd *cmd = list_first_entry(&cmd_list,
+ struct loop_cmd, list);
+ list_del_init(&cmd->list);
+ loop_handle_cmd(cmd);
+ }
+
+ spin_lock_irq(&lo->lo_lock);
+ if (!list_empty(&lo->write_cmd_head))
+ goto repeat;
+ lo->write_started = false;
+ spin_unlock_irq(&lo->lo_lock);
+}
+
+static void loop_queue_read_work(struct work_struct *work)
+{
+ struct loop_cmd *cmd =
+ container_of(work, struct loop_cmd, read_work);
+
+ loop_handle_cmd(cmd);
+}
+
+static int loop_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ cmd->rq = rq;
+ INIT_WORK(&cmd->read_work, loop_queue_read_work);
+
+ return 0;
+}
+
+static struct blk_mq_ops loop_mq_ops = {
+ .queue_rq = loop_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_request = loop_init_request,
+};
+
static int loop_add(struct loop_device **l, int i)
{
struct loop_device *lo;
@@ -1627,16 +1598,28 @@ static int loop_add(struct loop_device **l, int i)
i = err;
err = -ENOMEM;
- lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
- if (!lo->lo_queue)
+ lo->tag_set.ops = &loop_mq_ops;
+ lo->tag_set.nr_hw_queues = 1;
+ lo->tag_set.queue_depth = 128;
+ lo->tag_set.numa_node = NUMA_NO_NODE;
+ lo->tag_set.cmd_size = sizeof(struct loop_cmd);
+ lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ lo->tag_set.driver_data = lo;
+
+ err = blk_mq_alloc_tag_set(&lo->tag_set);
+ if (err)
goto out_free_idr;
- /*
- * set queue make_request_fn
- */
- blk_queue_make_request(lo->lo_queue, loop_make_request);
+ lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
+ if (IS_ERR_OR_NULL(lo->lo_queue)) {
+ err = PTR_ERR(lo->lo_queue);
+ goto out_cleanup_tags;
+ }
lo->lo_queue->queuedata = lo;
+ INIT_LIST_HEAD(&lo->write_cmd_head);
+ INIT_WORK(&lo->write_work, loop_queue_write_work);
+
disk = lo->lo_disk = alloc_disk(1 << part_shift);
if (!disk)
goto out_free_queue;
@@ -1664,9 +1647,6 @@ static int loop_add(struct loop_device **l, int i)
disk->flags |= GENHD_FL_EXT_DEVT;
mutex_init(&lo->lo_ctl_mutex);
lo->lo_number = i;
- lo->lo_thread = NULL;
- init_waitqueue_head(&lo->lo_event);
- init_waitqueue_head(&lo->lo_req_wait);
spin_lock_init(&lo->lo_lock);
disk->major = LOOP_MAJOR;
disk->first_minor = i << part_shift;
@@ -1680,6 +1660,8 @@ static int loop_add(struct loop_device **l, int i)
out_free_queue:
blk_cleanup_queue(lo->lo_queue);
+out_cleanup_tags:
+ blk_mq_free_tag_set(&lo->tag_set);
out_free_idr:
idr_remove(&loop_index_idr, i);
out_free_dev:
@@ -1692,6 +1674,7 @@ static void loop_remove(struct loop_device *lo)
{
del_gendisk(lo->lo_disk);
blk_cleanup_queue(lo->lo_queue);
+ blk_mq_free_tag_set(&lo->tag_set);
put_disk(lo->lo_disk);
kfree(lo);
}
@@ -1875,6 +1858,13 @@ static int __init loop_init(void)
goto misc_out;
}
+ loop_wq = alloc_workqueue("kloopd",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
+ if (!loop_wq) {
+ err = -ENOMEM;
+ goto misc_out;
+ }
+
blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
THIS_MODULE, loop_probe, NULL, NULL);
@@ -1912,6 +1902,8 @@ static void __exit loop_exit(void)
blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
unregister_blkdev(LOOP_MAJOR, "loop");
+ destroy_workqueue(loop_wq);
+
misc_deregister(&loop_misc);
}
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 90df5d6485b6..301c27f8323f 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -11,8 +11,10 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
+#include <linux/workqueue.h>
#include <uapi/linux/loop.h>
/* Possible states of device */
@@ -52,19 +54,23 @@ struct loop_device {
gfp_t old_gfp_mask;
spinlock_t lo_lock;
- struct bio_list lo_bio_list;
- unsigned int lo_bio_count;
+ struct list_head write_cmd_head;
+ struct work_struct write_work;
+ bool write_started;
int lo_state;
struct mutex lo_ctl_mutex;
- struct task_struct *lo_thread;
- wait_queue_head_t lo_event;
- /* wait queue for incoming requests */
- wait_queue_head_t lo_req_wait;
struct request_queue *lo_queue;
+ struct blk_mq_tag_set tag_set;
struct gendisk *lo_disk;
};
+struct loop_cmd {
+ struct work_struct read_work;
+ struct request *rq;
+ struct list_head list;
+};
+
/* Support for loadable transfer modules */
struct loop_func_table {
int number; /* filter type */
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index aa2224aa7caa..65cd61a4145e 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -579,7 +579,7 @@ static int null_add_dev(void)
sector_div(size, bs);
set_capacity(disk, size);
- disk->flags |= GENHD_FL_EXT_DEVT;
+ disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->major = null_major;
disk->first_minor = nullb->index;
disk->fops = &null_fops;
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d826bf3e62c8..b64bccbb78c9 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -37,17 +37,18 @@
#include <linux/ptrace.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/t10-pi.h>
#include <linux/types.h>
#include <scsi/sg.h>
#include <asm-generic/io-64-nonatomic-lo-hi.h>
+#define NVME_MINORS (1U << MINORBITS)
#define NVME_Q_DEPTH 1024
#define NVME_AQ_DEPTH 64
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
#define ADMIN_TIMEOUT (admin_timeout * HZ)
#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ)
-#define IOD_TIMEOUT (retry_time * HZ)
static unsigned char admin_timeout = 60;
module_param(admin_timeout, byte, 0644);
@@ -57,10 +58,6 @@ unsigned char nvme_io_timeout = 30;
module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
-static unsigned char retry_time = 30;
-module_param(retry_time, byte, 0644);
-MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O");
-
static unsigned char shutdown_timeout = 5;
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
@@ -68,6 +65,9 @@ MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown")
static int nvme_major;
module_param(nvme_major, int, 0);
+static int nvme_char_major;
+module_param(nvme_char_major, int, 0);
+
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);
@@ -76,7 +76,8 @@ static LIST_HEAD(dev_list);
static struct task_struct *nvme_thread;
static struct workqueue_struct *nvme_workq;
static wait_queue_head_t nvme_kthread_wait;
-static struct notifier_block nvme_nb;
+
+static struct class *nvme_class;
static void nvme_reset_failed_dev(struct work_struct *ws);
static int nvme_process_cq(struct nvme_queue *nvmeq);
@@ -95,7 +96,6 @@ struct async_cmd_info {
* commands and one for I/O commands).
*/
struct nvme_queue {
- struct llist_node node;
struct device *q_dmadev;
struct nvme_dev *dev;
char irqname[24]; /* nvme4294967295-65535\0 */
@@ -144,8 +144,37 @@ struct nvme_cmd_info {
void *ctx;
int aborted;
struct nvme_queue *nvmeq;
+ struct nvme_iod iod[0];
};
+/*
+ * Max size of iod being embedded in the request payload
+ */
+#define NVME_INT_PAGES 2
+#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size)
+
+/*
+ * Will slightly overestimate the number of pages needed. This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
+{
+ unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+ return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+ unsigned int ret = sizeof(struct nvme_cmd_info);
+
+ ret += sizeof(struct nvme_iod);
+ ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
+ ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+
+ return ret;
+}
+
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
@@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
}
+static void *iod_get_private(struct nvme_iod *iod)
+{
+ return (void *) (iod->private & ~0x1UL);
+}
+
+/*
+ * If bit 0 is set, the iod is embedded in the request payload.
+ */
+static bool iod_should_kfree(struct nvme_iod *iod)
+{
+ return (iod->private & 0x01) == 0;
+}
+
/* Special values must be less than 0x1000 */
#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
@@ -361,35 +403,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
return ((void *)iod) + iod->offset;
}
-/*
- * Will slightly overestimate the number of pages needed. This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static int nvme_npages(unsigned size, struct nvme_dev *dev)
+static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
+ unsigned nseg, unsigned long private)
{
- unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
- return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
+ iod->private = private;
+ iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+ iod->npages = -1;
+ iod->length = nbytes;
+ iod->nents = 0;
}
static struct nvme_iod *
-nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
+__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
+ unsigned long priv, gfp_t gfp)
{
struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
- sizeof(__le64 *) * nvme_npages(nbytes, dev) +
+ sizeof(__le64 *) * nvme_npages(bytes, dev) +
sizeof(struct scatterlist) * nseg, gfp);
- if (iod) {
- iod->offset = offsetof(struct nvme_iod, sg[nseg]);
- iod->npages = -1;
- iod->length = nbytes;
- iod->nents = 0;
- iod->first_dma = 0ULL;
- }
+ if (iod)
+ iod_init(iod, bytes, nseg, priv);
return iod;
}
+static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
+ gfp_t gfp)
+{
+ unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
+ sizeof(struct nvme_dsm_range);
+ unsigned long mask = 0;
+ struct nvme_iod *iod;
+
+ if (rq->nr_phys_segments <= NVME_INT_PAGES &&
+ size <= NVME_INT_BYTES(dev)) {
+ struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+
+ iod = cmd->iod;
+ mask = 0x01;
+ iod_init(iod, size, rq->nr_phys_segments,
+ (unsigned long) rq | 0x01);
+ return iod;
+ }
+
+ return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
+ (unsigned long) rq, gfp);
+}
+
void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
{
const int last_prp = dev->page_size / 8 - 1;
@@ -405,7 +465,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
prp_dma = next_prp_dma;
}
- kfree(iod);
+
+ if (iod_should_kfree(iod))
+ kfree(iod);
}
static int nvme_error_status(u16 status)
@@ -420,11 +482,67 @@ static int nvme_error_status(u16 status)
}
}
+static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+ if (be32_to_cpu(pi->ref_tag) == v)
+ pi->ref_tag = cpu_to_be32(p);
+}
+
+static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+ if (be32_to_cpu(pi->ref_tag) == p)
+ pi->ref_tag = cpu_to_be32(v);
+}
+
+/**
+ * nvme_dif_remap - remaps ref tags to bip seed and physical lba
+ *
+ * The virtual start sector is the one that was originally submitted by the
+ * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical
+ * start sector may be different. Remap protection information to match the
+ * physical LBA on writes, and back to the original seed on reads.
+ *
+ * Type 0 and 3 do not have a ref tag, so no remapping required.
+ */
+static void nvme_dif_remap(struct request *req,
+ void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
+{
+ struct nvme_ns *ns = req->rq_disk->private_data;
+ struct bio_integrity_payload *bip;
+ struct t10_pi_tuple *pi;
+ void *p, *pmap;
+ u32 i, nlb, ts, phys, virt;
+
+ if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
+ return;
+
+ bip = bio_integrity(req->bio);
+ if (!bip)
+ return;
+
+ pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
+ if (!pmap)
+ return;
+
+ p = pmap;
+ virt = bip_get_seed(bip);
+ phys = nvme_block_nr(ns, blk_rq_pos(req));
+ nlb = (blk_rq_bytes(req) >> ns->lba_shift);
+ ts = ns->disk->integrity->tuple_size;
+
+ for (i = 0; i < nlb; i++, virt++, phys++) {
+ pi = (struct t10_pi_tuple *)p;
+ dif_swap(phys, virt, pi);
+ p += ts;
+ }
+ kunmap_atomic(pmap);
+}
+
static void req_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
struct nvme_iod *iod = ctx;
- struct request *req = iod->private;
+ struct request *req = iod_get_private(iod);
struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
u16 status = le16_to_cpup(&cqe->status) >> 1;
@@ -450,9 +568,16 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
"completing aborted command with status:%04x\n",
status);
- if (iod->nents)
+ if (iod->nents) {
dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ if (blk_integrity_rq(req)) {
+ if (!rq_data_dir(req))
+ nvme_dif_remap(req, nvme_dif_complete);
+ dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1,
+ rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ }
+ }
nvme_free_iod(nvmeq->dev, iod);
blk_mq_complete_request(req);
@@ -585,7 +710,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
struct nvme_ns *ns)
{
- struct request *req = iod->private;
+ struct request *req = iod_get_private(iod);
struct nvme_command *cmnd;
u16 control = 0;
u32 dsmgmt = 0;
@@ -608,6 +733,24 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+
+ if (blk_integrity_rq(req)) {
+ cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg));
+ switch (ns->pi_type) {
+ case NVME_NS_DPS_PI_TYPE3:
+ control |= NVME_RW_PRINFO_PRCHK_GUARD;
+ break;
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ control |= NVME_RW_PRINFO_PRCHK_GUARD |
+ NVME_RW_PRINFO_PRCHK_REF;
+ cmnd->rw.reftag = cpu_to_le32(
+ nvme_block_nr(ns, blk_rq_pos(req)));
+ break;
+ }
+ } else if (ns->ms)
+ control |= NVME_RW_PRINFO_PRACT;
+
cmnd->rw.control = cpu_to_le16(control);
cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
@@ -626,17 +769,25 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *req = bd->rq;
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
struct nvme_iod *iod;
- int psegs = req->nr_phys_segments;
enum dma_data_direction dma_dir;
- unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
- sizeof(struct nvme_dsm_range);
- iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
+ /*
+ * If formated with metadata, require the block layer provide a buffer
+ * unless this namespace is formated such that the metadata can be
+ * stripped/generated by the controller with PRACT=1.
+ */
+ if (ns->ms && !blk_integrity_rq(req)) {
+ if (!(ns->pi_type && ns->ms == 8)) {
+ req->errors = -EFAULT;
+ blk_mq_complete_request(req);
+ return BLK_MQ_RQ_QUEUE_OK;
+ }
+ }
+
+ iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
if (!iod)
return BLK_MQ_RQ_QUEUE_BUSY;
- iod->private = req;
-
if (req->cmd_flags & REQ_DISCARD) {
void *range;
/*
@@ -651,10 +802,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
goto retry_cmd;
iod_list(iod)[0] = (__le64 *)range;
iod->npages = 0;
- } else if (psegs) {
+ } else if (req->nr_phys_segments) {
dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
- sg_init_table(iod->sg, psegs);
+ sg_init_table(iod->sg, req->nr_phys_segments);
iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
if (!iod->nents)
goto error_cmd;
@@ -668,6 +819,21 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
iod->nents, dma_dir);
goto retry_cmd;
}
+ if (blk_integrity_rq(req)) {
+ if (blk_rq_count_integrity_sg(req->q, req->bio) != 1)
+ goto error_cmd;
+
+ sg_init_table(iod->meta_sg, 1);
+ if (blk_rq_map_integrity_sg(
+ req->q, req->bio, iod->meta_sg) != 1)
+ goto error_cmd;
+
+ if (rq_data_dir(req))
+ nvme_dif_remap(req, nvme_dif_prep);
+
+ if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir))
+ goto error_cmd;
+ }
}
nvme_set_info(cmd, iod, req_completion);
@@ -760,14 +926,6 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
return IRQ_WAKE_THREAD;
}
-static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info *
- cmd_info)
-{
- spin_lock_irq(&nvmeq->q_lock);
- cancel_cmd_info(cmd_info, NULL);
- spin_unlock_irq(&nvmeq->q_lock);
-}
-
struct sync_cmd_info {
struct task_struct *task;
u32 result;
@@ -790,7 +948,6 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
u32 *result, unsigned timeout)
{
- int ret;
struct sync_cmd_info cmdinfo;
struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = cmd_rq->nvmeq;
@@ -802,29 +959,12 @@ static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
- set_current_state(TASK_KILLABLE);
- ret = nvme_submit_cmd(nvmeq, cmd);
- if (ret) {
- nvme_finish_cmd(nvmeq, req->tag, NULL);
- set_current_state(TASK_RUNNING);
- }
- ret = schedule_timeout(timeout);
-
- /*
- * Ensure that sync_completion has either run, or that it will
- * never run.
- */
- nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req));
-
- /*
- * We never got the completion
- */
- if (cmdinfo.status == -EINTR)
- return -EINTR;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ nvme_submit_cmd(nvmeq, cmd);
+ schedule();
if (result)
*result = cmdinfo.result;
-
return cmdinfo.status;
}
@@ -1101,29 +1241,18 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = cmd->nvmeq;
- /*
- * The aborted req will be completed on receiving the abort req.
- * We enable the timer again. If hit twice, it'll cause a device reset,
- * as the device then is in a faulty state.
- */
- int ret = BLK_EH_RESET_TIMER;
-
dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
nvmeq->qid);
-
spin_lock_irq(&nvmeq->q_lock);
- if (!nvmeq->dev->initialized) {
- /*
- * Force cancelled command frees the request, which requires we
- * return BLK_EH_NOT_HANDLED.
- */
- nvme_cancel_queue_ios(nvmeq->hctx, req, nvmeq, reserved);
- ret = BLK_EH_NOT_HANDLED;
- } else
- nvme_abort_req(req);
+ nvme_abort_req(req);
spin_unlock_irq(&nvmeq->q_lock);
- return ret;
+ /*
+ * The aborted req will be completed on receiving the abort req.
+ * We enable the timer again. If hit twice, it'll cause a device reset,
+ * as the device then is in a faulty state.
+ */
+ return BLK_EH_RESET_TIMER;
}
static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -1137,21 +1266,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
{
- LLIST_HEAD(q_list);
- struct nvme_queue *nvmeq, *next;
- struct llist_node *entry;
int i;
for (i = dev->queue_count - 1; i >= lowest; i--) {
struct nvme_queue *nvmeq = dev->queues[i];
- llist_add(&nvmeq->node, &q_list);
dev->queue_count--;
dev->queues[i] = NULL;
- }
- synchronize_rcu();
- entry = llist_del_all(&q_list);
- llist_for_each_entry_safe(nvmeq, next, entry, node)
nvme_free_queue(nvmeq);
+ }
}
/**
@@ -1183,7 +1305,6 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
spin_lock_irq(&nvmeq->q_lock);
- nvme_process_cq(nvmeq);
if (hctx && hctx->tags)
blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq);
spin_unlock_irq(&nvmeq->q_lock);
@@ -1206,7 +1327,10 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid)
}
if (!qid && dev->admin_q)
blk_mq_freeze_queue_start(dev->admin_q);
- nvme_clear_queue(nvmeq);
+
+ spin_lock_irq(&nvmeq->q_lock);
+ nvme_process_cq(nvmeq);
+ spin_unlock_irq(&nvmeq->q_lock);
}
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
@@ -1408,7 +1532,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
- dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
+ dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
dev->admin_tagset.driver_data = dev;
if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1522,7 +1646,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
}
err = -ENOMEM;
- iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
+ iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
if (!iod)
goto put_pages;
@@ -1825,13 +1949,61 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
return 0;
}
+static void nvme_config_discard(struct nvme_ns *ns)
+{
+ u32 logical_block_size = queue_logical_block_size(ns->queue);
+ ns->queue->limits.discard_zeroes_data = 0;
+ ns->queue->limits.discard_alignment = logical_block_size;
+ ns->queue->limits.discard_granularity = logical_block_size;
+ ns->queue->limits.max_discard_sectors = 0xffffffff;
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+}
+
+static int nvme_noop_verify(struct blk_integrity_iter *iter)
+{
+ return 0;
+}
+
+static int nvme_noop_generate(struct blk_integrity_iter *iter)
+{
+ return 0;
+}
+
+struct blk_integrity nvme_meta_noop = {
+ .name = "NVME_META_NOOP",
+ .generate_fn = nvme_noop_generate,
+ .verify_fn = nvme_noop_verify,
+};
+
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+ struct blk_integrity integrity;
+
+ switch (ns->pi_type) {
+ case NVME_NS_DPS_PI_TYPE3:
+ integrity = t10_pi_type3_crc;
+ break;
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ integrity = t10_pi_type1_crc;
+ break;
+ default:
+ integrity = nvme_meta_noop;
+ break;
+ }
+ integrity.tuple_size = ns->ms;
+ blk_integrity_register(ns->disk, &integrity);
+ blk_queue_max_integrity_segments(ns->queue, 1);
+}
+
static int nvme_revalidate_disk(struct gendisk *disk)
{
struct nvme_ns *ns = disk->private_data;
struct nvme_dev *dev = ns->dev;
struct nvme_id_ns *id;
dma_addr_t dma_addr;
- int lbaf;
+ int lbaf, pi_type, old_ms;
+ unsigned short bs;
id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
GFP_KERNEL);
@@ -1840,16 +2012,50 @@ static int nvme_revalidate_disk(struct gendisk *disk)
__func__);
return 0;
}
+ if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) {
+ dev_warn(&dev->pci_dev->dev,
+ "identify failed ns:%d, setting capacity to 0\n",
+ ns->ns_id);
+ memset(id, 0, sizeof(*id));
+ }
- if (nvme_identify(dev, ns->ns_id, 0, dma_addr))
- goto free;
-
- lbaf = id->flbas & 0xf;
+ old_ms = ns->ms;
+ lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
ns->lba_shift = id->lbaf[lbaf].ds;
+ ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+
+ /*
+ * If identify namespace failed, use default 512 byte block size so
+ * block layer can use before failing read/write for 0 capacity.
+ */
+ if (ns->lba_shift == 0)
+ ns->lba_shift = 9;
+ bs = 1 << ns->lba_shift;
+
+ /* XXX: PI implementation requires metadata equal t10 pi tuple size */
+ pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
+ id->dps & NVME_NS_DPS_PI_MASK : 0;
+
+ if (disk->integrity && (ns->pi_type != pi_type || ns->ms != old_ms ||
+ bs != queue_logical_block_size(disk->queue) ||
+ (ns->ms && id->flbas & NVME_NS_FLBAS_META_EXT)))
+ blk_integrity_unregister(disk);
+
+ ns->pi_type = pi_type;
+ blk_queue_logical_block_size(ns->queue, bs);
+
+ if (ns->ms && !disk->integrity && (disk->flags & GENHD_FL_UP) &&
+ !(id->flbas & NVME_NS_FLBAS_META_EXT))
+ nvme_init_integrity(ns);
+
+ if (id->ncap == 0 || (ns->ms && !disk->integrity))
+ set_capacity(disk, 0);
+ else
+ set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+ if (dev->oncs & NVME_CTRL_ONCS_DSM)
+ nvme_config_discard(ns);
- blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
- set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
- free:
dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
return 0;
}
@@ -1873,8 +2079,7 @@ static int nvme_kthread(void *data)
spin_lock(&dev_list_lock);
list_for_each_entry_safe(dev, next, &dev_list, node) {
int i;
- if (readl(&dev->bar->csts) & NVME_CSTS_CFS &&
- dev->initialized) {
+ if (readl(&dev->bar->csts) & NVME_CSTS_CFS) {
if (work_busy(&dev->reset_work))
continue;
list_del_init(&dev->node);
@@ -1906,30 +2111,16 @@ static int nvme_kthread(void *data)
return 0;
}
-static void nvme_config_discard(struct nvme_ns *ns)
-{
- u32 logical_block_size = queue_logical_block_size(ns->queue);
- ns->queue->limits.discard_zeroes_data = 0;
- ns->queue->limits.discard_alignment = logical_block_size;
- ns->queue->limits.discard_granularity = logical_block_size;
- ns->queue->limits.max_discard_sectors = 0xffffffff;
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
-}
-
-static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
- struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
+static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
{
struct nvme_ns *ns;
struct gendisk *disk;
int node = dev_to_node(&dev->pci_dev->dev);
- int lbaf;
-
- if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
- return NULL;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
- return NULL;
+ return;
+
ns->queue = blk_mq_init_queue(&dev->tagset);
if (IS_ERR(ns->queue))
goto out_free_ns;
@@ -1945,9 +2136,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
ns->ns_id = nsid;
ns->disk = disk;
- lbaf = id->flbas & 0xf;
- ns->lba_shift = id->lbaf[lbaf].ds;
- ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+ ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
+ list_add_tail(&ns->list, &dev->namespaces);
+
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
if (dev->max_hw_sectors)
blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
@@ -1961,21 +2152,26 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
disk->fops = &nvme_fops;
disk->private_data = ns;
disk->queue = ns->queue;
- disk->driverfs_dev = &dev->pci_dev->dev;
+ disk->driverfs_dev = dev->device;
disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
- set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
-
- if (dev->oncs & NVME_CTRL_ONCS_DSM)
- nvme_config_discard(ns);
-
- return ns;
+ /*
+ * Initialize capacity to 0 until we establish the namespace format and
+ * setup integrity extentions if necessary. The revalidate_disk after
+ * add_disk allows the driver to register with integrity if the format
+ * requires it.
+ */
+ set_capacity(disk, 0);
+ nvme_revalidate_disk(ns->disk);
+ add_disk(ns->disk);
+ if (ns->ms)
+ revalidate_disk(ns->disk);
+ return;
out_free_queue:
blk_cleanup_queue(ns->queue);
out_free_ns:
kfree(ns);
- return NULL;
}
static void nvme_create_io_queues(struct nvme_dev *dev)
@@ -2100,22 +2296,20 @@ static int nvme_dev_add(struct nvme_dev *dev)
struct pci_dev *pdev = dev->pci_dev;
int res;
unsigned nn, i;
- struct nvme_ns *ns;
struct nvme_id_ctrl *ctrl;
- struct nvme_id_ns *id_ns;
void *mem;
dma_addr_t dma_addr;
int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
- mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL);
+ mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL);
if (!mem)
return -ENOMEM;
res = nvme_identify(dev, 0, 1, dma_addr);
if (res) {
dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res);
- res = -EIO;
- goto out;
+ dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+ return -EIO;
}
ctrl = mem;
@@ -2141,6 +2335,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
} else
dev->max_hw_sectors = max_hw_sectors;
}
+ dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
dev->tagset.ops = &nvme_mq_ops;
dev->tagset.nr_hw_queues = dev->online_queues - 1;
@@ -2148,38 +2343,17 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
+ dev->tagset.cmd_size = nvme_cmd_size(dev);
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
if (blk_mq_alloc_tag_set(&dev->tagset))
- goto out;
-
- id_ns = mem;
- for (i = 1; i <= nn; i++) {
- res = nvme_identify(dev, i, 0, dma_addr);
- if (res)
- continue;
-
- if (id_ns->ncap == 0)
- continue;
-
- res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
- dma_addr + 4096, NULL);
- if (res)
- memset(mem + 4096, 0, 4096);
+ return 0;
- ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
- if (ns)
- list_add_tail(&ns->list, &dev->namespaces);
- }
- list_for_each_entry(ns, &dev->namespaces, list)
- add_disk(ns->disk);
- res = 0;
+ for (i = 1; i <= nn; i++)
+ nvme_alloc_ns(dev, i);
- out:
- dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
- return res;
+ return 0;
}
static int nvme_dev_map(struct nvme_dev *dev)
@@ -2308,8 +2482,6 @@ static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
static void nvme_del_queue_end(struct nvme_queue *nvmeq)
{
struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
-
- nvme_clear_queue(nvmeq);
nvme_put_dq(dq);
}
@@ -2452,7 +2624,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
int i;
u32 csts = -1;
- dev->initialized = 0;
nvme_dev_list_remove(dev);
if (dev->bar) {
@@ -2463,7 +2634,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
for (i = dev->queue_count - 1; i >= 0; i--) {
struct nvme_queue *nvmeq = dev->queues[i];
nvme_suspend_queue(nvmeq);
- nvme_clear_queue(nvmeq);
}
} else {
nvme_disable_io_queues(dev);
@@ -2471,6 +2641,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
nvme_disable_queue(dev, 0);
}
nvme_dev_unmap(dev);
+
+ for (i = dev->queue_count - 1; i >= 0; i--)
+ nvme_clear_queue(dev->queues[i]);
}
static void nvme_dev_remove(struct nvme_dev *dev)
@@ -2478,8 +2651,11 @@ static void nvme_dev_remove(struct nvme_dev *dev)
struct nvme_ns *ns;
list_for_each_entry(ns, &dev->namespaces, list) {
- if (ns->disk->flags & GENHD_FL_UP)
+ if (ns->disk->flags & GENHD_FL_UP) {
+ if (ns->disk->integrity)
+ blk_integrity_unregister(ns->disk);
del_gendisk(ns->disk);
+ }
if (!blk_queue_dying(ns->queue)) {
blk_mq_abort_requeue_list(ns->queue);
blk_cleanup_queue(ns->queue);
@@ -2561,6 +2737,7 @@ static void nvme_free_dev(struct kref *kref)
struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
pci_dev_put(dev->pci_dev);
+ put_device(dev->device);
nvme_free_namespaces(dev);
nvme_release_instance(dev);
blk_mq_free_tag_set(&dev->tagset);
@@ -2572,11 +2749,27 @@ static void nvme_free_dev(struct kref *kref)
static int nvme_dev_open(struct inode *inode, struct file *f)
{
- struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev,
- miscdev);
- kref_get(&dev->kref);
- f->private_data = dev;
- return 0;
+ struct nvme_dev *dev;
+ int instance = iminor(inode);
+ int ret = -ENODEV;
+
+ spin_lock(&dev_list_lock);
+ list_for_each_entry(dev, &dev_list, node) {
+ if (dev->instance == instance) {
+ if (!dev->admin_q) {
+ ret = -EWOULDBLOCK;
+ break;
+ }
+ if (!kref_get_unless_zero(&dev->kref))
+ break;
+ f->private_data = dev;
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock(&dev_list_lock);
+
+ return ret;
}
static int nvme_dev_release(struct inode *inode, struct file *f)
@@ -2718,7 +2911,6 @@ static int nvme_dev_resume(struct nvme_dev *dev)
nvme_unfreeze_queues(dev);
nvme_set_irq_hints(dev);
}
- dev->initialized = 1;
return 0;
}
@@ -2749,6 +2941,7 @@ static void nvme_reset_workfn(struct work_struct *work)
dev->reset_workfn(work);
}
+static void nvme_async_probe(struct work_struct *work);
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
@@ -2784,37 +2977,20 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto release;
kref_init(&dev->kref);
- result = nvme_dev_start(dev);
- if (result)
+ dev->device = device_create(nvme_class, &pdev->dev,
+ MKDEV(nvme_char_major, dev->instance),
+ dev, "nvme%d", dev->instance);
+ if (IS_ERR(dev->device)) {
+ result = PTR_ERR(dev->device);
goto release_pools;
+ }
+ get_device(dev->device);
- if (dev->online_queues > 1)
- result = nvme_dev_add(dev);
- if (result)
- goto shutdown;
-
- scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance);
- dev->miscdev.minor = MISC_DYNAMIC_MINOR;
- dev->miscdev.parent = &pdev->dev;
- dev->miscdev.name = dev->name;
- dev->miscdev.fops = &nvme_dev_fops;
- result = misc_register(&dev->miscdev);
- if (result)
- goto remove;
-
- nvme_set_irq_hints(dev);
-
- dev->initialized = 1;
+ INIT_WORK(&dev->probe_work, nvme_async_probe);
+ schedule_work(&dev->probe_work);
return 0;
- remove:
- nvme_dev_remove(dev);
- nvme_dev_remove_admin(dev);
- nvme_free_namespaces(dev);
- shutdown:
- nvme_dev_shutdown(dev);
release_pools:
- nvme_free_queues(dev, 0);
nvme_release_prp_pools(dev);
release:
nvme_release_instance(dev);
@@ -2827,6 +3003,29 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return result;
}
+static void nvme_async_probe(struct work_struct *work)
+{
+ struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
+ int result;
+
+ result = nvme_dev_start(dev);
+ if (result)
+ goto reset;
+
+ if (dev->online_queues > 1)
+ result = nvme_dev_add(dev);
+ if (result)
+ goto reset;
+
+ nvme_set_irq_hints(dev);
+ return;
+ reset:
+ if (!work_busy(&dev->reset_work)) {
+ dev->reset_workfn = nvme_reset_failed_dev;
+ queue_work(nvme_workq, &dev->reset_work);
+ }
+}
+
static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
@@ -2852,11 +3051,12 @@ static void nvme_remove(struct pci_dev *pdev)
spin_unlock(&dev_list_lock);
pci_set_drvdata(pdev, NULL);
+ flush_work(&dev->probe_work);
flush_work(&dev->reset_work);
- misc_deregister(&dev->miscdev);
nvme_dev_shutdown(dev);
nvme_dev_remove(dev);
nvme_dev_remove_admin(dev);
+ device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
nvme_free_queues(dev, 0);
nvme_release_prp_pools(dev);
kref_put(&dev->kref, nvme_free_dev);
@@ -2940,11 +3140,26 @@ static int __init nvme_init(void)
else if (result > 0)
nvme_major = result;
+ result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
+ &nvme_dev_fops);
+ if (result < 0)
+ goto unregister_blkdev;
+ else if (result > 0)
+ nvme_char_major = result;
+
+ nvme_class = class_create(THIS_MODULE, "nvme");
+ if (!nvme_class)
+ goto unregister_chrdev;
+
result = pci_register_driver(&nvme_driver);
if (result)
- goto unregister_blkdev;
+ goto destroy_class;
return 0;
+ destroy_class:
+ class_destroy(nvme_class);
+ unregister_chrdev:
+ __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
unregister_blkdev:
unregister_blkdev(nvme_major, "nvme");
kill_workq:
@@ -2955,9 +3170,10 @@ static int __init nvme_init(void)
static void __exit nvme_exit(void)
{
pci_unregister_driver(&nvme_driver);
- unregister_hotcpu_notifier(&nvme_nb);
unregister_blkdev(nvme_major, "nvme");
destroy_workqueue(nvme_workq);
+ class_destroy(nvme_class);
+ __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
_nvme_check_size();
}
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 5e78568026c3..e10196e0182d 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -779,10 +779,8 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
struct nvme_dev *dev = ns->dev;
dma_addr_t dma_addr;
void *mem;
- struct nvme_id_ctrl *id_ctrl;
int res = SNTI_TRANSLATION_SUCCESS;
int nvme_sc;
- u8 ieee[4];
int xfer_len;
__be32 tmp_id = cpu_to_be32(ns->ns_id);
@@ -793,46 +791,60 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
goto out_dma;
}
- /* nvme controller identify */
- nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
- res = nvme_trans_status_code(hdr, nvme_sc);
- if (res)
- goto out_free;
- if (nvme_sc) {
- res = nvme_sc;
- goto out_free;
- }
- id_ctrl = mem;
-
- /* Since SCSI tried to save 4 bits... [SPC-4(r34) Table 591] */
- ieee[0] = id_ctrl->ieee[0] << 4;
- ieee[1] = id_ctrl->ieee[0] >> 4 | id_ctrl->ieee[1] << 4;
- ieee[2] = id_ctrl->ieee[1] >> 4 | id_ctrl->ieee[2] << 4;
- ieee[3] = id_ctrl->ieee[2] >> 4;
-
- memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+ memset(inq_response, 0, alloc_len);
inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */
- inq_response[3] = 20; /* Page Length */
- /* Designation Descriptor start */
- inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */
- inq_response[5] = 0x03; /* PIV=0b | Asso=00b | Designator Type=3h */
- inq_response[6] = 0x00; /* Rsvd */
- inq_response[7] = 16; /* Designator Length */
- /* Designator start */
- inq_response[8] = 0x60 | ieee[3]; /* NAA=6h | IEEE ID MSB, High nibble*/
- inq_response[9] = ieee[2]; /* IEEE ID */
- inq_response[10] = ieee[1]; /* IEEE ID */
- inq_response[11] = ieee[0]; /* IEEE ID| Vendor Specific ID... */
- inq_response[12] = (dev->pci_dev->vendor & 0xFF00) >> 8;
- inq_response[13] = (dev->pci_dev->vendor & 0x00FF);
- inq_response[14] = dev->serial[0];
- inq_response[15] = dev->serial[1];
- inq_response[16] = dev->model[0];
- inq_response[17] = dev->model[1];
- memcpy(&inq_response[18], &tmp_id, sizeof(u32));
- /* Last 2 bytes are zero */
+ if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) {
+ struct nvme_id_ns *id_ns = mem;
+ void *eui = id_ns->eui64;
+ int len = sizeof(id_ns->eui64);
- xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+ nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_free;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_free;
+ }
+
+ if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) {
+ if (bitmap_empty(eui, len * 8)) {
+ eui = id_ns->nguid;
+ len = sizeof(id_ns->nguid);
+ }
+ }
+ if (bitmap_empty(eui, len * 8))
+ goto scsi_string;
+
+ inq_response[3] = 4 + len; /* Page Length */
+ /* Designation Descriptor start */
+ inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */
+ inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */
+ inq_response[6] = 0x00; /* Rsvd */
+ inq_response[7] = len; /* Designator Length */
+ memcpy(&inq_response[8], eui, len);
+ } else {
+ scsi_string:
+ if (alloc_len < 72) {
+ res = nvme_trans_completion(hdr,
+ SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out_free;
+ }
+ inq_response[3] = 0x48; /* Page Length */
+ /* Designation Descriptor start */
+ inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */
+ inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */
+ inq_response[6] = 0x00; /* Rsvd */
+ inq_response[7] = 0x44; /* Designator Length */
+
+ sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor);
+ memcpy(&inq_response[12], dev->model, sizeof(dev->model));
+ sprintf(&inq_response[52], "%04x", tmp_id);
+ memcpy(&inq_response[56], dev->serial, sizeof(dev->serial));
+ }
+ xfer_len = alloc_len;
res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
out_free:
@@ -1600,7 +1612,7 @@ static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10,
/* 10 Byte CDB */
*bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) +
parm_list[MODE_SELECT_10_BD_OFFSET + 1];
- *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] &&
+ *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] &
MODE_SELECT_10_LLBAA_MASK;
} else {
/* 6 Byte CDB */
@@ -2222,7 +2234,7 @@ static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr,
page_code = GET_INQ_PAGE_CODE(cmd);
alloc_len = GET_INQ_ALLOC_LENGTH(cmd);
- inq_response = kmalloc(STANDARD_INQUIRY_LENGTH, GFP_KERNEL);
+ inq_response = kmalloc(alloc_len, GFP_KERNEL);
if (inq_response == NULL) {
res = -ENOMEM;
goto out_mem;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 79aa179305b5..e22942596207 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -423,7 +423,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
}
/* switch queue to TCQ mode; allocate tag map */
- rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);
+ rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO);
if (rc) {
blk_cleanup_queue(q);
put_disk(disk);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3ec85dfce124..b40af3203089 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -38,6 +38,7 @@
#include <linux/kernel.h>
#include <linux/device.h>
#include <linux/module.h>
+#include <linux/blk-mq.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
@@ -340,9 +341,7 @@ struct rbd_device {
char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
- struct list_head rq_queue; /* incoming rq queue */
spinlock_t lock; /* queue, flags, open_count */
- struct work_struct rq_work;
struct rbd_image_header header;
unsigned long flags; /* possibly lock protected */
@@ -360,6 +359,9 @@ struct rbd_device {
atomic_t parent_ref;
struct rbd_device *parent;
+ /* Block layer tags. */
+ struct blk_mq_tag_set tag_set;
+
/* protects updating the header */
struct rw_semaphore header_rwsem;
@@ -1817,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
/*
* We support a 64-bit length, but ultimately it has to be
- * passed to blk_end_request(), which takes an unsigned int.
+ * passed to the block layer, which just supports a 32-bit
+ * length field.
*/
obj_request->xferred = osd_req->r_reply_op_len[0];
rbd_assert(obj_request->xferred < (u64)UINT_MAX);
@@ -2098,32 +2101,26 @@ static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
* If an image has a non-zero parent overlap, get a reference to its
* parent.
*
- * We must get the reference before checking for the overlap to
- * coordinate properly with zeroing the parent overlap in
- * rbd_dev_v2_parent_info() when an image gets flattened. We
- * drop it again if there is no overlap.
- *
* Returns true if the rbd device has a parent with a non-zero
* overlap and a reference for it was successfully taken, or
* false otherwise.
*/
static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
{
- int counter;
+ int counter = 0;
if (!rbd_dev->parent_spec)
return false;
- counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
- if (counter > 0 && rbd_dev->parent_overlap)
- return true;
-
- /* Image was flattened, but parent is not yet torn down */
+ down_read(&rbd_dev->header_rwsem);
+ if (rbd_dev->parent_overlap)
+ counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
+ up_read(&rbd_dev->header_rwsem);
if (counter < 0)
rbd_warn(rbd_dev, "parent reference overflow");
- return false;
+ return counter > 0;
}
/*
@@ -2281,7 +2278,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
more = obj_request->which < img_request->obj_request_count - 1;
} else {
rbd_assert(img_request->rq != NULL);
- more = blk_end_request(img_request->rq, result, xferred);
+
+ more = blk_update_request(img_request->rq, result, xferred);
+ if (!more)
+ __blk_mq_end_request(img_request->rq, result);
}
return more;
@@ -3310,8 +3310,10 @@ out:
return ret;
}
-static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
+static void rbd_queue_workfn(struct work_struct *work)
{
+ struct request *rq = blk_mq_rq_from_pdu(work);
+ struct rbd_device *rbd_dev = rq->q->queuedata;
struct rbd_img_request *img_request;
struct ceph_snap_context *snapc = NULL;
u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
@@ -3320,6 +3322,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
u64 mapping_size;
int result;
+ if (rq->cmd_type != REQ_TYPE_FS) {
+ dout("%s: non-fs request type %d\n", __func__,
+ (int) rq->cmd_type);
+ result = -EIO;
+ goto err;
+ }
+
if (rq->cmd_flags & REQ_DISCARD)
op_type = OBJ_OP_DISCARD;
else if (rq->cmd_flags & REQ_WRITE)
@@ -3365,6 +3374,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
goto err_rq; /* Shouldn't happen */
}
+ blk_mq_start_request(rq);
+
down_read(&rbd_dev->header_rwsem);
mapping_size = rbd_dev->mapping.size;
if (op_type != OBJ_OP_READ) {
@@ -3410,53 +3421,18 @@ err_rq:
rbd_warn(rbd_dev, "%s %llx at %llx result %d",
obj_op_name(op_type), length, offset, result);
ceph_put_snap_context(snapc);
- blk_end_request_all(rq, result);
+err:
+ blk_mq_end_request(rq, result);
}
-static void rbd_request_workfn(struct work_struct *work)
+static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
{
- struct rbd_device *rbd_dev =
- container_of(work, struct rbd_device, rq_work);
- struct request *rq, *next;
- LIST_HEAD(requests);
+ struct request *rq = bd->rq;
+ struct work_struct *work = blk_mq_rq_to_pdu(rq);
- spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
- list_splice_init(&rbd_dev->rq_queue, &requests);
- spin_unlock_irq(&rbd_dev->lock);
-
- list_for_each_entry_safe(rq, next, &requests, queuelist) {
- list_del_init(&rq->queuelist);
- rbd_handle_request(rbd_dev, rq);
- }
-}
-
-/*
- * Called with q->queue_lock held and interrupts disabled, possibly on
- * the way to schedule(). Do not sleep here!
- */
-static void rbd_request_fn(struct request_queue *q)
-{
- struct rbd_device *rbd_dev = q->queuedata;
- struct request *rq;
- int queued = 0;
-
- rbd_assert(rbd_dev);
-
- while ((rq = blk_fetch_request(q))) {
- /* Ignore any non-FS requests that filter through. */
- if (rq->cmd_type != REQ_TYPE_FS) {
- dout("%s: non-fs request type %d\n", __func__,
- (int) rq->cmd_type);
- __blk_end_request_all(rq, 0);
- continue;
- }
-
- list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
- queued++;
- }
-
- if (queued)
- queue_work(rbd_wq, &rbd_dev->rq_work);
+ queue_work(rbd_wq, work);
+ return BLK_MQ_RQ_QUEUE_OK;
}
/*
@@ -3517,6 +3493,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
del_gendisk(disk);
if (disk->queue)
blk_cleanup_queue(disk->queue);
+ blk_mq_free_tag_set(&rbd_dev->tag_set);
}
put_disk(disk);
}
@@ -3700,7 +3677,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
ret = rbd_dev_header_info(rbd_dev);
if (ret)
- return ret;
+ goto out;
/*
* If there is a parent, see if it has disappeared due to the
@@ -3709,30 +3686,46 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
if (rbd_dev->parent) {
ret = rbd_dev_v2_parent_info(rbd_dev);
if (ret)
- return ret;
+ goto out;
}
if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
- if (rbd_dev->mapping.size != rbd_dev->header.image_size)
- rbd_dev->mapping.size = rbd_dev->header.image_size;
+ rbd_dev->mapping.size = rbd_dev->header.image_size;
} else {
/* validate mapped snapshot's EXISTS flag */
rbd_exists_validate(rbd_dev);
}
+out:
up_write(&rbd_dev->header_rwsem);
-
- if (mapping_size != rbd_dev->mapping.size)
+ if (!ret && mapping_size != rbd_dev->mapping.size)
rbd_dev_update_size(rbd_dev);
+ return ret;
+}
+
+static int rbd_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct work_struct *work = blk_mq_rq_to_pdu(rq);
+
+ INIT_WORK(work, rbd_queue_workfn);
return 0;
}
+static struct blk_mq_ops rbd_mq_ops = {
+ .queue_rq = rbd_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_request = rbd_init_request,
+};
+
static int rbd_init_disk(struct rbd_device *rbd_dev)
{
struct gendisk *disk;
struct request_queue *q;
u64 segment_size;
+ int err;
/* create gendisk info */
disk = alloc_disk(single_major ?
@@ -3750,10 +3743,25 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
disk->fops = &rbd_bd_ops;
disk->private_data = rbd_dev;
- q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
- if (!q)
+ memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
+ rbd_dev->tag_set.ops = &rbd_mq_ops;
+ rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
+ rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
+ rbd_dev->tag_set.flags =
+ BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ rbd_dev->tag_set.nr_hw_queues = 1;
+ rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
+
+ err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
+ if (err)
goto out_disk;
+ q = blk_mq_init_queue(&rbd_dev->tag_set);
+ if (IS_ERR(q)) {
+ err = PTR_ERR(q);
+ goto out_tag_set;
+ }
+
/* We use the default size, but let's be explicit about it. */
blk_queue_physical_block_size(q, SECTOR_SIZE);
@@ -3779,10 +3787,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->disk = disk;
return 0;
+out_tag_set:
+ blk_mq_free_tag_set(&rbd_dev->tag_set);
out_disk:
put_disk(disk);
-
- return -ENOMEM;
+ return err;
}
/*
@@ -4039,8 +4048,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
return NULL;
spin_lock_init(&rbd_dev->lock);
- INIT_LIST_HEAD(&rbd_dev->rq_queue);
- INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
rbd_dev->flags = 0;
atomic_set(&rbd_dev->parent_ref, 0);
INIT_LIST_HEAD(&rbd_dev->node);
@@ -4239,7 +4246,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
*/
if (rbd_dev->parent_overlap) {
rbd_dev->parent_overlap = 0;
- smp_mb();
rbd_dev_parent_put(rbd_dev);
pr_info("%s: clone image has been flattened\n",
rbd_dev->disk->disk_name);
@@ -4281,33 +4287,22 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
}
/*
- * We always update the parent overlap. If it's zero we
- * treat it specially.
+ * We always update the parent overlap. If it's zero we issue
+ * a warning, as we will proceed as if there was no parent.
*/
- rbd_dev->parent_overlap = overlap;
- smp_mb();
if (!overlap) {
-
- /* A null parent_spec indicates it's the initial probe */
-
if (parent_spec) {
- /*
- * The overlap has become zero, so the clone
- * must have been resized down to 0 at some
- * point. Treat this the same as a flatten.
- */
- rbd_dev_parent_put(rbd_dev);
- pr_info("%s: clone image now standalone\n",
- rbd_dev->disk->disk_name);
+ /* refresh, careful to warn just once */
+ if (rbd_dev->parent_overlap)
+ rbd_warn(rbd_dev,
+ "clone now standalone (overlap became 0)");
} else {
- /*
- * For the initial probe, if we find the
- * overlap is zero we just pretend there was
- * no parent image.
- */
- rbd_warn(rbd_dev, "ignoring parent with overlap 0");
+ /* initial probe */
+ rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
}
}
+ rbd_dev->parent_overlap = overlap;
+
out:
ret = 0;
out_err:
@@ -4779,36 +4774,6 @@ static inline size_t next_token(const char **buf)
}
/*
- * Finds the next token in *buf, and if the provided token buffer is
- * big enough, copies the found token into it. The result, if
- * copied, is guaranteed to be terminated with '\0'. Note that *buf
- * must be terminated with '\0' on entry.
- *
- * Returns the length of the token found (not including the '\0').
- * Return value will be 0 if no token is found, and it will be >=
- * token_size if the token would not fit.
- *
- * The *buf pointer will be updated to point beyond the end of the
- * found token. Note that this occurs even if the token buffer is
- * too small to hold it.
- */
-static inline size_t copy_token(const char **buf,
- char *token,
- size_t token_size)
-{
- size_t len;
-
- len = next_token(buf);
- if (len < token_size) {
- memcpy(token, *buf, len);
- *(token + len) = '\0';
- }
- *buf += len;
-
- return len;
-}
-
-/*
* Finds the next token in *buf, dynamically allocates a buffer big
* enough to hold a copy of it, and copies the token into the new
* buffer. The copy is guaranteed to be terminated with '\0'. Note
@@ -5114,10 +5079,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
{
struct rbd_image_header *header;
- /* Drop parent reference unless it's already been done (or none) */
-
- if (rbd_dev->parent_overlap)
- rbd_dev_parent_put(rbd_dev);
+ rbd_dev_parent_put(rbd_dev);
/* Free dynamic fields from the header, then zero it out */
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index cdfbd21e3597..655e570b9b31 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -28,8 +28,7 @@ struct virtio_blk_vq {
char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;
-struct virtio_blk
-{
+struct virtio_blk {
struct virtio_device *vdev;
/* The disk structure for the kernel. */
@@ -52,8 +51,7 @@ struct virtio_blk
struct virtio_blk_vq *vqs;
};
-struct virtblk_req
-{
+struct virtblk_req {
struct request *req;
struct virtio_blk_outhdr out_hdr;
struct virtio_scsi_inhdr in_hdr;
@@ -575,6 +573,12 @@ static int virtblk_probe(struct virtio_device *vdev)
u16 min_io_size;
u8 physical_block_exp, alignment_offset;
+ if (!vdev->config->get) {
+ dev_err(&vdev->dev, "%s failure: config access disabled\n",
+ __func__);
+ return -EINVAL;
+ }
+
err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
GFP_KERNEL);
if (err < 0)
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 63fc7f06a014..2a04d341e598 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -47,6 +47,7 @@
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/balloon.h>
+#include <xen/grant_table.h>
#include "common.h"
/*
@@ -100,7 +101,7 @@ module_param(log_stats, int, 0644);
#define BLKBACK_INVALID_HANDLE (~0)
-/* Number of free pages to remove on each call to free_xenballooned_pages */
+/* Number of free pages to remove on each call to gnttab_free_pages */
#define NUM_BATCH_FREE_PAGES 10
static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
@@ -111,7 +112,7 @@ static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
if (list_empty(&blkif->free_pages)) {
BUG_ON(blkif->free_pages_num != 0);
spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
- return alloc_xenballooned_pages(1, page, false);
+ return gnttab_alloc_pages(1, page);
}
BUG_ON(blkif->free_pages_num == 0);
page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
@@ -151,14 +152,14 @@ static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
blkif->free_pages_num--;
if (++num_pages == NUM_BATCH_FREE_PAGES) {
spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
- free_xenballooned_pages(num_pages, page);
+ gnttab_free_pages(num_pages, page);
spin_lock_irqsave(&blkif->free_pages_lock, flags);
num_pages = 0;
}
}
spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
if (num_pages != 0)
- free_xenballooned_pages(num_pages, page);
+ gnttab_free_pages(num_pages, page);
}
#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
@@ -262,6 +263,17 @@ static void put_persistent_gnt(struct xen_blkif *blkif,
atomic_dec(&blkif->persistent_gnt_in_use);
}
+static void free_persistent_gnts_unmap_callback(int result,
+ struct gntab_unmap_queue_data *data)
+{
+ struct completion *c = data->data;
+
+ /* BUG_ON used to reproduce existing behaviour,
+ but is this the best way to deal with this? */
+ BUG_ON(result);
+ complete(c);
+}
+
static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
unsigned int num)
{
@@ -269,8 +281,17 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct persistent_gnt *persistent_gnt;
struct rb_node *n;
- int ret = 0;
int segs_to_unmap = 0;
+ struct gntab_unmap_queue_data unmap_data;
+ struct completion unmap_completion;
+
+ init_completion(&unmap_completion);
+
+ unmap_data.data = &unmap_completion;
+ unmap_data.done = &free_persistent_gnts_unmap_callback;
+ unmap_data.pages = pages;
+ unmap_data.unmap_ops = unmap;
+ unmap_data.kunmap_ops = NULL;
foreach_grant_safe(persistent_gnt, n, root, node) {
BUG_ON(persistent_gnt->handle ==
@@ -285,9 +306,11 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
!rb_next(&persistent_gnt->node)) {
- ret = gnttab_unmap_refs(unmap, NULL, pages,
- segs_to_unmap);
- BUG_ON(ret);
+
+ unmap_data.count = segs_to_unmap;
+ gnttab_unmap_refs_async(&unmap_data);
+ wait_for_completion(&unmap_completion);
+
put_free_pages(blkif, pages, segs_to_unmap);
segs_to_unmap = 0;
}
@@ -653,18 +676,14 @@ void xen_blkbk_free_caches(struct xen_blkif *blkif)
shrink_free_pagepool(blkif, 0 /* All */);
}
-/*
- * Unmap the grant references, and also remove the M2P over-rides
- * used in the 'pending_req'.
- */
-static void xen_blkbk_unmap(struct xen_blkif *blkif,
- struct grant_page *pages[],
- int num)
+static unsigned int xen_blkbk_unmap_prepare(
+ struct xen_blkif *blkif,
+ struct grant_page **pages,
+ unsigned int num,
+ struct gnttab_unmap_grant_ref *unmap_ops,
+ struct page **unmap_pages)
{
- struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int i, invcount = 0;
- int ret;
for (i = 0; i < num; i++) {
if (pages[i]->persistent_gnt != NULL) {
@@ -674,21 +693,95 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
continue;
unmap_pages[invcount] = pages[i]->page;
- gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page),
+ gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page),
GNTMAP_host_map, pages[i]->handle);
pages[i]->handle = BLKBACK_INVALID_HANDLE;
- if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
- ret = gnttab_unmap_refs(unmap, NULL, unmap_pages,
- invcount);
+ invcount++;
+ }
+
+ return invcount;
+}
+
+static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
+{
+ struct pending_req* pending_req = (struct pending_req*) (data->data);
+ struct xen_blkif *blkif = pending_req->blkif;
+
+ /* BUG_ON used to reproduce existing behaviour,
+ but is this the best way to deal with this? */
+ BUG_ON(result);
+
+ put_free_pages(blkif, data->pages, data->count);
+ make_response(blkif, pending_req->id,
+ pending_req->operation, pending_req->status);
+ free_req(blkif, pending_req);
+ /*
+ * Make sure the request is freed before releasing blkif,
+ * or there could be a race between free_req and the
+ * cleanup done in xen_blkif_free during shutdown.
+ *
+ * NB: The fact that we might try to wake up pending_free_wq
+ * before drain_complete (in case there's a drain going on)
+ * it's not a problem with our current implementation
+ * because we can assure there's no thread waiting on
+ * pending_free_wq if there's a drain going on, but it has
+ * to be taken into account if the current model is changed.
+ */
+ if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
+ complete(&blkif->drain_complete);
+ }
+ xen_blkif_put(blkif);
+}
+
+static void xen_blkbk_unmap_and_respond(struct pending_req *req)
+{
+ struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
+ struct xen_blkif *blkif = req->blkif;
+ struct grant_page **pages = req->segments;
+ unsigned int invcount;
+
+ invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_pages,
+ req->unmap, req->unmap_pages);
+
+ work->data = req;
+ work->done = xen_blkbk_unmap_and_respond_callback;
+ work->unmap_ops = req->unmap;
+ work->kunmap_ops = NULL;
+ work->pages = req->unmap_pages;
+ work->count = invcount;
+
+ gnttab_unmap_refs_async(&req->gnttab_unmap_data);
+}
+
+
+/*
+ * Unmap the grant references.
+ *
+ * This could accumulate ops up to the batch size to reduce the number
+ * of hypercalls, but since this is only used in error paths there's
+ * no real need.
+ */
+static void xen_blkbk_unmap(struct xen_blkif *blkif,
+ struct grant_page *pages[],
+ int num)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ unsigned int invcount = 0;
+ int ret;
+
+ while (num) {
+ unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+ invcount = xen_blkbk_unmap_prepare(blkif, pages, batch,
+ unmap, unmap_pages);
+ if (invcount) {
+ ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
BUG_ON(ret);
put_free_pages(blkif, unmap_pages, invcount);
- invcount = 0;
}
- }
- if (invcount) {
- ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
- BUG_ON(ret);
- put_free_pages(blkif, unmap_pages, invcount);
+ pages += batch;
+ num -= batch;
}
}
@@ -982,32 +1075,8 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
* the grant references associated with 'request' and provide
* the proper response on the ring.
*/
- if (atomic_dec_and_test(&pending_req->pendcnt)) {
- struct xen_blkif *blkif = pending_req->blkif;
-
- xen_blkbk_unmap(blkif,
- pending_req->segments,
- pending_req->nr_pages);
- make_response(blkif, pending_req->id,
- pending_req->operation, pending_req->status);
- free_req(blkif, pending_req);
- /*
- * Make sure the request is freed before releasing blkif,
- * or there could be a race between free_req and the
- * cleanup done in xen_blkif_free during shutdown.
- *
- * NB: The fact that we might try to wake up pending_free_wq
- * before drain_complete (in case there's a drain going on)
- * it's not a problem with our current implementation
- * because we can assure there's no thread waiting on
- * pending_free_wq if there's a drain going on, but it has
- * to be taken into account if the current model is changed.
- */
- if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
- complete(&blkif->drain_complete);
- }
- xen_blkif_put(blkif);
- }
+ if (atomic_dec_and_test(&pending_req->pendcnt))
+ xen_blkbk_unmap_and_respond(pending_req);
}
/*
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index f65b807e3236..375d28851860 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -214,6 +214,15 @@ enum blkif_protocol {
BLKIF_PROTOCOL_X86_64 = 3,
};
+/*
+ * Default protocol if the frontend doesn't specify one.
+ */
+#ifdef CONFIG_X86
+# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
+#else
+# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
+#endif
+
struct xen_vbd {
/* What the domain refers to this vbd as. */
blkif_vdev_t handle;
@@ -350,6 +359,9 @@ struct pending_req {
struct grant_page *indirect_pages[MAX_INDIRECT_PAGES];
struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
struct bio *biolist[MAX_INDIRECT_SEGMENTS];
+ struct gnttab_unmap_grant_ref unmap[MAX_INDIRECT_SEGMENTS];
+ struct page *unmap_pages[MAX_INDIRECT_SEGMENTS];
+ struct gntab_unmap_queue_data gnttab_unmap_data;
};
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 630a489e757d..e3afe97280b1 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be)
return err;
}
- be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
"%63s", protocol, NULL);
if (err)
- strcpy(protocol, "unspecified, assuming native");
+ strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2236c6f31608..37779e4c4585 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1391,7 +1391,7 @@ static int blkfront_probe(struct xenbus_device *dev,
if (major != XENVBD_MAJOR) {
printk(KERN_INFO
"%s: HVM does not support vbd %d as xen block device\n",
- __FUNCTION__, vdevice);
+ __func__, vdevice);
return -ENODEV;
}
}
@@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info)
merge_bio.tail = copy[i].request->biotail;
bio_list_merge(&bio_list, &merge_bio);
copy[i].request->bio = NULL;
- blk_put_request(copy[i].request);
+ blk_end_request_all(copy[i].request, 0);
}
kfree(copy);
@@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info)
req->bio = NULL;
if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
pr_alert("diskcache flush request found!\n");
- __blk_put_request(info->rq, req);
+ __blk_end_request_all(req, 0);
}
spin_unlock_irq(&info->io_lock);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index bd8bda386e02..8e233edd7a09 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -53,9 +53,9 @@ static ssize_t name##_show(struct device *d, \
} \
static DEVICE_ATTR_RO(name);
-static inline int init_done(struct zram *zram)
+static inline bool init_done(struct zram *zram)
{
- return zram->meta != NULL;
+ return zram->disksize;
}
static inline struct zram *dev_to_zram(struct device *dev)
@@ -307,42 +307,67 @@ static inline int valid_io_request(struct zram *zram,
return 1;
}
-static void zram_meta_free(struct zram_meta *meta)
+static void zram_meta_free(struct zram_meta *meta, u64 disksize)
{
+ size_t num_pages = disksize >> PAGE_SHIFT;
+ size_t index;
+
+ /* Free all pages that are still in this zram device */
+ for (index = 0; index < num_pages; index++) {
+ unsigned long handle = meta->table[index].handle;
+
+ if (!handle)
+ continue;
+
+ zs_free(meta->mem_pool, handle);
+ }
+
zs_destroy_pool(meta->mem_pool);
vfree(meta->table);
kfree(meta);
}
-static struct zram_meta *zram_meta_alloc(u64 disksize)
+static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
{
size_t num_pages;
+ char pool_name[8];
struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+
if (!meta)
- goto out;
+ return NULL;
num_pages = disksize >> PAGE_SHIFT;
meta->table = vzalloc(num_pages * sizeof(*meta->table));
if (!meta->table) {
pr_err("Error allocating zram address table\n");
- goto free_meta;
+ goto out_error;
}
- meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
+ snprintf(pool_name, sizeof(pool_name), "zram%d", device_id);
+ meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
if (!meta->mem_pool) {
pr_err("Error creating memory pool\n");
- goto free_table;
+ goto out_error;
}
return meta;
-free_table:
+out_error:
vfree(meta->table);
-free_meta:
kfree(meta);
- meta = NULL;
-out:
- return meta;
+ return NULL;
+}
+
+static inline bool zram_meta_get(struct zram *zram)
+{
+ if (atomic_inc_not_zero(&zram->refcount))
+ return true;
+ return false;
+}
+
+static inline void zram_meta_put(struct zram *zram)
+{
+ atomic_dec(&zram->refcount);
}
static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
@@ -704,10 +729,11 @@ static void zram_bio_discard(struct zram *zram, u32 index,
}
}
-static void zram_reset_device(struct zram *zram, bool reset_capacity)
+static void zram_reset_device(struct zram *zram)
{
- size_t index;
struct zram_meta *meta;
+ struct zcomp *comp;
+ u64 disksize;
down_write(&zram->init_lock);
@@ -719,36 +745,30 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
}
meta = zram->meta;
- /* Free all pages that are still in this zram device */
- for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
- unsigned long handle = meta->table[index].handle;
- if (!handle)
- continue;
-
- zs_free(meta->mem_pool, handle);
- }
-
- zcomp_destroy(zram->comp);
- zram->max_comp_streams = 1;
+ comp = zram->comp;
+ disksize = zram->disksize;
+ /*
+ * Refcount will go down to 0 eventually and r/w handler
+ * cannot handle further I/O so it will bail out by
+ * check zram_meta_get.
+ */
+ zram_meta_put(zram);
+ /*
+ * We want to free zram_meta in process context to avoid
+ * deadlock between reclaim path and any other locks.
+ */
+ wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
- zram_meta_free(zram->meta);
- zram->meta = NULL;
/* Reset stats */
memset(&zram->stats, 0, sizeof(zram->stats));
-
zram->disksize = 0;
- if (reset_capacity)
- set_capacity(zram->disk, 0);
+ zram->max_comp_streams = 1;
+ set_capacity(zram->disk, 0);
up_write(&zram->init_lock);
-
- /*
- * Revalidate disk out of the init_lock to avoid lockdep splat.
- * It's okay because disk's capacity is protected by init_lock
- * so that revalidate_disk always sees up-to-date capacity.
- */
- if (reset_capacity)
- revalidate_disk(zram->disk);
+ /* I/O operation under all of CPU are done so let's free */
+ zram_meta_free(meta, disksize);
+ zcomp_destroy(comp);
}
static ssize_t disksize_store(struct device *dev,
@@ -765,7 +785,7 @@ static ssize_t disksize_store(struct device *dev,
return -EINVAL;
disksize = PAGE_ALIGN(disksize);
- meta = zram_meta_alloc(disksize);
+ meta = zram_meta_alloc(zram->disk->first_minor, disksize);
if (!meta)
return -ENOMEM;
@@ -784,6 +804,8 @@ static ssize_t disksize_store(struct device *dev,
goto out_destroy_comp;
}
+ init_waitqueue_head(&zram->io_done);
+ atomic_set(&zram->refcount, 1);
zram->meta = meta;
zram->comp = comp;
zram->disksize = disksize;
@@ -803,7 +825,7 @@ out_destroy_comp:
up_write(&zram->init_lock);
zcomp_destroy(comp);
out_free_meta:
- zram_meta_free(meta);
+ zram_meta_free(meta, disksize);
return err;
}
@@ -821,8 +843,9 @@ static ssize_t reset_store(struct device *dev,
if (!bdev)
return -ENOMEM;
+ mutex_lock(&bdev->bd_mutex);
/* Do not reset an active device! */
- if (bdev->bd_holders) {
+ if (bdev->bd_openers) {
ret = -EBUSY;
goto out;
}
@@ -838,12 +861,16 @@ static ssize_t reset_store(struct device *dev,
/* Make sure all pending I/O is finished */
fsync_bdev(bdev);
+ zram_reset_device(zram);
+
+ mutex_unlock(&bdev->bd_mutex);
+ revalidate_disk(zram->disk);
bdput(bdev);
- zram_reset_device(zram, true);
return len;
out:
+ mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
return ret;
}
@@ -909,23 +936,21 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
{
struct zram *zram = queue->queuedata;
- down_read(&zram->init_lock);
- if (unlikely(!init_done(zram)))
+ if (unlikely(!zram_meta_get(zram)))
goto error;
if (!valid_io_request(zram, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size)) {
atomic64_inc(&zram->stats.invalid_io);
- goto error;
+ goto put_zram;
}
__zram_make_request(zram, bio);
- up_read(&zram->init_lock);
-
+ zram_meta_put(zram);
return;
-
+put_zram:
+ zram_meta_put(zram);
error:
- up_read(&zram->init_lock);
bio_io_error(bio);
}
@@ -947,21 +972,19 @@ static void zram_slot_free_notify(struct block_device *bdev,
static int zram_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, int rw)
{
- int offset, err;
+ int offset, err = -EIO;
u32 index;
struct zram *zram;
struct bio_vec bv;
zram = bdev->bd_disk->private_data;
+ if (unlikely(!zram_meta_get(zram)))
+ goto out;
+
if (!valid_io_request(zram, sector, PAGE_SIZE)) {
atomic64_inc(&zram->stats.invalid_io);
- return -EINVAL;
- }
-
- down_read(&zram->init_lock);
- if (unlikely(!init_done(zram))) {
- err = -EIO;
- goto out_unlock;
+ err = -EINVAL;
+ goto put_zram;
}
index = sector >> SECTORS_PER_PAGE_SHIFT;
@@ -972,8 +995,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
bv.bv_offset = 0;
err = zram_bvec_rw(zram, &bv, index, offset, rw);
-out_unlock:
- up_read(&zram->init_lock);
+put_zram:
+ zram_meta_put(zram);
+out:
/*
* If I/O fails, just return error(ie, non-zero) without
* calling page_endio.
@@ -1039,19 +1063,19 @@ static struct attribute_group zram_disk_attr_group = {
static int create_device(struct zram *zram, int device_id)
{
+ struct request_queue *queue;
int ret = -ENOMEM;
init_rwsem(&zram->init_lock);
- zram->queue = blk_alloc_queue(GFP_KERNEL);
- if (!zram->queue) {
+ queue = blk_alloc_queue(GFP_KERNEL);
+ if (!queue) {
pr_err("Error allocating disk queue for device %d\n",
device_id);
goto out;
}
- blk_queue_make_request(zram->queue, zram_make_request);
- zram->queue->queuedata = zram;
+ blk_queue_make_request(queue, zram_make_request);
/* gendisk structure */
zram->disk = alloc_disk(1);
@@ -1064,7 +1088,8 @@ static int create_device(struct zram *zram, int device_id)
zram->disk->major = zram_major;
zram->disk->first_minor = device_id;
zram->disk->fops = &zram_devops;
- zram->disk->queue = zram->queue;
+ zram->disk->queue = queue;
+ zram->disk->queue->queuedata = zram;
zram->disk->private_data = zram;
snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
@@ -1115,20 +1140,35 @@ out_free_disk:
del_gendisk(zram->disk);
put_disk(zram->disk);
out_free_queue:
- blk_cleanup_queue(zram->queue);
+ blk_cleanup_queue(queue);
out:
return ret;
}
-static void destroy_device(struct zram *zram)
+static void destroy_devices(unsigned int nr)
{
- sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
- &zram_disk_attr_group);
+ struct zram *zram;
+ unsigned int i;
- del_gendisk(zram->disk);
- put_disk(zram->disk);
+ for (i = 0; i < nr; i++) {
+ zram = &zram_devices[i];
+ /*
+ * Remove sysfs first, so no one will perform a disksize
+ * store while we destroy the devices
+ */
+ sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
+ &zram_disk_attr_group);
+
+ zram_reset_device(zram);
- blk_cleanup_queue(zram->queue);
+ blk_cleanup_queue(zram->disk->queue);
+ del_gendisk(zram->disk);
+ put_disk(zram->disk);
+ }
+
+ kfree(zram_devices);
+ unregister_blkdev(zram_major, "zram");
+ pr_info("Destroyed %u device(s)\n", nr);
}
static int __init zram_init(void)
@@ -1138,64 +1178,39 @@ static int __init zram_init(void)
if (num_devices > max_num_devices) {
pr_warn("Invalid value for num_devices: %u\n",
num_devices);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
zram_major = register_blkdev(0, "zram");
if (zram_major <= 0) {
pr_warn("Unable to get major number\n");
- ret = -EBUSY;
- goto out;
+ return -EBUSY;
}
/* Allocate the device array and initialize each one */
zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
if (!zram_devices) {
- ret = -ENOMEM;
- goto unregister;
+ unregister_blkdev(zram_major, "zram");
+ return -ENOMEM;
}
for (dev_id = 0; dev_id < num_devices; dev_id++) {
ret = create_device(&zram_devices[dev_id], dev_id);
if (ret)
- goto free_devices;
+ goto out_error;
}
- pr_info("Created %u device(s) ...\n", num_devices);
-
+ pr_info("Created %u device(s)\n", num_devices);
return 0;
-free_devices:
- while (dev_id)
- destroy_device(&zram_devices[--dev_id]);
- kfree(zram_devices);
-unregister:
- unregister_blkdev(zram_major, "zram");
-out:
+out_error:
+ destroy_devices(dev_id);
return ret;
}
static void __exit zram_exit(void)
{
- int i;
- struct zram *zram;
-
- for (i = 0; i < num_devices; i++) {
- zram = &zram_devices[i];
-
- destroy_device(zram);
- /*
- * Shouldn't access zram->disk after destroy_device
- * because destroy_device already released zram->disk.
- */
- zram_reset_device(zram, false);
- }
-
- unregister_blkdev(zram_major, "zram");
-
- kfree(zram_devices);
- pr_debug("Cleanup done!\n");
+ destroy_devices(num_devices);
}
module_init(zram_init);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index b05a816b09ac..17056e589146 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -100,24 +100,25 @@ struct zram_meta {
struct zram {
struct zram_meta *meta;
- struct request_queue *queue;
- struct gendisk *disk;
struct zcomp *comp;
-
- /* Prevent concurrent execution of device init, reset and R/W request */
+ struct gendisk *disk;
+ /* Prevent concurrent execution of device init */
struct rw_semaphore init_lock;
/*
- * This is the limit on amount of *uncompressed* worth of data
- * we can store in a disk.
+ * the number of pages zram can consume for storing compressed data
*/
- u64 disksize; /* bytes */
+ unsigned long limit_pages;
int max_comp_streams;
+
struct zram_stats stats;
+ atomic_t refcount; /* refcount for zram_meta */
+ /* wait all IO under all of cpu are done */
+ wait_queue_head_t io_done;
/*
- * the number of pages zram can consume for storing compressed data
+ * This is the limit on amount of *uncompressed* worth of data
+ * we can store in a disk.
*/
- unsigned long limit_pages;
-
+ u64 disksize; /* bytes */
char compressor[10];
};
#endif