diff options
Diffstat (limited to 'block')
53 files changed, 9521 insertions, 2185 deletions
diff --git a/block/Kconfig b/block/Kconfig index 161491d0a879..a2a92e57a87d 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -4,6 +4,8 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select SBITMAP + select SRCU help Provide block layer support for the kernel. @@ -47,9 +49,13 @@ config LBDAF If unsure, say Y. +config BLK_SCSI_REQUEST + bool + config BLK_DEV_BSG bool "Block layer SG support v4" default y + select BLK_SCSI_REQUEST help Saying Y here will enable generic SG (SCSI generic) v4 support for any block device. @@ -69,6 +75,7 @@ config BLK_DEV_BSGLIB bool "Block layer SG support v4 helper lib" default n select BLK_DEV_BSG + select BLK_SCSI_REQUEST help Subsystems will normally enable this if needed. Users will not normally need to manually enable this. @@ -88,6 +95,14 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_ZONED + bool "Zoned block device support" + ---help--- + Block layer zoned block device support. This option enables + support for ZAC/ZBC host-managed and host-aware zoned block devices. + + Say yes here if you have a ZAC or ZBC storage device. + config BLK_DEV_THROTTLING bool "Block layer bio throttling support" depends on BLK_CGROUP=y @@ -111,6 +126,51 @@ config BLK_CMDLINE_PARSER See Documentation/block/cmdline-partition.txt for more information. +config BLK_WBT + bool "Enable support for block device writeback throttling" + default n + ---help--- + Enabling this option enables the block layer to throttle buffered + background writeback from the VM, making it more smooth and having + less impact on foreground operations. The throttling is done + dynamically on an algorithm loosely based on CoDel, factoring in + the realtime performance of the disk. + +config BLK_WBT_SQ + bool "Single queue writeback throttling" + default n + depends on BLK_WBT + ---help--- + Enable writeback throttling by default on legacy single queue devices + +config BLK_WBT_MQ + bool "Multiqueue writeback throttling" + default y + depends on BLK_WBT + ---help--- + Enable writeback throttling by default on multiqueue devices. + Multiqueue currently doesn't have support for IO scheduling, + enabling this option is recommended. + +config BLK_DEBUG_FS + bool "Block layer debugging information in debugfs" + default y + depends on DEBUG_FS + ---help--- + Include block layer debugging information in debugfs. This information + is mostly useful for kernel developers, but it doesn't incur any cost + at runtime. + + Unless you are building a kernel for a tiny system, you should + say Y here. + +config BLK_SED_OPAL + bool "Logic for interfacing with Opal enabled SEDs" + ---help--- + Builds Logic for interfacing with Opal enabled controllers. + Enabling this option enables users to setup/unlock/lock + Locking ranges for SED devices using the Opal protocol. + menu "Partition Types" source "block/partitions/Kconfig" @@ -124,4 +184,9 @@ config BLOCK_COMPAT depends on BLOCK && COMPAT default y +config BLK_MQ_PCI + bool + depends on BLOCK && PCI + default y + source block/Kconfig.iosched diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 421bef9c4c48..0715ce93daef 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -63,6 +63,56 @@ config DEFAULT_IOSCHED default "cfq" if DEFAULT_CFQ default "noop" if DEFAULT_NOOP +config MQ_IOSCHED_DEADLINE + tristate "MQ deadline I/O scheduler" + default y + ---help--- + MQ version of the deadline IO scheduler. + +config MQ_IOSCHED_NONE + bool + default y + +choice + prompt "Default single-queue blk-mq I/O scheduler" + default DEFAULT_SQ_NONE + help + Select the I/O scheduler which will be used by default for blk-mq + managed block devices with a single queue. + + config DEFAULT_SQ_DEADLINE + bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y + + config DEFAULT_SQ_NONE + bool "None" + +endchoice + +config DEFAULT_SQ_IOSCHED + string + default "mq-deadline" if DEFAULT_SQ_DEADLINE + default "none" if DEFAULT_SQ_NONE + +choice + prompt "Default multi-queue blk-mq I/O scheduler" + default DEFAULT_MQ_NONE + help + Select the I/O scheduler which will be used by default for blk-mq + managed block devices with multiple queues. + + config DEFAULT_MQ_DEADLINE + bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y + + config DEFAULT_MQ_NONE + bool "None" + +endchoice + +config DEFAULT_MQ_IOSCHED + string + default "mq-deadline" if DEFAULT_MQ_DEADLINE + default "none" if DEFAULT_MQ_NONE + endmenu endif diff --git a/block/Makefile b/block/Makefile index 9eda2322b2d4..2ad7c304e3f5 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,12 +5,13 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o \ - blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ - genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ + blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ + genhd.o partition-generic.o ioprio.o \ badblocks.o partitions/ -obj-$(CONFIG_BOUNCE) += bounce.o +obj-$(CONFIG_BOUNCE) += bounce.o +obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o @@ -18,8 +19,13 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o - +obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o +obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o +obj-$(CONFIG_BLK_WBT) += blk-wbt.o +obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o +obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o diff --git a/block/badblocks.c b/block/badblocks.c index 7be53cb1cc3c..6ebcef282314 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -133,6 +133,26 @@ retry: } EXPORT_SYMBOL_GPL(badblocks_check); +static void badblocks_update_acked(struct badblocks *bb) +{ + u64 *p = bb->page; + int i; + bool unacked = false; + + if (!bb->unacked_exist) + return; + + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + unacked = true; + break; + } + } + + if (!unacked) + bb->unacked_exist = 0; +} + /** * badblocks_set() - Add a range of bad blocks to the table. * @bb: the badblocks structure that holds all badblock information @@ -294,6 +314,8 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, bb->changed = 1; if (!acknowledged) bb->unacked_exist = 1; + else + badblocks_update_acked(bb); write_sequnlock_irqrestore(&bb->lock, flags); return rv; @@ -354,7 +376,8 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) * current range. Earlier ranges could also overlap, * but only this one can overlap the end of the range. */ - if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) && + (BB_OFFSET(p[lo]) < target)) { /* Partial overlap, leave the tail of this range */ int ack = BB_ACK(p[lo]); sector_t a = BB_OFFSET(p[lo]); @@ -377,7 +400,8 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) lo--; } while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) && + (BB_OFFSET(p[lo]) < target)) { /* This range does overlap */ if (BB_OFFSET(p[lo]) < s) { /* Keep the early parts of this range. */ @@ -399,6 +423,7 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) } } + badblocks_update_acked(bb); bb->changed = 1; out: write_sequnlock_irq(&bb->lock); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 63f72f00c72e..5384713d48bc 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -172,7 +172,7 @@ bool bio_integrity_enabled(struct bio *bio) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - if (!bio_is_rw(bio)) + if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) return false; /* Already protected? */ diff --git a/block/bio.c b/block/bio.c index aa7354088008..4b564d0c3e29 100644 --- a/block/bio.c +++ b/block/bio.c @@ -270,11 +270,15 @@ static void bio_free(struct bio *bio) } } -void bio_init(struct bio *bio) +void bio_init(struct bio *bio, struct bio_vec *table, + unsigned short max_vecs) { memset(bio, 0, sizeof(*bio)); atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); + + bio->bi_io_vec = table; + bio->bi_max_vecs = max_vecs; } EXPORT_SYMBOL(bio_init); @@ -480,7 +484,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) return NULL; bio = p + front_pad; - bio_init(bio); + bio_init(bio, NULL, 0); if (nr_iovecs > inline_vecs) { unsigned long idx = 0; @@ -670,6 +674,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: break; case REQ_OP_WRITE_SAME: bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; @@ -847,6 +852,55 @@ done: } EXPORT_SYMBOL(bio_add_page); +/** + * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio + * @bio: bio to add pages to + * @iter: iov iterator describing the region to be mapped + * + * Pins as many pages from *iter and appends them to @bio's bvec array. The + * pages will have to be released using put_page() when done. + */ +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; + struct page **pages = (struct page **)bv; + size_t offset, diff; + ssize_t size; + + size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE; + + /* + * Deep magic below: We need to walk the pinned pages backwards + * because we are abusing the space allocated for the bio_vecs + * for the page array. Because the bio_vecs are larger than the + * page pointers by definition this will always work. But it also + * means we can't use bio_add_page, so any changes to it's semantics + * need to be reflected here as well. + */ + bio->bi_iter.bi_size += size; + bio->bi_vcnt += nr_pages; + + diff = (nr_pages * PAGE_SIZE - offset) - size; + while (nr_pages--) { + bv[nr_pages].bv_page = pages[nr_pages]; + bv[nr_pages].bv_len = PAGE_SIZE; + bv[nr_pages].bv_offset = 0; + } + + bv[0].bv_offset += offset; + bv[0].bv_len -= offset; + if (diff) + bv[bio->bi_vcnt - 1].bv_len -= diff; + + iov_iter_advance(iter, size); + return 0; +} +EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); + struct submit_bio_ret { struct completion event; int error; @@ -1068,7 +1122,7 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) return 0; } -static void bio_free_pages(struct bio *bio) +void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; int i; @@ -1076,6 +1130,7 @@ static void bio_free_pages(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); } +EXPORT_SYMBOL(bio_free_pages); /** * bio_uncopy_user - finish previously mapped bio @@ -1172,9 +1227,6 @@ struct bio *bio_copy_user_iov(struct request_queue *q, if (!bio) goto out_bmd; - if (iter->type & WRITE) - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - ret = 0; if (map_data) { @@ -1274,7 +1326,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, nr_pages += end - start; /* - * buffer must be aligned to at least hardsector size for now + * buffer must be aligned to at least logical block size for now */ if (uaddr & queue_dma_alignment(q)) return ERR_PTR(-EINVAL); @@ -1339,16 +1391,10 @@ struct bio *bio_map_user_iov(struct request_queue *q, kfree(pages); - /* - * set data direction, and check if mapped pages need bouncing - */ - if (iter->type & WRITE) - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - bio_set_flag(bio, BIO_USER_MAPPED); /* - * subtle -- if __bio_map_user() ended up bouncing a bio, + * subtle -- if bio_map_user_iov() ended up bouncing a bio, * it would normally disappear when its bi_end_io is run. * however, we need it for the unmap, so grab an extra * reference to it @@ -1390,8 +1436,8 @@ static void __bio_unmap_user(struct bio *bio) * bio_unmap_user - unmap a bio * @bio: the bio being unmapped * - * Unmap a bio previously mapped by bio_map_user(). Must be called with - * a process context. + * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from + * process context. * * bio_unmap_user() may sleep. */ @@ -1535,7 +1581,6 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, bio->bi_private = data; } else { bio->bi_end_io = bio_copy_kern_endio; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); } return bio; @@ -1785,15 +1830,7 @@ struct bio *bio_split(struct bio *bio, int sectors, BUG_ON(sectors <= 0); BUG_ON(sectors >= bio_sectors(bio)); - /* - * Discards need a mutable bio_vec to accommodate the payload - * required by the DSM TRIM and UNMAP commands. - */ - if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) - split = bio_clone_bioset(bio, gfp, bs); - else - split = bio_clone_fast(bio, gfp, bs); - + split = bio_clone_fast(bio, gfp, bs); if (!split) return NULL; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index dd38e5ced4a3..295e98c2c8cc 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -184,8 +184,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, goto err_free_blkg; } - wb_congested = wb_congested_get_create(&q->backing_dev_info, - blkcg->css.id, GFP_NOWAIT); + wb_congested = wb_congested_get_create(q->backing_dev_info, + blkcg->css.id, + GFP_NOWAIT | __GFP_NOWARN); if (!wb_congested) { ret = -ENOMEM; goto err_put_css; @@ -193,7 +194,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_congested; @@ -468,8 +469,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ - if (blkg->q->backing_dev_info.dev) - return dev_name(blkg->q->backing_dev_info.dev); + if (blkg->q->backing_dev_info->dev) + return dev_name(blkg->q->backing_dev_info->dev); return NULL; } EXPORT_SYMBOL_GPL(blkg_dev_name); @@ -1022,7 +1023,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) } spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); @@ -1078,10 +1079,8 @@ int blkcg_init_queue(struct request_queue *q) if (preloaded) radix_tree_preload_end(); - if (IS_ERR(blkg)) { - blkg_free(new_blkg); + if (IS_ERR(blkg)) return PTR_ERR(blkg); - } q->root_blkg = blkg; q->root_rl.blkg = blkg; @@ -1222,7 +1221,10 @@ int blkcg_activate_policy(struct request_queue *q, if (blkcg_policy_enabled(q, pol)) return 0; - blk_queue_bypass_start(q); + if (q->mq_ops) + blk_mq_freeze_queue(q); + else + blk_queue_bypass_start(q); pd_prealloc: if (!pd_prealloc) { pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); @@ -1240,7 +1242,7 @@ pd_prealloc: if (blkg->pd[pol->plid]) continue; - pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); + pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node); if (!pd) swap(pd, pd_prealloc); if (!pd) { @@ -1260,7 +1262,10 @@ pd_prealloc: spin_unlock_irq(q->queue_lock); out_bypass_end: - blk_queue_bypass_end(q); + if (q->mq_ops) + blk_mq_unfreeze_queue(q); + else + blk_queue_bypass_end(q); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; @@ -1283,7 +1288,11 @@ void blkcg_deactivate_policy(struct request_queue *q, if (!blkcg_policy_enabled(q, pol)) return; - blk_queue_bypass_start(q); + if (q->mq_ops) + blk_mq_freeze_queue(q); + else + blk_queue_bypass_start(q); + spin_lock_irq(q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); @@ -1303,7 +1312,11 @@ void blkcg_deactivate_policy(struct request_queue *q, } spin_unlock_irq(q->queue_lock); - blk_queue_bypass_end(q); + + if (q->mq_ops) + blk_mq_unfreeze_queue(q); + else + blk_queue_bypass_end(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); @@ -1340,10 +1353,8 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) { - mutex_unlock(&blkcg_pol_mutex); + if (!cpd) goto err_free_cpds; - } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; diff --git a/block/blk-core.c b/block/blk-core.c index 36c7ac328d8c..b9e857f4afe8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -33,12 +33,19 @@ #include <linux/ratelimit.h> #include <linux/pm_runtime.h> #include <linux/blk-cgroup.h> +#include <linux/debugfs.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" #include "blk-mq.h" +#include "blk-mq-sched.h" +#include "blk-wbt.h" + +#ifdef CONFIG_DEBUG_FS +struct dentry *blk_debugfs_root; +#endif EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -73,7 +80,7 @@ static void blk_clear_congested(struct request_list *rl, int sync) * flip its congestion state for events on other blkcgs. */ if (rl == &rl->q->root_rl) - clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); + clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync); #endif } @@ -84,7 +91,7 @@ static void blk_set_congested(struct request_list *rl, int sync) #else /* see blk_clear_congested() */ if (rl == &rl->q->root_rl) - set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); + set_wb_congested(rl->q->backing_dev_info->wb.congested, sync); #endif } @@ -103,22 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q) q->nr_congestion_off = nr; } -/** - * blk_get_backing_dev_info - get the address of a queue's backing_dev_info - * @bdev: device - * - * Locates the passed device's request queue and returns the address of its - * backing_dev_info. This function can only be called if @bdev is opened - * and the return value is never NULL. - */ -struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - - return &q->backing_dev_info; -} -EXPORT_SYMBOL(blk_get_backing_dev_info); - void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); @@ -130,9 +121,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); - rq->cmd = rq->__cmd; - rq->cmd_len = BLK_MAX_CDB; rq->tag = -1; + rq->internal_tag = -1; rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; @@ -145,22 +135,20 @@ static void req_bio_endio(struct request *rq, struct bio *bio, if (error) bio->bi_error = error; - if (unlikely(rq->cmd_flags & REQ_QUIET)) + if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); } void blk_dump_rq_flags(struct request *rq, char *msg) { - int bit; - - printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg, - rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, + printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, + rq->rq_disk ? rq->rq_disk->disk_name : "?", (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", @@ -168,13 +156,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg) blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, len %u\n", rq->bio, rq->biotail, blk_rq_bytes(rq)); - - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { - printk(KERN_INFO " cdb: "); - for (bit = 0; bit < BLK_MAX_CDB; bit++) - printk("%02x ", rq->cmd[bit]); - printk("\n"); - } } EXPORT_SYMBOL(blk_dump_rq_flags); @@ -288,7 +269,7 @@ void blk_sync_queue(struct request_queue *q) int i; queue_for_each_hw_ctx(q, hctx, i) { - cancel_delayed_work_sync(&hctx->run_work); + cancel_work_sync(&hctx->run_work); cancel_delayed_work_sync(&hctx->delay_work); } } else { @@ -524,12 +505,14 @@ void blk_set_queue_dying(struct request_queue *q) else { struct request_list *rl; + spin_lock_irq(q->queue_lock); blk_queue_for_each_rl(rl, q) { if (rl->rq_pool) { wake_up(&rl->wait[BLK_RW_SYNC]); wake_up(&rl->wait[BLK_RW_ASYNC]); } } + spin_unlock_irq(q->queue_lock); } } EXPORT_SYMBOL_GPL(blk_set_queue_dying); @@ -583,7 +566,7 @@ void blk_cleanup_queue(struct request_queue *q) blk_flush_integrity(); /* @q won't process any more request, flush async actions */ - del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); + del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); if (q->mq_ops) @@ -595,7 +578,8 @@ void blk_cleanup_queue(struct request_queue *q) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); - bdi_unregister(&q->backing_dev_info); + bdi_unregister(q->backing_dev_info); + put_disk_devt(q->disk_devt); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); @@ -603,17 +587,41 @@ void blk_cleanup_queue(struct request_queue *q) EXPORT_SYMBOL(blk_cleanup_queue); /* Allocate memory local to the request queue */ -static void *alloc_request_struct(gfp_t gfp_mask, void *data) +static void *alloc_request_simple(gfp_t gfp_mask, void *data) { - int nid = (int)(long)data; - return kmem_cache_alloc_node(request_cachep, gfp_mask, nid); + struct request_queue *q = data; + + return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node); } -static void free_request_struct(void *element, void *unused) +static void free_request_simple(void *element, void *data) { kmem_cache_free(request_cachep, element); } +static void *alloc_request_size(gfp_t gfp_mask, void *data) +{ + struct request_queue *q = data; + struct request *rq; + + rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask, + q->node); + if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) { + kfree(rq); + rq = NULL; + } + return rq; +} + +static void free_request_size(void *element, void *data) +{ + struct request_queue *q = data; + + if (q->exit_rq_fn) + q->exit_rq_fn(q, element); + kfree(element); +} + int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask) { @@ -626,10 +634,15 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q, init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct, - free_request_struct, - (void *)(long)q->node, gfp_mask, - q->node); + if (q->cmd_size) { + rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, + alloc_request_size, free_request_size, + q, gfp_mask, q->node); + } else { + rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, + alloc_request_simple, free_request_simple, + q, gfp_mask, q->node); + } if (!rl->rq_pool) return -ENOMEM; @@ -692,7 +705,6 @@ static void blk_rq_timed_out_timer(unsigned long data) struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; - int err; q = kmem_cache_alloc_node(blk_requestq_cachep, gfp_mask | __GFP_ZERO, node_id); @@ -707,17 +719,17 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q->bio_split) goto fail_id; - q->backing_dev_info.ra_pages = + q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id); + if (!q->backing_dev_info) + goto fail_split; + + q->backing_dev_info->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; - q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; - q->backing_dev_info.name = "block"; + q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; + q->backing_dev_info->name = "block"; q->node = node_id; - err = bdi_init(&q->backing_dev_info); - if (err) - goto fail_split; - - setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, + setup_timer(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->queue_head); @@ -767,7 +779,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) fail_ref: percpu_ref_exit(&q->q_usage_counter); fail_bdi: - bdi_destroy(&q->backing_dev_info); + bdi_put(q->backing_dev_info); fail_split: bioset_free(q->bio_split); fail_id: @@ -820,15 +832,19 @@ EXPORT_SYMBOL(blk_init_queue); struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { - struct request_queue *uninit_q, *q; + struct request_queue *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); - if (!uninit_q) + q = blk_alloc_queue_node(GFP_KERNEL, node_id); + if (!q) return NULL; - q = blk_init_allocated_queue(uninit_q, rfn, lock); - if (!q) - blk_cleanup_queue(uninit_q); + q->request_fn = rfn; + if (lock) + q->queue_lock = lock; + if (blk_init_allocated_queue(q) < 0) { + blk_cleanup_queue(q); + return NULL; + } return q; } @@ -836,30 +852,22 @@ EXPORT_SYMBOL(blk_init_queue_node); static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); -struct request_queue * -blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, - spinlock_t *lock) -{ - if (!q) - return NULL; - q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0); +int blk_init_allocated_queue(struct request_queue *q) +{ + q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size); if (!q->fq) - return NULL; + return -ENOMEM; + + if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL)) + goto out_free_flush_queue; if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) - goto fail; + goto out_exit_flush_rq; INIT_WORK(&q->timeout_work, blk_timeout_work); - q->request_fn = rfn; - q->prep_rq_fn = NULL; - q->unprep_rq_fn = NULL; q->queue_flags |= QUEUE_FLAG_DEFAULT; - /* Override internal queue lock with supplied lock pointer */ - if (lock) - q->queue_lock = lock; - /* * This also sets hw/phys segments, boundary and size */ @@ -873,16 +881,19 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, /* init elevator */ if (elevator_init(q, NULL)) { mutex_unlock(&q->sysfs_lock); - goto fail; + goto out_exit_flush_rq; } mutex_unlock(&q->sysfs_lock); + return 0; - return q; - -fail: +out_exit_flush_rq: + if (q->exit_rq_fn) + q->exit_rq_fn(q, q->fq->flush_rq); +out_free_flush_queue: blk_free_flush_queue(q->fq); - return NULL; + wbt_exit(q); + return -ENOMEM; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -899,7 +910,7 @@ EXPORT_SYMBOL(blk_get_queue); static inline void blk_free_request(struct request_list *rl, struct request *rq) { - if (rq->cmd_flags & REQ_ELVPRIV) { + if (rq->rq_flags & RQF_ELVPRIV) { elv_put_request(rl->q, rq); if (rq->elv.icq) put_io_context(rq->elv.icq->ioc); @@ -961,14 +972,14 @@ static void __freed_request(struct request_list *rl, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_list *rl, int op, unsigned int flags) +static void freed_request(struct request_list *rl, bool sync, + req_flags_t rq_flags) { struct request_queue *q = rl->q; - int sync = rw_is_sync(op, flags); q->nr_rqs[sync]--; rl->count[sync]--; - if (flags & REQ_ELVPRIV) + if (rq_flags & RQF_ELVPRIV) q->nr_rqs_elvpriv--; __freed_request(rl, sync); @@ -1018,46 +1029,10 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) return 0; } -/* - * Determine if elevator data should be initialized when allocating the - * request associated with @bio. - */ -static bool blk_rq_should_init_elevator(struct bio *bio) -{ - if (!bio) - return true; - - /* - * Flush requests do not use the elevator so skip initialization. - * This allows a request to share the flush and elevator data. - */ - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) - return false; - - return true; -} - -/** - * rq_ioc - determine io_context for request allocation - * @bio: request being allocated is for this bio (can be %NULL) - * - * Determine io_context to use for request allocation for @bio. May return - * %NULL if %current->io_context doesn't exist. - */ -static struct io_context *rq_ioc(struct bio *bio) -{ -#ifdef CONFIG_BLK_CGROUP - if (bio && bio->bi_ioc) - return bio->bi_ioc; -#endif - return current->io_context; -} - /** * __get_request - get a free request * @rl: request list to allocate from - * @op: REQ_OP_READ/REQ_OP_WRITE - * @op_flags: rq_flag_bits + * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * @@ -1068,22 +1043,22 @@ static struct io_context *rq_ioc(struct bio *bio) * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ -static struct request *__get_request(struct request_list *rl, int op, - int op_flags, struct bio *bio, - gfp_t gfp_mask) +static struct request *__get_request(struct request_list *rl, unsigned int op, + struct bio *bio, gfp_t gfp_mask) { struct request_queue *q = rl->q; struct request *rq; struct elevator_type *et = q->elevator->type; struct io_context *ioc = rq_ioc(bio); struct io_cq *icq = NULL; - const bool is_sync = rw_is_sync(op, op_flags) != 0; + const bool is_sync = op_is_sync(op); int may_queue; + req_flags_t rq_flags = RQF_ALLOCED; if (unlikely(blk_queue_dying(q))) return ERR_PTR(-ENODEV); - may_queue = elv_may_queue(q, op, op_flags); + may_queue = elv_may_queue(q, op); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; @@ -1127,23 +1102,26 @@ static struct request *__get_request(struct request_list *rl, int op, /* * Decide whether the new request will be managed by elevator. If - * so, mark @op_flags and increment elvpriv. Non-zero elvpriv will + * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will * prevent the current elevator from being destroyed until the new * request is freed. This guarantees icq's won't be destroyed and * makes creating new ones safe. * + * Flush requests do not use the elevator so skip initialization. + * This allows a request to share the flush and elevator data. + * * Also, lookup icq while holding queue_lock. If it doesn't exist, * it will be created after releasing queue_lock. */ - if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { - op_flags |= REQ_ELVPRIV; + if (!op_is_flush(op) && !blk_queue_bypass(q)) { + rq_flags |= RQF_ELVPRIV; q->nr_rqs_elvpriv++; if (et->icq_cache && ioc) icq = ioc_lookup_icq(ioc, q); } if (blk_queue_io_stat(q)) - op_flags |= REQ_IO_STAT; + rq_flags |= RQF_IO_STAT; spin_unlock_irq(q->queue_lock); /* allocate and init request */ @@ -1153,10 +1131,12 @@ static struct request *__get_request(struct request_list *rl, int op, blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); - req_set_op_attrs(rq, op, op_flags | REQ_ALLOCED); + blk_rq_set_prio(rq, ioc); + rq->cmd_flags = op; + rq->rq_flags = rq_flags; /* init elvpriv */ - if (op_flags & REQ_ELVPRIV) { + if (rq_flags & RQF_ELVPRIV) { if (unlikely(et->icq_cache && !icq)) { if (ioc) icq = ioc_create_icq(ioc, q, gfp_mask); @@ -1193,9 +1173,9 @@ fail_elvpriv: * disturb iosched and blkcg but weird is bettern than dead. */ printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", - __func__, dev_name(q->backing_dev_info.dev)); + __func__, dev_name(q->backing_dev_info->dev)); - rq->cmd_flags &= ~REQ_ELVPRIV; + rq->rq_flags &= ~RQF_ELVPRIV; rq->elv.icq = NULL; spin_lock_irq(q->queue_lock); @@ -1212,7 +1192,7 @@ fail_alloc: * queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(rl, op, op_flags); + freed_request(rl, is_sync, rq_flags); /* * in the very unlikely event that allocation failed and no @@ -1230,8 +1210,7 @@ rq_starved: /** * get_request - get a free request * @q: request_queue to allocate request from - * @op: REQ_OP_READ/REQ_OP_WRITE - * @op_flags: rq_flag_bits + * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * @@ -1242,18 +1221,17 @@ rq_starved: * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ -static struct request *get_request(struct request_queue *q, int op, - int op_flags, struct bio *bio, - gfp_t gfp_mask) +static struct request *get_request(struct request_queue *q, unsigned int op, + struct bio *bio, gfp_t gfp_mask) { - const bool is_sync = rw_is_sync(op, op_flags) != 0; + const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); struct request_list *rl; struct request *rq; rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: - rq = __get_request(rl, op, op_flags, bio, gfp_mask); + rq = __get_request(rl, op, bio, gfp_mask); if (!IS_ERR(rq)) return rq; @@ -1289,13 +1267,11 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, { struct request *rq; - BUG_ON(rw != READ && rw != WRITE); - /* create ioc upfront */ create_io_context(gfp_mask, q->node); spin_lock_irq(q->queue_lock); - rq = get_request(q, rw, 0, NULL, gfp_mask); + rq = get_request(q, rw, NULL, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); return rq; @@ -1320,18 +1296,6 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) EXPORT_SYMBOL(blk_get_request); /** - * blk_rq_set_block_pc - initialize a request to type BLOCK_PC - * @rq: request to be initialized - * - */ -void blk_rq_set_block_pc(struct request *rq) -{ - rq->cmd_type = REQ_TYPE_BLOCK_PC; - memset(rq->__cmd, 0, sizeof(rq->__cmd)); -} -EXPORT_SYMBOL(blk_rq_set_block_pc); - -/** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted * @rq: request to be inserted @@ -1346,8 +1310,9 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); + wbt_requeue(q->rq_wb, &rq->issue_stat); - if (rq->cmd_flags & REQ_QUEUED) + if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); BUG_ON(blk_queued_rq(rq)); @@ -1409,7 +1374,7 @@ EXPORT_SYMBOL_GPL(part_round_stats); #ifdef CONFIG_PM static void blk_pm_put_request(struct request *rq) { - if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending) + if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending) pm_runtime_mark_last_busy(rq->q->dev); } #else @@ -1421,6 +1386,8 @@ static inline void blk_pm_put_request(struct request *rq) {} */ void __blk_put_request(struct request_queue *q, struct request *req) { + req_flags_t rq_flags = req->rq_flags; + if (unlikely(!q)) return; @@ -1436,20 +1403,21 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); + wbt_done(q->rq_wb, &req->issue_stat); + /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools */ - if (req->cmd_flags & REQ_ALLOCED) { - unsigned int flags = req->cmd_flags; - int op = req_op(req); + if (rq_flags & RQF_ALLOCED) { struct request_list *rl = blk_rq_rl(req); + bool sync = op_is_sync(req->cmd_flags); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(ELV_ON_HASH(req)); blk_free_request(rl, req); - freed_request(rl, op, flags); + freed_request(rl, sync, rq_flags); blk_put_rl(rl); } } @@ -1471,38 +1439,6 @@ void blk_put_request(struct request *req) } EXPORT_SYMBOL(blk_put_request); -/** - * blk_add_request_payload - add a payload to a request - * @rq: request to update - * @page: page backing the payload - * @offset: offset in page - * @len: length of the payload. - * - * This allows to later add a payload to an already submitted request by - * a block driver. The driver needs to take care of freeing the payload - * itself. - * - * Note that this is a quite horrible hack and nothing but handling of - * discard requests should ever use it. - */ -void blk_add_request_payload(struct request *rq, struct page *page, - int offset, unsigned int len) -{ - struct bio *bio = rq->bio; - - bio->bi_io_vec->bv_page = page; - bio->bi_io_vec->bv_offset = offset; - bio->bi_io_vec->bv_len = len; - - bio->bi_iter.bi_size = len; - bio->bi_vcnt = 1; - bio->bi_phys_segments = 1; - - rq->__data_len = rq->resid_len = len; - rq->nr_phys_segments = 1; -} -EXPORT_SYMBOL_GPL(blk_add_request_payload); - bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio) { @@ -1549,6 +1485,30 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, return true; } +bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, + struct bio *bio) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + + if (segments >= queue_max_discard_segments(q)) + goto no_merge; + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) + goto no_merge; + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_iter.bi_size; + req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); + req->nr_phys_segments = segments + 1; + + blk_account_io_start(req, false); + return true; +no_merge: + req_set_nomerge(q, req); + return false; +} + /** * blk_attempt_plug_merge - try to merge with %current's plugged list * @q: request_queue new bio is being queued at @@ -1577,12 +1537,11 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, { struct blk_plug *plug; struct request *rq; - bool ret = false; struct list_head *plug_list; plug = current->plug; if (!plug) - goto out; + return false; *request_count = 0; if (q->mq_ops) @@ -1591,7 +1550,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, plug_list = &plug->list; list_for_each_entry_reverse(rq, plug_list, queuelist) { - int el_ret; + bool merged = false; if (rq->q == q) { (*request_count)++; @@ -1607,19 +1566,25 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; - el_ret = blk_try_merge(rq, bio); - if (el_ret == ELEVATOR_BACK_MERGE) { - ret = bio_attempt_back_merge(q, rq, bio); - if (ret) - break; - } else if (el_ret == ELEVATOR_FRONT_MERGE) { - ret = bio_attempt_front_merge(q, rq, bio); - if (ret) - break; + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + merged = bio_attempt_back_merge(q, rq, bio); + break; + case ELEVATOR_FRONT_MERGE: + merged = bio_attempt_front_merge(q, rq, bio); + break; + case ELEVATOR_DISCARD_MERGE: + merged = bio_attempt_discard_merge(q, rq, bio); + break; + default: + break; } + + if (merged) + return true; } -out: - return ret; + + return false; } unsigned int blk_plug_queued_count(struct request_queue *q) @@ -1648,25 +1613,23 @@ out: void init_request_from_bio(struct request *req, struct bio *bio) { - req->cmd_type = REQ_TYPE_FS; - - req->cmd_flags |= bio->bi_opf & REQ_COMMON_MASK; if (bio->bi_opf & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; req->errors = 0; req->__sector = bio->bi_iter.bi_sector; - req->ioprio = bio_prio(bio); + if (ioprio_valid(bio_prio(bio))) + req->ioprio = bio_prio(bio); blk_rq_bio_prep(req->q, req, bio); } static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { - const bool sync = !!(bio->bi_opf & REQ_SYNC); struct blk_plug *plug; - int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT; - struct request *req; + int where = ELEVATOR_INSERT_SORT; + struct request *req, *free; unsigned int request_count = 0; + unsigned int wb_acct; /* * low level driver can indicate that it wants pages above a @@ -1683,7 +1646,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) { + if (op_is_flush(bio->bi_opf)) { spin_lock_irq(q->queue_lock); where = ELEVATOR_INSERT_FLUSH; goto get_rq; @@ -1701,48 +1664,48 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) spin_lock_irq(q->queue_lock); - el_ret = elv_merge(q, &req, bio); - if (el_ret == ELEVATOR_BACK_MERGE) { - if (bio_attempt_back_merge(q, req, bio)) { - elv_bio_merged(q, req, bio); - if (!attempt_back_merge(q, req)) - elv_merged_request(q, req, el_ret); - goto out_unlock; - } - } else if (el_ret == ELEVATOR_FRONT_MERGE) { - if (bio_attempt_front_merge(q, req, bio)) { - elv_bio_merged(q, req, bio); - if (!attempt_front_merge(q, req)) - elv_merged_request(q, req, el_ret); - goto out_unlock; - } + switch (elv_merge(q, &req, bio)) { + case ELEVATOR_BACK_MERGE: + if (!bio_attempt_back_merge(q, req, bio)) + break; + elv_bio_merged(q, req, bio); + free = attempt_back_merge(q, req); + if (free) + __blk_put_request(q, free); + else + elv_merged_request(q, req, ELEVATOR_BACK_MERGE); + goto out_unlock; + case ELEVATOR_FRONT_MERGE: + if (!bio_attempt_front_merge(q, req, bio)) + break; + elv_bio_merged(q, req, bio); + free = attempt_front_merge(q, req); + if (free) + __blk_put_request(q, free); + else + elv_merged_request(q, req, ELEVATOR_FRONT_MERGE); + goto out_unlock; + default: + break; } get_rq: - /* - * This sync check and mask will be re-done in init_request_from_bio(), - * but we need to set it earlier to expose the sync flag to the - * rq allocator and io schedulers. - */ - if (sync) - rw_flags |= REQ_SYNC; - - /* - * Add in META/PRIO flags, if set, before we get to the IO scheduler - */ - rw_flags |= (bio->bi_opf & (REQ_META | REQ_PRIO)); + wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock); /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ - req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO); + req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { + __wbt_done(q->rq_wb, wb_acct); bio->bi_error = PTR_ERR(req); bio_endio(bio); goto out_unlock; } + wbt_track(&req->issue_stat, wb_acct); + /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). @@ -1759,11 +1722,16 @@ get_rq: /* * If this is the first request added after a plug, fire * of a plug trace. + * + * @request_count may become stale because of schedule + * out, so check plug list again. */ - if (!request_count) + if (!request_count || list_empty(&plug->list)) trace_block_plug(q); else { - if (request_count >= BLK_MAX_REQUEST_COUNT) { + struct request *last = list_entry_rq(plug->list.prev); + if (request_count >= BLK_MAX_REQUEST_COUNT || + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) { blk_flush_plug_list(plug, false); trace_block_plug(q); } @@ -1788,7 +1756,12 @@ static inline void blk_partition_remap(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; - if (bio_sectors(bio) && bdev != bdev->bd_contains) { + /* + * Zone reset does not include bi_size so bio_sectors() is always 0. + * Include a test for the reset op code and perform the remap if needed. + */ + if (bdev != bdev->bd_contains && + (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)) { struct hd_struct *p = bdev->bd_part; bio->bi_iter.bi_sector += p->start_sect; @@ -1920,7 +1893,7 @@ generic_make_request_checks(struct bio *bio) * drivers without flush support don't have to worry * about them. */ - if ((bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) && + if (op_is_flush(bio->bi_opf) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!nr_sectors) { @@ -1942,6 +1915,15 @@ generic_make_request_checks(struct bio *bio) if (!bdev_write_same(bio->bi_bdev)) goto not_supported; break; + case REQ_OP_ZONE_REPORT: + case REQ_OP_ZONE_RESET: + if (!bdev_is_zoned(bio->bi_bdev)) + goto not_supported; + break; + case REQ_OP_WRITE_ZEROES: + if (!bdev_write_zeroes_sectors(bio->bi_bdev)) + goto not_supported; + break; default: break; } @@ -2160,7 +2142,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); - blk_mq_insert_request(rq, false, true, false); + blk_mq_sched_insert_request(rq, false, true, false, false); return 0; } @@ -2176,7 +2158,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) */ BUG_ON(blk_queued_rq(rq)); - if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA)) + if (op_is_flush(rq->cmd_flags)) where = ELEVATOR_INSERT_FLUSH; add_acct_request(q, rq, where); @@ -2210,7 +2192,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq) unsigned int bytes = 0; struct bio *bio; - if (!(rq->cmd_flags & REQ_MIXED_MERGE)) + if (!(rq->rq_flags & RQF_MIXED_MERGE)) return blk_rq_bytes(rq); /* @@ -2253,7 +2235,7 @@ void blk_account_io_done(struct request *req) * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) { + if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; @@ -2281,7 +2263,7 @@ static struct request *blk_pm_peek_request(struct request_queue *q, struct request *rq) { if (q->dev && (q->rpm_status == RPM_SUSPENDED || - (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM)))) + (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM)))) return NULL; else return rq; @@ -2357,13 +2339,13 @@ struct request *blk_peek_request(struct request_queue *q) if (!rq) break; - if (!(rq->cmd_flags & REQ_STARTED)) { + if (!(rq->rq_flags & RQF_STARTED)) { /* * This is the first time the device driver * sees this request (possibly after * requeueing). Notify IO scheduler. */ - if (rq->cmd_flags & REQ_SORTED) + if (rq->rq_flags & RQF_SORTED) elv_activate_rq(q, rq); /* @@ -2371,7 +2353,7 @@ struct request *blk_peek_request(struct request_queue *q) * it, a request that has been delayed should * not be passed by new incoming requests */ - rq->cmd_flags |= REQ_STARTED; + rq->rq_flags |= RQF_STARTED; trace_block_rq_issue(q, rq); } @@ -2380,7 +2362,7 @@ struct request *blk_peek_request(struct request_queue *q) q->boundary_rq = NULL; } - if (rq->cmd_flags & REQ_DONTPREP) + if (rq->rq_flags & RQF_DONTPREP) break; if (q->dma_drain_size && blk_rq_bytes(rq)) { @@ -2403,11 +2385,11 @@ struct request *blk_peek_request(struct request_queue *q) /* * the request may have been (partially) prepped. * we need to keep this request in the front to - * avoid resource deadlock. REQ_STARTED will + * avoid resource deadlock. RQF_STARTED will * prevent other fs requests from passing this one. */ if (q->dma_drain_size && blk_rq_bytes(rq) && - !(rq->cmd_flags & REQ_DONTPREP)) { + !(rq->rq_flags & RQF_DONTPREP)) { /* * remove the space for the drain we added * so that we don't add it again @@ -2420,7 +2402,7 @@ struct request *blk_peek_request(struct request_queue *q) } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO; - rq->cmd_flags |= REQ_QUIET; + rq->rq_flags |= RQF_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. @@ -2475,13 +2457,11 @@ void blk_start_request(struct request *req) { blk_dequeue_request(req); - /* - * We are now handing the request to the hardware, initialize - * resid_len to full count and add the timeout handler. - */ - req->resid_len = blk_rq_bytes(req); - if (unlikely(blk_bidi_rq(req))) - req->next_rq->resid_len = blk_rq_bytes(req->next_rq); + if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { + blk_stat_set_issue_time(&req->issue_stat); + req->rq_flags |= RQF_STATS; + wbt_issue(req->q->rq_wb, &req->issue_stat); + } BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); blk_add_timer(req); @@ -2553,11 +2533,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) * TODO: tj: This is too subtle. It would be better to let * low level drivers do what they see fit. */ - if (req->cmd_type == REQ_TYPE_FS) + if (!blk_rq_is_passthrough(req)) req->errors = 0; - if (error && req->cmd_type == REQ_TYPE_FS && - !(req->cmd_flags & REQ_QUIET)) { + if (error && !blk_rq_is_passthrough(req) && + !(req->rq_flags & RQF_QUIET)) { char *error_type; switch (error) { @@ -2623,14 +2603,16 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) return false; } + WARN_ON_ONCE(req->rq_flags & RQF_SPECIAL_PAYLOAD); + req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ - if (req->cmd_type == REQ_TYPE_FS) + if (!blk_rq_is_passthrough(req)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ - if (req->cmd_flags & REQ_MIXED_MERGE) { + if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } @@ -2683,7 +2665,7 @@ void blk_unprep_request(struct request *req) { struct request_queue *q = req->q; - req->cmd_flags &= ~REQ_DONTPREP; + req->rq_flags &= ~RQF_DONTPREP; if (q->unprep_rq_fn) q->unprep_rq_fn(q, req); } @@ -2694,28 +2676,34 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); */ void blk_finish_request(struct request *req, int error) { - if (req->cmd_flags & REQ_QUEUED) - blk_queue_end_tag(req->q, req); + struct request_queue *q = req->q; + + if (req->rq_flags & RQF_STATS) + blk_stat_add(&q->rq_stats[rq_data_dir(req)], req); + + if (req->rq_flags & RQF_QUEUED) + blk_queue_end_tag(q, req); BUG_ON(blk_queued_rq(req)); - if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS) - laptop_io_completion(&req->q->backing_dev_info); + if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req)) + laptop_io_completion(req->q->backing_dev_info); blk_delete_timer(req); - if (req->cmd_flags & REQ_DONTPREP) + if (req->rq_flags & RQF_DONTPREP) blk_unprep_request(req); blk_account_io_done(req); - if (req->end_io) + if (req->end_io) { + wbt_done(req->q->rq_wb, &req->issue_stat); req->end_io(req, error); - else { + } else { if (blk_bidi_rq(req)) __blk_put_request(req->next_rq->q, req->next_rq); - __blk_put_request(req->q, req); + __blk_put_request(q, req); } } EXPORT_SYMBOL(blk_finish_request); @@ -2939,8 +2927,6 @@ EXPORT_SYMBOL_GPL(__blk_end_request_err); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { - req_set_op(rq, bio_op(bio)); - if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); @@ -3024,9 +3010,6 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - req_set_op_attrs(dst, req_op(src), - (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE); - dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); dst->nr_phys_segments = src->nr_phys_segments; @@ -3097,6 +3080,12 @@ int kblockd_schedule_work(struct work_struct *work) } EXPORT_SYMBOL(kblockd_schedule_work); +int kblockd_schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, kblockd_workqueue, work); +} +EXPORT_SYMBOL(kblockd_schedule_work_on); + int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { @@ -3270,7 +3259,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) /* * rq is already accounted, so use raw insert */ - if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA)) + if (op_is_flush(rq->cmd_flags)) __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); else __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); @@ -3297,48 +3286,6 @@ void blk_finish_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_finish_plug); -bool blk_poll(struct request_queue *q, blk_qc_t cookie) -{ - struct blk_plug *plug; - long state; - - if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - return false; - - plug = current->plug; - if (plug) - blk_flush_plug_list(plug, false); - - state = current->state; - while (!need_resched()) { - unsigned int queue_num = blk_qc_t_to_queue_num(cookie); - struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; - int ret; - - hctx->poll_invoked++; - - ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); - if (ret > 0) { - hctx->poll_success++; - set_current_state(TASK_RUNNING); - return true; - } - - if (signal_pending_state(state, current)) - set_current_state(TASK_RUNNING); - - if (current->state == TASK_RUNNING) - return true; - if (ret < 0) - break; - cpu_relax(); - } - - return false; -} -EXPORT_SYMBOL_GPL(blk_poll); - #ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine @@ -3520,8 +3467,11 @@ EXPORT_SYMBOL(blk_set_runtime_active); int __init blk_dev_init(void) { - BUILD_BUG_ON(__REQ_NR_BITS > 8 * + BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * FIELD_SIZEOF(struct request, cmd_flags)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * + FIELD_SIZEOF(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", @@ -3535,5 +3485,9 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); +#ifdef CONFIG_DEBUG_FS + blk_debugfs_root = debugfs_create_dir("block", NULL); +#endif + return 0; } diff --git a/block/blk-exec.c b/block/blk-exec.c index 7ea04325d02f..8cd0e9bc8dc8 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -9,11 +9,7 @@ #include <linux/sched/sysctl.h> #include "blk.h" - -/* - * for max sense size - */ -#include <scsi/scsi_cmnd.h> +#include "blk-mq-sched.h" /** * blk_end_sync_rq - executes a completion event on a request @@ -55,7 +51,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; WARN_ON(irqs_disabled()); - WARN_ON(rq->cmd_type == REQ_TYPE_FS); + WARN_ON(!blk_rq_is_passthrough(rq)); rq->rq_disk = bd_disk; rq->end_io = done; @@ -65,14 +61,14 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be reused after dying flag is set */ if (q->mq_ops) { - blk_mq_insert_request(rq, at_head, true, false); + blk_mq_sched_insert_request(rq, at_head, true, false, false); return; } spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { - rq->cmd_flags |= REQ_QUIET; + rq->rq_flags |= RQF_QUIET; rq->errors = -ENXIO; __blk_end_request_all(rq, rq->errors); spin_unlock_irq(q->queue_lock); @@ -100,16 +96,9 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head) { DECLARE_COMPLETION_ONSTACK(wait); - char sense[SCSI_SENSE_BUFFERSIZE]; int err = 0; unsigned long hang_check; - if (!rq->sense) { - memset(sense, 0, sizeof(sense)); - rq->sense = sense; - rq->sense_len = 0; - } - rq->end_io_data = &wait; blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); @@ -123,11 +112,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, if (rq->errors) err = -EIO; - if (rq->sense == sense) { - rq->sense = NULL; - rq->sense_len = 0; - } - return err; } EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-flush.c b/block/blk-flush.c index d308def812db..0d5a9c1da1fc 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -56,7 +56,7 @@ * Once while executing DATA and again after the whole sequence is * complete. The first completion updates the contained bio but doesn't * finish it so that the bio submitter is notified only after the whole - * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in + * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in * req_bio_endio(). * * The above peculiarity requires that each FLUSH/FUA request has only one @@ -74,6 +74,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-mq-sched.h" /* FLUSH/FUA sequences */ enum { @@ -127,17 +128,14 @@ static void blk_flush_restore_request(struct request *rq) rq->bio = rq->biotail; /* make @rq a normal request */ - rq->cmd_flags &= ~REQ_FLUSH_SEQ; + rq->rq_flags &= ~RQF_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; } static bool blk_flush_queue_rq(struct request *rq, bool add_front) { if (rq->q->mq_ops) { - struct request_queue *q = rq->q; - - blk_mq_add_to_requeue_list(rq, add_front); - blk_mq_kick_requeue_list(q); + blk_mq_add_to_requeue_list(rq, add_front, true); return false; } else { if (add_front) @@ -232,7 +230,7 @@ static void flush_end_io(struct request *flush_rq, int error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu); + hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); flush_rq->tag = -1; } @@ -299,8 +297,14 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return false; - /* C2 and C3 */ + /* C2 and C3 + * + * For blk-mq + scheduling, we can risk having all driver tags + * assigned to empty flushes, and we deadlock if we are expecting + * other requests to make progress. Don't defer for that case. + */ if (!list_empty(&fq->flush_data_in_flight) && + !(q->mq_ops && q->elevator) && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return false; @@ -325,12 +329,12 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) flush_rq->tag = first_rq->tag; fq->orig_rq = first_rq; - hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu); + hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); } - flush_rq->cmd_type = REQ_TYPE_FS; - req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH | REQ_FLUSH_SEQ); + flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; + flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; @@ -343,6 +347,34 @@ static void flush_data_end_io(struct request *rq, int error) struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); /* + * Updating q->in_flight[] here for making this tag usable + * early. Because in blk_queue_start_tag(), + * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and + * reserve tags for sync I/O. + * + * More importantly this way can avoid the following I/O + * deadlock: + * + * - suppose there are 40 fua requests comming to flush queue + * and queue depth is 31 + * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc + * tag for async I/O any more + * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT + * and flush_data_end_io() is called + * - the other rqs still can't go ahead if not updating + * q->in_flight[BLK_RW_ASYNC] here, meantime these rqs + * are held in flush data queue and make no progress of + * handling post flush rq + * - only after the post flush rq is handled, all these rqs + * can be completed + */ + + elv_completed_request(q, rq); + + /* for avoiding double accounting */ + rq->rq_flags &= ~RQF_STARTED; + + /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ @@ -358,16 +390,17 @@ static void mq_flush_data_end_io(struct request *rq, int error) unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) - blk_mq_run_hw_queue(hctx, true); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + + blk_mq_run_hw_queue(hctx, true); } /** @@ -398,6 +431,13 @@ void blk_insert_flush(struct request *rq) rq->cmd_flags &= ~REQ_FUA; /* + * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any + * of those flags, we have to set REQ_SYNC to avoid skewing + * the request accounting. + */ + rq->cmd_flags |= REQ_SYNC; + + /* * An empty flush handed down from a stacking driver may * translate into nothing if the underlying device does not * advertise a write-back cache. In this case, simply @@ -420,9 +460,9 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - if (q->mq_ops) { - blk_mq_insert_request(rq, false, false, true); - } else + if (q->mq_ops) + blk_mq_sched_insert_request(rq, false, true, false, false); + else list_add_tail(&rq->queuelist, &q->queue_head); return; } @@ -433,7 +473,7 @@ void blk_insert_flush(struct request *rq) */ memset(&rq->flush, 0, sizeof(rq->flush)); INIT_LIST_HEAD(&rq->flush.list); - rq->cmd_flags |= REQ_FLUSH_SEQ; + rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ if (q->mq_ops) { rq->end_io = mq_flush_data_end_io; @@ -485,7 +525,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, bio = bio_alloc(gfp_mask, 0); bio->bi_bdev = bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; ret = submit_bio_wait(bio); @@ -512,11 +552,10 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, if (!fq) goto fail; - if (q->mq_ops) { + if (q->mq_ops) spin_lock_init(&fq->mq_flush_lock); - rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); - } + rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node); if (!fq->flush_rq) goto fail_rq; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index d69c5c79f98e..9f0ff5ba4f84 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -443,10 +443,10 @@ void blk_integrity_revalidate(struct gendisk *disk) return; if (bi->profile) - disk->queue->backing_dev_info.capabilities |= + disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; else - disk->queue->backing_dev_info.capabilities &= + disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; } diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 381cb50a673c..b12f9c87b4c3 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -35,7 +35,10 @@ static void icq_free_icq_rcu(struct rcu_head *head) kmem_cache_free(icq->__rcu_icq_cache, icq); } -/* Exit an icq. Called with both ioc and q locked. */ +/* + * Exit an icq. Called with both ioc and q locked for sq, only ioc locked for + * mq. + */ static void ioc_exit_icq(struct io_cq *icq) { struct elevator_type *et = icq->q->elevator->type; @@ -43,8 +46,10 @@ static void ioc_exit_icq(struct io_cq *icq) if (icq->flags & ICQ_EXITED) return; - if (et->ops.elevator_exit_icq_fn) - et->ops.elevator_exit_icq_fn(icq); + if (et->uses_mq && et->ops.mq.exit_icq) + et->ops.mq.exit_icq(icq); + else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn) + et->ops.sq.elevator_exit_icq_fn(icq); icq->flags |= ICQ_EXITED; } @@ -164,6 +169,7 @@ EXPORT_SYMBOL(put_io_context); */ void put_io_context_active(struct io_context *ioc) { + struct elevator_type *et; unsigned long flags; struct io_cq *icq; @@ -182,13 +188,19 @@ retry: hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { if (icq->flags & ICQ_EXITED) continue; - if (spin_trylock(icq->q->queue_lock)) { + + et = icq->q->elevator->type; + if (et->uses_mq) { ioc_exit_icq(icq); - spin_unlock(icq->q->queue_lock); } else { - spin_unlock_irqrestore(&ioc->lock, flags); - cpu_relax(); - goto retry; + if (spin_trylock(icq->q->queue_lock)) { + ioc_exit_icq(icq); + spin_unlock(icq->q->queue_lock); + } else { + spin_unlock_irqrestore(&ioc->lock, flags); + cpu_relax(); + goto retry; + } } } spin_unlock_irqrestore(&ioc->lock, flags); @@ -383,8 +395,10 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { hlist_add_head(&icq->ioc_node, &ioc->icq_list); list_add(&icq->q_node, &q->icq_list); - if (et->ops.elevator_init_icq_fn) - et->ops.elevator_init_icq_fn(icq); + if (et->uses_mq && et->ops.mq.init_icq) + et->ops.mq.init_icq(icq); + else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn) + et->ops.sq.elevator_init_icq_fn(icq); } else { kmem_cache_free(et->icq_cache, icq); icq = ioc_lookup_icq(ioc, q); diff --git a/block/blk-lib.c b/block/blk-lib.c index 083e56f72308..ed1e78e24db0 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -29,8 +29,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int granularity; - enum req_op op; + unsigned int op; int alignment; + sector_t bs_mask; if (!q) return -ENXIO; @@ -50,6 +51,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, op = REQ_OP_DISCARD; } + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; @@ -75,7 +80,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, req_sects = end_sect - sector; } - bio = next_bio(bio, 1, gfp_mask); + bio = next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); @@ -132,28 +137,36 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, EXPORT_SYMBOL(blkdev_issue_discard); /** - * blkdev_issue_write_same - queue a write same operation + * __blkdev_issue_write_same - generate number of bios with same page * @bdev: target blockdev * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) * @page: page containing data to write + * @biop: pointer to anchor bio * * Description: - * Issue a write same request for the sectors in question. + * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page. */ -int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, - struct page *page) +static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct page *page, + struct bio **biop) { struct request_queue *q = bdev_get_queue(bdev); unsigned int max_write_same_sectors; - struct bio *bio = NULL; - int ret = 0; + struct bio *bio = *biop; + sector_t bs_mask; if (!q) return -ENXIO; + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + + if (!bdev_write_same(bdev)) + return -EOPNOTSUPP; + /* Ensure that max_write_same_sectors doesn't overflow bi_size */ max_write_same_sectors = UINT_MAX >> 9; @@ -175,34 +188,125 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, bio->bi_iter.bi_size = nr_sects << 9; nr_sects = 0; } + cond_resched(); } - if (bio) { + *biop = bio; + return 0; +} + +/** + * blkdev_issue_write_same - queue a write same operation + * @bdev: target blockdev + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @page: page containing data + * + * Description: + * Issue a write same request for the sectors in question. + */ +int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, + struct page *page) +{ + struct bio *bio = NULL; + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page, + &bio); + if (ret == 0 && bio) { ret = submit_bio_wait(bio); bio_put(bio); } + blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(blkdev_issue_write_same); /** - * blkdev_issue_zeroout - generate number of zero filed write bios + * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES * @bdev: blockdev to issue * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) + * @biop: pointer to anchor bio * * Description: - * Generate and issue number of bios with zerofiled pages. + * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages. */ +static int __blkdev_issue_write_zeroes(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop) +{ + struct bio *bio = *biop; + unsigned int max_write_zeroes_sectors; + struct request_queue *q = bdev_get_queue(bdev); -static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) + if (!q) + return -ENXIO; + + /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ + max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); + + if (max_write_zeroes_sectors == 0) + return -EOPNOTSUPP; + + while (nr_sects) { + bio = next_bio(bio, 0, gfp_mask); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0); + + if (nr_sects > max_write_zeroes_sectors) { + bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; + nr_sects -= max_write_zeroes_sectors; + sector += max_write_zeroes_sectors; + } else { + bio->bi_iter.bi_size = nr_sects << 9; + nr_sects = 0; + } + cond_resched(); + } + + *biop = bio; + return 0; +} + +/** + * __blkdev_issue_zeroout - generate number of zero filed write bios + * @bdev: blockdev to issue + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @biop: pointer to anchor bio + * @discard: discard flag + * + * Description: + * Generate and issue number of bios with zerofiled pages. + */ +int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, + bool discard) { int ret; - struct bio *bio = NULL; + int bi_size = 0; + struct bio *bio = *biop; unsigned int sz; + sector_t bs_mask; + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + + ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, + biop); + if (ret == 0 || (ret && ret != -EOPNOTSUPP)) + goto out; + ret = 0; while (nr_sects != 0) { bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES), gfp_mask); @@ -212,21 +316,20 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); - ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); - nr_sects -= ret >> 9; - sector += ret >> 9; - if (ret < (sz << 9)) + bi_size = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); + nr_sects -= bi_size >> 9; + sector += bi_size >> 9; + if (bi_size < (sz << 9)) break; } + cond_resched(); } - if (bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - return ret; - } - return 0; + *biop = bio; +out: + return ret; } +EXPORT_SYMBOL(__blkdev_issue_zeroout); /** * blkdev_issue_zeroout - zero-fill a block range @@ -243,26 +346,37 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, * the discard request fail, if the discard flag is not set, or if * discard_zeroes_data is not supported, this function will resort to * zeroing the blocks manually, thus provisioning (allocating, - * anchoring) them. If the block device supports the WRITE SAME command - * blkdev_issue_zeroout() will use it to optimize the process of + * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME + * command(s), blkdev_issue_zeroout() will use it to optimize the process of * clearing the block range. Otherwise the zeroing will be performed * using regular WRITE calls. */ - int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, bool discard) { + int ret; + struct bio *bio = NULL; + struct blk_plug plug; + if (discard) { if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, BLKDEV_DISCARD_ZERO)) return 0; } - if (bdev_write_same(bdev) && - blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, - ZERO_PAGE(0)) == 0) + if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, + ZERO_PAGE(0))) return 0; - return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); + blk_start_plug(&plug); + ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, + &bio, discard); + if (ret == 0 && bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + + return ret; } EXPORT_SYMBOL(blkdev_issue_zeroout); diff --git a/block/blk-map.c b/block/blk-map.c index b8657fa8dc9a..2f18c2a0be1b 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -60,6 +60,9 @@ static int __blk_rq_map_user_iov(struct request *rq, if (IS_ERR(bio)) return PTR_ERR(bio); + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= req_op(rq); + if (map_data && map_data->null_mapped) bio_set_flag(bio, BIO_NULL_MAPPED); @@ -88,7 +91,7 @@ static int __blk_rq_map_user_iov(struct request *rq, } /** - * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage + * blk_rq_map_user_iov - map user data to a request, for passthrough requests * @q: request queue where request should be inserted * @rq: request to map data to * @map_data: pointer to the rq_map_data holding pages (if necessary) @@ -118,6 +121,9 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct iov_iter i; int ret; + if (!iter_is_iovec(iter)) + goto fail; + if (map_data) copy = true; else if (iov_iter_alignment(iter) & align) @@ -135,11 +141,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, } while (iov_iter_count(&i)); if (!bio_flagged(bio, BIO_USER_MAPPED)) - rq->cmd_flags |= REQ_COPY_USER; + rq->rq_flags |= RQF_COPY_USER; return 0; unmap_rq: __blk_rq_unmap_user(bio); +fail: rq->bio = NULL; return -EINVAL; } @@ -193,7 +200,7 @@ int blk_rq_unmap_user(struct bio *bio) EXPORT_SYMBOL(blk_rq_unmap_user); /** - * blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage + * blk_rq_map_kern - map kernel data to a request, for passthrough requests * @q: request queue where request should be inserted * @rq: request to fill * @kbuf: the kernel buffer @@ -228,11 +235,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (IS_ERR(bio)) return PTR_ERR(bio); - if (!reading) - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= req_op(rq); if (do_copy) - rq->cmd_flags |= REQ_COPY_USER; + rq->rq_flags |= RQF_COPY_USER; ret = blk_rq_append_bio(rq, bio); if (unlikely(ret)) { diff --git a/block/blk-merge.c b/block/blk-merge.c index 2642e5fc8b69..2afa262425d1 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, case REQ_OP_SECURE_ERASE: split = blk_bio_discard_split(q, *bio, bs, &nsegs); break; + case REQ_OP_WRITE_ZEROES: + split = NULL; + nsegs = (*bio)->bi_phys_segments; + break; case REQ_OP_WRITE_SAME: split = blk_bio_write_same_split(q, *bio, bs, &nsegs); break; @@ -237,15 +241,14 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, if (!bio) return 0; - /* - * This should probably be returning 0, but blk_add_request_payload() - * (Christoph!!!!) - */ - if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) - return 1; - - if (bio_op(bio) == REQ_OP_WRITE_SAME) + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + return 0; + case REQ_OP_WRITE_SAME: return 1; + } fbio = bio; cluster = blk_queue_cluster(q); @@ -402,38 +405,21 @@ new_segment: *bvprv = *bvec; } +static inline int __blk_bvec_map_sg(struct request_queue *q, struct bio_vec bv, + struct scatterlist *sglist, struct scatterlist **sg) +{ + *sg = sglist; + sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); + return 1; +} + static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist, struct scatterlist **sg) { struct bio_vec bvec, bvprv = { NULL }; struct bvec_iter iter; - int nsegs, cluster; - - nsegs = 0; - cluster = blk_queue_cluster(q); - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - /* - * This is a hack - drivers should be neither modifying the - * biovec, nor relying on bi_vcnt - but because of - * blk_add_request_payload(), a discard bio may or may not have - * a payload we need to set up here (thank you Christoph) and - * bi_vcnt is really the only way of telling if we need to. - */ - if (!bio->bi_vcnt) - return 0; - /* Fall through */ - case REQ_OP_WRITE_SAME: - *sg = sglist; - bvec = bio_iovec(bio); - sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); - return 1; - default: - break; - } + int cluster = blk_queue_cluster(q), nsegs = 0; for_each_bio(bio) bio_for_each_segment(bvec, bio, iter) @@ -453,10 +439,14 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sg = NULL; int nsegs = 0; - if (rq->bio) + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + nsegs = __blk_bvec_map_sg(q, rq->special_vec, sglist, &sg); + else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) + nsegs = __blk_bvec_map_sg(q, bio_iovec(rq->bio), sglist, &sg); + else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); - if (unlikely(rq->cmd_flags & REQ_COPY_USER) && + if (unlikely(rq->rq_flags & RQF_COPY_USER) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { unsigned int pad_len = (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; @@ -486,7 +476,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, * Something must have been wrong if the figured number of * segment is bigger than number of req's physical segments */ - WARN_ON(nsegs > rq->nr_phys_segments); + WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); return nsegs; } @@ -512,9 +502,7 @@ static inline int ll_new_hw_segment(struct request_queue *q, return 1; no_merge: - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; + req_set_nomerge(q, req); return 0; } @@ -528,9 +516,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, blk_rq_pos(req))) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; + req_set_nomerge(q, req); return 0; } if (!bio_flagged(req->biotail, BIO_SEG_VALID)) @@ -552,9 +538,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; + req_set_nomerge(q, req); return 0; } if (!bio_flagged(bio, BIO_SEG_VALID)) @@ -634,7 +618,7 @@ void blk_rq_set_mixed_merge(struct request *rq) unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; struct bio *bio; - if (rq->cmd_flags & REQ_MIXED_MERGE) + if (rq->rq_flags & RQF_MIXED_MERGE) return; /* @@ -647,7 +631,7 @@ void blk_rq_set_mixed_merge(struct request *rq) (bio->bi_opf & REQ_FAILFAST_MASK) != ff); bio->bi_opf |= ff; } - rq->cmd_flags |= REQ_MIXED_MERGE; + rq->rq_flags |= RQF_MIXED_MERGE; } static void blk_account_io_merge(struct request *req) @@ -668,31 +652,32 @@ static void blk_account_io_merge(struct request *req) } /* - * Has to be called with the request spinlock acquired + * For non-mq, this has to be called with the request spinlock acquired. + * For mq with scheduling, the appropriate queue wide lock should be held. */ -static int attempt_merge(struct request_queue *q, struct request *req, - struct request *next) +static struct request *attempt_merge(struct request_queue *q, + struct request *req, struct request *next) { if (!rq_mergeable(req) || !rq_mergeable(next)) - return 0; + return NULL; if (req_op(req) != req_op(next)) - return 0; + return NULL; /* * not contiguous */ if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next)) - return 0; + return NULL; if (rq_data_dir(req) != rq_data_dir(next) || req->rq_disk != next->rq_disk || req_no_special_merge(next)) - return 0; + return NULL; if (req_op(req) == REQ_OP_WRITE_SAME && !blk_write_same_mergeable(req->bio, next->bio)) - return 0; + return NULL; /* * If we are allowed to merge, then append bio list @@ -701,7 +686,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, * counts here. */ if (!ll_merge_requests_fn(q, req, next)) - return 0; + return NULL; /* * If failfast settings disagree or any of the two is already @@ -709,7 +694,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, * makes sure that all involved bios have mixable attributes * set properly. */ - if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE || + if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) || (req->cmd_flags & REQ_FAILFAST_MASK) != (next->cmd_flags & REQ_FAILFAST_MASK)) { blk_rq_set_mixed_merge(req); @@ -741,42 +726,51 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (blk_rq_cpu_valid(next)) req->cpu = next->cpu; - /* owner-ship of bio passed from next to req */ + /* + * ownership of bio passed from next to req, return 'next' for + * the caller to free + */ next->bio = NULL; - __blk_put_request(q, next); - return 1; + return next; } -int attempt_back_merge(struct request_queue *q, struct request *rq) +struct request *attempt_back_merge(struct request_queue *q, struct request *rq) { struct request *next = elv_latter_request(q, rq); if (next) return attempt_merge(q, rq, next); - return 0; + return NULL; } -int attempt_front_merge(struct request_queue *q, struct request *rq) +struct request *attempt_front_merge(struct request_queue *q, struct request *rq) { struct request *prev = elv_former_request(q, rq); if (prev) return attempt_merge(q, prev, rq); - return 0; + return NULL; } int blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; + struct request *free; - if (e->type->ops.elevator_allow_rq_merge_fn) - if (!e->type->ops.elevator_allow_rq_merge_fn(q, rq, next)) + if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn) + if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next)) return 0; - return attempt_merge(q, rq, next); + free = attempt_merge(q, rq, next); + if (free) { + __blk_put_request(q, free); + return 1; + } + + return 0; } bool blk_rq_merge_ok(struct request *rq, struct bio *bio) @@ -807,9 +801,12 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return true; } -int blk_try_merge(struct request *rq, struct bio *bio) +enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) { - if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) + if (req_op(rq) == REQ_OP_DISCARD && + queue_max_discard_segments(rq->q) > 1) + return ELEVATOR_DISCARD_MERGE; + else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) return ELEVATOR_BACK_MERGE; else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector) return ELEVATOR_FRONT_MERGE; diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c deleted file mode 100644 index bb3ed488f7b5..000000000000 --- a/block/blk-mq-cpu.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * CPU notifier helper code for blk-mq - * - * Copyright (C) 2013-2014 Jens Axboe - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/blkdev.h> -#include <linux/list.h> -#include <linux/llist.h> -#include <linux/smp.h> -#include <linux/cpu.h> - -#include <linux/blk-mq.h> -#include "blk-mq.h" - -static LIST_HEAD(blk_mq_cpu_notify_list); -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock); - -static int blk_mq_main_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long) hcpu; - struct blk_mq_cpu_notifier *notify; - int ret = NOTIFY_OK; - - raw_spin_lock(&blk_mq_cpu_notify_lock); - - list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) { - ret = notify->notify(notify->data, action, cpu); - if (ret != NOTIFY_OK) - break; - } - - raw_spin_unlock(&blk_mq_cpu_notify_lock); - return ret; -} - -void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) -{ - BUG_ON(!notifier->notify); - - raw_spin_lock(&blk_mq_cpu_notify_lock); - list_add_tail(¬ifier->list, &blk_mq_cpu_notify_list); - raw_spin_unlock(&blk_mq_cpu_notify_lock); -} - -void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) -{ - raw_spin_lock(&blk_mq_cpu_notify_lock); - list_del(¬ifier->list); - raw_spin_unlock(&blk_mq_cpu_notify_lock); -} - -void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - int (*fn)(void *, unsigned long, unsigned int), - void *data) -{ - notifier->notify = fn; - notifier->data = data; -} - -void __init blk_mq_cpu_init(void) -{ - hotcpu_notifier(blk_mq_main_cpu_notify, 0); -} diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index d0634bcf322f..8e61e8640e17 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -31,14 +31,16 @@ static int get_first_sibling(unsigned int cpu) return cpu; } -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, - const struct cpumask *online_mask) +int blk_mq_map_queues(struct blk_mq_tag_set *set) { + unsigned int *map = set->mq_map; + unsigned int nr_queues = set->nr_hw_queues; + const struct cpumask *online_mask = cpu_online_mask; unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; cpumask_var_t cpus; if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) - return 1; + return -ENOMEM; cpumask_clear(cpus); nr_cpus = nr_uniq_cpus = 0; @@ -85,23 +87,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, free_cpumask_var(cpus); return 0; } - -unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) -{ - unsigned int *map; - - /* If cpus are offline, map them to first hctx */ - map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL, - set->numa_node); - if (!map) - return NULL; - - if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask)) - return map; - - kfree(map); - return NULL; -} +EXPORT_SYMBOL_GPL(blk_mq_map_queues); /* * We have no quick way of doing reverse lookups. This is only used at diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c new file mode 100644 index 000000000000..f6d917977b33 --- /dev/null +++ b/block/blk-mq-debugfs.c @@ -0,0 +1,772 @@ +/* + * Copyright (C) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/debugfs.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" + +struct blk_mq_debugfs_attr { + const char *name; + umode_t mode; + const struct file_operations *fops; +}; + +static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, ops); + if (!ret) { + m = file->private_data; + m->private = inode->i_private; + } + return ret; +} + +static int hctx_state_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "0x%lx\n", hctx->state); + return 0; +} + +static int hctx_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_state_show, inode->i_private); +} + +static const struct file_operations hctx_state_fops = { + .open = hctx_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_flags_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "0x%lx\n", hctx->flags); + return 0; +} + +static int hctx_flags_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_flags_show, inode->i_private); +} + +static const struct file_operations hctx_flags_fops = { + .open = hctx_flags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) +{ + struct request *rq = list_entry_rq(v); + + seq_printf(m, "%p {.cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n", + rq, rq->cmd_flags, (__force unsigned int)rq->rq_flags, + rq->tag, rq->internal_tag); + return 0; +} + +static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) + __acquires(&hctx->lock) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + spin_lock(&hctx->lock); + return seq_list_start(&hctx->dispatch, *pos); +} + +static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + return seq_list_next(v, &hctx->dispatch, pos); +} + +static void hctx_dispatch_stop(struct seq_file *m, void *v) + __releases(&hctx->lock) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + spin_unlock(&hctx->lock); +} + +static const struct seq_operations hctx_dispatch_seq_ops = { + .start = hctx_dispatch_start, + .next = hctx_dispatch_next, + .stop = hctx_dispatch_stop, + .show = blk_mq_debugfs_rq_show, +}; + +static int hctx_dispatch_open(struct inode *inode, struct file *file) +{ + return blk_mq_debugfs_seq_open(inode, file, &hctx_dispatch_seq_ops); +} + +static const struct file_operations hctx_dispatch_fops = { + .open = hctx_dispatch_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int hctx_ctx_map_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + sbitmap_bitmap_show(&hctx->ctx_map, m); + return 0; +} + +static int hctx_ctx_map_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_ctx_map_show, inode->i_private); +} + +static const struct file_operations hctx_ctx_map_fops = { + .open = hctx_ctx_map_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void blk_mq_debugfs_tags_show(struct seq_file *m, + struct blk_mq_tags *tags) +{ + seq_printf(m, "nr_tags=%u\n", tags->nr_tags); + seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); + seq_printf(m, "active_queues=%d\n", + atomic_read(&tags->active_queues)); + + seq_puts(m, "\nbitmap_tags:\n"); + sbitmap_queue_show(&tags->bitmap_tags, m); + + if (tags->nr_reserved_tags) { + seq_puts(m, "\nbreserved_tags:\n"); + sbitmap_queue_show(&tags->breserved_tags, m); + } +} + +static int hctx_tags_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + int res; + + res = mutex_lock_interruptible(&q->sysfs_lock); + if (res) + goto out; + if (hctx->tags) + blk_mq_debugfs_tags_show(m, hctx->tags); + mutex_unlock(&q->sysfs_lock); + +out: + return res; +} + +static int hctx_tags_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_tags_show, inode->i_private); +} + +static const struct file_operations hctx_tags_fops = { + .open = hctx_tags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_tags_bitmap_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + int res; + + res = mutex_lock_interruptible(&q->sysfs_lock); + if (res) + goto out; + if (hctx->tags) + sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); + mutex_unlock(&q->sysfs_lock); + +out: + return res; +} + +static int hctx_tags_bitmap_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_tags_bitmap_show, inode->i_private); +} + +static const struct file_operations hctx_tags_bitmap_fops = { + .open = hctx_tags_bitmap_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_sched_tags_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + int res; + + res = mutex_lock_interruptible(&q->sysfs_lock); + if (res) + goto out; + if (hctx->sched_tags) + blk_mq_debugfs_tags_show(m, hctx->sched_tags); + mutex_unlock(&q->sysfs_lock); + +out: + return res; +} + +static int hctx_sched_tags_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_sched_tags_show, inode->i_private); +} + +static const struct file_operations hctx_sched_tags_fops = { + .open = hctx_sched_tags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_sched_tags_bitmap_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct request_queue *q = hctx->queue; + int res; + + res = mutex_lock_interruptible(&q->sysfs_lock); + if (res) + goto out; + if (hctx->sched_tags) + sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); + mutex_unlock(&q->sysfs_lock); + +out: + return res; +} + +static int hctx_sched_tags_bitmap_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_sched_tags_bitmap_show, inode->i_private); +} + +static const struct file_operations hctx_sched_tags_bitmap_fops = { + .open = hctx_sched_tags_bitmap_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_io_poll_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "considered=%lu\n", hctx->poll_considered); + seq_printf(m, "invoked=%lu\n", hctx->poll_invoked); + seq_printf(m, "success=%lu\n", hctx->poll_success); + return 0; +} + +static int hctx_io_poll_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_io_poll_show, inode->i_private); +} + +static ssize_t hctx_io_poll_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + + hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; + return count; +} + +static const struct file_operations hctx_io_poll_fops = { + .open = hctx_io_poll_open, + .read = seq_read, + .write = hctx_io_poll_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) +{ + seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", + stat->nr_samples, stat->mean, stat->min, stat->max); +} + +static int hctx_stats_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + struct blk_rq_stat stat[2]; + + blk_stat_init(&stat[BLK_STAT_READ]); + blk_stat_init(&stat[BLK_STAT_WRITE]); + + blk_hctx_stat_get(hctx, stat); + + seq_puts(m, "read: "); + print_stat(m, &stat[BLK_STAT_READ]); + seq_puts(m, "\n"); + + seq_puts(m, "write: "); + print_stat(m, &stat[BLK_STAT_WRITE]); + seq_puts(m, "\n"); + return 0; +} + +static int hctx_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_stats_show, inode->i_private); +} + +static ssize_t hctx_stats_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + struct blk_mq_ctx *ctx; + int i; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_init(&ctx->stat[BLK_STAT_READ]); + blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); + } + return count; +} + +static const struct file_operations hctx_stats_fops = { + .open = hctx_stats_open, + .read = seq_read, + .write = hctx_stats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_dispatched_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + int i; + + seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]); + + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { + unsigned int d = 1U << (i - 1); + + seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]); + } + + seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]); + return 0; +} + +static int hctx_dispatched_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_dispatched_show, inode->i_private); +} + +static ssize_t hctx_dispatched_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + int i; + + for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) + hctx->dispatched[i] = 0; + return count; +} + +static const struct file_operations hctx_dispatched_fops = { + .open = hctx_dispatched_open, + .read = seq_read, + .write = hctx_dispatched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_queued_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "%lu\n", hctx->queued); + return 0; +} + +static int hctx_queued_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_queued_show, inode->i_private); +} + +static ssize_t hctx_queued_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + + hctx->queued = 0; + return count; +} + +static const struct file_operations hctx_queued_fops = { + .open = hctx_queued_open, + .read = seq_read, + .write = hctx_queued_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_run_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "%lu\n", hctx->run); + return 0; +} + +static int hctx_run_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_run_show, inode->i_private); +} + +static ssize_t hctx_run_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_hw_ctx *hctx = m->private; + + hctx->run = 0; + return count; +} + +static const struct file_operations hctx_run_fops = { + .open = hctx_run_open, + .read = seq_read, + .write = hctx_run_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int hctx_active_show(struct seq_file *m, void *v) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); + return 0; +} + +static int hctx_active_open(struct inode *inode, struct file *file) +{ + return single_open(file, hctx_active_show, inode->i_private); +} + +static const struct file_operations hctx_active_fops = { + .open = hctx_active_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) + __acquires(&ctx->lock) +{ + struct blk_mq_ctx *ctx = m->private; + + spin_lock(&ctx->lock); + return seq_list_start(&ctx->rq_list, *pos); +} + +static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct blk_mq_ctx *ctx = m->private; + + return seq_list_next(v, &ctx->rq_list, pos); +} + +static void ctx_rq_list_stop(struct seq_file *m, void *v) + __releases(&ctx->lock) +{ + struct blk_mq_ctx *ctx = m->private; + + spin_unlock(&ctx->lock); +} + +static const struct seq_operations ctx_rq_list_seq_ops = { + .start = ctx_rq_list_start, + .next = ctx_rq_list_next, + .stop = ctx_rq_list_stop, + .show = blk_mq_debugfs_rq_show, +}; + +static int ctx_rq_list_open(struct inode *inode, struct file *file) +{ + return blk_mq_debugfs_seq_open(inode, file, &ctx_rq_list_seq_ops); +} + +static const struct file_operations ctx_rq_list_fops = { + .open = ctx_rq_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int ctx_dispatched_show(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]); + return 0; +} + +static int ctx_dispatched_open(struct inode *inode, struct file *file) +{ + return single_open(file, ctx_dispatched_show, inode->i_private); +} + +static ssize_t ctx_dispatched_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_ctx *ctx = m->private; + + ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0; + return count; +} + +static const struct file_operations ctx_dispatched_fops = { + .open = ctx_dispatched_open, + .read = seq_read, + .write = ctx_dispatched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ctx_merged_show(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + seq_printf(m, "%lu\n", ctx->rq_merged); + return 0; +} + +static int ctx_merged_open(struct inode *inode, struct file *file) +{ + return single_open(file, ctx_merged_show, inode->i_private); +} + +static ssize_t ctx_merged_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_ctx *ctx = m->private; + + ctx->rq_merged = 0; + return count; +} + +static const struct file_operations ctx_merged_fops = { + .open = ctx_merged_open, + .read = seq_read, + .write = ctx_merged_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ctx_completed_show(struct seq_file *m, void *v) +{ + struct blk_mq_ctx *ctx = m->private; + + seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]); + return 0; +} + +static int ctx_completed_open(struct inode *inode, struct file *file) +{ + return single_open(file, ctx_completed_show, inode->i_private); +} + +static ssize_t ctx_completed_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct blk_mq_ctx *ctx = m->private; + + ctx->rq_completed[0] = ctx->rq_completed[1] = 0; + return count; +} + +static const struct file_operations ctx_completed_fops = { + .open = ctx_completed_open, + .read = seq_read, + .write = ctx_completed_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { + {"state", 0400, &hctx_state_fops}, + {"flags", 0400, &hctx_flags_fops}, + {"dispatch", 0400, &hctx_dispatch_fops}, + {"ctx_map", 0400, &hctx_ctx_map_fops}, + {"tags", 0400, &hctx_tags_fops}, + {"tags_bitmap", 0400, &hctx_tags_bitmap_fops}, + {"sched_tags", 0400, &hctx_sched_tags_fops}, + {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops}, + {"io_poll", 0600, &hctx_io_poll_fops}, + {"stats", 0600, &hctx_stats_fops}, + {"dispatched", 0600, &hctx_dispatched_fops}, + {"queued", 0600, &hctx_queued_fops}, + {"run", 0600, &hctx_run_fops}, + {"active", 0400, &hctx_active_fops}, + {}, +}; + +static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { + {"rq_list", 0400, &ctx_rq_list_fops}, + {"dispatched", 0600, &ctx_dispatched_fops}, + {"merged", 0600, &ctx_merged_fops}, + {"completed", 0600, &ctx_completed_fops}, + {}, +}; + +int blk_mq_debugfs_register(struct request_queue *q, const char *name) +{ + if (!blk_debugfs_root) + return -ENOENT; + + q->debugfs_dir = debugfs_create_dir(name, blk_debugfs_root); + if (!q->debugfs_dir) + goto err; + + if (blk_mq_debugfs_register_hctxs(q)) + goto err; + + return 0; + +err: + blk_mq_debugfs_unregister(q); + return -ENOMEM; +} + +void blk_mq_debugfs_unregister(struct request_queue *q) +{ + debugfs_remove_recursive(q->debugfs_dir); + q->mq_debugfs_dir = NULL; + q->debugfs_dir = NULL; +} + +static bool debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr) +{ + for (; attr->name; attr++) { + if (!debugfs_create_file(attr->name, attr->mode, parent, + data, attr->fops)) + return false; + } + return true; +} + +static int blk_mq_debugfs_register_ctx(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct dentry *hctx_dir) +{ + struct dentry *ctx_dir; + char name[20]; + + snprintf(name, sizeof(name), "cpu%u", ctx->cpu); + ctx_dir = debugfs_create_dir(name, hctx_dir); + if (!ctx_dir) + return -ENOMEM; + + if (!debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs)) + return -ENOMEM; + + return 0; +} + +static int blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + struct dentry *hctx_dir; + char name[20]; + int i; + + snprintf(name, sizeof(name), "%u", hctx->queue_num); + hctx_dir = debugfs_create_dir(name, q->mq_debugfs_dir); + if (!hctx_dir) + return -ENOMEM; + + if (!debugfs_create_files(hctx_dir, hctx, blk_mq_debugfs_hctx_attrs)) + return -ENOMEM; + + hctx_for_each_ctx(hctx, ctx, i) { + if (blk_mq_debugfs_register_ctx(q, ctx, hctx_dir)) + return -ENOMEM; + } + + return 0; +} + +int blk_mq_debugfs_register_hctxs(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + if (!q->debugfs_dir) + return -ENOENT; + + q->mq_debugfs_dir = debugfs_create_dir("mq", q->debugfs_dir); + if (!q->mq_debugfs_dir) + goto err; + + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_debugfs_register_hctx(q, hctx)) + goto err; + } + + return 0; + +err: + blk_mq_debugfs_unregister_hctxs(q); + return -ENOMEM; +} + +void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) +{ + debugfs_remove_recursive(q->mq_debugfs_dir); + q->mq_debugfs_dir = NULL; +} diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c new file mode 100644 index 000000000000..966c2169762e --- /dev/null +++ b/block/blk-mq-pci.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/kobject.h> +#include <linux/blkdev.h> +#include <linux/blk-mq.h> +#include <linux/blk-mq-pci.h> +#include <linux/pci.h> +#include <linux/module.h> + +/** + * blk_mq_pci_map_queues - provide a default queue mapping for PCI device + * @set: tagset to provide the mapping for + * @pdev: PCI device associated with @set. + * + * This function assumes the PCI device @pdev has at least as many available + * interrupt vetors as @set has queues. It will then queuery the vector + * corresponding to each queue for it's affinity mask and built queue mapping + * that maps a queue to the CPUs that have irq affinity for the corresponding + * vector. + */ +int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < set->nr_hw_queues; queue++) { + mask = pci_irq_get_affinity(pdev, queue); + if (!mask) + return -EINVAL; + + for_each_cpu(cpu, mask) + set->mq_map[cpu] = queue; + } + + return 0; +} +EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c new file mode 100644 index 000000000000..9e8d6795a8c1 --- /dev/null +++ b/block/blk-mq-sched.c @@ -0,0 +1,515 @@ +/* + * blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/blk-mq.h> + +#include <trace/events/block.h> + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-sched.h" +#include "blk-mq-tag.h" +#include "blk-wbt.h" + +void blk_mq_sched_free_hctx_data(struct request_queue *q, + void (*exit)(struct blk_mq_hw_ctx *)) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (exit && hctx->sched_data) + exit(hctx); + kfree(hctx->sched_data); + hctx->sched_data = NULL; + } +} +EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); + +int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, + int (*init)(struct blk_mq_hw_ctx *), + void (*exit)(struct blk_mq_hw_ctx *)) +{ + struct blk_mq_hw_ctx *hctx; + int ret; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node); + if (!hctx->sched_data) { + ret = -ENOMEM; + goto error; + } + + if (init) { + ret = init(hctx); + if (ret) { + /* + * We don't want to give exit() a partially + * initialized sched_data. init() must clean up + * if it fails. + */ + kfree(hctx->sched_data); + hctx->sched_data = NULL; + goto error; + } + } + } + + return 0; +error: + blk_mq_sched_free_hctx_data(q, exit); + return ret; +} +EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data); + +static void __blk_mq_sched_assign_ioc(struct request_queue *q, + struct request *rq, + struct bio *bio, + struct io_context *ioc) +{ + struct io_cq *icq; + + spin_lock_irq(q->queue_lock); + icq = ioc_lookup_icq(ioc, q); + spin_unlock_irq(q->queue_lock); + + if (!icq) { + icq = ioc_create_icq(ioc, q, GFP_ATOMIC); + if (!icq) + return; + } + + rq->elv.icq = icq; + if (!blk_mq_sched_get_rq_priv(q, rq, bio)) { + rq->rq_flags |= RQF_ELVPRIV; + get_io_context(icq->ioc); + return; + } + + rq->elv.icq = NULL; +} + +static void blk_mq_sched_assign_ioc(struct request_queue *q, + struct request *rq, struct bio *bio) +{ + struct io_context *ioc; + + ioc = rq_ioc(bio); + if (ioc) + __blk_mq_sched_assign_ioc(q, rq, bio, ioc); +} + +struct request *blk_mq_sched_get_request(struct request_queue *q, + struct bio *bio, + unsigned int op, + struct blk_mq_alloc_data *data) +{ + struct elevator_queue *e = q->elevator; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + struct request *rq; + + blk_queue_enter_live(q); + ctx = blk_mq_get_ctx(q); + hctx = blk_mq_map_queue(q, ctx->cpu); + + blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx); + + if (e) { + data->flags |= BLK_MQ_REQ_INTERNAL; + + /* + * Flush requests are special and go directly to the + * dispatch list. + */ + if (!op_is_flush(op) && e->type->ops.mq.get_request) { + rq = e->type->ops.mq.get_request(q, op, data); + if (rq) + rq->rq_flags |= RQF_QUEUED; + } else + rq = __blk_mq_alloc_request(data, op); + } else { + rq = __blk_mq_alloc_request(data, op); + if (rq) + data->hctx->tags->rqs[rq->tag] = rq; + } + + if (rq) { + if (!op_is_flush(op)) { + rq->elv.icq = NULL; + if (e && e->type->icq_cache) + blk_mq_sched_assign_ioc(q, rq, bio); + } + data->hctx->queued++; + return rq; + } + + blk_queue_exit(q); + return NULL; +} + +void blk_mq_sched_put_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if (rq->rq_flags & RQF_ELVPRIV) { + blk_mq_sched_put_rq_priv(rq->q, rq); + if (rq->elv.icq) { + put_io_context(rq->elv.icq->ioc); + rq->elv.icq = NULL; + } + } + + if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request) + e->type->ops.mq.put_request(rq); + else + blk_mq_finish_request(rq); +} + +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +{ + struct elevator_queue *e = hctx->queue->elevator; + const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; + bool did_work = false; + LIST_HEAD(rq_list); + + if (unlikely(blk_mq_hctx_stopped(hctx))) + return; + + hctx->run++; + + /* + * If we have previous entries on our dispatch list, grab them first for + * more fair dispatch. + */ + if (!list_empty_careful(&hctx->dispatch)) { + spin_lock(&hctx->lock); + if (!list_empty(&hctx->dispatch)) + list_splice_init(&hctx->dispatch, &rq_list); + spin_unlock(&hctx->lock); + } + + /* + * Only ask the scheduler for requests, if we didn't have residual + * requests from the dispatch list. This is to avoid the case where + * we only ever dispatch a fraction of the requests available because + * of low device queue depth. Once we pull requests out of the IO + * scheduler, we can no longer merge or sort them. So it's best to + * leave them there for as long as we can. Mark the hw queue as + * needing a restart in that case. + */ + if (!list_empty(&rq_list)) { + blk_mq_sched_mark_restart(hctx); + did_work = blk_mq_dispatch_rq_list(hctx, &rq_list); + } else if (!has_sched_dispatch) { + blk_mq_flush_busy_ctxs(hctx, &rq_list); + blk_mq_dispatch_rq_list(hctx, &rq_list); + } + + /* + * We want to dispatch from the scheduler if we had no work left + * on the dispatch list, OR if we did have work but weren't able + * to make progress. + */ + if (!did_work && has_sched_dispatch) { + do { + struct request *rq; + + rq = e->type->ops.mq.dispatch_request(hctx); + if (!rq) + break; + list_add(&rq->queuelist, &rq_list); + } while (blk_mq_dispatch_rq_list(hctx, &rq_list)); + } +} + +void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, + struct list_head *rq_list, + struct request *(*get_rq)(struct blk_mq_hw_ctx *)) +{ + do { + struct request *rq; + + rq = get_rq(hctx); + if (!rq) + break; + + list_add_tail(&rq->queuelist, rq_list); + } while (1); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch); + +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, + struct request **merged_request) +{ + struct request *rq; + + switch (elv_merge(q, &rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (!blk_mq_sched_allow_merge(q, rq, bio)) + return false; + if (!bio_attempt_back_merge(q, rq, bio)) + return false; + *merged_request = attempt_back_merge(q, rq); + if (!*merged_request) + elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); + return true; + case ELEVATOR_FRONT_MERGE: + if (!blk_mq_sched_allow_merge(q, rq, bio)) + return false; + if (!bio_attempt_front_merge(q, rq, bio)) + return false; + *merged_request = attempt_front_merge(q, rq); + if (!*merged_request) + elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); + return true; + default: + return false; + } +} +EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); + +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e->type->ops.mq.bio_merge) { + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + + blk_mq_put_ctx(ctx); + return e->type->ops.mq.bio_merge(hctx, bio); + } + + return false; +} + +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +{ + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); + +void blk_mq_sched_request_inserted(struct request *rq) +{ + trace_block_rq_insert(rq->q, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); + +static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag == -1) { + rq->rq_flags |= RQF_SORTED; + return false; + } + + /* + * If we already have a real request tag, send directly to + * the dispatch list. + */ + spin_lock(&hctx->lock); + list_add(&rq->queuelist, &hctx->dispatch); + spin_unlock(&hctx->lock); + return true; +} + +static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +{ + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (blk_mq_hctx_has_pending(hctx)) + blk_mq_run_hw_queue(hctx, true); + } +} + +void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) +{ + unsigned int i; + + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + blk_mq_sched_restart_hctx(hctx); + else { + struct request_queue *q = hctx->queue; + + if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) + return; + + clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags); + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_sched_restart_hctx(hctx); + } +} + +/* + * Add flush/fua to the queue. If we fail getting a driver tag, then + * punt to the requeue list. Requeue will re-invoke us from a context + * that's safe to block from. + */ +static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool can_block) +{ + if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { + blk_insert_flush(rq); + blk_mq_run_hw_queue(hctx, true); + } else + blk_mq_add_to_requeue_list(rq, false, true); +} + +void blk_mq_sched_insert_request(struct request *rq, bool at_head, + bool run_queue, bool async, bool can_block) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + + if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { + blk_mq_sched_insert_flush(hctx, rq, can_block); + return; + } + + if (e && blk_mq_sched_bypass_insert(hctx, rq)) + goto run; + + if (e && e->type->ops.mq.insert_requests) { + LIST_HEAD(list); + + list_add(&rq->queuelist, &list); + e->type->ops.mq.insert_requests(hctx, &list, at_head); + } else { + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, at_head); + spin_unlock(&ctx->lock); + } + +run: + if (run_queue) + blk_mq_run_hw_queue(hctx, async); +} + +void blk_mq_sched_insert_requests(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async) +{ + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct elevator_queue *e = hctx->queue->elevator; + + if (e) { + struct request *rq, *next; + + /* + * We bypass requests that already have a driver tag assigned, + * which should only be flushes. Flushes are only ever inserted + * as single requests, so we shouldn't ever hit the + * WARN_ON_ONCE() below (but let's handle it just in case). + */ + list_for_each_entry_safe(rq, next, list, queuelist) { + if (WARN_ON_ONCE(rq->tag != -1)) { + list_del_init(&rq->queuelist); + blk_mq_sched_bypass_insert(hctx, rq); + } + } + } + + if (e && e->type->ops.mq.insert_requests) + e->type->ops.mq.insert_requests(hctx, list, false); + else + blk_mq_insert_requests(hctx, ctx, list); + + blk_mq_run_hw_queue(hctx, run_queue_async); +} + +static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx) +{ + if (hctx->sched_tags) { + blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); + blk_mq_free_rq_map(hctx->sched_tags); + hctx->sched_tags = NULL; + } +} + +int blk_mq_sched_setup(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int ret, i; + + /* + * Default to 256, since we don't split into sync/async like the + * old code did. Additionally, this is a per-hw queue depth. + */ + q->nr_requests = 2 * BLKDEV_MAX_RQ; + + /* + * We're switching to using an IO scheduler, so setup the hctx + * scheduler tags and switch the request map from the regular + * tags to scheduler tags. First allocate what we need, so we + * can safely fail and fallback, if needed. + */ + ret = 0; + queue_for_each_hw_ctx(q, hctx, i) { + hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0); + if (!hctx->sched_tags) { + ret = -ENOMEM; + break; + } + ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests); + if (ret) + break; + } + + /* + * If we failed, free what we did allocate + */ + if (ret) { + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->sched_tags) + continue; + blk_mq_sched_free_tags(set, hctx, i); + } + + return ret; + } + + return 0; +} + +void blk_mq_sched_teardown(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_sched_free_tags(set, hctx, i); +} + +int blk_mq_sched_init(struct request_queue *q) +{ + int ret; + +#if defined(CONFIG_DEFAULT_SQ_NONE) + if (q->nr_hw_queues == 1) + return 0; +#endif +#if defined(CONFIG_DEFAULT_MQ_NONE) + if (q->nr_hw_queues > 1) + return 0; +#endif + + mutex_lock(&q->sysfs_lock); + ret = elevator_init(q, NULL); + mutex_unlock(&q->sysfs_lock); + + return ret; +} diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h new file mode 100644 index 000000000000..7b5f3b95c78e --- /dev/null +++ b/block/blk-mq-sched.h @@ -0,0 +1,143 @@ +#ifndef BLK_MQ_SCHED_H +#define BLK_MQ_SCHED_H + +#include "blk-mq.h" +#include "blk-mq-tag.h" + +int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, + int (*init)(struct blk_mq_hw_ctx *), + void (*exit)(struct blk_mq_hw_ctx *)); + +void blk_mq_sched_free_hctx_data(struct request_queue *q, + void (*exit)(struct blk_mq_hw_ctx *)); + +struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data); +void blk_mq_sched_put_request(struct request *rq); + +void blk_mq_sched_request_inserted(struct request *rq); +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, + struct request **merged_request); +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx); + +void blk_mq_sched_insert_request(struct request *rq, bool at_head, + bool run_queue, bool async, bool can_block); +void blk_mq_sched_insert_requests(struct request_queue *q, + struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async); + +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); +void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, + struct list_head *rq_list, + struct request *(*get_rq)(struct blk_mq_hw_ctx *)); + +int blk_mq_sched_setup(struct request_queue *q); +void blk_mq_sched_teardown(struct request_queue *q); + +int blk_mq_sched_init(struct request_queue *q); + +static inline bool +blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio)) + return false; + + return __blk_mq_sched_bio_merge(q, bio); +} + +static inline int blk_mq_sched_get_rq_priv(struct request_queue *q, + struct request *rq, + struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.get_rq_priv) + return e->type->ops.mq.get_rq_priv(q, rq, bio); + + return 0; +} + +static inline void blk_mq_sched_put_rq_priv(struct request_queue *q, + struct request *rq) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.put_rq_priv) + e->type->ops.mq.put_rq_priv(q, rq); +} + +static inline bool +blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.allow_merge) + return e->type->ops.mq.allow_merge(q, rq, bio); + + return true; +} + +static inline void +blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.mq.completed_request) + e->type->ops.mq.completed_request(hctx, rq); + + BUG_ON(rq->internal_tag == -1); + + blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag); +} + +static inline void blk_mq_sched_started_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.started_request) + e->type->ops.mq.started_request(rq); +} + +static inline void blk_mq_sched_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.mq.requeue_request) + e->type->ops.mq.requeue_request(rq); +} + +static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.mq.has_work) + return e->type->ops.mq.has_work(hctx); + + return false; +} + +static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct request_queue *q = hctx->queue; + + if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) + set_bit(QUEUE_FLAG_RESTART, &q->queue_flags); + } + } +} + +static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) +{ + return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); +} + +#endif diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index fe822aa5b8e4..295e69670c39 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -122,111 +122,16 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, return res; } -static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page) -{ - return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1], - ctx->rq_dispatched[0]); -} - -static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page) -{ - return sprintf(page, "%lu\n", ctx->rq_merged); -} - -static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) -{ - return sprintf(page, "%lu %lu\n", ctx->rq_completed[1], - ctx->rq_completed[0]); -} - -static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) -{ - struct request *rq; - int len = snprintf(page, PAGE_SIZE - 1, "%s:\n", msg); - - list_for_each_entry(rq, list, queuelist) { - const int rq_len = 2 * sizeof(rq) + 2; - - /* if the output will be truncated */ - if (PAGE_SIZE - 1 < len + rq_len) { - /* backspacing if it can't hold '\t...\n' */ - if (PAGE_SIZE - 1 < len + 5) - len -= rq_len; - len += snprintf(page + len, PAGE_SIZE - 1 - len, - "\t...\n"); - break; - } - len += snprintf(page + len, PAGE_SIZE - 1 - len, - "\t%p\n", rq); - } - - return len; -} - -static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) -{ - ssize_t ret; - - spin_lock(&ctx->lock); - ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending"); - spin_unlock(&ctx->lock); - - return ret; -} - -static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success); -} - -static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, - char *page) -{ - return sprintf(page, "%lu\n", hctx->queued); -} - -static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - return sprintf(page, "%lu\n", hctx->run); -} - -static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, - char *page) -{ - char *start_page = page; - int i; - - page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); - - for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { - unsigned long d = 1U << (i - 1); - - page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); - } - - return page - start_page; -} - -static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, +static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { - ssize_t ret; - - spin_lock(&hctx->lock); - ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending"); - spin_unlock(&hctx->lock); - - return ret; -} - -static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - return blk_mq_tag_sysfs_show(hctx->tags, page); + return sprintf(page, "%u\n", hctx->tags->nr_tags); } -static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) +static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, + char *page) { - return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); + return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags); } static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) @@ -247,73 +152,27 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return ret; } -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { - .attr = {.name = "dispatched", .mode = S_IRUGO }, - .show = blk_mq_sysfs_dispatched_show, -}; -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = { - .attr = {.name = "merged", .mode = S_IRUGO }, - .show = blk_mq_sysfs_merged_show, -}; -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { - .attr = {.name = "completed", .mode = S_IRUGO }, - .show = blk_mq_sysfs_completed_show, -}; -static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = { - .attr = {.name = "rq_list", .mode = S_IRUGO }, - .show = blk_mq_sysfs_rq_list_show, -}; - static struct attribute *default_ctx_attrs[] = { - &blk_mq_sysfs_dispatched.attr, - &blk_mq_sysfs_merged.attr, - &blk_mq_sysfs_completed.attr, - &blk_mq_sysfs_rq_list.attr, NULL, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = { - .attr = {.name = "queued", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_queued_show, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { - .attr = {.name = "run", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_run_show, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { - .attr = {.name = "dispatched", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_dispatched_show, +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { + .attr = {.name = "nr_tags", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_nr_tags_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { - .attr = {.name = "active", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_active_show, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { - .attr = {.name = "pending", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_rq_list_show, -}; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { - .attr = {.name = "tags", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_tags_show, +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { + .attr = {.name = "nr_reserved_tags", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_nr_reserved_tags_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_cpus_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { - .attr = {.name = "io_poll", .mode = S_IRUGO }, - .show = blk_mq_hw_sysfs_poll_show, -}; static struct attribute *default_hw_ctx_attrs[] = { - &blk_mq_hw_sysfs_queued.attr, - &blk_mq_hw_sysfs_run.attr, - &blk_mq_hw_sysfs_dispatched.attr, - &blk_mq_hw_sysfs_pending.attr, - &blk_mq_hw_sysfs_tags.attr, + &blk_mq_hw_sysfs_nr_tags.attr, + &blk_mq_hw_sysfs_nr_reserved_tags.attr, &blk_mq_hw_sysfs_cpus.attr, - &blk_mq_hw_sysfs_active.attr, - &blk_mq_hw_sysfs_poll.attr, NULL, }; @@ -380,9 +239,8 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -static void __blk_mq_unregister_disk(struct gendisk *disk) +static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { - struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; int i, j; @@ -396,19 +254,21 @@ static void __blk_mq_unregister_disk(struct gendisk *disk) kobject_put(&hctx->kobj); } + blk_mq_debugfs_unregister_hctxs(q); + kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); kobject_del(&q->mq_kobj); kobject_put(&q->mq_kobj); - kobject_put(&disk_to_dev(disk)->kobj); + kobject_put(&dev->kobj); q->mq_sysfs_init_done = false; } -void blk_mq_unregister_disk(struct gendisk *disk) +void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { blk_mq_disable_hotplug(); - __blk_mq_unregister_disk(disk); + __blk_mq_unregister_dev(dev, q); blk_mq_enable_hotplug(); } @@ -430,10 +290,8 @@ static void blk_mq_sysfs_init(struct request_queue *q) } } -int blk_mq_register_disk(struct gendisk *disk) +int blk_mq_register_dev(struct device *dev, struct request_queue *q) { - struct device *dev = disk_to_dev(disk); - struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; int ret, i; @@ -447,6 +305,8 @@ int blk_mq_register_disk(struct gendisk *disk) kobject_uevent(&q->mq_kobj, KOBJ_ADD); + blk_mq_debugfs_register(q, kobject_name(&dev->kobj)); + queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) @@ -454,7 +314,7 @@ int blk_mq_register_disk(struct gendisk *disk) } if (ret) - __blk_mq_unregister_disk(disk); + __blk_mq_unregister_dev(dev, q); else q->mq_sysfs_init_done = true; out: @@ -462,7 +322,7 @@ out: return ret; } -EXPORT_SYMBOL_GPL(blk_mq_register_disk); +EXPORT_SYMBOL_GPL(blk_mq_register_dev); void blk_mq_sysfs_unregister(struct request_queue *q) { @@ -472,6 +332,8 @@ void blk_mq_sysfs_unregister(struct request_queue *q) if (!q->mq_sysfs_init_done) return; + blk_mq_debugfs_unregister_hctxs(q); + queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); } @@ -484,6 +346,8 @@ int blk_mq_sysfs_register(struct request_queue *q) if (!q->mq_sysfs_init_done) return ret; + blk_mq_debugfs_register_hctxs(q); + queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 729bac3a673b..54c84363c1b2 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -1,58 +1,24 @@ /* - * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread - * over multiple cachelines to avoid ping-pong between multiple submitters - * or submitter and completer. Uses rolling wakeups to avoid falling of - * the scaling cliff when we run out of tags and have to start putting - * submitters to sleep. - * - * Uses active queue tracking to support fairer distribution of tags - * between multiple submitters when a shared tag map is used. + * Tag allocation using scalable bitmaps. Uses active queue tracking to support + * fairer distribution of tags between multiple submitters when a shared tag map + * is used. * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/random.h> #include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" -static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) -{ - int i; - - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - int ret; - - ret = find_first_zero_bit(&bm->word, bm->depth); - if (ret < bm->depth) - return true; - } - - return false; -} - bool blk_mq_has_free_tags(struct blk_mq_tags *tags) { if (!tags) return true; - return bt_has_free_tags(&tags->bitmap_tags); -} - -static inline int bt_index_inc(int index) -{ - return (index + 1) & (BT_WAIT_QUEUES - 1); -} - -static inline void bt_index_atomic_inc(atomic_t *index) -{ - int old = atomic_read(index); - int new = bt_index_inc(old); - atomic_cmpxchg(index, old, new); + return sbitmap_any_bit_clear(&tags->bitmap_tags.sb); } /* @@ -72,29 +38,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { - struct blk_mq_bitmap_tags *bt; - int i, wake_index; - - /* - * Make sure all changes prior to this are visible from other CPUs. - */ - smp_mb(); - bt = &tags->bitmap_tags; - wake_index = atomic_read(&bt->wake_index); - for (i = 0; i < BT_WAIT_QUEUES; i++) { - struct bt_wait_state *bs = &bt->bs[wake_index]; - - if (waitqueue_active(&bs->wait)) - wake_up(&bs->wait); - - wake_index = bt_index_inc(wake_index); - } - - if (include_reserve) { - bt = &tags->breserved_tags; - if (waitqueue_active(&bt->bs[0].wait)) - wake_up(&bt->bs[0].wait); - } + sbitmap_queue_wake_all(&tags->bitmap_tags); + if (include_reserve) + sbitmap_queue_wake_all(&tags->breserved_tags); } /* @@ -118,7 +64,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, - struct blk_mq_bitmap_tags *bt) + struct sbitmap_queue *bt) { unsigned int depth, users; @@ -130,7 +76,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* * Don't try dividing an ant */ - if (bt->depth == 1) + if (bt->sb.depth == 1) return true; users = atomic_read(&hctx->tags->active_queues); @@ -140,337 +86,191 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* * Allow at least some tags */ - depth = max((bt->depth + users - 1) / users, 4U); + depth = max((bt->sb.depth + users - 1) / users, 4U); return atomic_read(&hctx->nr_active) < depth; } -static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag, - bool nowrap) +static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, + struct sbitmap_queue *bt) { - int tag, org_last_tag = last_tag; - - while (1) { - tag = find_next_zero_bit(&bm->word, bm->depth, last_tag); - if (unlikely(tag >= bm->depth)) { - /* - * We started with an offset, and we didn't reset the - * offset to 0 in a failure case, so start from 0 to - * exhaust the map. - */ - if (org_last_tag && last_tag && !nowrap) { - last_tag = org_last_tag = 0; - continue; - } - return -1; - } - - if (!test_and_set_bit(tag, &bm->word)) - break; - - last_tag = tag + 1; - if (last_tag >= bm->depth - 1) - last_tag = 0; - } - - return tag; -} - -#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) - -/* - * Straight forward bitmap tag implementation, where each bit is a tag - * (cleared == free, and set == busy). The small twist is using per-cpu - * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue - * contexts. This enables us to drastically limit the space searched, - * without dirtying an extra shared cacheline like we would if we stored - * the cache value inside the shared blk_mq_bitmap_tags structure. On top - * of that, each word of tags is in a separate cacheline. This means that - * multiple users will tend to stick to different cachelines, at least - * until the map is exhausted. - */ -static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, - unsigned int *tag_cache, struct blk_mq_tags *tags) -{ - unsigned int last_tag, org_last_tag; - int index, i, tag; - - if (!hctx_may_queue(hctx, bt)) + if (!(data->flags & BLK_MQ_REQ_INTERNAL) && + !hctx_may_queue(data->hctx, bt)) return -1; - - last_tag = org_last_tag = *tag_cache; - index = TAG_TO_INDEX(bt, last_tag); - - for (i = 0; i < bt->map_nr; i++) { - tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag), - BT_ALLOC_RR(tags)); - if (tag != -1) { - tag += (index << bt->bits_per_word); - goto done; - } - - /* - * Jump to next index, and reset the last tag to be the - * first tag of that index - */ - index++; - last_tag = (index << bt->bits_per_word); - - if (index >= bt->map_nr) { - index = 0; - last_tag = 0; - } - } - - *tag_cache = 0; - return -1; - - /* - * Only update the cache from the allocation path, if we ended - * up using the specific cached tag. - */ -done: - if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) { - last_tag = tag + 1; - if (last_tag >= bt->depth - 1) - last_tag = 0; - - *tag_cache = last_tag; - } - - return tag; + return __sbitmap_queue_get(bt); } -static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, - struct blk_mq_hw_ctx *hctx) -{ - struct bt_wait_state *bs; - int wait_index; - - if (!hctx) - return &bt->bs[0]; - - wait_index = atomic_read(&hctx->wait_index); - bs = &bt->bs[wait_index]; - bt_index_atomic_inc(&hctx->wait_index); - return bs; -} - -static int bt_get(struct blk_mq_alloc_data *data, - struct blk_mq_bitmap_tags *bt, - struct blk_mq_hw_ctx *hctx, - unsigned int *last_tag, struct blk_mq_tags *tags) +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { - struct bt_wait_state *bs; + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct sbitmap_queue *bt; + struct sbq_wait_state *ws; DEFINE_WAIT(wait); + unsigned int tag_offset; + bool drop_ctx; int tag; - tag = __bt_get(hctx, bt, last_tag, tags); + if (data->flags & BLK_MQ_REQ_RESERVED) { + if (unlikely(!tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return BLK_MQ_TAG_FAIL; + } + bt = &tags->breserved_tags; + tag_offset = 0; + } else { + bt = &tags->bitmap_tags; + tag_offset = tags->nr_reserved_tags; + } + + tag = __blk_mq_get_tag(data, bt); if (tag != -1) - return tag; + goto found_tag; if (data->flags & BLK_MQ_REQ_NOWAIT) - return -1; + return BLK_MQ_TAG_FAIL; - bs = bt_wait_ptr(bt, hctx); + ws = bt_wait_ptr(bt, data->hctx); + drop_ctx = data->ctx == NULL; do { - prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __blk_mq_get_tag(data, bt); if (tag != -1) break; /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for - * some to complete. Note that hctx can be NULL here for - * reserved tag allocation. + * some to complete. */ - if (hctx) - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(data->hctx, false); /* * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __blk_mq_get_tag(data, bt); if (tag != -1) break; - blk_mq_put_ctx(data->ctx); + if (data->ctx) + blk_mq_put_ctx(data->ctx); io_schedule(); data->ctx = blk_mq_get_ctx(data->q); - data->hctx = data->q->mq_ops->map_queue(data->q, - data->ctx->cpu); - if (data->flags & BLK_MQ_REQ_RESERVED) { - bt = &data->hctx->tags->breserved_tags; - } else { - last_tag = &data->ctx->last_tag; - hctx = data->hctx; - bt = &hctx->tags->bitmap_tags; - } - finish_wait(&bs->wait, &wait); - bs = bt_wait_ptr(bt, hctx); + data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu); + tags = blk_mq_tags_from_data(data); + if (data->flags & BLK_MQ_REQ_RESERVED) + bt = &tags->breserved_tags; + else + bt = &tags->bitmap_tags; + + finish_wait(&ws->wait, &wait); + ws = bt_wait_ptr(bt, data->hctx); } while (1); - finish_wait(&bs->wait, &wait); - return tag; -} - -static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) -{ - int tag; + if (drop_ctx && data->ctx) + blk_mq_put_ctx(data->ctx); - tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, - &data->ctx->last_tag, data->hctx->tags); - if (tag >= 0) - return tag + data->hctx->tags->nr_reserved_tags; + finish_wait(&ws->wait, &wait); - return BLK_MQ_TAG_FAIL; +found_tag: + return tag + tag_offset; } -static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, + struct blk_mq_ctx *ctx, unsigned int tag) { - int tag, zero = 0; + if (tag >= tags->nr_reserved_tags) { + const int real_tag = tag - tags->nr_reserved_tags; - if (unlikely(!data->hctx->tags->nr_reserved_tags)) { - WARN_ON_ONCE(1); - return BLK_MQ_TAG_FAIL; + BUG_ON(real_tag >= tags->nr_tags); + sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); + } else { + BUG_ON(tag >= tags->nr_reserved_tags); + sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } - - tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero, - data->hctx->tags); - if (tag < 0) - return BLK_MQ_TAG_FAIL; - - return tag; } -unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) -{ - if (data->flags & BLK_MQ_REQ_RESERVED) - return __blk_mq_get_reserved_tag(data); - return __blk_mq_get_tag(data); -} +struct bt_iter_data { + struct blk_mq_hw_ctx *hctx; + busy_iter_fn *fn; + void *data; + bool reserved; +}; -static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) +static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { - int i, wake_index; - - wake_index = atomic_read(&bt->wake_index); - for (i = 0; i < BT_WAIT_QUEUES; i++) { - struct bt_wait_state *bs = &bt->bs[wake_index]; - - if (waitqueue_active(&bs->wait)) { - int o = atomic_read(&bt->wake_index); - if (wake_index != o) - atomic_cmpxchg(&bt->wake_index, o, wake_index); - - return bs; - } + struct bt_iter_data *iter_data = data; + struct blk_mq_hw_ctx *hctx = iter_data->hctx; + struct blk_mq_tags *tags = hctx->tags; + bool reserved = iter_data->reserved; + struct request *rq; - wake_index = bt_index_inc(wake_index); - } + if (!reserved) + bitnr += tags->nr_reserved_tags; + rq = tags->rqs[bitnr]; - return NULL; + if (rq->q == hctx->queue) + iter_data->fn(hctx, rq, iter_data->data, reserved); + return true; } -static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, + busy_iter_fn *fn, void *data, bool reserved) { - const int index = TAG_TO_INDEX(bt, tag); - struct bt_wait_state *bs; - int wait_cnt; - - clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); - - /* Ensure that the wait list checks occur after clear_bit(). */ - smp_mb(); + struct bt_iter_data iter_data = { + .hctx = hctx, + .fn = fn, + .data = data, + .reserved = reserved, + }; - bs = bt_wake_ptr(bt); - if (!bs) - return; - - wait_cnt = atomic_dec_return(&bs->wait_cnt); - if (unlikely(wait_cnt < 0)) - wait_cnt = atomic_inc_return(&bs->wait_cnt); - if (wait_cnt == 0) { - atomic_add(bt->wake_cnt, &bs->wait_cnt); - bt_index_atomic_inc(&bt->wake_index); - wake_up(&bs->wait); - } + sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); } -void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, - unsigned int *last_tag) -{ - struct blk_mq_tags *tags = hctx->tags; - - if (tag >= tags->nr_reserved_tags) { - const int real_tag = tag - tags->nr_reserved_tags; - - BUG_ON(real_tag >= tags->nr_tags); - bt_clear_tag(&tags->bitmap_tags, real_tag); - if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) - *last_tag = real_tag; - } else { - BUG_ON(tag >= tags->nr_reserved_tags); - bt_clear_tag(&tags->breserved_tags, tag); - } -} +struct bt_tags_iter_data { + struct blk_mq_tags *tags; + busy_tag_iter_fn *fn; + void *data; + bool reserved; +}; -static void bt_for_each(struct blk_mq_hw_ctx *hctx, - struct blk_mq_bitmap_tags *bt, unsigned int off, - busy_iter_fn *fn, void *data, bool reserved) +static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { + struct bt_tags_iter_data *iter_data = data; + struct blk_mq_tags *tags = iter_data->tags; + bool reserved = iter_data->reserved; struct request *rq; - int bit, i; - - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - for (bit = find_first_bit(&bm->word, bm->depth); - bit < bm->depth; - bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { - rq = hctx->tags->rqs[off + bit]; - if (rq->q == hctx->queue) - fn(hctx, rq, data, reserved); - } + if (!reserved) + bitnr += tags->nr_reserved_tags; + rq = tags->rqs[bitnr]; - off += (1 << bt->bits_per_word); - } + iter_data->fn(rq, iter_data->data, reserved); + return true; } -static void bt_tags_for_each(struct blk_mq_tags *tags, - struct blk_mq_bitmap_tags *bt, unsigned int off, - busy_tag_iter_fn *fn, void *data, bool reserved) +static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, + busy_tag_iter_fn *fn, void *data, bool reserved) { - struct request *rq; - int bit, i; - - if (!tags->rqs) - return; - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - - for (bit = find_first_bit(&bm->word, bm->depth); - bit < bm->depth; - bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { - rq = tags->rqs[off + bit]; - fn(rq, data, reserved); - } + struct bt_tags_iter_data iter_data = { + .tags = tags, + .fn = fn, + .data = data, + .reserved = reserved, + }; - off += (1 << bt->bits_per_word); - } + if (tags->rqs) + sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); } static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv) { if (tags->nr_reserved_tags) - bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); - bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); + bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true); + bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false); } void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, @@ -496,11 +296,11 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) struct blk_mq_tags *tags = set->tags[i]; for (j = 0; j < tags->nr_tags; j++) { - if (!tags->rqs[j]) + if (!tags->static_rqs[j]) continue; ret = set->ops->reinit_request(set->driver_data, - tags->rqs[j]); + tags->static_rqs[j]); if (ret) goto out; } @@ -529,124 +329,35 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, continue; if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); + bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } } -static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, + bool round_robin, int node) { - unsigned int i, used; - - for (i = 0, used = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - - used += bitmap_weight(&bm->word, bm->depth); - } - - return bt->depth - used; -} - -static void bt_update_count(struct blk_mq_bitmap_tags *bt, - unsigned int depth) -{ - unsigned int tags_per_word = 1U << bt->bits_per_word; - unsigned int map_depth = depth; - - if (depth) { - int i; - - for (i = 0; i < bt->map_nr; i++) { - bt->map[i].depth = min(map_depth, tags_per_word); - map_depth -= bt->map[i].depth; - } - } - - bt->wake_cnt = BT_WAIT_BATCH; - if (bt->wake_cnt > depth / BT_WAIT_QUEUES) - bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES); - - bt->depth = depth; -} - -static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, - int node, bool reserved) -{ - int i; - - bt->bits_per_word = ilog2(BITS_PER_LONG); - - /* - * Depth can be zero for reserved tags, that's not a failure - * condition. - */ - if (depth) { - unsigned int nr, tags_per_word; - - tags_per_word = (1 << bt->bits_per_word); - - /* - * If the tag space is small, shrink the number of tags - * per word so we spread over a few cachelines, at least. - * If less than 4 tags, just forget about it, it's not - * going to work optimally anyway. - */ - if (depth >= 4) { - while (tags_per_word * 4 > depth) { - bt->bits_per_word--; - tags_per_word = (1 << bt->bits_per_word); - } - } - - nr = ALIGN(depth, tags_per_word) / tags_per_word; - bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), - GFP_KERNEL, node); - if (!bt->map) - return -ENOMEM; - - bt->map_nr = nr; - } - - bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); - if (!bt->bs) { - kfree(bt->map); - bt->map = NULL; - return -ENOMEM; - } - - bt_update_count(bt, depth); - - for (i = 0; i < BT_WAIT_QUEUES; i++) { - init_waitqueue_head(&bt->bs[i].wait); - atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt); - } - - return 0; -} - -static void bt_free(struct blk_mq_bitmap_tags *bt) -{ - kfree(bt->map); - kfree(bt->bs); + return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, + node); } static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, int node, int alloc_policy) { unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - tags->alloc_policy = alloc_policy; - - if (bt_alloc(&tags->bitmap_tags, depth, node, false)) - goto enomem; - if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) - goto enomem; + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) + goto free_tags; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin, + node)) + goto free_bitmap_tags; return tags; -enomem: - bt_free(&tags->bitmap_tags); +free_bitmap_tags: + sbitmap_queue_free(&tags->bitmap_tags); +free_tags: kfree(tags); return NULL; } @@ -666,11 +377,6 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; - if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) { - kfree(tags); - return NULL; - } - tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; @@ -679,31 +385,61 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, void blk_mq_free_tags(struct blk_mq_tags *tags) { - bt_free(&tags->bitmap_tags); - bt_free(&tags->breserved_tags); - free_cpumask_var(tags->cpumask); + sbitmap_queue_free(&tags->bitmap_tags); + sbitmap_queue_free(&tags->breserved_tags); kfree(tags); } -void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) +int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, + struct blk_mq_tags **tagsptr, unsigned int tdepth, + bool can_grow) { - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + struct blk_mq_tags *tags = *tagsptr; - *tag = prandom_u32() % depth; -} + if (tdepth <= tags->nr_reserved_tags) + return -EINVAL; -int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) -{ tdepth -= tags->nr_reserved_tags; - if (tdepth > tags->nr_tags) - return -EINVAL; /* - * Don't need (or can't) update reserved tags here, they remain - * static and should never need resizing. + * If we are allowed to grow beyond the original size, allocate + * a new set of tags before freeing the old one. */ - bt_update_count(&tags->bitmap_tags, tdepth); - blk_mq_tag_wakeup_all(tags, false); + if (tdepth > tags->nr_tags) { + struct blk_mq_tag_set *set = hctx->queue->tag_set; + struct blk_mq_tags *new; + bool ret; + + if (!can_grow) + return -EINVAL; + + /* + * We need some sort of upper limit, set it high enough that + * no valid use cases should require more. + */ + if (tdepth > 16 * BLKDEV_MAX_RQ) + return -EINVAL; + + new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0); + if (!new) + return -ENOMEM; + ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); + if (ret) { + blk_mq_free_rq_map(new); + return -ENOMEM; + } + + blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); + blk_mq_free_rq_map(*tagsptr); + *tagsptr = new; + } else { + /* + * Don't need (or can't) update reserved tags here, they + * remain static and should never need resizing. + */ + sbitmap_queue_resize(&tags->bitmap_tags, tdepth); + } + return 0; } @@ -726,7 +462,7 @@ u32 blk_mq_unique_tag(struct request *rq) int hwq = 0; if (q->mq_ops) { - hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); hwq = hctx->queue_num; } @@ -734,25 +470,3 @@ u32 blk_mq_unique_tag(struct request *rq) (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } EXPORT_SYMBOL(blk_mq_unique_tag); - -ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) -{ - char *orig_page = page; - unsigned int free, res; - - if (!tags) - return 0; - - page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " - "bits_per_word=%u\n", - tags->nr_tags, tags->nr_reserved_tags, - tags->bitmap_tags.bits_per_word); - - free = bt_unused_tags(&tags->bitmap_tags); - res = bt_unused_tags(&tags->breserved_tags); - - page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); - page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); - - return page - orig_page; -} diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index d468a79f2c4a..63497423c5cd 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -3,31 +3,6 @@ #include "blk-mq.h" -enum { - BT_WAIT_QUEUES = 8, - BT_WAIT_BATCH = 8, -}; - -struct bt_wait_state { - atomic_t wait_cnt; - wait_queue_head_t wait; -} ____cacheline_aligned_in_smp; - -#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) -#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) - -struct blk_mq_bitmap_tags { - unsigned int depth; - unsigned int wake_cnt; - unsigned int bits_per_word; - - unsigned int map_nr; - struct blk_align_bitmap *map; - - atomic_t wake_index; - struct bt_wait_state *bs; -}; - /* * Tag address space map. */ @@ -37,14 +12,12 @@ struct blk_mq_tags { atomic_t active_queues; - struct blk_mq_bitmap_tags bitmap_tags; - struct blk_mq_bitmap_tags breserved_tags; + struct sbitmap_queue bitmap_tags; + struct sbitmap_queue breserved_tags; struct request **rqs; + struct request **static_rqs; struct list_head page_list; - - int alloc_policy; - cpumask_var_t cpumask; }; @@ -52,15 +25,24 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); -extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, + struct blk_mq_ctx *ctx, unsigned int tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); -extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); -extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); -extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); +extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, + struct blk_mq_tags **tags, + unsigned int depth, bool can_grow); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv); +static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx) +{ + if (!hctx) + return &bt->ws[0]; + return sbq_wait_ptr(bt, &hctx->wait_index); +} + enum { BLK_MQ_TAG_CACHE_MIN = 1, BLK_MQ_TAG_CACHE_MAX = 64, diff --git a/block/blk-mq.c b/block/blk-mq.c index c207fa9870eb..b29e7dc7b309 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -22,6 +22,7 @@ #include <linux/sched/sysctl.h> #include <linux/delay.h> #include <linux/crash_dump.h> +#include <linux/prefetch.h> #include <trace/events/block.h> @@ -29,53 +30,37 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-stat.h" +#include "blk-wbt.h" +#include "blk-mq-sched.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); - /* * Check if any of the ctx's have pending work in this hardware queue */ -static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) -{ - unsigned int i; - - for (i = 0; i < hctx->ctx_map.size; i++) - if (hctx->ctx_map.map[i].word) - return true; - - return false; -} - -static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx) +bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; + return sbitmap_any_bit_set(&hctx->ctx_map) || + !list_empty_careful(&hctx->dispatch) || + blk_mq_sched_has_work(hctx); } -#define CTX_TO_BIT(hctx, ctx) \ - ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) - /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - struct blk_align_bitmap *bm = get_bm(hctx, ctx); - - if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) - set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); + if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) + sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - struct blk_align_bitmap *bm = get_bm(hctx, ctx); - - clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); + sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); } void blk_mq_freeze_queue_start(struct request_queue *q) @@ -135,6 +120,33 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +/** + * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished + * @q: request queue. + * + * Note: this function does not prevent that the struct request end_io() + * callback function is invoked. Additionally, it is not prevented that + * new queue_rq() calls occur unless the queue has been stopped first. + */ +void blk_mq_quiesce_queue(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + bool rcu = false; + + blk_mq_stop_hw_queues(q); + + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(&hctx->queue_rq_srcu); + else + rcu = true; + } + if (rcu) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; @@ -158,18 +170,16 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL(blk_mq_can_queue); -static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, int op, - unsigned int op_flags) +void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, + struct request *rq, unsigned int op) { - if (blk_queue_io_stat(q)) - op_flags |= REQ_IO_STAT; - INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ rq->q = q; rq->mq_ctx = ctx; - req_set_op_attrs(rq, op, op_flags); + rq->cmd_flags = op; + if (blk_queue_io_stat(q)) + rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ rq->cpu = -1; INIT_HLIST_NODE(&rq->hash); @@ -189,13 +199,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, rq->special = NULL; /* tag was already set */ rq->errors = 0; - - rq->cmd = rq->__cmd; - rq->extra_len = 0; - rq->sense_len = 0; - rq->resid_len = 0; - rq->sense = NULL; INIT_LIST_HEAD(&rq->timeout_list); rq->timeout = 0; @@ -204,65 +208,60 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, rq->end_io_data = NULL; rq->next_rq = NULL; - ctx->rq_dispatched[rw_is_sync(op, op_flags)]++; + ctx->rq_dispatched[op_is_sync(op)]++; } +EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init); -static struct request * -__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) +struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, + unsigned int op) { struct request *rq; unsigned int tag; tag = blk_mq_get_tag(data); if (tag != BLK_MQ_TAG_FAIL) { - rq = data->hctx->tags->rqs[tag]; + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); - if (blk_mq_tag_busy(data->hctx)) { - rq->cmd_flags = REQ_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); + rq = tags->static_rqs[tag]; + + if (data->flags & BLK_MQ_REQ_INTERNAL) { + rq->tag = -1; + rq->internal_tag = tag; + } else { + if (blk_mq_tag_busy(data->hctx)) { + rq->rq_flags = RQF_MQ_INFLIGHT; + atomic_inc(&data->hctx->nr_active); + } + rq->tag = tag; + rq->internal_tag = -1; } - rq->tag = tag; - blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags); + blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); return rq; } return NULL; } +EXPORT_SYMBOL_GPL(__blk_mq_alloc_request); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, unsigned int flags) { - struct blk_mq_ctx *ctx; - struct blk_mq_hw_ctx *hctx; + struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; - struct blk_mq_alloc_data alloc_data; int ret; ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); if (ret) return ERR_PTR(ret); - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); + rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); - if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); + blk_mq_put_ctx(alloc_data.ctx); + blk_queue_exit(q); - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); - ctx = alloc_data.ctx; - } - blk_mq_put_ctx(ctx); - if (!rq) { - blk_queue_exit(q); + if (!rq) return ERR_PTR(-EWOULDBLOCK); - } rq->__data_len = 0; rq->__sector = (sector_t) -1; @@ -308,7 +307,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); + rq = __blk_mq_alloc_request(&alloc_data, rw); if (!rq) { ret = -EWOULDBLOCK; goto out_queue_exit; @@ -322,38 +321,45 @@ out_queue_exit: } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, struct request *rq) +void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct request *rq) { - const int tag = rq->tag; + const int sched_tag = rq->internal_tag; struct request_queue *q = rq->q; - if (rq->cmd_flags & REQ_MQ_INFLIGHT) + if (rq->rq_flags & RQF_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); - rq->cmd_flags = 0; + + wbt_done(q->rq_wb, &rq->issue_stat); + rq->rq_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - blk_mq_put_tag(hctx, tag, &ctx->last_tag); + clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + if (rq->tag != -1) + blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); + if (sched_tag != -1) + blk_mq_sched_completed_request(hctx, rq); + blk_mq_sched_restart_queues(hctx); blk_queue_exit(q); } -void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) +static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx, + struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; ctx->rq_completed[rq_is_sync(rq)]++; - __blk_mq_free_request(hctx, ctx, rq); + __blk_mq_finish_request(hctx, ctx, rq); +} +void blk_mq_finish_request(struct request *rq) +{ + blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); } -EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); void blk_mq_free_request(struct request *rq) { - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; - - hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); - blk_mq_free_hctx_request(hctx, rq); + blk_mq_sched_put_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); @@ -362,6 +368,7 @@ inline void __blk_mq_end_request(struct request *rq, int error) blk_account_io_done(rq); if (rq->end_io) { + wbt_done(rq->q->rq_wb, &rq->issue_stat); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) @@ -412,10 +419,27 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu(); } +static void blk_mq_stat_add(struct request *rq) +{ + if (rq->rq_flags & RQF_STATS) { + /* + * We could rq->mq_ctx here, but there's less of a risk + * of races if we have the completion event add the stats + * to the local software queue. + */ + struct blk_mq_ctx *ctx; + + ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id()); + blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq); + } +} + static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_stat_add(rq); + if (!q->softirq_done_fn) blk_mq_end_request(rq, rq->errors); else @@ -453,11 +477,15 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_sched_started_request(rq); + trace_block_rq_issue(q, rq); - rq->resid_len = blk_rq_bytes(rq); - if (unlikely(blk_bidi_rq(rq))) - rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + blk_stat_set_issue_time(&rq->issue_stat); + rq->rq_flags |= RQF_STATS; + wbt_issue(q->rq_wb, &rq->issue_stat); + } blk_add_timer(rq); @@ -494,6 +522,8 @@ static void __blk_mq_requeue_request(struct request *rq) struct request_queue *q = rq->q; trace_block_rq_requeue(q, rq); + wbt_requeue(q->rq_wb, &rq->issue_stat); + blk_mq_sched_requeue_request(rq); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { if (q->dma_drain_size && blk_rq_bytes(rq)) @@ -501,19 +531,19 @@ static void __blk_mq_requeue_request(struct request *rq) } } -void blk_mq_requeue_request(struct request *rq) +void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { __blk_mq_requeue_request(rq); BUG_ON(blk_queued_rq(rq)); - blk_mq_add_to_requeue_list(rq, true); + blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } EXPORT_SYMBOL(blk_mq_requeue_request); static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = - container_of(work, struct request_queue, requeue_work); + container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); struct request *rq, *next; unsigned long flags; @@ -523,28 +553,25 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_unlock_irqrestore(&q->requeue_lock, flags); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { - if (!(rq->cmd_flags & REQ_SOFTBARRIER)) + if (!(rq->rq_flags & RQF_SOFTBARRIER)) continue; - rq->cmd_flags &= ~REQ_SOFTBARRIER; + rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, true, false, false); + blk_mq_sched_insert_request(rq, true, false, false, true); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, false, false, false); + blk_mq_sched_insert_request(rq, false, false, false, true); } - /* - * Use the start variant of queue running here, so that running - * the requeue work will kick stopped queues. - */ - blk_mq_start_hw_queues(q); + blk_mq_run_hw_queues(q, false); } -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, + bool kick_requeue_list) { struct request_queue *q = rq->q; unsigned long flags; @@ -553,30 +580,35 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) * We abuse this flag that is otherwise used by the I/O scheduler to * request head insertation from the workqueue. */ - BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); + BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); spin_lock_irqsave(&q->requeue_lock, flags); if (at_head) { - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; list_add(&rq->queuelist, &q->requeue_list); } else { list_add_tail(&rq->queuelist, &q->requeue_list); } spin_unlock_irqrestore(&q->requeue_lock, flags); + + if (kick_requeue_list) + blk_mq_kick_requeue_list(q); } EXPORT_SYMBOL(blk_mq_add_to_requeue_list); -void blk_mq_cancel_requeue_work(struct request_queue *q) +void blk_mq_kick_requeue_list(struct request_queue *q) { - cancel_work_sync(&q->requeue_work); + kblockd_schedule_delayed_work(&q->requeue_work, 0); } -EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); +EXPORT_SYMBOL(blk_mq_kick_requeue_list); -void blk_mq_kick_requeue_list(struct request_queue *q) +void blk_mq_delay_kick_requeue_list(struct request_queue *q, + unsigned long msecs) { - kblockd_schedule_work(&q->requeue_work); + kblockd_schedule_delayed_work(&q->requeue_work, + msecs_to_jiffies(msecs)); } -EXPORT_SYMBOL(blk_mq_kick_requeue_list); +EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); void blk_mq_abort_requeue_list(struct request_queue *q) { @@ -600,8 +632,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - if (tag < tags->nr_tags) + if (tag < tags->nr_tags) { + prefetch(tags->rqs[tag]); return tags->rqs[tag]; + } return NULL; } @@ -614,7 +648,7 @@ struct blk_mq_timeout_data { void blk_mq_rq_timed_out(struct request *req, bool reserved) { - struct blk_mq_ops *ops = req->q->mq_ops; + const struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; /* @@ -729,7 +763,7 @@ static bool blk_mq_attempt_merge(struct request_queue *q, int checked = 8; list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { - int el_ret; + bool merged = false; if (!checked--) break; @@ -737,98 +771,147 @@ static bool blk_mq_attempt_merge(struct request_queue *q, if (!blk_rq_merge_ok(rq, bio)) continue; - el_ret = blk_try_merge(rq, bio); - if (el_ret == ELEVATOR_BACK_MERGE) { - if (bio_attempt_back_merge(q, rq, bio)) { - ctx->rq_merged++; - return true; - } + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (blk_mq_sched_allow_merge(q, rq, bio)) + merged = bio_attempt_back_merge(q, rq, bio); break; - } else if (el_ret == ELEVATOR_FRONT_MERGE) { - if (bio_attempt_front_merge(q, rq, bio)) { - ctx->rq_merged++; - return true; - } + case ELEVATOR_FRONT_MERGE: + if (blk_mq_sched_allow_merge(q, rq, bio)) + merged = bio_attempt_front_merge(q, rq, bio); break; + case ELEVATOR_DISCARD_MERGE: + merged = bio_attempt_discard_merge(q, rq, bio); + break; + default: + continue; } + + if (merged) + ctx->rq_merged++; + return merged; } return false; } +struct flush_busy_ctx_data { + struct blk_mq_hw_ctx *hctx; + struct list_head *list; +}; + +static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) +{ + struct flush_busy_ctx_data *flush_data = data; + struct blk_mq_hw_ctx *hctx = flush_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + + sbitmap_clear_bit(sb, bitnr); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, flush_data->list); + spin_unlock(&ctx->lock); + return true; +} + /* * Process software queues that have been marked busy, splicing them * to the for-dispatch */ -static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) +void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - struct blk_mq_ctx *ctx; - int i; + struct flush_busy_ctx_data data = { + .hctx = hctx, + .list = list, + }; - for (i = 0; i < hctx->ctx_map.size; i++) { - struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; - unsigned int off, bit; + sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); +} +EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); - if (!bm->word) - continue; +static inline unsigned int queued_to_index(unsigned int queued) +{ + if (!queued) + return 0; - bit = 0; - off = i * hctx->ctx_map.bits_per_word; - do { - bit = find_next_bit(&bm->word, bm->depth, bit); - if (bit >= bm->depth) - break; + return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); +} - ctx = hctx->ctxs[bit + off]; - clear_bit(bit, &bm->word); - spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, list); - spin_unlock(&ctx->lock); +bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, + bool wait) +{ + struct blk_mq_alloc_data data = { + .q = rq->q, + .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), + .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, + }; - bit++; - } while (1); + if (rq->tag != -1) { +done: + if (hctx) + *hctx = data.hctx; + return true; + } + + rq->tag = blk_mq_get_tag(&data); + if (rq->tag >= 0) { + if (blk_mq_tag_busy(data.hctx)) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + atomic_inc(&data.hctx->nr_active); + } + data.hctx->tags->rqs[rq->tag] = rq; + goto done; } + + return false; } -/* - * Run this hardware queue, pulling any software queues mapped to it in. - * Note that this function currently has various problems around ordering - * of IO. In particular, we'd like FIFO behaviour on handling existing - * items on the hctx->dispatch list. Ignore that for now. - */ -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) +static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) { - struct request_queue *q = hctx->queue; - struct request *rq; - LIST_HEAD(rq_list); - LIST_HEAD(driver_list); - struct list_head *dptr; - int queued; - - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + if (rq->tag == -1 || rq->internal_tag == -1) return; - WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)); + blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); + rq->tag = -1; - hctx->run++; + if (rq->rq_flags & RQF_MQ_INFLIGHT) { + rq->rq_flags &= ~RQF_MQ_INFLIGHT; + atomic_dec(&hctx->nr_active); + } +} - /* - * Touch any software queue that has pending entries. - */ - flush_busy_ctxs(hctx, &rq_list); +/* + * If we fail getting a driver tag because all the driver tags are already + * assigned and on the dispatch list, BUT the first entry does not have a + * tag, then we could deadlock. For that case, move entries with assigned + * driver tags to the front, leaving the set of tagged requests in the + * same order, and the untagged set in the same order. + */ +static bool reorder_tags_to_front(struct list_head *list) +{ + struct request *rq, *tmp, *first = NULL; - /* - * If we have previous entries on our dispatch list, grab them - * and stuff them at the front for more fair dispatch. - */ - if (!list_empty_careful(&hctx->dispatch)) { - spin_lock(&hctx->lock); - if (!list_empty(&hctx->dispatch)) - list_splice_init(&hctx->dispatch, &rq_list); - spin_unlock(&hctx->lock); + list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) { + if (rq == first) + break; + if (rq->tag != -1) { + list_move(&rq->queuelist, list); + if (!first) + first = rq; + } } + return first != NULL; +} + +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) +{ + struct request_queue *q = hctx->queue; + struct request *rq; + LIST_HEAD(driver_list); + struct list_head *dptr; + int queued, ret = BLK_MQ_RQ_QUEUE_OK; + /* * Start off with dptr being NULL, so we start the first request * immediately, even if we have more pending. @@ -839,16 +922,29 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) * Now process all the entries, sending them to the driver. */ queued = 0; - while (!list_empty(&rq_list)) { + while (!list_empty(list)) { struct blk_mq_queue_data bd; - int ret; - rq = list_first_entry(&rq_list, struct request, queuelist); + rq = list_first_entry(list, struct request, queuelist); + if (!blk_mq_get_driver_tag(rq, &hctx, false)) { + if (!queued && reorder_tags_to_front(list)) + continue; + + /* + * We failed getting a driver tag. Mark the queue(s) + * as needing a restart. Retry getting a tag again, + * in case the needed IO completed right before we + * marked the queue as needing a restart. + */ + blk_mq_sched_mark_restart(hctx); + if (!blk_mq_get_driver_tag(rq, &hctx, false)) + break; + } list_del_init(&rq->queuelist); bd.rq = rq; bd.list = dptr; - bd.last = list_empty(&rq_list); + bd.last = list_empty(list); ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { @@ -856,7 +952,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) queued++; break; case BLK_MQ_RQ_QUEUE_BUSY: - list_add(&rq->queuelist, &rq_list); + blk_mq_put_driver_tag(hctx, rq); + list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; default: @@ -874,23 +971,21 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) * We've done the first request. If we have more than 1 * left in the list, set dptr to defer issue. */ - if (!dptr && rq_list.next != rq_list.prev) + if (!dptr && list->next != list->prev) dptr = &driver_list; } - if (!queued) - hctx->dispatched[0]++; - else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) - hctx->dispatched[ilog2(queued) + 1]++; + hctx->dispatched[queued_to_index(queued)]++; /* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. */ - if (!list_empty(&rq_list)) { + if (!list_empty(list)) { spin_lock(&hctx->lock); - list_splice(&rq_list, &hctx->dispatch); + list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); + /* * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but * it's possible the queue is stopped and restarted again @@ -899,8 +994,32 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) * the requests in rq_list might get lost. * * blk_mq_run_hw_queue() already checks the STOPPED bit - **/ - blk_mq_run_hw_queue(hctx, true); + * + * If RESTART is set, then let completion restart the queue + * instead of potentially looping here. + */ + if (!blk_mq_sched_needs_restart(hctx)) + blk_mq_run_hw_queue(hctx, true); + } + + return queued != 0; +} + +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + int srcu_idx; + + WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && + cpu_online(hctx->next_cpu)); + + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + rcu_read_lock(); + blk_mq_sched_dispatch_requests(hctx); + rcu_read_unlock(); + } else { + srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); + blk_mq_sched_dispatch_requests(hctx); + srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); } } @@ -916,7 +1035,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { - int cpu = hctx->next_cpu, next_cpu; + int next_cpu; next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); if (next_cpu >= nr_cpu_ids) @@ -924,8 +1043,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; - - return cpu; } return hctx->next_cpu; @@ -933,11 +1050,11 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || - !blk_mq_hw_queue_mapped(hctx))) + if (unlikely(blk_mq_hctx_stopped(hctx) || + !blk_mq_hw_queue_mapped(hctx))) return; - if (!async) { + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); @@ -948,8 +1065,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) put_cpu(); } - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, 0); + kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); } void blk_mq_run_hw_queues(struct request_queue *q, bool async) @@ -958,9 +1074,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) int i; queue_for_each_hw_ctx(q, hctx, i) { - if ((!blk_mq_hctx_has_pending(hctx) && - list_empty_careful(&hctx->dispatch)) || - test_bit(BLK_MQ_S_STOPPED, &hctx->state)) + if (!blk_mq_hctx_has_pending(hctx) || + blk_mq_hctx_stopped(hctx)) continue; blk_mq_run_hw_queue(hctx, async); @@ -968,9 +1083,29 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) } EXPORT_SYMBOL(blk_mq_run_hw_queues); +/** + * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped + * @q: request queue. + * + * The caller is responsible for serializing this function against + * blk_mq_{start,stop}_hw_queue(). + */ +bool blk_mq_queue_stopped(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + if (blk_mq_hctx_stopped(hctx)) + return true; + + return false; +} +EXPORT_SYMBOL(blk_mq_queue_stopped); + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { - cancel_delayed_work(&hctx->run_work); + cancel_work(&hctx->run_work); cancel_delayed_work(&hctx->delay_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } @@ -1004,18 +1139,23 @@ void blk_mq_start_hw_queues(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_start_hw_queues); +void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +{ + if (!blk_mq_hctx_stopped(hctx)) + return; + + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + blk_mq_run_hw_queue(hctx, async); +} +EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); + void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) { - if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) - continue; - - clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - blk_mq_run_hw_queue(hctx, async); - } + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_start_stopped_hw_queue(hctx, async); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); @@ -1023,7 +1163,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx; - hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + hctx = container_of(work, struct blk_mq_hw_ctx, run_work); __blk_mq_run_hw_queue(hctx); } @@ -1043,6 +1183,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) if (unlikely(!blk_mq_hw_queue_mapped(hctx))) return; + blk_mq_stop_hw_queue(hctx); kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->delay_work, msecs_to_jiffies(msecs)); } @@ -1062,8 +1203,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, list_add_tail(&rq->queuelist, &ctx->rq_list); } -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool at_head) +void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; @@ -1071,36 +1212,10 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, blk_mq_hctx_mark_pending(hctx, ctx); } -void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, - bool async) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); - - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - - if (run_queue) - blk_mq_run_hw_queue(hctx, async); -} - -static void blk_mq_insert_requests(struct request_queue *q, - struct blk_mq_ctx *ctx, - struct list_head *list, - int depth, - bool from_schedule) +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct list_head *list) { - struct blk_mq_hw_ctx *hctx; - - trace_block_unplug(q, depth, !from_schedule); - - hctx = q->mq_ops->map_queue(q, ctx->cpu); - /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now @@ -1116,8 +1231,6 @@ static void blk_mq_insert_requests(struct request_queue *q, } blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); - - blk_mq_run_hw_queue(hctx, from_schedule); } static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -1153,9 +1266,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(!rq->q); if (rq->mq_ctx != this_ctx) { if (this_ctx) { - blk_mq_insert_requests(this_q, this_ctx, - &ctx_list, depth, - from_schedule); + trace_block_unplug(this_q, depth, from_schedule); + blk_mq_sched_insert_requests(this_q, this_ctx, + &ctx_list, + from_schedule); } this_ctx = rq->mq_ctx; @@ -1172,8 +1286,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) * on 'ctx_list'. Do those. */ if (this_ctx) { - blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, - from_schedule); + trace_block_unplug(this_q, depth, from_schedule); + blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, + from_schedule); } } @@ -1181,7 +1296,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { init_request_from_bio(rq, bio); - blk_account_io_start(rq, 1); + blk_account_io_start(rq, true); } static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) @@ -1211,68 +1326,38 @@ insert_rq: } spin_unlock(&ctx->lock); - __blk_mq_free_request(hctx, ctx, rq); + __blk_mq_finish_request(hctx, ctx, rq); return true; } } -struct blk_map_ctx { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; -}; - -static struct request *blk_mq_map_request(struct request_queue *q, - struct bio *bio, - struct blk_map_ctx *data) +static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - struct request *rq; - int op = bio_data_dir(bio); - int op_flags = 0; - struct blk_mq_alloc_data alloc_data; - - blk_queue_enter_live(q); - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - - if (rw_is_sync(bio_op(bio), bio->bi_opf)) - op_flags |= REQ_SYNC; - - trace_block_getrq(q, bio, op); - blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - if (unlikely(!rq)) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - trace_block_sleeprq(q, bio, op); + if (rq->tag != -1) + return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - ctx = alloc_data.ctx; - hctx = alloc_data.hctx; - } - - hctx->queued++; - data->hctx = hctx; - data->ctx = ctx; - return rq; + return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) +static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) { - int ret; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, - rq->mq_ctx->cpu); struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, .last = 1 }; - blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); + struct blk_mq_hw_ctx *hctx; + blk_qc_t new_cookie; + int ret; + + if (q->elevator) + goto insert; + + if (!blk_mq_get_driver_tag(rq, &hctx, false)) + goto insert; + + new_cookie = request_to_qc_t(hctx, rq); /* * For OK queue, we are done. For error, kill it. Any other @@ -1282,7 +1367,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) ret = q->mq_ops->queue_rq(hctx, &bd); if (ret == BLK_MQ_RQ_QUEUE_OK) { *cookie = new_cookie; - return 0; + return; } __blk_mq_requeue_request(rq); @@ -1291,10 +1376,11 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) *cookie = BLK_QC_T_NONE; rq->errors = -EIO; blk_mq_end_request(rq, rq->errors); - return 0; + return; } - return -1; +insert: + blk_mq_sched_insert_request(rq, false, true, true, false); } /* @@ -1304,14 +1390,15 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) */ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { - const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); - const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); - struct blk_map_ctx data; + const int is_sync = op_is_sync(bio->bi_opf); + const int is_flush_fua = op_is_flush(bio->bi_opf); + struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; - unsigned int request_count = 0; + unsigned int request_count = 0, srcu_idx; struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; + unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1326,13 +1413,26 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return BLK_QC_T_NONE; - rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (blk_mq_sched_bio_merge(q, bio)) return BLK_QC_T_NONE; - cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + wb_acct = wbt_wait(q->rq_wb, bio, NULL); + + trace_block_getrq(q, bio, bio->bi_opf); + + rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); + if (unlikely(!rq)) { + __wbt_done(q->rq_wb, wb_acct); + return BLK_QC_T_NONE; + } + + wbt_track(&rq->issue_stat, wb_acct); + + cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { + if (q->elevator) + goto elv_insert; blk_mq_bio_to_request(rq, bio); blk_insert_flush(rq); goto run_queue; @@ -1351,7 +1451,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio); /* - * We do limited pluging. If the bio can be merged, do that. + * We do limited plugging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be * issued. So the plug list will have one request at most */ @@ -1371,12 +1471,27 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); if (!old_rq) goto done; - if (!blk_mq_direct_issue_request(old_rq, &cookie)) - goto done; - blk_mq_insert_request(old_rq, false, true, true); + + if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { + rcu_read_lock(); + blk_mq_try_issue_directly(old_rq, &cookie); + rcu_read_unlock(); + } else { + srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); + blk_mq_try_issue_directly(old_rq, &cookie); + srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx); + } goto done; } + if (q->elevator) { +elv_insert: + blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_sched_insert_request(rq, false, true, + !is_sync || is_flush_fua, true); + goto done; + } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { /* * For a SYNC request, send it to the hardware immediately. For @@ -1398,13 +1513,14 @@ done: */ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { - const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); - const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); + const int is_sync = op_is_sync(bio->bi_opf); + const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_plug *plug; unsigned int request_count = 0; - struct blk_map_ctx data; + struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; blk_qc_t cookie; + unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1421,13 +1537,26 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) } else request_count = blk_plug_queued_count(q); - rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (blk_mq_sched_bio_merge(q, bio)) + return BLK_QC_T_NONE; + + wb_acct = wbt_wait(q->rq_wb, bio, NULL); + + trace_block_getrq(q, bio, bio->bi_opf); + + rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); + if (unlikely(!rq)) { + __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; + } - cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + wbt_track(&rq->issue_stat, wb_acct); + + cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { + if (q->elevator) + goto elv_insert; blk_mq_bio_to_request(rq, bio); blk_insert_flush(rq); goto run_queue; @@ -1440,13 +1569,25 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) */ plug = current->plug; if (plug) { + struct request *last = NULL; + blk_mq_bio_to_request(rq, bio); + + /* + * @request_count may become stale because of schedule + * out, so check the list again. + */ + if (list_empty(&plug->mq_list)) + request_count = 0; if (!request_count) trace_block_plug(q); + else + last = list_entry_rq(plug->mq_list.prev); blk_mq_put_ctx(data.ctx); - if (request_count >= BLK_MAX_REQUEST_COUNT) { + if (request_count >= BLK_MAX_REQUEST_COUNT || (last && + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_flush_plug_list(plug, false); trace_block_plug(q); } @@ -1455,6 +1596,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) return cookie; } + if (q->elevator) { +elv_insert: + blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_sched_insert_request(rq, false, true, + !is_sync || is_flush_fua, true); + goto done; + } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { /* * For a SYNC request, send it to the hardware immediately. For @@ -1467,20 +1616,12 @@ run_queue: } blk_mq_put_ctx(data.ctx); +done: return cookie; } -/* - * Default mapping to a software queue, since we use one per CPU. - */ -struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) -{ - return q->queue_hw_ctx[q->mq_map[cpu]]; -} -EXPORT_SYMBOL(blk_mq_map_queue); - -static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, - struct blk_mq_tags *tags, unsigned int hctx_idx) +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx) { struct page *page; @@ -1488,11 +1629,13 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, int i; for (i = 0; i < tags->nr_tags; i++) { - if (!tags->rqs[i]) + struct request *rq = tags->static_rqs[i]; + + if (!rq) continue; - set->ops->exit_request(set->driver_data, tags->rqs[i], + set->ops->exit_request(set->driver_data, rq, hctx_idx, i); - tags->rqs[i] = NULL; + tags->static_rqs[i] = NULL; } } @@ -1506,49 +1649,73 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, kmemleak_free(page_address(page)); __free_pages(page, page->private); } +} +void blk_mq_free_rq_map(struct blk_mq_tags *tags) +{ kfree(tags->rqs); + tags->rqs = NULL; + kfree(tags->static_rqs); + tags->static_rqs = NULL; blk_mq_free_tags(tags); } -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; -} - -static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx) +struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags) { struct blk_mq_tags *tags; - unsigned int i, j, entries_per_page, max_order = 4; - size_t rq_size, left; - tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, + tags = blk_mq_init_tags(nr_tags, reserved_tags, set->numa_node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; - INIT_LIST_HEAD(&tags->page_list); - - tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), - GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, + tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, set->numa_node); if (!tags->rqs) { blk_mq_free_tags(tags); return NULL; } + tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, + set->numa_node); + if (!tags->static_rqs) { + kfree(tags->rqs); + blk_mq_free_tags(tags); + return NULL; + } + + return tags; +} + +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth) +{ + unsigned int i, j, entries_per_page, max_order = 4; + size_t rq_size, left; + + INIT_LIST_HEAD(&tags->page_list); + /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); - left = rq_size * set->queue_depth; + left = rq_size * depth; - for (i = 0; i < set->queue_depth; ) { + for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; @@ -1559,7 +1726,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, do { page = alloc_pages_node(set->numa_node, - GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) break; @@ -1580,17 +1747,19 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, * Allow kmemleak to scan these pages as they contain pointers * to additional allocations like via ops->init_request(). */ - kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); + kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); entries_per_page = order_to_size(this_order) / rq_size; - to_do = min(entries_per_page, set->queue_depth - i); + to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { - tags->rqs[i] = p; + struct request *rq = p; + + tags->static_rqs[i] = rq; if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, - tags->rqs[i], hctx_idx, i, + rq, hctx_idx, i, set->numa_node)) { - tags->rqs[i] = NULL; + tags->static_rqs[i] = NULL; goto fail; } } @@ -1599,37 +1768,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, i++; } } - return tags; + return 0; fail: - blk_mq_free_rq_map(set, tags, hctx_idx); - return NULL; -} - -static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) -{ - kfree(bitmap->map); -} - -static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) -{ - unsigned int bpw = 8, total, num_maps, i; - - bitmap->bits_per_word = bpw; - - num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; - bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), - GFP_KERNEL, node); - if (!bitmap->map) - return -ENOMEM; - - total = nr_cpu_ids; - for (i = 0; i < num_maps; i++) { - bitmap->map[i].depth = min(total, bitmap->bits_per_word); - total -= bitmap->map[i].depth; - } - - return 0; + blk_mq_free_rqs(set, tags, hctx_idx); + return -ENOMEM; } /* @@ -1637,11 +1780,13 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) * software queue to the hw queue dispatch list, and ensure that it * gets run. */ -static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) +static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) { + struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); + hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); ctx = __blk_mq_get_ctx(hctx->queue, cpu); spin_lock(&ctx->lock); @@ -1652,30 +1797,20 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) spin_unlock(&ctx->lock); if (list_empty(&tmp)) - return NOTIFY_OK; + return 0; spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); blk_mq_run_hw_queue(hctx, true); - return NOTIFY_OK; + return 0; } -static int blk_mq_hctx_notify(void *data, unsigned long action, - unsigned int cpu) +static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { - struct blk_mq_hw_ctx *hctx = data; - - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) - return blk_mq_hctx_cpu_offline(hctx, cpu); - - /* - * In case of CPU online, tags may be reallocated - * in blk_mq_map_swqueue() after mapping is updated. - */ - - return NOTIFY_OK; + cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, + &hctx->cpuhp_dead); } /* hctx->ctxs will be freed in queue's release handler */ @@ -1695,9 +1830,12 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + if (hctx->flags & BLK_MQ_F_BLOCKING) + cleanup_srcu_struct(&hctx->queue_rq_srcu); + + blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); - blk_mq_free_bitmap(&hctx->ctx_map); + sbitmap_free(&hctx->ctx_map); } static void blk_mq_exit_hw_queues(struct request_queue *q, @@ -1734,7 +1872,7 @@ static int blk_mq_init_hctx(struct request_queue *q, if (node == NUMA_NO_NODE) node = hctx->numa_node = set->numa_node; - INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); @@ -1742,9 +1880,7 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->queue_num = hctx_idx; hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; - blk_mq_init_cpu_notifier(&hctx->cpu_notifier, - blk_mq_hctx_notify, hctx); - blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); hctx->tags = set->tags[hctx_idx]; @@ -1757,7 +1893,8 @@ static int blk_mq_init_hctx(struct request_queue *q, if (!hctx->ctxs) goto unregister_cpu_notifier; - if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) + if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, + node)) goto free_ctxs; hctx->nr_ctx = 0; @@ -1776,6 +1913,9 @@ static int blk_mq_init_hctx(struct request_queue *q, flush_start_tag + hctx_idx, node)) goto free_fq; + if (hctx->flags & BLK_MQ_F_BLOCKING) + init_srcu_struct(&hctx->queue_rq_srcu); + return 0; free_fq: @@ -1784,12 +1924,11 @@ static int blk_mq_init_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); free_bitmap: - blk_mq_free_bitmap(&hctx->ctx_map); + sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); unregister_cpu_notifier: - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - + blk_mq_remove_cpuhp(hctx); return -1; } @@ -1807,12 +1946,14 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, spin_lock_init(&__ctx->lock); INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; + blk_stat_init(&__ctx->stat[BLK_STAT_READ]); + blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]); /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpu_online(i)) continue; - hctx = q->mq_ops->map_queue(q, i); + hctx = blk_mq_map_queue(q, i); /* * Set local node, IFF we have more than one hw queue. If @@ -1823,10 +1964,39 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } +static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) +{ + int ret = 0; + + set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, + set->queue_depth, set->reserved_tags); + if (!set->tags[hctx_idx]) + return false; + + ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, + set->queue_depth); + if (!ret) + return true; + + blk_mq_free_rq_map(set->tags[hctx_idx]); + set->tags[hctx_idx] = NULL; + return false; +} + +static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + if (set->tags[hctx_idx]) { + blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); + blk_mq_free_rq_map(set->tags[hctx_idx]); + set->tags[hctx_idx] = NULL; + } +} + static void blk_mq_map_swqueue(struct request_queue *q, const struct cpumask *online_mask) { - unsigned int i; + unsigned int i, hctx_idx; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -1849,8 +2019,21 @@ static void blk_mq_map_swqueue(struct request_queue *q, if (!cpumask_test_cpu(i, online_mask)) continue; + hctx_idx = q->mq_map[i]; + /* unmapped hw queue can be remapped after CPU topo changed */ + if (!set->tags[hctx_idx] && + !__blk_mq_alloc_rq_map(set, hctx_idx)) { + /* + * If tags initialization fail for some hctx, + * that hctx won't be brought online. In this + * case, remap the current ctx to hctx[0] which + * is guaranteed to always have tags allocated + */ + q->mq_map[i] = 0; + } + ctx = per_cpu_ptr(q->queue_ctx, i); - hctx = q->mq_ops->map_queue(q, i); + hctx = blk_mq_map_queue(q, i); cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; @@ -1860,34 +2043,31 @@ static void blk_mq_map_swqueue(struct request_queue *q, mutex_unlock(&q->sysfs_lock); queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_ctxmap *map = &hctx->ctx_map; - /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. */ if (!hctx->nr_ctx) { - if (set->tags[i]) { - blk_mq_free_rq_map(set, set->tags[i], i); - set->tags[i] = NULL; - } + /* Never unmap queue 0. We need it as a + * fallback in case of a new remap fails + * allocation + */ + if (i && set->tags[i]) + blk_mq_free_map_and_requests(set, i); + hctx->tags = NULL; continue; } - /* unmapped hw queue can be remapped after CPU topo changed */ - if (!set->tags[i]) - set->tags[i] = blk_mq_init_rq_map(set, i); hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); - cpumask_copy(hctx->tags->cpumask, hctx->cpumask); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping * over all possibly mapped software queues. */ - map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word); + sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* * Initialize batch roundrobin counts @@ -1967,6 +2147,8 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned int i; + blk_mq_sched_teardown(q); + /* hctx kobj stays in hctx */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx) @@ -1975,7 +2157,6 @@ void blk_mq_release(struct request_queue *q) kfree(hctx); } - kfree(q->mq_map); q->mq_map = NULL; kfree(q->queue_hw_ctx); @@ -2042,10 +2223,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { - if (hctx->tags) { - blk_mq_free_rq_map(set, hctx->tags, j); - set->tags[j] = NULL; - } + if (hctx->tags) + blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); free_cpumask_var(hctx->cpumask); kobject_put(&hctx->kobj); @@ -2074,9 +2253,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (!q->queue_hw_ctx) goto err_percpu; - q->mq_map = blk_mq_make_queue_map(set); - if (!q->mq_map) - goto err_map; + q->mq_map = set->mq_map; blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) @@ -2094,7 +2271,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->sg_reserved_size = INT_MAX; - INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); @@ -2108,6 +2285,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, */ q->nr_requests = set->queue_depth; + /* + * Default to classic polling + */ + q->poll_nsec = -1; + if (set->ops->complete) blk_queue_softirq_done(q, set->ops->complete); @@ -2123,11 +2305,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, mutex_unlock(&all_q_mutex); put_online_cpus(); + if (!(set->flags & BLK_MQ_F_NO_SCHED)) { + int ret; + + ret = blk_mq_sched_init(q); + if (ret) + return ERR_PTR(ret); + } + return q; err_hctxs: - kfree(q->mq_map); -err_map: kfree(q->queue_hw_ctx); err_percpu: free_percpu(q->queue_ctx); @@ -2145,6 +2333,8 @@ void blk_mq_free_queue(struct request_queue *q) list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); + wbt_exit(q); + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); @@ -2159,8 +2349,6 @@ static void blk_mq_queue_reinit(struct request_queue *q, blk_mq_sysfs_unregister(q); - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); - /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe * we should change hctx numa_node according to new topology (this @@ -2172,50 +2360,18 @@ static void blk_mq_queue_reinit(struct request_queue *q, blk_mq_sysfs_register(q); } -static int blk_mq_queue_reinit_notify(struct notifier_block *nb, - unsigned long action, void *hcpu) +/* + * New online cpumask which is going to be set in this hotplug event. + * Declare this cpumasks as global as cpu-hotplug operation is invoked + * one-by-one and dynamically allocating this could result in a failure. + */ +static struct cpumask cpuhp_online_new; + +static void blk_mq_queue_reinit_work(void) { struct request_queue *q; - int cpu = (unsigned long)hcpu; - /* - * New online cpumask which is going to be set in this hotplug event. - * Declare this cpumasks as global as cpu-hotplug operation is invoked - * one-by-one and dynamically allocating this could result in a failure. - */ - static struct cpumask online_new; - - /* - * Before hotadded cpu starts handling requests, new mappings must - * be established. Otherwise, these requests in hw queue might - * never be dispatched. - * - * For example, there is a single hw queue (hctx) and two CPU queues - * (ctx0 for CPU0, and ctx1 for CPU1). - * - * Now CPU1 is just onlined and a request is inserted into - * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is - * still zero. - * - * And then while running hw queue, flush_busy_ctxs() finds bit0 is - * set in pending bitmap and tries to retrieve requests in - * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, - * so the request in ctx1->rq_list is ignored. - */ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DEAD: - case CPU_UP_CANCELED: - cpumask_copy(&online_new, cpu_online_mask); - break; - case CPU_UP_PREPARE: - cpumask_copy(&online_new, cpu_online_mask); - cpumask_set_cpu(cpu, &online_new); - break; - default: - return NOTIFY_OK; - } mutex_lock(&all_q_mutex); - /* * We need to freeze and reinit all existing queues. Freezing * involves synchronous wait for an RCU grace period and doing it @@ -2225,41 +2381,62 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, */ list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_freeze_queue_start(q); - list_for_each_entry(q, &all_q_list, all_q_node) { + list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_freeze_queue_wait(q); - /* - * timeout handler can't touch hw queue during the - * reinitialization - */ - del_timer_sync(&q->timeout); - } - list_for_each_entry(q, &all_q_list, all_q_node) - blk_mq_queue_reinit(q, &online_new); + blk_mq_queue_reinit(q, &cpuhp_online_new); list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_unfreeze_queue(q); mutex_unlock(&all_q_mutex); - return NOTIFY_OK; +} + +static int blk_mq_queue_reinit_dead(unsigned int cpu) +{ + cpumask_copy(&cpuhp_online_new, cpu_online_mask); + blk_mq_queue_reinit_work(); + return 0; +} + +/* + * Before hotadded cpu starts handling requests, new mappings must be + * established. Otherwise, these requests in hw queue might never be + * dispatched. + * + * For example, there is a single hw queue (hctx) and two CPU queues (ctx0 + * for CPU0, and ctx1 for CPU1). + * + * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list + * and set bit0 in pending bitmap as ctx1->index_hw is still zero. + * + * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set + * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. + * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is + * ignored. + */ +static int blk_mq_queue_reinit_prepare(unsigned int cpu) +{ + cpumask_copy(&cpuhp_online_new, cpu_online_mask); + cpumask_set_cpu(cpu, &cpuhp_online_new); + blk_mq_queue_reinit_work(); + return 0; } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) { - set->tags[i] = blk_mq_init_rq_map(set, i); - if (!set->tags[i]) + for (i = 0; i < set->nr_hw_queues; i++) + if (!__blk_mq_alloc_rq_map(set, i)) goto out_unwind; - } return 0; out_unwind: while (--i >= 0) - blk_mq_free_rq_map(set, set->tags[i], i); + blk_mq_free_rq_map(set->tags[i]); return -ENOMEM; } @@ -2299,12 +2476,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0; } -struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) -{ - return tags->cpumask; -} -EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); - /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -2313,6 +2484,8 @@ EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { + int ret; + BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); if (!set->nr_hw_queues) @@ -2322,7 +2495,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; - if (!set->ops->queue_rq || !set->ops->map_queue) + if (!set->ops->queue_rq) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { @@ -2351,17 +2524,35 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->tags) return -ENOMEM; - if (blk_mq_alloc_rq_maps(set)) - goto enomem; + ret = -ENOMEM; + set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, + GFP_KERNEL, set->numa_node); + if (!set->mq_map) + goto out_free_tags; + + if (set->ops->map_queues) + ret = set->ops->map_queues(set); + else + ret = blk_mq_map_queues(set); + if (ret) + goto out_free_mq_map; + + ret = blk_mq_alloc_rq_maps(set); + if (ret) + goto out_free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; -enomem: + +out_free_mq_map: + kfree(set->mq_map); + set->mq_map = NULL; +out_free_tags: kfree(set->tags); set->tags = NULL; - return -ENOMEM; + return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); @@ -2369,10 +2560,11 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < nr_cpu_ids; i++) { - if (set->tags[i]) - blk_mq_free_rq_map(set, set->tags[i], i); - } + for (i = 0; i < nr_cpu_ids; i++) + blk_mq_free_map_and_requests(set, i); + + kfree(set->mq_map); + set->mq_map = NULL; kfree(set->tags); set->tags = NULL; @@ -2385,14 +2577,28 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) struct blk_mq_hw_ctx *hctx; int i, ret; - if (!set || nr > set->queue_depth) + if (!set) return -EINVAL; + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; - ret = blk_mq_tag_update_depth(hctx->tags, nr); + /* + * If we're using an MQ scheduler, just update the scheduler + * queue depth. This is similar to what the old code would do. + */ + if (!hctx->sched_tags) { + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, + min(nr, set->queue_depth), + false); + } else { + ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, + nr, true); + } if (ret) break; } @@ -2400,6 +2606,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) q->nr_requests = nr; + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + return ret; } @@ -2419,10 +2628,14 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); + /* + * Manually set the make_request_fn as blk_queue_make_request + * resets a lot of the queue settings. + */ if (q->nr_hw_queues > 1) - blk_queue_make_request(q, blk_mq_make_request); + q->make_request_fn = blk_mq_make_request; else - blk_queue_make_request(q, blk_sq_make_request); + q->make_request_fn = blk_sq_make_request; blk_mq_queue_reinit(q, cpu_online_mask); } @@ -2432,6 +2645,168 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); +static unsigned long blk_mq_poll_nsecs(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct blk_rq_stat stat[2]; + unsigned long ret = 0; + + /* + * If stats collection isn't on, don't sleep but turn it on for + * future users + */ + if (!blk_stat_enable(q)) + return 0; + + /* + * We don't have to do this once per IO, should optimize this + * to just use the current window of stats until it changes + */ + memset(&stat, 0, sizeof(stat)); + blk_hctx_stat_get(hctx, stat); + + /* + * As an optimistic guess, use half of the mean service time + * for this type of request. We can (and should) make this smarter. + * For instance, if the completion latencies are tight, we can + * get closer than just half the mean. This is especially + * important on devices where the completion latencies are longer + * than ~10 usec. + */ + if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples) + ret = (stat[BLK_STAT_READ].mean + 1) / 2; + else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples) + ret = (stat[BLK_STAT_WRITE].mean + 1) / 2; + + return ret; +} + +static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct hrtimer_sleeper hs; + enum hrtimer_mode mode; + unsigned int nsecs; + ktime_t kt; + + if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + return false; + + /* + * poll_nsec can be: + * + * -1: don't ever hybrid sleep + * 0: use half of prev avg + * >0: use this specific value + */ + if (q->poll_nsec == -1) + return false; + else if (q->poll_nsec > 0) + nsecs = q->poll_nsec; + else + nsecs = blk_mq_poll_nsecs(q, hctx, rq); + + if (!nsecs) + return false; + + set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + + /* + * This will be replaced with the stats tracking code, using + * 'avg_completion_time / 2' as the pre-sleep target. + */ + kt = nsecs; + + mode = HRTIMER_MODE_REL; + hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); + hrtimer_set_expires(&hs.timer, kt); + + hrtimer_init_sleeper(&hs, current); + do { + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); + hrtimer_start_expires(&hs.timer, mode); + if (hs.task) + io_schedule(); + hrtimer_cancel(&hs.timer); + mode = HRTIMER_MODE_ABS; + } while (hs.task && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&hs.timer); + return true; +} + +static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + struct request_queue *q = hctx->queue; + long state; + + /* + * If we sleep, have the caller restart the poll loop to reset + * the state. Like for the other success return cases, the + * caller is responsible for checking if the IO completed. If + * the IO isn't complete, we'll get called again and will go + * straight to the busy poll loop. + */ + if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) + return true; + + hctx->poll_considered++; + + state = current->state; + while (!need_resched()) { + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx, rq->tag); + if (ret > 0) { + hctx->poll_success++; + set_current_state(TASK_RUNNING); + return true; + } + + if (signal_pending_state(state, current)) + set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return true; + if (ret < 0) + break; + cpu_relax(); + } + + return false; +} + +bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_plug *plug; + struct request *rq; + + if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return false; + + plug = current->plug; + if (plug) + blk_flush_plug_list(plug, false); + + hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; + if (!blk_qc_t_is_internal(cookie)) + rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + else + rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); + + return __blk_mq_poll(hctx, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_poll); + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); @@ -2444,10 +2819,12 @@ void blk_mq_enable_hotplug(void) static int __init blk_mq_init(void) { - blk_mq_cpu_init(); - - hotcpu_notifier(blk_mq_queue_reinit_notify, 0); + cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, + blk_mq_hctx_notify_dead); + cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare", + blk_mq_queue_reinit_prepare, + blk_mq_queue_reinit_dead); return 0; } subsys_initcall(blk_mq_init); diff --git a/block/blk-mq.h b/block/blk-mq.h index 9087b11037b7..24b2256186f3 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +#include "blk-stat.h" + struct blk_mq_tag_set; struct blk_mq_ctx { @@ -12,14 +14,13 @@ struct blk_mq_ctx { unsigned int cpu; unsigned int index_hw; - unsigned int last_tag ____cacheline_aligned_in_smp; - /* incremented at dispatch time */ unsigned long rq_dispatched[2]; unsigned long rq_merged; /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + struct blk_rq_stat stat[2]; struct request_queue *queue; struct kobject kobj; @@ -30,28 +31,49 @@ void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); +void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); +bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); +bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, + bool wait); + +/* + * Internal helpers for allocating/freeing the request map + */ +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx); +void blk_mq_free_rq_map(struct blk_mq_tags *tags); +struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags); +int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth); /* + * Internal helpers for request insertion into sw queues + */ +void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head); +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct list_head *list); +/* * CPU hotplug helpers */ -struct blk_mq_cpu_notifier; -void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - int (*fn)(void *, unsigned long, unsigned int), - void *data); -void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); -void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); -void blk_mq_cpu_init(void); void blk_mq_enable_hotplug(void); void blk_mq_disable_hotplug(void); /* * CPU -> queue mappings */ -extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, - const struct cpumask *online_mask); extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); +static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, + int cpu) +{ + return q->queue_hw_ctx[q->mq_map[cpu]]; +} + /* * sysfs helpers */ @@ -59,19 +81,39 @@ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); +/* + * debugfs helpers + */ +#ifdef CONFIG_BLK_DEBUG_FS +int blk_mq_debugfs_register(struct request_queue *q, const char *name); +void blk_mq_debugfs_unregister(struct request_queue *q); +int blk_mq_debugfs_register_hctxs(struct request_queue *q); +void blk_mq_debugfs_unregister_hctxs(struct request_queue *q); +#else +static inline int blk_mq_debugfs_register(struct request_queue *q, + const char *name) +{ + return 0; +} + +static inline void blk_mq_debugfs_unregister(struct request_queue *q) +{ +} + +static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q) +{ + return 0; +} + +static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) +{ +} +#endif + extern void blk_mq_rq_timed_out(struct request *req, bool reserved); void blk_mq_release(struct request_queue *q); -/* - * Basic implementation of sparser bitmap, allowing the user to spread - * the bits over more cachelines. - */ -struct blk_align_bitmap { - unsigned long word; - unsigned long depth; -} ____cacheline_aligned_in_smp; - static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { @@ -114,6 +156,30 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, data->hctx = hctx; } +static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) +{ + if (data->flags & BLK_MQ_REQ_INTERNAL) + return data->hctx->sched_tags; + + return data->hctx->tags; +} + +/* + * Internal helpers for request allocation/init/free + */ +void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, + struct request *rq, unsigned int op); +void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct request *rq); +void blk_mq_finish_request(struct request *rq); +struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, + unsigned int op); + +static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) +{ + return test_bit(BLK_MQ_S_STOPPED, &hctx->state); +} + static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { return hctx->nr_ctx && hctx->tags; diff --git a/block/blk-settings.c b/block/blk-settings.c index f679ae122843..1e7174ffc9d4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -13,6 +13,7 @@ #include <linux/gfp.h> #include "blk.h" +#include "blk-wbt.h" unsigned long blk_max_low_pfn; EXPORT_SYMBOL(blk_max_low_pfn); @@ -87,6 +88,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); void blk_set_default_limits(struct queue_limits *lim) { lim->max_segments = BLK_MAX_SEGMENTS; + lim->max_discard_segments = 1; lim->max_integrity_segments = 0; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->virt_boundary_mask = 0; @@ -95,6 +97,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_dev_sectors = 0; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; + lim->max_write_zeroes_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; lim->discard_granularity = 0; @@ -107,6 +110,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->io_opt = 0; lim->misaligned = 0; lim->cluster = 1; + lim->zoned = BLK_ZONED_NONE; } EXPORT_SYMBOL(blk_set_default_limits); @@ -125,11 +129,13 @@ void blk_set_stacking_limits(struct queue_limits *lim) /* Inherit limits from component devices */ lim->discard_zeroes_data = 1; lim->max_segments = USHRT_MAX; + lim->max_discard_segments = 1; lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; + lim->max_write_zeroes_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -249,6 +255,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); limits->max_sectors = max_sectors; + q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -298,6 +305,19 @@ void blk_queue_max_write_same_sectors(struct request_queue *q, EXPORT_SYMBOL(blk_queue_max_write_same_sectors); /** + * blk_queue_max_write_zeroes_sectors - set max sectors for a single + * write zeroes + * @q: the request queue for the device + * @max_write_zeroes_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_write_zeroes_sectors(struct request_queue *q, + unsigned int max_write_zeroes_sectors) +{ + q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors; +} +EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); + +/** * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device * @max_segments: max number of segments @@ -319,6 +339,22 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments EXPORT_SYMBOL(blk_queue_max_segments); /** + * blk_queue_max_discard_segments - set max segments for discard requests + * @q: the request queue for the device + * @max_segments: max number of segments + * + * Description: + * Enables a low level driver to set an upper limit on the number of + * segments in a discard request. + **/ +void blk_queue_max_discard_segments(struct request_queue *q, + unsigned short max_segments) +{ + q->limits.max_discard_segments = max_segments; +} +EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments); + +/** * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg * @q: the request queue for the device * @max_size: max size of segment in bytes @@ -525,6 +561,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_same_sectors = min(t->max_write_same_sectors, b->max_write_same_sectors); + t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, + b->max_write_zeroes_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, @@ -533,6 +571,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->virt_boundary_mask); t->max_segments = min_not_zero(t->max_segments, b->max_segments); + t->max_discard_segments = min_not_zero(t->max_discard_segments, + b->max_discard_segments); t->max_integrity_segments = min_not_zero(t->max_integrity_segments, b->max_integrity_segments); @@ -630,6 +670,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_granularity; } + if (b->chunk_sectors) + t->chunk_sectors = min_not_zero(t->chunk_sectors, + b->chunk_sectors); + return ret; } EXPORT_SYMBOL(blk_stack_limits); @@ -832,6 +876,19 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable) EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); /** + * blk_set_queue_depth - tell the block layer about the device queue depth + * @q: the request queue for the device + * @depth: queue depth + * + */ +void blk_set_queue_depth(struct request_queue *q, unsigned int depth) +{ + q->queue_depth = depth; + wbt_set_queue_depth(q->rq_wb, depth); +} +EXPORT_SYMBOL(blk_set_queue_depth); + +/** * blk_queue_write_cache - configure queue's write cache * @q: the request queue for the device * @wc: write back cache on or off @@ -851,6 +908,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) else queue_flag_clear(QUEUE_FLAG_FUA, q); spin_unlock_irq(q->queue_lock); + + wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 53b1737e978d..06cf9807f49a 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -18,7 +18,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); * Softirq action handler - move entries to local list and loop over them * while passing them to the queue registered handler. */ -static void blk_done_softirq(struct softirq_action *h) +static __latent_entropy void blk_done_softirq(struct softirq_action *h) { struct list_head *cpu_list, local_list; @@ -78,30 +78,21 @@ static int raise_blk_irq(int cpu, struct request *rq) } #endif -static int blk_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int blk_softirq_cpu_dead(unsigned int cpu) { /* * If a CPU goes away, splice its entries to the current CPU * and trigger a run of the softirq */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - this_cpu_ptr(&blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - } + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_done, cpu), + this_cpu_ptr(&blk_cpu_done)); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_enable(); - return NOTIFY_OK; + return 0; } -static struct notifier_block blk_cpu_notifier = { - .notifier_call = blk_cpu_notify, -}; - void __blk_complete_request(struct request *req) { int ccpu, cpu; @@ -180,7 +171,9 @@ static __init int blk_softirq_init(void) INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); - register_hotcpu_notifier(&blk_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, + "block/softirq:dead", NULL, + blk_softirq_cpu_dead); return 0; } subsys_initcall(blk_softirq_init); diff --git a/block/blk-stat.c b/block/blk-stat.c new file mode 100644 index 000000000000..9b43efb8933f --- /dev/null +++ b/block/blk-stat.c @@ -0,0 +1,256 @@ +/* + * Block stat tracking code + * + * Copyright (C) 2016 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/blk-mq.h> + +#include "blk-stat.h" +#include "blk-mq.h" + +static void blk_stat_flush_batch(struct blk_rq_stat *stat) +{ + const s32 nr_batch = READ_ONCE(stat->nr_batch); + const s32 nr_samples = READ_ONCE(stat->nr_samples); + + if (!nr_batch) + return; + if (!nr_samples) + stat->mean = div64_s64(stat->batch, nr_batch); + else { + stat->mean = div64_s64((stat->mean * nr_samples) + + stat->batch, + nr_batch + nr_samples); + } + + stat->nr_samples += nr_batch; + stat->nr_batch = stat->batch = 0; +} + +static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +{ + if (!src->nr_samples) + return; + + blk_stat_flush_batch(src); + + dst->min = min(dst->min, src->min); + dst->max = max(dst->max, src->max); + + if (!dst->nr_samples) + dst->mean = src->mean; + else { + dst->mean = div64_s64((src->mean * src->nr_samples) + + (dst->mean * dst->nr_samples), + dst->nr_samples + src->nr_samples); + } + dst->nr_samples += src->nr_samples; +} + +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + uint64_t latest = 0; + int i, j, nr; + + blk_stat_init(&dst[BLK_STAT_READ]); + blk_stat_init(&dst[BLK_STAT_WRITE]); + + nr = 0; + do { + uint64_t newest = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); + blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]); + + if (!ctx->stat[BLK_STAT_READ].nr_samples && + !ctx->stat[BLK_STAT_WRITE].nr_samples) + continue; + if (ctx->stat[BLK_STAT_READ].time > newest) + newest = ctx->stat[BLK_STAT_READ].time; + if (ctx->stat[BLK_STAT_WRITE].time > newest) + newest = ctx->stat[BLK_STAT_WRITE].time; + } + } + + /* + * No samples + */ + if (!newest) + break; + + if (newest > latest) + latest = newest; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (ctx->stat[BLK_STAT_READ].time == newest) { + blk_stat_sum(&dst[BLK_STAT_READ], + &ctx->stat[BLK_STAT_READ]); + nr++; + } + if (ctx->stat[BLK_STAT_WRITE].time == newest) { + blk_stat_sum(&dst[BLK_STAT_WRITE], + &ctx->stat[BLK_STAT_WRITE]); + nr++; + } + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare. + */ + } while (!nr); + + dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest; +} + +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + if (q->mq_ops) + blk_mq_stat_get(q, dst); + else { + blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]); + blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]); + memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ], + sizeof(struct blk_rq_stat)); + memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE], + sizeof(struct blk_rq_stat)); + } +} + +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) +{ + struct blk_mq_ctx *ctx; + unsigned int i, nr; + + nr = 0; + do { + uint64_t newest = 0; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); + blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]); + + if (!ctx->stat[BLK_STAT_READ].nr_samples && + !ctx->stat[BLK_STAT_WRITE].nr_samples) + continue; + + if (ctx->stat[BLK_STAT_READ].time > newest) + newest = ctx->stat[BLK_STAT_READ].time; + if (ctx->stat[BLK_STAT_WRITE].time > newest) + newest = ctx->stat[BLK_STAT_WRITE].time; + } + + if (!newest) + break; + + hctx_for_each_ctx(hctx, ctx, i) { + if (ctx->stat[BLK_STAT_READ].time == newest) { + blk_stat_sum(&dst[BLK_STAT_READ], + &ctx->stat[BLK_STAT_READ]); + nr++; + } + if (ctx->stat[BLK_STAT_WRITE].time == newest) { + blk_stat_sum(&dst[BLK_STAT_WRITE], + &ctx->stat[BLK_STAT_WRITE]); + nr++; + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare, as the window is only updated + * occasionally + */ + } while (!nr); +} + +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->batch = stat->nr_batch = 0; + stat->time = time_now & BLK_STAT_NSEC_MASK; +} + +void blk_stat_init(struct blk_rq_stat *stat) +{ + __blk_stat_init(stat, ktime_to_ns(ktime_get())); +} + +static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) +{ + return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK); +} + +bool blk_stat_is_current(struct blk_rq_stat *stat) +{ + return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); +} + +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) +{ + s64 now, value; + + now = __blk_stat_time(ktime_to_ns(ktime_get())); + if (now < blk_stat_time(&rq->issue_stat)) + return; + + if (!__blk_stat_is_current(stat, now)) + __blk_stat_init(stat, now); + + value = now - blk_stat_time(&rq->issue_stat); + if (value > stat->max) + stat->max = value; + if (value < stat->min) + stat->min = value; + + if (stat->batch + value < stat->batch || + stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) + blk_stat_flush_batch(stat); + + stat->batch += value; + stat->nr_batch++; +} + +void blk_stat_clear(struct request_queue *q) +{ + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + blk_stat_init(&ctx->stat[BLK_STAT_READ]); + blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); + } + } + } else { + blk_stat_init(&q->rq_stats[BLK_STAT_READ]); + blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]); + } +} + +void blk_stat_set_issue_time(struct blk_issue_stat *stat) +{ + stat->time = (stat->time & BLK_STAT_MASK) | + (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK); +} + +/* + * Enable stat tracking, return whether it was enabled + */ +bool blk_stat_enable(struct request_queue *q) +{ + if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + set_bit(QUEUE_FLAG_STATS, &q->queue_flags); + return false; + } + + return true; +} diff --git a/block/blk-stat.h b/block/blk-stat.h new file mode 100644 index 000000000000..a2050a0a5314 --- /dev/null +++ b/block/blk-stat.h @@ -0,0 +1,42 @@ +#ifndef BLK_STAT_H +#define BLK_STAT_H + +/* + * ~0.13s window as a power-of-2 (2^27 nsecs) + */ +#define BLK_STAT_NSEC 134217728ULL +#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1) + +/* + * Upper 3 bits can be used elsewhere + */ +#define BLK_STAT_RES_BITS 3 +#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS) +#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1) +#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK + +enum { + BLK_STAT_READ = 0, + BLK_STAT_WRITE, +}; + +void blk_stat_add(struct blk_rq_stat *, struct request *); +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); +void blk_stat_clear(struct request_queue *); +void blk_stat_init(struct blk_rq_stat *); +bool blk_stat_is_current(struct blk_rq_stat *); +void blk_stat_set_issue_time(struct blk_issue_stat *); +bool blk_stat_enable(struct request_queue *); + +static inline u64 __blk_stat_time(u64 time) +{ + return time & BLK_STAT_TIME_MASK; +} + +static inline u64 blk_stat_time(struct blk_issue_stat *stat) +{ + return __blk_stat_time(stat->time); +} + +#endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f87a7e747d36..002af836aa87 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wbt.h" struct queue_sysfs_entry { struct attribute attr; @@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } +static ssize_t queue_var_store64(s64 *var, const char *page) +{ + int err; + s64 v; + + err = kstrtos64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + static ssize_t queue_requests_show(struct request_queue *q, char *page) { return queue_var_show(q->nr_requests, (page)); @@ -75,7 +89,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_ra_show(struct request_queue *q, char *page) { - unsigned long ra_kb = q->backing_dev_info.ra_pages << + unsigned long ra_kb = q->backing_dev_info->ra_pages << (PAGE_SHIFT - 10); return queue_var_show(ra_kb, (page)); @@ -90,7 +104,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; - q->backing_dev_info.ra_pages = ra_kb >> (PAGE_SHIFT - 10); + q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -107,6 +121,12 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page) return queue_var_show(queue_max_segments(q), (page)); } +static ssize_t queue_max_discard_segments_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_max_discard_segments(q), (page)); +} + static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) { return queue_var_show(q->limits.max_integrity_segments, (page)); @@ -130,6 +150,11 @@ static ssize_t queue_physical_block_size_show(struct request_queue *q, char *pag return queue_var_show(queue_physical_block_size(q), page); } +static ssize_t queue_chunk_sectors_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.chunk_sectors, page); +} + static ssize_t queue_io_min_show(struct request_queue *q, char *page) { return queue_var_show(queue_io_min(q), page); @@ -192,6 +217,11 @@ static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_same_sectors << 9); } +static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_write_zeroes_sectors << 9); +} static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) @@ -212,6 +242,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; + q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(q->queue_lock); return ret; @@ -257,6 +288,18 @@ QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); #undef QUEUE_SYSFS_BIT_FNS +static ssize_t queue_zoned_show(struct request_queue *q, char *page) +{ + switch (blk_queue_zoned_model(q)) { + case BLK_ZONED_HA: + return sprintf(page, "host-aware\n"); + case BLK_ZONED_HM: + return sprintf(page, "host-managed\n"); + default: + return sprintf(page, "none\n"); + } +} + static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { return queue_var_show((blk_queue_nomerges(q) << 1) | @@ -319,6 +362,38 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) +{ + int val; + + if (q->poll_nsec == -1) + val = -1; + else + val = q->poll_nsec / 1000; + + return sprintf(page, "%d\n", val); +} + +static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, + size_t count) +{ + int err, val; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + err = kstrtoint(page, 10, &val); + if (err < 0) + return err; + + if (val == -1) + q->poll_nsec = -1; + else + q->poll_nsec = val * 1000; + + return count; +} + static ssize_t queue_poll_show(struct request_queue *q, char *page) { return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); @@ -347,6 +422,50 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + struct rq_wb *rwb; + ssize_t ret; + s64 val; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + if (val < -1) + return -EINVAL; + + rwb = q->rq_wb; + if (!rwb) { + ret = wbt_init(q); + if (ret) + return ret; + + rwb = q->rq_wb; + if (!rwb) + return -EINVAL; + } + + if (val == -1) + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + else if (val >= 0) + rwb->min_lat_nsec = val * 1000ULL; + + if (rwb->enable_state == WBT_STATE_ON_DEFAULT) + rwb->enable_state = WBT_STATE_ON_MANUAL; + + wbt_update_limits(rwb); + return count; +} + static ssize_t queue_wc_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) @@ -384,6 +503,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_dax(q), page); } +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t queue_stats_show(struct request_queue *q, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_queue_stat_get(q, stat); + + ret = print_stat(page, &stat[BLK_STAT_READ], "read :"); + ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:"); + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -412,6 +551,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = { .show = queue_max_segments_show, }; +static struct queue_sysfs_entry queue_max_discard_segments_entry = { + .attr = {.name = "max_discard_segments", .mode = S_IRUGO }, + .show = queue_max_discard_segments_show, +}; + static struct queue_sysfs_entry queue_max_integrity_segments_entry = { .attr = {.name = "max_integrity_segments", .mode = S_IRUGO }, .show = queue_max_integrity_segments_show, @@ -443,6 +587,11 @@ static struct queue_sysfs_entry queue_physical_block_size_entry = { .show = queue_physical_block_size_show, }; +static struct queue_sysfs_entry queue_chunk_sectors_entry = { + .attr = {.name = "chunk_sectors", .mode = S_IRUGO }, + .show = queue_chunk_sectors_show, +}; + static struct queue_sysfs_entry queue_io_min_entry = { .attr = {.name = "minimum_io_size", .mode = S_IRUGO }, .show = queue_io_min_show, @@ -479,12 +628,22 @@ static struct queue_sysfs_entry queue_write_same_max_entry = { .show = queue_write_same_max_show, }; +static struct queue_sysfs_entry queue_write_zeroes_max_entry = { + .attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO }, + .show = queue_write_zeroes_max_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_show_nonrot, .store = queue_store_nonrot, }; +static struct queue_sysfs_entry queue_zoned_entry = { + .attr = {.name = "zoned", .mode = S_IRUGO }, + .show = queue_zoned_show, +}; + static struct queue_sysfs_entry queue_nomerges_entry = { .attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR }, .show = queue_nomerges_show, @@ -515,6 +674,12 @@ static struct queue_sysfs_entry queue_poll_entry = { .store = queue_poll_store, }; +static struct queue_sysfs_entry queue_poll_delay_entry = { + .attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR }, + .show = queue_poll_delay_show, + .store = queue_poll_delay_store, +}; + static struct queue_sysfs_entry queue_wc_entry = { .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR }, .show = queue_wc_show, @@ -526,18 +691,31 @@ static struct queue_sysfs_entry queue_dax_entry = { .show = queue_dax_show, }; +static struct queue_sysfs_entry queue_stats_entry = { + .attr = {.name = "stats", .mode = S_IRUGO }, + .show = queue_stats_show, +}; + +static struct queue_sysfs_entry queue_wb_lat_entry = { + .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_lat_show, + .store = queue_wb_lat_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, + &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, + &queue_chunk_sectors_entry.attr, &queue_io_min_entry.attr, &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, @@ -545,7 +723,9 @@ static struct attribute *default_attrs[] = { &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, + &queue_write_zeroes_max_entry.attr, &queue_nonrot_entry.attr, + &queue_zoned_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, @@ -553,6 +733,9 @@ static struct attribute *default_attrs[] = { &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_dax_entry.attr, + &queue_stats_entry.attr, + &queue_wb_lat_entry.attr, + &queue_poll_delay_entry.attr, NULL, }; @@ -627,7 +810,8 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); - bdi_exit(&q->backing_dev_info); + wbt_exit(q); + bdi_put(q->backing_dev_info); blkcg_exit_queue(q); if (q->elevator) { @@ -642,13 +826,19 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - if (!q->mq_ops) + if (!q->mq_ops) { + if (q->exit_rq_fn) + q->exit_rq_fn(q, q->fq->flush_rq); blk_free_flush_queue(q->fq); - else + } else { blk_mq_release(q); + } blk_trace_shutdown(q); + if (q->mq_ops) + blk_mq_debugfs_unregister(q); + if (q->bio_split) bioset_free(q->bio_split); @@ -667,6 +857,23 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void blk_wb_init(struct request_queue *q) +{ +#ifndef CONFIG_BLK_WBT_MQ + if (q->mq_ops) + return; +#endif +#ifndef CONFIG_BLK_WBT_SQ + if (q->request_fn) + return; +#endif + + /* + * If this fails, we don't get throttling + */ + wbt_init(q); +} + int blk_register_queue(struct gendisk *disk) { int ret; @@ -695,30 +902,36 @@ int blk_register_queue(struct gendisk *disk) if (ret) return ret; + if (q->mq_ops) + blk_mq_register_dev(dev, q); + + /* Prevent changes through sysfs until registration is completed. */ + mutex_lock(&q->sysfs_lock); + ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); if (ret < 0) { blk_trace_remove_sysfs(dev); - return ret; + goto unlock; } kobject_uevent(&q->kobj, KOBJ_ADD); - if (q->mq_ops) - blk_mq_register_disk(disk); - - if (!q->request_fn) - return 0; - - ret = elv_register_queue(q); - if (ret) { - kobject_uevent(&q->kobj, KOBJ_REMOVE); - kobject_del(&q->kobj); - blk_trace_remove_sysfs(dev); - kobject_put(&dev->kobj); - return ret; + blk_wb_init(q); + + if (q->request_fn || (q->mq_ops && q->elevator)) { + ret = elv_register_queue(q); + if (ret) { + kobject_uevent(&q->kobj, KOBJ_REMOVE); + kobject_del(&q->kobj); + blk_trace_remove_sysfs(dev); + kobject_put(&dev->kobj); + goto unlock; + } } - - return 0; + ret = 0; +unlock: + mutex_unlock(&q->sysfs_lock); + return ret; } void blk_unregister_queue(struct gendisk *disk) @@ -729,9 +942,9 @@ void blk_unregister_queue(struct gendisk *disk) return; if (q->mq_ops) - blk_mq_unregister_disk(disk); + blk_mq_unregister_dev(disk_to_dev(disk), q); - if (q->request_fn) + if (q->request_fn || (q->mq_ops && q->elevator)) elv_unregister_queue(q); kobject_uevent(&q->kobj, KOBJ_REMOVE); diff --git a/block/blk-tag.c b/block/blk-tag.c index f0344e6939d5..07cc329fa4b0 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -270,8 +270,9 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq) BUG_ON(tag >= bqt->real_max_depth); list_del_init(&rq->queuelist); - rq->cmd_flags &= ~REQ_QUEUED; + rq->rq_flags &= ~RQF_QUEUED; rq->tag = -1; + rq->internal_tag = -1; if (unlikely(bqt->tag_index[tag] == NULL)) printk(KERN_ERR "%s: tag %d is missing\n", @@ -316,7 +317,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) unsigned max_depth; int tag; - if (unlikely((rq->cmd_flags & REQ_QUEUED))) { + if (unlikely((rq->rq_flags & RQF_QUEUED))) { printk(KERN_ERR "%s: request %p for device [%s] already tagged %d", __func__, rq, @@ -371,7 +372,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) */ bqt->next_tag = (tag + 1) % bqt->max_depth; - rq->cmd_flags |= REQ_QUEUED; + rq->rq_flags |= RQF_QUEUED; rq->tag = tag; bqt->tag_index[tag] = rq; blk_start_request(rq); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a3ea8260c94c..82fd0cc394eb 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -818,13 +818,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg->io_disp[rw]++; /* - * REQ_THROTTLED is used to prevent the same bio to be throttled + * BIO_THROTTLED is used to prevent the same bio to be throttled * more than once as a throttled bio will go through blk-throtl the * second time when it eventually gets issued. Set it when a bio * is being charged to a tg. */ - if (!(bio->bi_opf & REQ_THROTTLED)) - bio->bi_opf |= REQ_THROTTLED; + if (!bio_flagged(bio, BIO_THROTTLED)) + bio_set_flag(bio, BIO_THROTTLED); } /** @@ -866,10 +866,12 @@ static void tg_update_disptime(struct throtl_grp *tg) unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; struct bio *bio; - if ((bio = throtl_peek_queued(&sq->queued[READ]))) + bio = throtl_peek_queued(&sq->queued[READ]); + if (bio) tg_may_dispatch(tg, bio, &read_wait); - if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) + bio = throtl_peek_queued(&sq->queued[WRITE]); + if (bio) tg_may_dispatch(tg, bio, &write_wait); min_wait = min(read_wait, write_wait); @@ -1401,7 +1403,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, WARN_ON_ONCE(!rcu_read_lock_held()); /* see throtl_charge_bio() */ - if ((bio->bi_opf & REQ_THROTTLED) || !tg->has_rules[rw]) + if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) goto out; spin_lock_irq(q->queue_lock); @@ -1480,7 +1482,7 @@ out: * being issued. */ if (!throttled) - bio->bi_opf &= ~REQ_THROTTLED; + bio_clear_flag(bio, BIO_THROTTLED); return throttled; } diff --git a/block/blk-wbt.c b/block/blk-wbt.c new file mode 100644 index 000000000000..1aedb1f7ee0c --- /dev/null +++ b/block/blk-wbt.c @@ -0,0 +1,751 @@ +/* + * buffered writeback throttling. loosely based on CoDel. We can't drop + * packets for IO scheduling, so the logic is something like this: + * + * - Monitor latencies in a defined window of time. + * - If the minimum latency in the above window exceeds some target, increment + * scaling step and scale down queue depth by a factor of 2x. The monitoring + * window is then shrunk to 100 / sqrt(scaling step + 1). + * - For any window where we don't have solid data on what the latencies + * look like, retain status quo. + * - If latencies look good, decrement scaling step. + * - If we're only doing writes, allow the scaling step to go negative. This + * will temporarily boost write performance, snapping back to a stable + * scaling step of 0 if reads show up or the heavy writers finish. Unlike + * positive scaling steps where we shrink the monitoring window, a negative + * scaling step retains the default step==0 window size. + * + * Copyright (C) 2016 Jens Axboe + * + */ +#include <linux/kernel.h> +#include <linux/blk_types.h> +#include <linux/slab.h> +#include <linux/backing-dev.h> +#include <linux/swap.h> + +#include "blk-wbt.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/wbt.h> + +enum { + /* + * Default setting, we'll scale up (to 75% of QD max) or down (min 1) + * from here depending on device stats + */ + RWB_DEF_DEPTH = 16, + + /* + * 100msec window + */ + RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, + + /* + * Disregard stats, if we don't meet this minimum + */ + RWB_MIN_WRITE_SAMPLES = 3, + + /* + * If we have this number of consecutive windows with not enough + * information to scale up or down, scale up. + */ + RWB_UNKNOWN_BUMP = 5, +}; + +static inline bool rwb_enabled(struct rq_wb *rwb) +{ + return rwb && rwb->wb_normal != 0; +} + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) +{ + if (rwb_enabled(rwb)) { + const unsigned long cur = jiffies; + + if (cur != *var) + *var = cur; + } +} + +/* + * If a task was rate throttled in balance_dirty_pages() within the last + * second or so, use that to indicate a higher cleaning rate. + */ +static bool wb_recent_wait(struct rq_wb *rwb) +{ + struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; + + return time_before(jiffies, wb->dirty_sleep + HZ); +} + +static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd) +{ + return &rwb->rq_wait[is_kswapd]; +} + +static void rwb_wake_all(struct rq_wb *rwb) +{ + int i; + + for (i = 0; i < WBT_NUM_RWQ; i++) { + struct rq_wait *rqw = &rwb->rq_wait[i]; + + if (waitqueue_active(&rqw->wait)) + wake_up_all(&rqw->wait); + } +} + +void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) +{ + struct rq_wait *rqw; + int inflight, limit; + + if (!(wb_acct & WBT_TRACKED)) + return; + + rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD); + inflight = atomic_dec_return(&rqw->inflight); + + /* + * wbt got disabled with IO in flight. Wake up any potential + * waiters, we don't have to do more than that. + */ + if (unlikely(!rwb_enabled(rwb))) { + rwb_wake_all(rwb); + return; + } + + /* + * If the device does write back caching, drop further down + * before we wake people up. + */ + if (rwb->wc && !wb_recent_wait(rwb)) + limit = 0; + else + limit = rwb->wb_normal; + + /* + * Don't wake anyone up if we are above the normal limit. + */ + if (inflight && inflight >= limit) + return; + + if (waitqueue_active(&rqw->wait)) { + int diff = limit - inflight; + + if (!inflight || diff >= rwb->wb_background / 2) + wake_up_all(&rqw->wait); + } +} + +/* + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. + */ +void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ + if (!rwb) + return; + + if (!wbt_is_tracked(stat)) { + if (rwb->sync_cookie == stat) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } + + if (wbt_is_read(stat)) + wb_timestamp(rwb, &rwb->last_comp); + wbt_clear_state(stat); + } else { + WARN_ON_ONCE(stat == rwb->sync_cookie); + __wbt_done(rwb, wbt_stat_to_mask(stat)); + wbt_clear_state(stat); + } +} + +/* + * Return true, if we can't increase the depth further by scaling + */ +static bool calc_wb_limits(struct rq_wb *rwb) +{ + unsigned int depth; + bool ret = false; + + if (!rwb->min_lat_nsec) { + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; + return false; + } + + /* + * For QD=1 devices, this is a special case. It's important for those + * to have one request ready when one completes, so force a depth of + * 2 for those devices. On the backend, it'll be a depth of 1 anyway, + * since the device can't have more than that in flight. If we're + * scaling down, then keep a setting of 1/1/1. + */ + if (rwb->queue_depth == 1) { + if (rwb->scale_step > 0) + rwb->wb_max = rwb->wb_normal = 1; + else { + rwb->wb_max = rwb->wb_normal = 2; + ret = true; + } + rwb->wb_background = 1; + } else { + /* + * scale_step == 0 is our default state. If we have suffered + * latency spikes, step will be > 0, and we shrink the + * allowed write depths. If step is < 0, we're only doing + * writes, and we allow a temporarily higher depth to + * increase performance. + */ + depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); + if (rwb->scale_step > 0) + depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); + else if (rwb->scale_step < 0) { + unsigned int maxd = 3 * rwb->queue_depth / 4; + + depth = 1 + ((depth - 1) << -rwb->scale_step); + if (depth > maxd) { + depth = maxd; + ret = true; + } + } + + /* + * Set our max/normal/bg queue depths based on how far + * we have scaled down (->scale_step). + */ + rwb->wb_max = depth; + rwb->wb_normal = (rwb->wb_max + 1) / 2; + rwb->wb_background = (rwb->wb_max + 3) / 4; + } + + return ret; +} + +static inline bool stat_sample_valid(struct blk_rq_stat *stat) +{ + /* + * We need at least one read sample, and a minimum of + * RWB_MIN_WRITE_SAMPLES. We require some write samples to know + * that it's writes impacting us, and not just some sole read on + * a device that is in a lower power state. + */ + return stat[BLK_STAT_READ].nr_samples >= 1 && + stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES; +} + +static u64 rwb_sync_issue_lat(struct rq_wb *rwb) +{ + u64 now, issue = ACCESS_ONCE(rwb->sync_issue); + + if (!issue || !rwb->sync_cookie) + return 0; + + now = ktime_to_ns(ktime_get()); + return now - issue; +} + +enum { + LAT_OK = 1, + LAT_UNKNOWN, + LAT_UNKNOWN_WRITES, + LAT_EXCEEDED, +}; + +static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) +{ + struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + u64 thislat; + + /* + * If our stored sync issue exceeds the window size, or it + * exceeds our min target AND we haven't logged any entries, + * flag the latency as exceeded. wbt works off completion latencies, + * but for a flooded device, a single sync IO can take a long time + * to complete after being issued. If this time exceeds our + * monitoring window AND we didn't see any other completions in that + * window, then count that sync IO as a violation of the latency. + */ + thislat = rwb_sync_issue_lat(rwb); + if (thislat > rwb->cur_win_nsec || + (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) { + trace_wbt_lat(bdi, thislat); + return LAT_EXCEEDED; + } + + /* + * No read/write mix, if stat isn't valid + */ + if (!stat_sample_valid(stat)) { + /* + * If we had writes in this stat window and the window is + * current, we're only doing writes. If a task recently + * waited or still has writes in flights, consider us doing + * just writes as well. + */ + if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) || + wb_recent_wait(rwb) || wbt_inflight(rwb)) + return LAT_UNKNOWN_WRITES; + return LAT_UNKNOWN; + } + + /* + * If the 'min' latency exceeds our target, step down. + */ + if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) { + trace_wbt_lat(bdi, stat[BLK_STAT_READ].min); + trace_wbt_stat(bdi, stat); + return LAT_EXCEEDED; + } + + if (rwb->scale_step) + trace_wbt_stat(bdi, stat); + + return LAT_OK; +} + +static int latency_exceeded(struct rq_wb *rwb) +{ + struct blk_rq_stat stat[2]; + + blk_queue_stat_get(rwb->queue, stat); + return __latency_exceeded(rwb, stat); +} + +static void rwb_trace_step(struct rq_wb *rwb, const char *msg) +{ + struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + + trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rwb->wb_max); +} + +static void scale_up(struct rq_wb *rwb) +{ + /* + * Hit max in previous round, stop here + */ + if (rwb->scaled_max) + return; + + rwb->scale_step--; + rwb->unknown_cnt = 0; + blk_stat_clear(rwb->queue); + + rwb->scaled_max = calc_wb_limits(rwb); + + rwb_wake_all(rwb); + + rwb_trace_step(rwb, "step up"); +} + +/* + * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we + * had a latency violation. + */ +static void scale_down(struct rq_wb *rwb, bool hard_throttle) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rwb->wb_max == 1) + return; + + if (rwb->scale_step < 0 && hard_throttle) + rwb->scale_step = 0; + else + rwb->scale_step++; + + rwb->scaled_max = false; + rwb->unknown_cnt = 0; + blk_stat_clear(rwb->queue); + calc_wb_limits(rwb); + rwb_trace_step(rwb, "step down"); +} + +static void rwb_arm_timer(struct rq_wb *rwb) +{ + unsigned long expires; + + if (rwb->scale_step > 0) { + /* + * We should speed this up, using some variant of a fast + * integer inverse square root calculation. Since we only do + * this for every window expiration, it's not a huge deal, + * though. + */ + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, + int_sqrt((rwb->scale_step + 1) << 8)); + } else { + /* + * For step < 0, we don't want to increase/decrease the + * window size. + */ + rwb->cur_win_nsec = rwb->win_nsec; + } + + expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); + mod_timer(&rwb->window_timer, expires); +} + +static void wb_timer_fn(unsigned long data) +{ + struct rq_wb *rwb = (struct rq_wb *) data; + unsigned int inflight = wbt_inflight(rwb); + int status; + + status = latency_exceeded(rwb); + + trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, + inflight); + + /* + * If we exceeded the latency target, step down. If we did not, + * step one level up. If we don't know enough to say either exceeded + * or ok, then don't do anything. + */ + switch (status) { + case LAT_EXCEEDED: + scale_down(rwb, true); + break; + case LAT_OK: + scale_up(rwb); + break; + case LAT_UNKNOWN_WRITES: + /* + * We started a the center step, but don't have a valid + * read/write sample, but we do have writes going on. + * Allow step to go negative, to increase write perf. + */ + scale_up(rwb); + break; + case LAT_UNKNOWN: + if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) + break; + /* + * We get here when previously scaled reduced depth, and we + * currently don't have a valid read/write sample. For that + * case, slowly return to center state (step == 0). + */ + if (rwb->scale_step > 0) + scale_up(rwb); + else if (rwb->scale_step < 0) + scale_down(rwb, false); + break; + default: + break; + } + + /* + * Re-arm timer, if we have IO in flight + */ + if (rwb->scale_step || inflight) + rwb_arm_timer(rwb); +} + +void wbt_update_limits(struct rq_wb *rwb) +{ + rwb->scale_step = 0; + rwb->scaled_max = false; + calc_wb_limits(rwb); + + rwb_wake_all(rwb); +} + +static bool close_io(struct rq_wb *rwb) +{ + const unsigned long now = jiffies; + + return time_before(now, rwb->last_issue + HZ / 10) || + time_before(now, rwb->last_comp + HZ / 10); +} + +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) + +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +{ + unsigned int limit; + + /* + * At this point we know it's a buffered write. If this is + * kswapd trying to free memory, or REQ_SYNC is set, set, then + * it's WB_SYNC_ALL writeback, and we'll use the max limit for + * that. If the write is marked as a background write, then use + * the idle limit, or go to normal if we haven't had competing + * IO for a bit. + */ + if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + limit = rwb->wb_max; + else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { + /* + * If less than 100ms since we completed unrelated IO, + * limit us to half the depth for background writeback. + */ + limit = rwb->wb_background; + } else + limit = rwb->wb_normal; + + return limit; +} + +static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, + wait_queue_t *wait, unsigned long rw) +{ + /* + * inc it here even if disabled, since we'll dec it at completion. + * this only happens if the task was sleeping in __wbt_wait(), + * and someone turned it off at the same time. + */ + if (!rwb_enabled(rwb)) { + atomic_inc(&rqw->inflight); + return true; + } + + /* + * If the waitqueue is already active and we are not the next + * in line to be woken up, wait for our turn. + */ + if (waitqueue_active(&rqw->wait) && + rqw->wait.task_list.next != &wait->task_list) + return false; + + return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); +} + +/* + * Block if we will exceed our limit, or if we are currently waiting for + * the timer to kick off queuing again. + */ +static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) + __releases(lock) + __acquires(lock) +{ + struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd()); + DEFINE_WAIT(wait); + + if (may_queue(rwb, rqw, &wait, rw)) + return; + + do { + prepare_to_wait_exclusive(&rqw->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (may_queue(rwb, rqw, &wait, rw)) + break; + + if (lock) { + spin_unlock_irq(lock); + io_schedule(); + spin_lock_irq(lock); + } else + io_schedule(); + } while (1); + + finish_wait(&rqw->wait, &wait); +} + +static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) +{ + const int op = bio_op(bio); + + /* + * If not a WRITE, do nothing + */ + if (op != REQ_OP_WRITE) + return false; + + /* + * Don't throttle WRITE_ODIRECT + */ + if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) + return false; + + return true; +} + +/* + * Returns true if the IO request should be accounted, false if not. + * May sleep, if we have exceeded the writeback limits. Caller can pass + * in an irq held spinlock, if it holds one when calling this function. + * If we do sleep, we'll release and re-grab it. + */ +enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) +{ + unsigned int ret = 0; + + if (!rwb_enabled(rwb)) + return 0; + + if (bio_op(bio) == REQ_OP_READ) + ret = WBT_READ; + + if (!wbt_should_throttle(rwb, bio)) { + if (ret & WBT_READ) + wb_timestamp(rwb, &rwb->last_issue); + return ret; + } + + __wbt_wait(rwb, bio->bi_opf, lock); + + if (!timer_pending(&rwb->window_timer)) + rwb_arm_timer(rwb); + + if (current_is_kswapd()) + ret |= WBT_KSWAPD; + + return ret | WBT_TRACKED; +} + +void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ + if (!rwb_enabled(rwb)) + return; + + /* + * Track sync issue, in case it takes a long time to complete. Allows + * us to react quicker, if a sync IO takes a long time to complete. + * Note that this is just a hint. 'stat' can go away when the + * request completes, so it's important we never dereference it. We + * only use the address to compare with, which is why we store the + * sync_issue time locally. + */ + if (wbt_is_read(stat) && !rwb->sync_issue) { + rwb->sync_cookie = stat; + rwb->sync_issue = blk_stat_time(stat); + } +} + +void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ + if (!rwb_enabled(rwb)) + return; + if (stat == rwb->sync_cookie) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } +} + +void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +{ + if (rwb) { + rwb->queue_depth = depth; + wbt_update_limits(rwb); + } +} + +void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) +{ + if (rwb) + rwb->wc = write_cache_on; +} + + /* + * Disable wbt, if enabled by default. Only called from CFQ, if we have + * cgroups enabled + */ +void wbt_disable_default(struct request_queue *q) +{ + struct rq_wb *rwb = q->rq_wb; + + if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) { + del_timer_sync(&rwb->window_timer); + rwb->win_nsec = rwb->min_lat_nsec = 0; + wbt_update_limits(rwb); + } +} +EXPORT_SYMBOL_GPL(wbt_disable_default); + +u64 wbt_default_latency_nsec(struct request_queue *q) +{ + /* + * We default to 2msec for non-rotational storage, and 75msec + * for rotational storage. + */ + if (blk_queue_nonrot(q)) + return 2000000ULL; + else + return 75000000ULL; +} + +int wbt_init(struct request_queue *q) +{ + struct rq_wb *rwb; + int i; + + /* + * For now, we depend on the stats window being larger than + * our monitoring window. Ensure that this isn't inadvertently + * violated. + */ + BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC); + BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); + + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) + return -ENOMEM; + + for (i = 0; i < WBT_NUM_RWQ; i++) { + atomic_set(&rwb->rq_wait[i].inflight, 0); + init_waitqueue_head(&rwb->rq_wait[i].wait); + } + + setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); + rwb->wc = 1; + rwb->queue_depth = RWB_DEF_DEPTH; + rwb->last_comp = rwb->last_issue = jiffies; + rwb->queue = q; + rwb->win_nsec = RWB_WINDOW_NSEC; + rwb->enable_state = WBT_STATE_ON_DEFAULT; + wbt_update_limits(rwb); + + /* + * Assign rwb, and turn on stats tracking for this queue + */ + q->rq_wb = rwb; + blk_stat_enable(q); + + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + + wbt_set_queue_depth(rwb, blk_queue_depth(q)); + wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + + return 0; +} + +void wbt_exit(struct request_queue *q) +{ + struct rq_wb *rwb = q->rq_wb; + + if (rwb) { + del_timer_sync(&rwb->window_timer); + q->rq_wb = NULL; + kfree(rwb); + } +} diff --git a/block/blk-wbt.h b/block/blk-wbt.h new file mode 100644 index 000000000000..65f1de519f67 --- /dev/null +++ b/block/blk-wbt.h @@ -0,0 +1,171 @@ +#ifndef WB_THROTTLE_H +#define WB_THROTTLE_H + +#include <linux/kernel.h> +#include <linux/atomic.h> +#include <linux/wait.h> +#include <linux/timer.h> +#include <linux/ktime.h> + +#include "blk-stat.h" + +enum wbt_flags { + WBT_TRACKED = 1, /* write, tracked for throttling */ + WBT_READ = 2, /* read */ + WBT_KSWAPD = 4, /* write, from kswapd */ + + WBT_NR_BITS = 3, /* number of bits */ +}; + +enum { + WBT_NUM_RWQ = 2, +}; + +/* + * Enable states. Either off, or on by default (done at init time), + * or on through manual setup in sysfs. + */ +enum { + WBT_STATE_ON_DEFAULT = 1, + WBT_STATE_ON_MANUAL = 2, +}; + +static inline void wbt_clear_state(struct blk_issue_stat *stat) +{ + stat->time &= BLK_STAT_TIME_MASK; +} + +static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat) +{ + return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT; +} + +static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct) +{ + stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT; +} + +static inline bool wbt_is_tracked(struct blk_issue_stat *stat) +{ + return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED; +} + +static inline bool wbt_is_read(struct blk_issue_stat *stat) +{ + return (stat->time >> BLK_STAT_SHIFT) & WBT_READ; +} + +struct rq_wait { + wait_queue_head_t wait; + atomic_t inflight; +}; + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + unsigned int wb_max; /* max throughput writeback */ + int scale_step; + bool scaled_max; + + short enable_state; /* WBT_STATE_* */ + + /* + * Number of consecutive periods where we don't have enough + * information to make a firm scale up/down decision. + */ + unsigned int unknown_cnt; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + struct timer_list window_timer; + + s64 sync_issue; + void *sync_cookie; + + unsigned int wc; + unsigned int queue_depth; + + unsigned long last_issue; /* last non-throttled issue */ + unsigned long last_comp; /* last non-throttled comp */ + unsigned long min_lat_nsec; + struct request_queue *queue; + struct rq_wait rq_wait[WBT_NUM_RWQ]; +}; + +static inline unsigned int wbt_inflight(struct rq_wb *rwb) +{ + unsigned int i, ret = 0; + + for (i = 0; i < WBT_NUM_RWQ; i++) + ret += atomic_read(&rwb->rq_wait[i].inflight); + + return ret; +} + +#ifdef CONFIG_BLK_WBT + +void __wbt_done(struct rq_wb *, enum wbt_flags); +void wbt_done(struct rq_wb *, struct blk_issue_stat *); +enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *); +int wbt_init(struct request_queue *); +void wbt_exit(struct request_queue *); +void wbt_update_limits(struct rq_wb *); +void wbt_requeue(struct rq_wb *, struct blk_issue_stat *); +void wbt_issue(struct rq_wb *, struct blk_issue_stat *); +void wbt_disable_default(struct request_queue *); + +void wbt_set_queue_depth(struct rq_wb *, unsigned int); +void wbt_set_write_cache(struct rq_wb *, bool); + +u64 wbt_default_latency_nsec(struct request_queue *); + +#else + +static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags) +{ +} +static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ +} +static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, + spinlock_t *lock) +{ + return 0; +} +static inline int wbt_init(struct request_queue *q) +{ + return -EINVAL; +} +static inline void wbt_exit(struct request_queue *q) +{ +} +static inline void wbt_update_limits(struct rq_wb *rwb) +{ +} +static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ +} +static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ +} +static inline void wbt_disable_default(struct request_queue *q) +{ +} +static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +{ +} +static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc) +{ +} +static inline u64 wbt_default_latency_nsec(struct request_queue *q) +{ + return 0; +} + +#endif /* CONFIG_BLK_WBT */ + +#endif diff --git a/block/blk-zoned.c b/block/blk-zoned.c new file mode 100644 index 000000000000..3bd15d8095b1 --- /dev/null +++ b/block/blk-zoned.c @@ -0,0 +1,348 @@ +/* + * Zoned block device handling + * + * Copyright (c) 2015, Hannes Reinecke + * Copyright (c) 2015, SUSE Linux GmbH + * + * Copyright (c) 2016, Damien Le Moal + * Copyright (c) 2016, Western Digital + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rbtree.h> +#include <linux/blkdev.h> + +static inline sector_t blk_zone_start(struct request_queue *q, + sector_t sector) +{ + sector_t zone_mask = blk_queue_zone_sectors(q) - 1; + + return sector & ~zone_mask; +} + +/* + * Check that a zone report belongs to the partition. + * If yes, fix its start sector and write pointer, copy it in the + * zone information array and return true. Return false otherwise. + */ +static bool blkdev_report_zone(struct block_device *bdev, + struct blk_zone *rep, + struct blk_zone *zone) +{ + sector_t offset = get_start_sect(bdev); + + if (rep->start < offset) + return false; + + rep->start -= offset; + if (rep->start + rep->len > bdev->bd_part->nr_sects) + return false; + + if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL) + rep->wp = rep->start + rep->len; + else + rep->wp -= offset; + memcpy(zone, rep, sizeof(struct blk_zone)); + + return true; +} + +/** + * blkdev_report_zones - Get zones information + * @bdev: Target block device + * @sector: Sector from which to report zones + * @zones: Array of zone structures where to return the zones information + * @nr_zones: Number of zone structures in the zone array + * @gfp_mask: Memory allocation flags (for bio_alloc) + * + * Description: + * Get zone information starting from the zone containing @sector. + * The number of zone information reported may be less than the number + * requested by @nr_zones. The number of zones actually reported is + * returned in @nr_zones. + */ +int blkdev_report_zones(struct block_device *bdev, + sector_t sector, + struct blk_zone *zones, + unsigned int *nr_zones, + gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct blk_zone_report_hdr *hdr; + unsigned int nrz = *nr_zones; + struct page *page; + unsigned int nr_rep; + size_t rep_bytes; + unsigned int nr_pages; + struct bio *bio; + struct bio_vec *bv; + unsigned int i, n, nz; + unsigned int ofst; + void *addr; + int ret; + + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -EOPNOTSUPP; + + if (!nrz) + return 0; + + if (sector > bdev->bd_part->nr_sects) { + *nr_zones = 0; + return 0; + } + + /* + * The zone report has a header. So make room for it in the + * payload. Also make sure that the report fits in a single BIO + * that will not be split down the stack. + */ + rep_bytes = sizeof(struct blk_zone_report_hdr) + + sizeof(struct blk_zone) * nrz; + rep_bytes = (rep_bytes + PAGE_SIZE - 1) & PAGE_MASK; + if (rep_bytes > (queue_max_sectors(q) << 9)) + rep_bytes = queue_max_sectors(q) << 9; + + nr_pages = min_t(unsigned int, BIO_MAX_PAGES, + rep_bytes >> PAGE_SHIFT); + nr_pages = min_t(unsigned int, nr_pages, + queue_max_segments(q)); + + bio = bio_alloc(gfp_mask, nr_pages); + if (!bio) + return -ENOMEM; + + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = blk_zone_start(q, sector); + bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0); + + for (i = 0; i < nr_pages; i++) { + page = alloc_page(gfp_mask); + if (!page) { + ret = -ENOMEM; + goto out; + } + if (!bio_add_page(bio, page, PAGE_SIZE, 0)) { + __free_page(page); + break; + } + } + + if (i == 0) + ret = -ENOMEM; + else + ret = submit_bio_wait(bio); + if (ret) + goto out; + + /* + * Process the report result: skip the header and go through the + * reported zones to fixup and fixup the zone information for + * partitions. At the same time, return the zone information into + * the zone array. + */ + n = 0; + nz = 0; + nr_rep = 0; + bio_for_each_segment_all(bv, bio, i) { + + if (!bv->bv_page) + break; + + addr = kmap_atomic(bv->bv_page); + + /* Get header in the first page */ + ofst = 0; + if (!nr_rep) { + hdr = (struct blk_zone_report_hdr *) addr; + nr_rep = hdr->nr_zones; + ofst = sizeof(struct blk_zone_report_hdr); + } + + /* Fixup and report zones */ + while (ofst < bv->bv_len && + n < nr_rep && nz < nrz) { + if (blkdev_report_zone(bdev, addr + ofst, &zones[nz])) + nz++; + ofst += sizeof(struct blk_zone); + n++; + } + + kunmap_atomic(addr); + + if (n >= nr_rep || nz >= nrz) + break; + + } + + *nr_zones = nz; +out: + bio_for_each_segment_all(bv, bio, i) + __free_page(bv->bv_page); + bio_put(bio); + + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_report_zones); + +/** + * blkdev_reset_zones - Reset zones write pointer + * @bdev: Target block device + * @sector: Start sector of the first zone to reset + * @nr_sectors: Number of sectors, at least the length of one zone + * @gfp_mask: Memory allocation flags (for bio_alloc) + * + * Description: + * Reset the write pointer of the zones contained in the range + * @sector..@sector+@nr_sectors. Specifying the entire disk sector range + * is valid, but the specified range should not contain conventional zones. + */ +int blkdev_reset_zones(struct block_device *bdev, + sector_t sector, sector_t nr_sectors, + gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(bdev); + sector_t zone_sectors; + sector_t end_sector = sector + nr_sectors; + struct bio *bio; + int ret; + + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -EOPNOTSUPP; + + if (end_sector > bdev->bd_part->nr_sects) + /* Out of range */ + return -EINVAL; + + /* Check alignment (handle eventual smaller last zone) */ + zone_sectors = blk_queue_zone_sectors(q); + if (sector & (zone_sectors - 1)) + return -EINVAL; + + if ((nr_sectors & (zone_sectors - 1)) && + end_sector != bdev->bd_part->nr_sects) + return -EINVAL; + + while (sector < end_sector) { + + bio = bio_alloc(gfp_mask, 0); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); + + ret = submit_bio_wait(bio); + bio_put(bio); + + if (ret) + return ret; + + sector += zone_sectors; + + /* This may take a while, so be nice to others */ + cond_resched(); + + } + + return 0; +} +EXPORT_SYMBOL_GPL(blkdev_reset_zones); + +/** + * BLKREPORTZONE ioctl processing. + * Called from blkdev_ioctl. + */ +int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct request_queue *q; + struct blk_zone_report rep; + struct blk_zone *zones; + int ret; + + if (!argp) + return -EINVAL; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) + return -EFAULT; + + if (!rep.nr_zones) + return -EINVAL; + + zones = kcalloc(rep.nr_zones, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) + return -ENOMEM; + + ret = blkdev_report_zones(bdev, rep.sector, + zones, &rep.nr_zones, + GFP_KERNEL); + if (ret) + goto out; + + if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) { + ret = -EFAULT; + goto out; + } + + if (rep.nr_zones) { + if (copy_to_user(argp + sizeof(struct blk_zone_report), zones, + sizeof(struct blk_zone) * rep.nr_zones)) + ret = -EFAULT; + } + + out: + kfree(zones); + + return ret; +} + +/** + * BLKRESETZONE ioctl processing. + * Called from blkdev_ioctl. + */ +int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct request_queue *q; + struct blk_zone_range zrange; + + if (!argp) + return -EINVAL; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) + return -EFAULT; + + return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); +} diff --git a/block/blk.h b/block/blk.h index c37492f5edaa..d1ea4bd9b9a3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -14,6 +14,10 @@ /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) +#ifdef CONFIG_DEBUG_FS +extern struct dentry *blk_debugfs_root; +#endif + struct blk_flush_queue { unsigned int flush_queue_delayed:1; unsigned int flush_pending_idx:1; @@ -39,14 +43,9 @@ extern struct ida blk_queue_ida; static inline struct blk_flush_queue *blk_get_flush_queue( struct request_queue *q, struct blk_mq_ctx *ctx) { - struct blk_mq_hw_ctx *hctx; - - if (!q->mq_ops) - return q->fq; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); - - return hctx->fq; + if (q->mq_ops) + return blk_mq_map_queue(q, ctx->cpu)->fq; + return q->fq; } static inline void __blk_get_queue(struct request_queue *q) @@ -101,6 +100,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, struct bio *bio); bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio); +bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, + struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count, struct request **same_queue_rq); @@ -116,6 +117,7 @@ void blk_account_io_done(struct request *req); enum rq_atomic_flags { REQ_ATOM_COMPLETE = 0, REQ_ATOM_STARTED, + REQ_ATOM_POLL_SLEPT, }; /* @@ -135,7 +137,7 @@ static inline void blk_clear_rq_complete(struct request *rq) /* * Internal elevator interface */ -#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED) +#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) void blk_insert_flush(struct request *rq); @@ -171,7 +173,7 @@ static inline struct request *__elv_next_request(struct request_queue *q) return NULL; } if (unlikely(blk_queue_bypass(q)) || - !q->elevator->type->ops.elevator_dispatch_fn(q, 0)) + !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) return NULL; } } @@ -180,16 +182,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_activate_req_fn) - e->type->ops.elevator_activate_req_fn(q, rq); + if (e->type->ops.sq.elevator_activate_req_fn) + e->type->ops.sq.elevator_activate_req_fn(q, rq); } static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_deactivate_req_fn) - e->type->ops.elevator_deactivate_req_fn(q, rq); + if (e->type->ops.sq.elevator_deactivate_req_fn) + e->type->ops.sq.elevator_deactivate_req_fn(q, rq); } #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -208,14 +210,14 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio); int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio); -int attempt_back_merge(struct request_queue *q, struct request *rq); -int attempt_front_merge(struct request_queue *q, struct request *rq); +struct request *attempt_back_merge(struct request_queue *q, struct request *rq); +struct request *attempt_front_merge(struct request_queue *q, struct request *rq); int blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); void blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); -int blk_try_merge(struct request *rq, struct bio *bio); +enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); void blk_queue_congestion_threshold(struct request_queue *q); @@ -252,8 +254,15 @@ extern int blk_update_nr_requests(struct request_queue *, unsigned int); static inline int blk_do_io_stat(struct request *rq) { return rq->rq_disk && - (rq->cmd_flags & REQ_IO_STAT) && - (rq->cmd_type == REQ_TYPE_FS); + (rq->rq_flags & RQF_IO_STAT) && + !blk_rq_is_passthrough(rq); +} + +static inline void req_set_nomerge(struct request_queue *q, struct request *req) +{ + req->cmd_flags |= REQ_NOMERGE; + if (req == q->last_merge) + q->last_merge = NULL; } /* @@ -268,6 +277,22 @@ void ioc_clear_queue(struct request_queue *q); int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); /** + * rq_ioc - determine io_context for request allocation + * @bio: request being allocated is for this bio (can be %NULL) + * + * Determine io_context to use for request allocation for @bio. May return + * %NULL if %current->io_context doesn't exist. + */ +static inline struct io_context *rq_ioc(struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + if (bio && bio->bi_ioc) + return bio->bi_ioc; +#endif + return current->io_context; +} + +/** * create_io_context - try to create task->io_context * @gfp_mask: allocation mask * @node: allocation node diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 650f427d915b..cd15f9dbb147 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -32,8 +32,13 @@ * bsg_destroy_job - routine to teardown/delete a bsg job * @job: bsg_job that is to be torn down */ -static void bsg_destroy_job(struct bsg_job *job) +static void bsg_destroy_job(struct kref *kref) { + struct bsg_job *job = container_of(kref, struct bsg_job, kref); + struct request *rq = job->req; + + blk_end_request_all(rq, rq->errors); + put_device(job->dev); /* release reference for the request */ kfree(job->request_payload.sg_list); @@ -41,6 +46,18 @@ static void bsg_destroy_job(struct bsg_job *job) kfree(job); } +void bsg_job_put(struct bsg_job *job) +{ + kref_put(&job->kref, bsg_destroy_job); +} +EXPORT_SYMBOL_GPL(bsg_job_put); + +int bsg_job_get(struct bsg_job *job) +{ + return kref_get_unless_zero(&job->kref); +} +EXPORT_SYMBOL_GPL(bsg_job_get); + /** * bsg_job_done - completion routine for bsg requests * @job: bsg_job that is complete @@ -54,22 +71,24 @@ void bsg_job_done(struct bsg_job *job, int result, { struct request *req = job->req; struct request *rsp = req->next_rq; + struct scsi_request *rq = scsi_req(req); int err; err = job->req->errors = result; if (err < 0) /* we're only returning the result field in the reply */ - job->req->sense_len = sizeof(u32); + rq->sense_len = sizeof(u32); else - job->req->sense_len = job->reply_len; + rq->sense_len = job->reply_len; /* we assume all request payload was transferred, residual == 0 */ - req->resid_len = 0; + rq->resid_len = 0; if (rsp) { - WARN_ON(reply_payload_rcv_len > rsp->resid_len); + WARN_ON(reply_payload_rcv_len > scsi_req(rsp)->resid_len); /* set reply (bidi) residual */ - rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len); + scsi_req(rsp)->resid_len -= + min(reply_payload_rcv_len, scsi_req(rsp)->resid_len); } blk_complete_request(req); } @@ -83,8 +102,7 @@ static void bsg_softirq_done(struct request *rq) { struct bsg_job *job = rq->special; - blk_end_request_all(rq, rq->errors); - bsg_destroy_job(job); + bsg_job_put(job); } static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) @@ -97,6 +115,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) if (!buf->sg_list) return -ENOMEM; sg_init_table(buf->sg_list, req->nr_phys_segments); + scsi_req(req)->resid_len = blk_rq_bytes(req); buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); buf->payload_len = blk_rq_bytes(req); return 0; @@ -111,6 +130,7 @@ static int bsg_create_job(struct device *dev, struct request *req) { struct request *rsp = req->next_rq; struct request_queue *q = req->q; + struct scsi_request *rq = scsi_req(req); struct bsg_job *job; int ret; @@ -124,9 +144,9 @@ static int bsg_create_job(struct device *dev, struct request *req) job->req = req; if (q->bsg_job_size) job->dd_data = (void *)&job[1]; - job->request = req->cmd; - job->request_len = req->cmd_len; - job->reply = req->sense; + job->request = rq->cmd; + job->request_len = rq->cmd_len; + job->reply = rq->sense; job->reply_len = SCSI_SENSE_BUFFERSIZE; /* Size of sense buffer * allocated */ if (req->bio) { @@ -142,6 +162,7 @@ static int bsg_create_job(struct device *dev, struct request *req) job->dev = dev; /* take a reference for the request */ get_device(job->dev); + kref_init(&job->kref); return 0; failjob_rls_rqst_payload: @@ -160,7 +181,9 @@ failjob_rls_job: * * Drivers/subsys should pass this to the queue init function. */ -void bsg_request_fn(struct request_queue *q) +static void bsg_request_fn(struct request_queue *q) + __releases(q->queue_lock) + __acquires(q->queue_lock) { struct device *dev = q->queuedata; struct request *req; @@ -195,24 +218,30 @@ void bsg_request_fn(struct request_queue *q) put_device(dev); spin_lock_irq(q->queue_lock); } -EXPORT_SYMBOL_GPL(bsg_request_fn); /** * bsg_setup_queue - Create and add the bsg hooks so we can receive requests * @dev: device to attach bsg device to - * @q: request queue setup by caller * @name: device to give bsg device * @job_fn: bsg job handler * @dd_job_size: size of LLD data needed for each job - * - * The caller should have setup the reuqest queue with bsg_request_fn - * as the request_fn. */ -int bsg_setup_queue(struct device *dev, struct request_queue *q, - char *name, bsg_job_fn *job_fn, int dd_job_size) +struct request_queue *bsg_setup_queue(struct device *dev, char *name, + bsg_job_fn *job_fn, int dd_job_size) { + struct request_queue *q; int ret; + q = blk_alloc_queue(GFP_KERNEL); + if (!q) + return ERR_PTR(-ENOMEM); + q->cmd_size = sizeof(struct scsi_request); + q->request_fn = bsg_request_fn; + + ret = blk_init_allocated_queue(q); + if (ret) + goto out_cleanup_queue; + q->queuedata = dev; q->bsg_job_size = dd_job_size; q->bsg_job_fn = job_fn; @@ -224,9 +253,12 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q, if (ret) { printk(KERN_ERR "%s: bsg interface failed to " "initialize - register queue\n", dev->kobj.name); - return ret; + goto out_cleanup_queue; } - return 0; + return q; +out_cleanup_queue: + blk_cleanup_queue(q); + return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(bsg_setup_queue); diff --git a/block/bsg.c b/block/bsg.c index d214e929ce18..a9a8b8e0446f 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -85,7 +85,6 @@ struct bsg_command { struct bio *bidi_bio; int err; struct sg_io_v4 hdr; - char sense[SCSI_SENSE_BUFFERSIZE]; }; static void bsg_free_command(struct bsg_command *bc) @@ -140,18 +139,20 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, struct sg_io_v4 *hdr, struct bsg_device *bd, fmode_t has_write_perm) { + struct scsi_request *req = scsi_req(rq); + if (hdr->request_len > BLK_MAX_CDB) { - rq->cmd = kzalloc(hdr->request_len, GFP_KERNEL); - if (!rq->cmd) + req->cmd = kzalloc(hdr->request_len, GFP_KERNEL); + if (!req->cmd) return -ENOMEM; } - if (copy_from_user(rq->cmd, (void __user *)(unsigned long)hdr->request, + if (copy_from_user(req->cmd, (void __user *)(unsigned long)hdr->request, hdr->request_len)) return -EFAULT; if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { - if (blk_verify_command(rq->cmd, has_write_perm)) + if (blk_verify_command(req->cmd, has_write_perm)) return -EPERM; } else if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -159,7 +160,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, /* * fill in request structure */ - rq->cmd_len = hdr->request_len; + req->cmd_len = hdr->request_len; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) @@ -176,7 +177,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, * Check if sg_io_v4 from user is allowed and valid */ static int -bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw) +bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op) { int ret = 0; @@ -197,7 +198,7 @@ bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw) ret = -EINVAL; } - *rw = hdr->dout_xfer_len ? WRITE : READ; + *op = hdr->dout_xfer_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN; return ret; } @@ -205,13 +206,12 @@ bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw) * map sg_io_v4 to a request. */ static struct request * -bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, - u8 *sense) +bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) { struct request_queue *q = bd->queue; struct request *rq, *next_rq = NULL; - int ret, rw; - unsigned int dxfer_len; + int ret; + unsigned int op, dxfer_len; void __user *dxferp = NULL; struct bsg_class_device *bcd = &q->bsg_dev; @@ -226,36 +226,35 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, hdr->din_xfer_len); - ret = bsg_validate_sgv4_hdr(q, hdr, &rw); + ret = bsg_validate_sgv4_hdr(hdr, &op); if (ret) return ERR_PTR(ret); /* * map scatter-gather elements separately and string them to request */ - rq = blk_get_request(q, rw, GFP_KERNEL); + rq = blk_get_request(q, op, GFP_KERNEL); if (IS_ERR(rq)) return rq; - blk_rq_set_block_pc(rq); + scsi_req_init(rq); ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); if (ret) goto out; - if (rw == WRITE && hdr->din_xfer_len) { + if (op == REQ_OP_SCSI_OUT && hdr->din_xfer_len) { if (!test_bit(QUEUE_FLAG_BIDI, &q->queue_flags)) { ret = -EOPNOTSUPP; goto out; } - next_rq = blk_get_request(q, READ, GFP_KERNEL); + next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL); if (IS_ERR(next_rq)) { ret = PTR_ERR(next_rq); next_rq = NULL; goto out; } rq->next_rq = next_rq; - next_rq->cmd_type = rq->cmd_type; dxferp = (void __user *)(unsigned long)hdr->din_xferp; ret = blk_rq_map_user(q, next_rq, NULL, dxferp, @@ -280,13 +279,9 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, goto out; } - rq->sense = sense; - rq->sense_len = 0; - return rq; out: - if (rq->cmd != rq->__cmd) - kfree(rq->cmd); + scsi_req_free_cmd(scsi_req(rq)); blk_put_request(rq); if (next_rq) { blk_rq_unmap_user(next_rq->bio); @@ -393,6 +388,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd) static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, struct bio *bio, struct bio *bidi_bio) { + struct scsi_request *req = scsi_req(rq); int ret = 0; dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors); @@ -407,12 +403,12 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, hdr->info |= SG_INFO_CHECK; hdr->response_len = 0; - if (rq->sense_len && hdr->response) { + if (req->sense_len && hdr->response) { int len = min_t(unsigned int, hdr->max_response_len, - rq->sense_len); + req->sense_len); ret = copy_to_user((void __user *)(unsigned long)hdr->response, - rq->sense, len); + req->sense, len); if (!ret) hdr->response_len = len; else @@ -420,14 +416,14 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, } if (rq->next_rq) { - hdr->dout_resid = rq->resid_len; - hdr->din_resid = rq->next_rq->resid_len; + hdr->dout_resid = req->resid_len; + hdr->din_resid = scsi_req(rq->next_rq)->resid_len; blk_rq_unmap_user(bidi_bio); blk_put_request(rq->next_rq); } else if (rq_data_dir(rq) == READ) - hdr->din_resid = rq->resid_len; + hdr->din_resid = req->resid_len; else - hdr->dout_resid = rq->resid_len; + hdr->dout_resid = req->resid_len; /* * If the request generated a negative error number, return it @@ -439,8 +435,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, ret = rq->errors; blk_rq_unmap_user(bio); - if (rq->cmd != rq->__cmd) - kfree(rq->cmd); + scsi_req_free_cmd(req); blk_put_request(rq); return ret; @@ -625,7 +620,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf, /* * get a request, fill in the blanks, and add to request queue */ - rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm, bc->sense); + rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm); if (IS_ERR(rq)) { ret = PTR_ERR(rq); rq = NULL; @@ -655,6 +650,9 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) dprintk("%s: write %Zd bytes\n", bd->name, count); + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + return -EINVAL; + bsg_set_block(bd, file); bytes_written = 0; @@ -908,12 +906,11 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct bio *bio, *bidi_bio = NULL; struct sg_io_v4 hdr; int at_head; - u8 sense[SCSI_SENSE_BUFFERSIZE]; if (copy_from_user(&hdr, uarg, sizeof(hdr))) return -EFAULT; - rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE, sense); + rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index cc2f6dbd4303..137944777859 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -16,6 +16,7 @@ #include <linux/blktrace_api.h> #include <linux/blk-cgroup.h> #include "blk.h" +#include "blk-wbt.h" /* * tunables @@ -667,10 +668,10 @@ static inline void cfqg_put(struct cfq_group *cfqg) } while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, int op, - int op_flags) + struct cfq_group *curr_cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, 1); + blkg_rwstat_add(&cfqg->stats.queued, op, 1); cfqg_stats_end_empty_time(&cfqg->stats); cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); } @@ -684,30 +685,29 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, #endif } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op, - int op_flags) +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, -1); + blkg_rwstat_add(&cfqg->stats.queued, op, -1); } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op, - int op_flags) +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.merged, op, op_flags, 1); + blkg_rwstat_add(&cfqg->stats.merged, op, 1); } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { struct cfqg_stats *stats = &cfqg->stats; unsigned long long now = sched_clock(); if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, op, op_flags, - now - io_start_time); + blkg_rwstat_add(&stats->service_time, op, now - io_start_time); if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, op, op_flags, + blkg_rwstat_add(&stats->wait_time, op, io_start_time - start_time); } @@ -786,16 +786,16 @@ static inline void cfqg_put(struct cfq_group *cfqg) { } #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, int op, int op_flags) { } + struct cfq_group *curr_cfqg, unsigned int op) { } static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, uint64_t time, unsigned long unaccounted_time) { } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op, - int op_flags) { } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op, - int op_flags) { } +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, + unsigned int op) { } +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, + unsigned int op) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) { } + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -913,15 +913,6 @@ static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) } /* - * We regard a request as SYNC, if it's either a read or has the SYNC bit - * set (in which case it could also be direct WRITE). - */ -static inline bool cfq_bio_sync(struct bio *bio) -{ - return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC); -} - -/* * scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing */ @@ -1596,7 +1587,7 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) { struct cfq_group_data *cgd; - cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); + cgd = kzalloc(sizeof(*cgd), gfp); if (!cgd) return NULL; return &cgd->cpd; @@ -2474,10 +2465,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); cfq_add_rq_rb(rq); cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, - req_op(rq), rq->cmd_flags); + rq->cmd_flags); } static struct request * @@ -2491,7 +2482,7 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) if (!cic) return NULL; - cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); + cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf)); if (cfqq) return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); @@ -2530,14 +2521,14 @@ static void cfq_remove_request(struct request *rq) cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); if (rq->cmd_flags & REQ_PRIO) { WARN_ON(!cfqq->prio_pending); cfqq->prio_pending--; } } -static int cfq_merge(struct request_queue *q, struct request **req, +static enum elv_merge cfq_merge(struct request_queue *q, struct request **req, struct bio *bio) { struct cfq_data *cfqd = q->elevator->elevator_data; @@ -2553,7 +2544,7 @@ static int cfq_merge(struct request_queue *q, struct request **req, } static void cfq_merged_request(struct request_queue *q, struct request *req, - int type) + enum elv_merge type) { if (type == ELEVATOR_FRONT_MERGE) { struct cfq_queue *cfqq = RQ_CFQQ(req); @@ -2565,7 +2556,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, static void cfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_opf); + cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf); } static void @@ -2588,7 +2579,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, if (cfqq->next_rq == next) cfqq->next_rq = rq; cfq_remove_request(next); - cfqg_stats_update_io_merged(RQ_CFQG(rq), req_op(next), next->cmd_flags); + cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); cfqq = RQ_CFQQ(next); /* @@ -2605,13 +2596,14 @@ static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq, struct bio *bio) { struct cfq_data *cfqd = q->elevator->elevator_data; + bool is_sync = op_is_sync(bio->bi_opf); struct cfq_io_cq *cic; struct cfq_queue *cfqq; /* * Disallow merge of a sync bio into an async request. */ - if (cfq_bio_sync(bio) && !rq_is_sync(rq)) + if (is_sync && !rq_is_sync(rq)) return false; /* @@ -2622,7 +2614,7 @@ static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq, if (!cic) return false; - cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); + cfqq = cic_to_cfqq(cic, is_sync); return cfqq == RQ_CFQQ(rq); } @@ -2757,9 +2749,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) if (!cfqg) return NULL; - for_each_cfqg_st(cfqg, i, j, st) - if ((cfqq = cfq_rb_first(st)) != NULL) + for_each_cfqg_st(cfqg, i, j, st) { + cfqq = cfq_rb_first(st); + if (cfqq) return cfqq; + } return NULL; } @@ -3042,7 +3036,6 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq) if (ktime_get_ns() < rq->fifo_time) rq = NULL; - cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); return rq; } @@ -3420,6 +3413,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned int max_dispatch; + if (cfq_cfqq_must_dispatch(cfqq)) + return true; + /* * Drain async requests before we start sync IO */ @@ -3511,15 +3507,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); + rq = cfq_check_fifo(cfqq); + if (rq) + cfq_mark_cfqq_must_dispatch(cfqq); + if (!cfq_may_dispatch(cfqd, cfqq)) return false; /* * follow expired path, else get first next available */ - rq = cfq_check_fifo(cfqq); if (!rq) rq = cfqq->next_rq; + else + cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); /* * insert request into driver dispatch list @@ -3759,14 +3760,16 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) +static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; uint64_t serial_nr; + bool nonroot_cg; rcu_read_lock(); serial_nr = bio_blkcg(bio)->css.serial_nr; + nonroot_cg = bio_blkcg(bio) != &blkcg_root; rcu_read_unlock(); /* @@ -3774,7 +3777,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) * spuriously on a newly created cic but there's no harm. */ if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) - return; + return nonroot_cg; /* * Drop reference to queues. New queues will be assigned in new @@ -3795,9 +3798,13 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) } cic->blkcg_serial_nr = serial_nr; + return nonroot_cg; } #else -static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } +static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) +{ + return false; +} #endif /* CONFIG_CFQ_GROUP_IOSCHED */ static struct cfq_queue ** @@ -3847,13 +3854,16 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, goto out; } - cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, + cfqq = kmem_cache_alloc_node(cfq_pool, + GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, cfqd->queue->node); if (!cfqq) { cfqq = &cfqd->oom_cfqq; goto out; } + /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */ + cfqq->ioprio_class = IOPRIO_CLASS_NONE; cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, cic); cfq_link_cfqq_cfqg(cfqq, cfqg); @@ -3916,6 +3926,12 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); } +static inline bool req_noidle(struct request *req) +{ + return req_op(req) == REQ_OP_WRITE && + (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC; +} + /* * Disable idle window if the process thinks too long or seeks so much that * it doesn't matter @@ -3937,7 +3953,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq->queued[0] + cfqq->queued[1] >= 4) cfq_mark_cfqq_deep(cfqq); - if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) + if (cfqq->next_rq && req_noidle(cfqq->next_rq)) enable_idle = 0; else if (!atomic_read(&cic->icq.ioc->active_ref) || !cfqd->cfq_slice_idle || @@ -3989,7 +4005,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if the new request is sync, but the currently running queue is * not, let the sync request have priority. */ - if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) + if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) return true; /* @@ -4135,7 +4151,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &cfqq->fifo); cfq_add_rq_rb(rq); - cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, req_op(rq), + cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, rq->cmd_flags); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -4222,8 +4238,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) const int sync = rq_is_sync(rq); u64 now = ktime_get_ns(); - cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", - !!(rq->cmd_flags & REQ_NOIDLE)); + cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq)); cfq_update_hw_tag(cfqd); @@ -4233,8 +4248,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqq->dispatched--; (RQ_CFQG(rq))->dispatched--; cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), - rq_io_start_time_ns(rq), req_op(rq), - rq->cmd_flags); + rq_io_start_time_ns(rq), rq->cmd_flags); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; @@ -4312,14 +4326,14 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_schedule_dispatch(cfqd); } -static void cfqq_boost_on_prio(struct cfq_queue *cfqq, int op_flags) +static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op) { /* * If REQ_PRIO is set, boost class and prio level, if it's below * BE/NORM. If prio is not set, restore the potentially boosted * class/prio level. */ - if (!(op_flags & REQ_PRIO)) { + if (!(op & REQ_PRIO)) { cfqq->ioprio_class = cfqq->org_ioprio_class; cfqq->ioprio = cfqq->org_ioprio; } else { @@ -4340,7 +4354,7 @@ static inline int __cfq_may_queue(struct cfq_queue *cfqq) return ELV_MQUEUE_MAY; } -static int cfq_may_queue(struct request_queue *q, int op, int op_flags) +static int cfq_may_queue(struct request_queue *q, unsigned int op) { struct cfq_data *cfqd = q->elevator->elevator_data; struct task_struct *tsk = current; @@ -4357,10 +4371,10 @@ static int cfq_may_queue(struct request_queue *q, int op, int op_flags) if (!cic) return ELV_MQUEUE_MAY; - cfqq = cic_to_cfqq(cic, rw_is_sync(op, op_flags)); + cfqq = cic_to_cfqq(cic, op_is_sync(op)); if (cfqq) { cfq_init_prio_data(cfqq, cic); - cfqq_boost_on_prio(cfqq, op_flags); + cfqq_boost_on_prio(cfqq, op); return __cfq_may_queue(cfqq); } @@ -4434,11 +4448,12 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, const int rw = rq_data_dir(rq); const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; + bool disable_wbt; spin_lock_irq(q->queue_lock); check_ioprio_changed(cic, bio); - check_blkcg_changed(cic, bio); + disable_wbt = check_blkcg_changed(cic, bio); new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { @@ -4474,6 +4489,10 @@ new_queue: rq->elv.priv[0] = cfqq; rq->elv.priv[1] = cfqq->cfqg; spin_unlock_irq(q->queue_lock); + + if (disable_wbt) + wbt_disable_default(q); + return 0; } @@ -4823,7 +4842,7 @@ static struct elv_fs_entry cfq_attrs[] = { }; static struct elevator_type iosched_cfq = { - .ops = { + .ops.sq = { .elevator_merge_fn = cfq_merge, .elevator_merged_fn = cfq_merged_request, .elevator_merge_req_fn = cfq_merged_requests, diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 556826ac7cb4..570021a0dc1c 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -661,7 +661,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) struct block_device *bdev = inode->i_bdev; struct gendisk *disk = bdev->bd_disk; fmode_t mode = file->f_mode; - struct backing_dev_info *bdi; loff_t size; unsigned int max_sectors; @@ -708,9 +707,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKFRAGET: if (!arg) return -EINVAL; - bdi = blk_get_backing_dev_info(bdev); return compat_put_long(arg, - (bdi->ra_pages * PAGE_SIZE) / 512); + (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); case BLKROGET: /* compatible */ return compat_put_int(arg, bdev_read_only(bdev) != 0); case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ @@ -728,8 +726,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKFRASET: if (!capable(CAP_SYS_ADMIN)) return -EACCES; - bdi = blk_get_backing_dev_info(bdev); - bdi->ra_pages = (arg * 512) / PAGE_SIZE; + bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKGETSIZE: size = i_size_read(bdev->bd_inode); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 55e0bb6d7da7..c68f6bbc0dcd 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -120,12 +120,11 @@ static void deadline_remove_request(struct request_queue *q, struct request *rq) deadline_del_rq_rb(dd, rq); } -static int +static enum elv_merge deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; struct request *__rq; - int ret; /* * check for front merge @@ -138,20 +137,17 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { - ret = ELEVATOR_FRONT_MERGE; - goto out; + *req = __rq; + return ELEVATOR_FRONT_MERGE; } } } return ELEVATOR_NO_MERGE; -out: - *req = __rq; - return ret; } static void deadline_merged_request(struct request_queue *q, - struct request *req, int type) + struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; @@ -439,7 +435,7 @@ static struct elv_fs_entry deadline_attrs[] = { }; static struct elevator_type iosched_deadline = { - .ops = { + .ops.sq = { .elevator_merge_fn = deadline_merge, .elevator_merged_fn = deadline_merged_request, .elevator_merge_req_fn = deadline_merged_requests, diff --git a/block/elevator.c b/block/elevator.c index f7d973a56fd7..699d10f71a2c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -40,6 +40,7 @@ #include <trace/events/block.h> #include "blk.h" +#include "blk-mq-sched.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -58,8 +59,10 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_allow_bio_merge_fn) - return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio); + if (e->uses_mq && e->type->ops.mq.allow_merge) + return e->type->ops.mq.allow_merge(q, rq, bio); + else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn) + return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio); return 1; } @@ -163,6 +166,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); + eq->uses_mq = e->uses_mq; return eq; } @@ -203,11 +207,12 @@ int elevator_init(struct request_queue *q, char *name) } /* - * Use the default elevator specified by config boot param or - * config option. Don't try to load modules as we could be running - * off async and request_module() isn't allowed from async. + * Use the default elevator specified by config boot param for + * non-mq devices, or by config option. Don't try to load modules + * as we could be running off async and request_module() isn't + * allowed from async. */ - if (!e && *chosen_elevator) { + if (!e && !q->mq_ops && *chosen_elevator) { e = elevator_get(chosen_elevator, false); if (!e) printk(KERN_ERR "I/O scheduler %s not found\n", @@ -215,18 +220,32 @@ int elevator_init(struct request_queue *q, char *name) } if (!e) { - e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); + if (q->mq_ops && q->nr_hw_queues == 1) + e = elevator_get(CONFIG_DEFAULT_SQ_IOSCHED, false); + else if (q->mq_ops) + e = elevator_get(CONFIG_DEFAULT_MQ_IOSCHED, false); + else + e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); + if (!e) { printk(KERN_ERR "Default I/O scheduler not found. " \ - "Using noop.\n"); + "Using noop/none.\n"); e = elevator_get("noop", false); } } - err = e->ops.elevator_init_fn(q, e); - if (err) + if (e->uses_mq) { + err = blk_mq_sched_setup(q); + if (!err) + err = e->ops.mq.init_sched(q, e); + } else + err = e->ops.sq.elevator_init_fn(q, e); + if (err) { + if (e->uses_mq) + blk_mq_sched_teardown(q); elevator_put(e); + } return err; } EXPORT_SYMBOL(elevator_init); @@ -234,8 +253,10 @@ EXPORT_SYMBOL(elevator_init); void elevator_exit(struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); - if (e->type->ops.elevator_exit_fn) - e->type->ops.elevator_exit_fn(e); + if (e->uses_mq && e->type->ops.mq.exit_sched) + e->type->ops.mq.exit_sched(e); + else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) + e->type->ops.sq.elevator_exit_fn(e); mutex_unlock(&e->sysfs_lock); kobject_put(&e->kobj); @@ -245,31 +266,33 @@ EXPORT_SYMBOL(elevator_exit); static inline void __elv_rqhash_del(struct request *rq) { hash_del(&rq->hash); - rq->cmd_flags &= ~REQ_HASHED; + rq->rq_flags &= ~RQF_HASHED; } -static void elv_rqhash_del(struct request_queue *q, struct request *rq) +void elv_rqhash_del(struct request_queue *q, struct request *rq) { if (ELV_ON_HASH(rq)) __elv_rqhash_del(rq); } +EXPORT_SYMBOL_GPL(elv_rqhash_del); -static void elv_rqhash_add(struct request_queue *q, struct request *rq) +void elv_rqhash_add(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; BUG_ON(ELV_ON_HASH(rq)); hash_add(e->hash, &rq->hash, rq_hash_key(rq)); - rq->cmd_flags |= REQ_HASHED; + rq->rq_flags |= RQF_HASHED; } +EXPORT_SYMBOL_GPL(elv_rqhash_add); -static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) +void elv_rqhash_reposition(struct request_queue *q, struct request *rq) { __elv_rqhash_del(rq); elv_rqhash_add(q, rq); } -static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) +struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) { struct elevator_queue *e = q->elevator; struct hlist_node *next; @@ -352,7 +375,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) { sector_t boundary; struct list_head *entry; - int stop_flags; if (q->last_merge == rq) q->last_merge = NULL; @@ -362,7 +384,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) q->nr_sorted--; boundary = q->end_sector; - stop_flags = REQ_SOFTBARRIER | REQ_STARTED; list_for_each_prev(entry, &q->queue_head) { struct request *pos = list_entry_rq(entry); @@ -370,7 +391,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) break; if (rq_data_dir(rq) != rq_data_dir(pos)) break; - if (pos->cmd_flags & stop_flags) + if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER)) break; if (blk_rq_pos(rq) >= boundary) { if (blk_rq_pos(pos) < boundary) @@ -407,11 +428,11 @@ void elv_dispatch_add_tail(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL(elv_dispatch_add_tail); -int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) +enum elv_merge elv_merge(struct request_queue *q, struct request **req, + struct bio *bio) { struct elevator_queue *e = q->elevator; struct request *__rq; - int ret; /* * Levels of merges: @@ -426,7 +447,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) * First try one-hit cache. */ if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) { - ret = blk_try_merge(q->last_merge, bio); + enum elv_merge ret = blk_try_merge(q->last_merge, bio); + if (ret != ELEVATOR_NO_MERGE) { *req = q->last_merge; return ret; @@ -445,8 +467,10 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) return ELEVATOR_BACK_MERGE; } - if (e->type->ops.elevator_merge_fn) - return e->type->ops.elevator_merge_fn(q, req, bio); + if (e->uses_mq && e->type->ops.mq.request_merge) + return e->type->ops.mq.request_merge(q, req, bio); + else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn) + return e->type->ops.sq.elevator_merge_fn(q, req, bio); return ELEVATOR_NO_MERGE; } @@ -458,8 +482,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) * * Returns true if we merged, false otherwise */ -static bool elv_attempt_insert_merge(struct request_queue *q, - struct request *rq) +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) { struct request *__rq; bool ret; @@ -493,12 +516,15 @@ static bool elv_attempt_insert_merge(struct request_queue *q, return ret; } -void elv_merged_request(struct request_queue *q, struct request *rq, int type) +void elv_merged_request(struct request_queue *q, struct request *rq, + enum elv_merge type) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_merged_fn) - e->type->ops.elevator_merged_fn(q, rq, type); + if (e->uses_mq && e->type->ops.mq.request_merged) + e->type->ops.mq.request_merged(q, rq, type); + else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn) + e->type->ops.sq.elevator_merged_fn(q, rq, type); if (type == ELEVATOR_BACK_MERGE) elv_rqhash_reposition(q, rq); @@ -510,10 +536,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; - const int next_sorted = next->cmd_flags & REQ_SORTED; - - if (next_sorted && e->type->ops.elevator_merge_req_fn) - e->type->ops.elevator_merge_req_fn(q, rq, next); + bool next_sorted = false; + + if (e->uses_mq && e->type->ops.mq.requests_merged) + e->type->ops.mq.requests_merged(q, rq, next); + else if (e->type->ops.sq.elevator_merge_req_fn) { + next_sorted = (__force bool)(next->rq_flags & RQF_SORTED); + if (next_sorted) + e->type->ops.sq.elevator_merge_req_fn(q, rq, next); + } elv_rqhash_reposition(q, rq); @@ -530,20 +561,23 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_bio_merged_fn) - e->type->ops.elevator_bio_merged_fn(q, rq, bio); + if (WARN_ON_ONCE(e->uses_mq)) + return; + + if (e->type->ops.sq.elevator_bio_merged_fn) + e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio); } #ifdef CONFIG_PM static void blk_pm_requeue_request(struct request *rq) { - if (rq->q->dev && !(rq->cmd_flags & REQ_PM)) + if (rq->q->dev && !(rq->rq_flags & RQF_PM)) rq->q->nr_pending--; } static void blk_pm_add_request(struct request_queue *q, struct request *rq) { - if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 && + if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 && (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING)) pm_request_resume(q->dev); } @@ -563,11 +597,11 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if (rq->cmd_flags & REQ_SORTED) + if (rq->rq_flags & RQF_SORTED) elv_deactivate_rq(q, rq); } - rq->cmd_flags &= ~REQ_STARTED; + rq->rq_flags &= ~RQF_STARTED; blk_pm_requeue_request(rq); @@ -576,11 +610,15 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) void elv_drain_elevator(struct request_queue *q) { + struct elevator_queue *e = q->elevator; static int printed; + if (WARN_ON_ONCE(e->uses_mq)) + return; + lockdep_assert_held(q->queue_lock); - while (q->elevator->type->ops.elevator_dispatch_fn(q, 1)) + while (e->type->ops.sq.elevator_dispatch_fn(q, 1)) ; if (q->nr_sorted && printed++ < 10) { printk(KERN_ERR "%s: forced dispatching is broken " @@ -597,13 +635,13 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) rq->q = q; - if (rq->cmd_flags & REQ_SOFTBARRIER) { + if (rq->rq_flags & RQF_SOFTBARRIER) { /* barriers are scheduling boundary, update end_sector */ - if (rq->cmd_type == REQ_TYPE_FS) { + if (!blk_rq_is_passthrough(rq)) { q->end_sector = rq_end_sector(rq); q->boundary_rq = rq; } - } else if (!(rq->cmd_flags & REQ_ELVPRIV) && + } else if (!(rq->rq_flags & RQF_ELVPRIV) && (where == ELEVATOR_INSERT_SORT || where == ELEVATOR_INSERT_SORT_MERGE)) where = ELEVATOR_INSERT_BACK; @@ -611,12 +649,12 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) switch (where) { case ELEVATOR_INSERT_REQUEUE: case ELEVATOR_INSERT_FRONT: - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; list_add(&rq->queuelist, &q->queue_head); break; case ELEVATOR_INSERT_BACK: - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; elv_drain_elevator(q); list_add_tail(&rq->queuelist, &q->queue_head); /* @@ -641,8 +679,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) if (elv_attempt_insert_merge(q, rq)) break; case ELEVATOR_INSERT_SORT: - BUG_ON(rq->cmd_type != REQ_TYPE_FS); - rq->cmd_flags |= REQ_SORTED; + BUG_ON(blk_rq_is_passthrough(rq)); + rq->rq_flags |= RQF_SORTED; q->nr_sorted++; if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -655,11 +693,11 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) * rq cannot be accessed after calling * elevator_add_req_fn. */ - q->elevator->type->ops.elevator_add_req_fn(q, rq); + q->elevator->type->ops.sq.elevator_add_req_fn(q, rq); break; case ELEVATOR_INSERT_FLUSH: - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; blk_insert_flush(rq); break; default: @@ -684,8 +722,11 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_latter_req_fn) - return e->type->ops.elevator_latter_req_fn(q, rq); + if (e->uses_mq && e->type->ops.mq.next_request) + return e->type->ops.mq.next_request(q, rq); + else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn) + return e->type->ops.sq.elevator_latter_req_fn(q, rq); + return NULL; } @@ -693,8 +734,10 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_former_req_fn) - return e->type->ops.elevator_former_req_fn(q, rq); + if (e->uses_mq && e->type->ops.mq.former_request) + return e->type->ops.mq.former_request(q, rq); + if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn) + return e->type->ops.sq.elevator_former_req_fn(q, rq); return NULL; } @@ -703,8 +746,11 @@ int elv_set_request(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_set_req_fn) - return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); + if (WARN_ON_ONCE(e->uses_mq)) + return 0; + + if (e->type->ops.sq.elevator_set_req_fn) + return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask); return 0; } @@ -712,16 +758,22 @@ void elv_put_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_put_req_fn) - e->type->ops.elevator_put_req_fn(rq); + if (WARN_ON_ONCE(e->uses_mq)) + return; + + if (e->type->ops.sq.elevator_put_req_fn) + e->type->ops.sq.elevator_put_req_fn(rq); } -int elv_may_queue(struct request_queue *q, int op, int op_flags) +int elv_may_queue(struct request_queue *q, unsigned int op) { struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_may_queue_fn) - return e->type->ops.elevator_may_queue_fn(q, op, op_flags); + if (WARN_ON_ONCE(e->uses_mq)) + return 0; + + if (e->type->ops.sq.elevator_may_queue_fn) + return e->type->ops.sq.elevator_may_queue_fn(q, op); return ELV_MQUEUE_MAY; } @@ -730,14 +782,17 @@ void elv_completed_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; + if (WARN_ON_ONCE(e->uses_mq)) + return; + /* * request is released from the driver, io must be done */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if ((rq->cmd_flags & REQ_SORTED) && - e->type->ops.elevator_completed_req_fn) - e->type->ops.elevator_completed_req_fn(q, rq); + if ((rq->rq_flags & RQF_SORTED) && + e->type->ops.sq.elevator_completed_req_fn) + e->type->ops.sq.elevator_completed_req_fn(q, rq); } } @@ -805,8 +860,8 @@ int elv_register_queue(struct request_queue *q) } kobject_uevent(&e->kobj, KOBJ_ADD); e->registered = 1; - if (e->type->ops.elevator_registered_fn) - e->type->ops.elevator_registered_fn(q); + if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn) + e->type->ops.sq.elevator_registered_fn(q); } return error; } @@ -893,9 +948,14 @@ EXPORT_SYMBOL_GPL(elv_unregister); static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { struct elevator_queue *old = q->elevator; - bool registered = old->registered; + bool old_registered = false; int err; + if (q->mq_ops) { + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + } + /* * Turn on BYPASS and drain all requests w/ elevator private data. * Block layer doesn't call into a quiesced elevator - all requests @@ -903,42 +963,76 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) * using INSERT_BACK. All requests have SOFTBARRIER set and no * merge happens either. */ - blk_queue_bypass_start(q); + if (old) { + old_registered = old->registered; + + if (old->uses_mq) + blk_mq_sched_teardown(q); + + if (!q->mq_ops) + blk_queue_bypass_start(q); - /* unregister and clear all auxiliary data of the old elevator */ - if (registered) - elv_unregister_queue(q); + /* unregister and clear all auxiliary data of the old elevator */ + if (old_registered) + elv_unregister_queue(q); - spin_lock_irq(q->queue_lock); - ioc_clear_queue(q); - spin_unlock_irq(q->queue_lock); + spin_lock_irq(q->queue_lock); + ioc_clear_queue(q); + spin_unlock_irq(q->queue_lock); + } /* allocate, init and register new elevator */ - err = new_e->ops.elevator_init_fn(q, new_e); - if (err) - goto fail_init; + if (new_e) { + if (new_e->uses_mq) { + err = blk_mq_sched_setup(q); + if (!err) + err = new_e->ops.mq.init_sched(q, new_e); + } else + err = new_e->ops.sq.elevator_init_fn(q, new_e); + if (err) + goto fail_init; - if (registered) { err = elv_register_queue(q); if (err) goto fail_register; - } + } else + q->elevator = NULL; /* done, kill the old one and finish */ - elevator_exit(old); - blk_queue_bypass_end(q); + if (old) { + elevator_exit(old); + if (!q->mq_ops) + blk_queue_bypass_end(q); + } - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + if (q->mq_ops) { + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + } + + if (new_e) + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + else + blk_add_trace_msg(q, "elv switch: none"); return 0; fail_register: + if (q->mq_ops) + blk_mq_sched_teardown(q); elevator_exit(q->elevator); fail_init: /* switch failed, restore and re-register old elevator */ - q->elevator = old; - elv_register_queue(q); - blk_queue_bypass_end(q); + if (old) { + q->elevator = old; + elv_register_queue(q); + if (!q->mq_ops) + blk_queue_bypass_end(q); + } + if (q->mq_ops) { + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + } return err; } @@ -951,8 +1045,11 @@ static int __elevator_change(struct request_queue *q, const char *name) char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; - if (!q->elevator) - return -ENXIO; + /* + * Special case for mq, turn off scheduling + */ + if (q->mq_ops && !strncmp(name, "none", 4)) + return elevator_switch(q, NULL); strlcpy(elevator_name, name, sizeof(elevator_name)); e = elevator_get(strstrip(elevator_name), true); @@ -961,11 +1058,21 @@ static int __elevator_change(struct request_queue *q, const char *name) return -EINVAL; } - if (!strcmp(elevator_name, q->elevator->type->elevator_name)) { + if (q->elevator && + !strcmp(elevator_name, q->elevator->type->elevator_name)) { elevator_put(e); return 0; } + if (!e->uses_mq && q->mq_ops) { + elevator_put(e); + return -EINVAL; + } + if (e->uses_mq && !q->mq_ops) { + elevator_put(e); + return -EINVAL; + } + return elevator_switch(q, e); } @@ -987,7 +1094,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, { int ret; - if (!q->elevator) + if (!(q->mq_ops || q->request_fn)) return count; ret = __elevator_change(q, name); @@ -1001,24 +1108,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, ssize_t elv_iosched_show(struct request_queue *q, char *name) { struct elevator_queue *e = q->elevator; - struct elevator_type *elv; + struct elevator_type *elv = NULL; struct elevator_type *__e; int len = 0; - if (!q->elevator || !blk_queue_stackable(q)) + if (!blk_queue_stackable(q)) return sprintf(name, "none\n"); - elv = e->type; + if (!q->elevator) + len += sprintf(name+len, "[none] "); + else + elv = e->type; spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (!strcmp(elv->elevator_name, __e->elevator_name)) + if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); - else + continue; + } + if (__e->uses_mq && q->mq_ops) + len += sprintf(name+len, "%s ", __e->elevator_name); + else if (!__e->uses_mq && !q->mq_ops) len += sprintf(name+len, "%s ", __e->elevator_name); } spin_unlock(&elv_list_lock); + if (q->mq_ops && q->elevator) + len += sprintf(name+len, "none"); + len += sprintf(len+name, "\n"); return len; } diff --git a/block/genhd.c b/block/genhd.c index fcd6d4fae657..3631cd480295 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -572,6 +572,20 @@ exit: disk_part_iter_exit(&piter); } +void put_disk_devt(struct disk_devt *disk_devt) +{ + if (disk_devt && atomic_dec_and_test(&disk_devt->count)) + disk_devt->release(disk_devt); +} +EXPORT_SYMBOL(put_disk_devt); + +void get_disk_devt(struct disk_devt *disk_devt) +{ + if (disk_devt) + atomic_inc(&disk_devt->count); +} +EXPORT_SYMBOL(get_disk_devt); + /** * device_add_disk - add partitioning information to kernel list * @parent: parent device for the disk @@ -612,8 +626,15 @@ void device_add_disk(struct device *parent, struct gendisk *disk) disk_alloc_events(disk); + /* + * Take a reference on the devt and assign it to queue since it + * must not be reallocated while the bdi is registered + */ + disk->queue->disk_devt = disk->disk_devt; + get_disk_devt(disk->disk_devt); + /* Register BDI before referencing it from bdev */ - bdi = &disk->queue->backing_dev_info; + bdi = disk->queue->backing_dev_info; bdi_register_owner(bdi, disk_to_dev(disk)); blk_register_region(disk_devt(disk), disk->minors, NULL, @@ -648,6 +669,8 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { + bdev_unhash_inode(MKDEV(disk->major, + disk->first_minor + part->partno)); invalidate_partition(disk, part->partno); delete_partition(disk, part->partno); } diff --git a/block/ioctl.c b/block/ioctl.c index ed2397f8de9d..7b88820b93d9 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -8,7 +8,7 @@ #include <linux/fs.h> #include <linux/blktrace_api.h> #include <linux/pr.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) { @@ -45,6 +45,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user || pstart < 0 || plength < 0 || partno > 65535) return -EINVAL; } + /* check if partition is aligned to blocksize */ + if (p.start & (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; mutex_lock(&bdev->bd_mutex); @@ -225,7 +228,8 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, unsigned long arg) { uint64_t range[2]; - uint64_t start, len; + struct address_space *mapping; + uint64_t start, end, len; if (!(mode & FMODE_WRITE)) return -EBADF; @@ -235,18 +239,23 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, start = range[0]; len = range[1]; + end = start + len - 1; if (start & 511) return -EINVAL; if (len & 511) return -EINVAL; - start >>= 9; - len >>= 9; - - if (start + len > (i_size_read(bdev->bd_inode) >> 9)) + if (end >= (uint64_t)i_size_read(bdev->bd_inode)) return -EINVAL; + if (end < start) + return -EINVAL; + + /* Invalidate the page cache, including dirty pages */ + mapping = bdev->bd_inode->i_mapping; + truncate_inode_pages_range(mapping, start, end); - return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false); + return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, + false); } static int put_ushort(unsigned long arg, unsigned short val) @@ -496,7 +505,6 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { - struct backing_dev_info *bdi; void __user *argp = (void __user *)arg; loff_t size; unsigned int max_sectors; @@ -513,14 +521,17 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, BLKDEV_DISCARD_SECURE); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); + case BLKREPORTZONE: + return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); + case BLKRESETZONE: + return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg); case HDIO_GETGEO: return blkdev_getgeo(bdev, argp); case BLKRAGET: case BLKFRAGET: if (!arg) return -EINVAL; - bdi = blk_get_backing_dev_info(bdev); - return put_long(arg, (bdi->ra_pages * PAGE_SIZE) / 512); + return put_long(arg, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512); case BLKROGET: return put_int(arg, bdev_read_only(bdev) != 0); case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ @@ -547,8 +558,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; - bdi = blk_get_backing_dev_info(bdev); - bdi->ra_pages = (arg * 512) / PAGE_SIZE; + bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKBSZSET: return blkdev_bszset(bdev, mode, argp); diff --git a/block/mq-deadline.c b/block/mq-deadline.c new file mode 100644 index 000000000000..236121633ca0 --- /dev/null +++ b/block/mq-deadline.c @@ -0,0 +1,556 @@ +/* + * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, + * for the blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> + */ +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/blk-mq.h> +#include <linux/elevator.h> +#include <linux/bio.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/rbtree.h> +#include <linux/sbitmap.h> + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" +#include "blk-mq-sched.h" + +/* + * See Documentation/block/deadline-iosched.txt + */ +static const int read_expire = HZ / 2; /* max time before a read is submitted. */ +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +static const int writes_starved = 2; /* max times reads can starve a write */ +static const int fifo_batch = 16; /* # of sequential requests treated as one + by the above parameters. For throughput. */ + +struct deadline_data { + /* + * run time data + */ + + /* + * requests (deadline_rq s) are present on both sort_list and fifo_list + */ + struct rb_root sort_list[2]; + struct list_head fifo_list[2]; + + /* + * next in sort order. read, write or both are NULL + */ + struct request *next_rq[2]; + unsigned int batching; /* number of sequential requests made */ + unsigned int starved; /* times reads have starved writes */ + + /* + * settings that change how the i/o scheduler behaves + */ + int fifo_expire[2]; + int fifo_batch; + int writes_starved; + int front_merges; + + spinlock_t lock; + struct list_head dispatch; +}; + +static inline struct rb_root * +deadline_rb_root(struct deadline_data *dd, struct request *rq) +{ + return &dd->sort_list[rq_data_dir(rq)]; +} + +/* + * get the request after `rq' in sector-sorted order + */ +static inline struct request * +deadline_latter_request(struct request *rq) +{ + struct rb_node *node = rb_next(&rq->rb_node); + + if (node) + return rb_entry_rq(node); + + return NULL; +} + +static void +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) +{ + struct rb_root *root = deadline_rb_root(dd, rq); + + elv_rb_add(root, rq); +} + +static inline void +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) +{ + const int data_dir = rq_data_dir(rq); + + if (dd->next_rq[data_dir] == rq) + dd->next_rq[data_dir] = deadline_latter_request(rq); + + elv_rb_del(deadline_rb_root(dd, rq), rq); +} + +/* + * remove rq from rbtree and fifo. + */ +static void deadline_remove_request(struct request_queue *q, struct request *rq) +{ + struct deadline_data *dd = q->elevator->elevator_data; + + list_del_init(&rq->queuelist); + + /* + * We might not be on the rbtree, if we are doing an insert merge + */ + if (!RB_EMPTY_NODE(&rq->rb_node)) + deadline_del_rq_rb(dd, rq); + + elv_rqhash_del(q, rq); + if (q->last_merge == rq) + q->last_merge = NULL; +} + +static void dd_request_merged(struct request_queue *q, struct request *req, + enum elv_merge type) +{ + struct deadline_data *dd = q->elevator->elevator_data; + + /* + * if the merge was a front merge, we need to reposition request + */ + if (type == ELEVATOR_FRONT_MERGE) { + elv_rb_del(deadline_rb_root(dd, req), req); + deadline_add_rq_rb(dd, req); + } +} + +static void dd_merged_requests(struct request_queue *q, struct request *req, + struct request *next) +{ + /* + * if next expires before rq, assign its expire time to rq + * and move into next position (next will be deleted) in fifo + */ + if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { + if (time_before((unsigned long)next->fifo_time, + (unsigned long)req->fifo_time)) { + list_move(&req->queuelist, &next->queuelist); + req->fifo_time = next->fifo_time; + } + } + + /* + * kill knowledge of next, this one is a goner + */ + deadline_remove_request(q, next); +} + +/* + * move an entry to dispatch queue + */ +static void +deadline_move_request(struct deadline_data *dd, struct request *rq) +{ + const int data_dir = rq_data_dir(rq); + + dd->next_rq[READ] = NULL; + dd->next_rq[WRITE] = NULL; + dd->next_rq[data_dir] = deadline_latter_request(rq); + + /* + * take it off the sort and fifo list + */ + deadline_remove_request(rq->q, rq); +} + +/* + * deadline_check_fifo returns 0 if there are no expired requests on the fifo, + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) + */ +static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) +{ + struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); + + /* + * rq is expired! + */ + if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) + return 1; + + return 0; +} + +/* + * deadline_dispatch_requests selects the best request according to + * read/write expire, fifo_batch, etc + */ +static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + struct request *rq; + bool reads, writes; + int data_dir; + + if (!list_empty(&dd->dispatch)) { + rq = list_first_entry(&dd->dispatch, struct request, queuelist); + list_del_init(&rq->queuelist); + goto done; + } + + reads = !list_empty(&dd->fifo_list[READ]); + writes = !list_empty(&dd->fifo_list[WRITE]); + + /* + * batches are currently reads XOR writes + */ + if (dd->next_rq[WRITE]) + rq = dd->next_rq[WRITE]; + else + rq = dd->next_rq[READ]; + + if (rq && dd->batching < dd->fifo_batch) + /* we have a next request are still entitled to batch */ + goto dispatch_request; + + /* + * at this point we are not running a batch. select the appropriate + * data direction (read / write) + */ + + if (reads) { + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); + + if (writes && (dd->starved++ >= dd->writes_starved)) + goto dispatch_writes; + + data_dir = READ; + + goto dispatch_find_request; + } + + /* + * there are either no reads or writes have been starved + */ + + if (writes) { +dispatch_writes: + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); + + dd->starved = 0; + + data_dir = WRITE; + + goto dispatch_find_request; + } + + return NULL; + +dispatch_find_request: + /* + * we are not running a batch, find best request for selected data_dir + */ + if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + /* + * A deadline has expired, the last request was in the other + * direction, or we have run out of higher-sectored requests. + * Start again from the request with the earliest expiry time. + */ + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + } else { + /* + * The last req was the same dir and we have a next request in + * sort order. No expired requests so continue on from here. + */ + rq = dd->next_rq[data_dir]; + } + + dd->batching = 0; + +dispatch_request: + /* + * rq is the selected appropriate request. + */ + dd->batching++; + deadline_move_request(dd, rq); +done: + rq->rq_flags |= RQF_STARTED; + return rq; +} + +static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + struct request *rq; + + spin_lock(&dd->lock); + rq = __dd_dispatch_request(hctx); + spin_unlock(&dd->lock); + + return rq; +} + +static void dd_exit_queue(struct elevator_queue *e) +{ + struct deadline_data *dd = e->elevator_data; + + BUG_ON(!list_empty(&dd->fifo_list[READ])); + BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + + kfree(dd); +} + +/* + * initialize elevator private data (deadline_data). + */ +static int dd_init_queue(struct request_queue *q, struct elevator_type *e) +{ + struct deadline_data *dd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; + + dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); + if (!dd) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = dd; + + INIT_LIST_HEAD(&dd->fifo_list[READ]); + INIT_LIST_HEAD(&dd->fifo_list[WRITE]); + dd->sort_list[READ] = RB_ROOT; + dd->sort_list[WRITE] = RB_ROOT; + dd->fifo_expire[READ] = read_expire; + dd->fifo_expire[WRITE] = write_expire; + dd->writes_starved = writes_starved; + dd->front_merges = 1; + dd->fifo_batch = fifo_batch; + spin_lock_init(&dd->lock); + INIT_LIST_HEAD(&dd->dispatch); + + q->elevator = eq; + return 0; +} + +static int dd_request_merge(struct request_queue *q, struct request **rq, + struct bio *bio) +{ + struct deadline_data *dd = q->elevator->elevator_data; + sector_t sector = bio_end_sector(bio); + struct request *__rq; + + if (!dd->front_merges) + return ELEVATOR_NO_MERGE; + + __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); + if (__rq) { + BUG_ON(sector != blk_rq_pos(__rq)); + + if (elv_bio_merge_ok(__rq, bio)) { + *rq = __rq; + return ELEVATOR_FRONT_MERGE; + } + } + + return ELEVATOR_NO_MERGE; +} + +static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + struct request *free = NULL; + bool ret; + + spin_lock(&dd->lock); + ret = blk_mq_sched_try_merge(q, bio, &free); + spin_unlock(&dd->lock); + + if (free) + blk_mq_free_request(free); + + return ret; +} + +/* + * add rq to rbtree and fifo + */ +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + const int data_dir = rq_data_dir(rq); + + if (blk_mq_sched_try_insert_merge(q, rq)) + return; + + blk_mq_sched_request_inserted(rq); + + if (at_head || blk_rq_is_passthrough(rq)) { + if (at_head) + list_add(&rq->queuelist, &dd->dispatch); + else + list_add_tail(&rq->queuelist, &dd->dispatch); + } else { + deadline_add_rq_rb(dd, rq); + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); + if (!q->last_merge) + q->last_merge = rq; + } + + /* + * set expire time and add to fifo list + */ + rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; + list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + } +} + +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *list, bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_lock(&dd->lock); + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(hctx, rq, at_head); + } + spin_unlock(&dd->lock); +} + +static bool dd_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + + return !list_empty_careful(&dd->dispatch) || + !list_empty_careful(&dd->fifo_list[0]) || + !list_empty_careful(&dd->fifo_list[1]); +} + +/* + * sysfs parts below + */ +static ssize_t +deadline_var_show(int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t +deadline_var_store(int *var, const char *page, size_t count) +{ + char *p = (char *) page; + + *var = simple_strtol(p, &p, 10); + return count; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return deadline_var_show(__data, (page)); \ +} +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + int __data; \ + int ret = deadline_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); +#undef STORE_FUNCTION + +#define DD_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \ + deadline_##name##_store) + +static struct elv_fs_entry deadline_attrs[] = { + DD_ATTR(read_expire), + DD_ATTR(write_expire), + DD_ATTR(writes_starved), + DD_ATTR(front_merges), + DD_ATTR(fifo_batch), + __ATTR_NULL +}; + +static struct elevator_type mq_deadline = { + .ops.mq = { + .insert_requests = dd_insert_requests, + .dispatch_request = dd_dispatch_request, + .next_request = elv_rb_latter_request, + .former_request = elv_rb_former_request, + .bio_merge = dd_bio_merge, + .request_merge = dd_request_merge, + .requests_merged = dd_merged_requests, + .request_merged = dd_request_merged, + .has_work = dd_has_work, + .init_sched = dd_init_queue, + .exit_sched = dd_exit_queue, + }, + + .uses_mq = true, + .elevator_attrs = deadline_attrs, + .elevator_name = "mq-deadline", + .elevator_owner = THIS_MODULE, +}; + +static int __init deadline_init(void) +{ + return elv_register(&mq_deadline); +} + +static void __exit deadline_exit(void) +{ + elv_unregister(&mq_deadline); +} + +module_init(deadline_init); +module_exit(deadline_exit); + +MODULE_AUTHOR("Jens Axboe"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MQ deadline IO scheduler"); diff --git a/block/noop-iosched.c b/block/noop-iosched.c index a163c487cf38..2d1b15d89b45 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -92,7 +92,7 @@ static void noop_exit_queue(struct elevator_queue *e) } static struct elevator_type elevator_noop = { - .ops = { + .ops.sq = { .elevator_merge_req_fn = noop_merged_requests, .elevator_dispatch_fn = noop_dispatch, .elevator_add_req_fn = noop_add_request, diff --git a/block/opal_proto.h b/block/opal_proto.h new file mode 100644 index 000000000000..f40c9acf8895 --- /dev/null +++ b/block/opal_proto.h @@ -0,0 +1,452 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli <rafael.antognolli@intel.com> + * Scott Bauer <scott.bauer@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/types.h> + +#ifndef _OPAL_PROTO_H +#define _OPAL_PROTO_H + +/* + * These constant values come from: + * SPC-4 section + * 6.30 SECURITY PROTOCOL IN command / table 265. + */ +enum { + TCG_SECP_00 = 0, + TCG_SECP_01, +}; + +/* + * Token defs derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * 3.2.2 Data Stream Encoding + */ +enum opal_response_token { + OPAL_DTA_TOKENID_BYTESTRING = 0xe0, + OPAL_DTA_TOKENID_SINT = 0xe1, + OPAL_DTA_TOKENID_UINT = 0xe2, + OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */ + OPAL_DTA_TOKENID_INVALID = 0X0 +}; + +#define DTAERROR_NO_METHOD_STATUS 0x89 +#define GENERIC_HOST_SESSION_NUM 0x41 + +#define TPER_SYNC_SUPPORTED 0x01 + +#define TINY_ATOM_DATA_MASK 0x3F +#define TINY_ATOM_SIGNED 0x40 + +#define SHORT_ATOM_ID 0x80 +#define SHORT_ATOM_BYTESTRING 0x20 +#define SHORT_ATOM_SIGNED 0x10 +#define SHORT_ATOM_LEN_MASK 0xF + +#define MEDIUM_ATOM_ID 0xC0 +#define MEDIUM_ATOM_BYTESTRING 0x10 +#define MEDIUM_ATOM_SIGNED 0x8 +#define MEDIUM_ATOM_LEN_MASK 0x7 + +#define LONG_ATOM_ID 0xe0 +#define LONG_ATOM_BYTESTRING 0x2 +#define LONG_ATOM_SIGNED 0x1 + +/* Derived from TCG Core spec 2.01 Section: + * 3.2.2.1 + * Data Type + */ +#define TINY_ATOM_BYTE 0x7F +#define SHORT_ATOM_BYTE 0xBF +#define MEDIUM_ATOM_BYTE 0xDF +#define LONG_ATOM_BYTE 0xE3 + +#define OPAL_INVAL_PARAM 12 +#define OPAL_MANUFACTURED_INACTIVE 0x08 +#define OPAL_DISCOVERY_COMID 0x0001 + +#define LOCKING_RANGE_NON_GLOBAL 0x03 +/* + * User IDs used in the TCG storage SSCs + * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 6.3 Assigned UIDs + */ +#define OPAL_UID_LENGTH 8 +#define OPAL_METHOD_LENGTH 8 +#define OPAL_MSID_KEYLEN 15 +#define OPAL_UID_LENGTH_HALF 4 + +/* Enum to index OPALUID array */ +enum opal_uid { + /* users */ + OPAL_SMUID_UID, + OPAL_THISSP_UID, + OPAL_ADMINSP_UID, + OPAL_LOCKINGSP_UID, + OPAL_ENTERPRISE_LOCKINGSP_UID, + OPAL_ANYBODY_UID, + OPAL_SID_UID, + OPAL_ADMIN1_UID, + OPAL_USER1_UID, + OPAL_USER2_UID, + OPAL_PSID_UID, + OPAL_ENTERPRISE_BANDMASTER0_UID, + OPAL_ENTERPRISE_ERASEMASTER_UID, + /* tables */ + OPAL_LOCKINGRANGE_GLOBAL, + OPAL_LOCKINGRANGE_ACE_RDLOCKED, + OPAL_LOCKINGRANGE_ACE_WRLOCKED, + OPAL_MBRCONTROL, + OPAL_MBR, + OPAL_AUTHORITY_TABLE, + OPAL_C_PIN_TABLE, + OPAL_LOCKING_INFO_TABLE, + OPAL_ENTERPRISE_LOCKING_INFO_TABLE, + /* C_PIN_TABLE object ID's */ + OPAL_C_PIN_MSID, + OPAL_C_PIN_SID, + OPAL_C_PIN_ADMIN1, + /* half UID's (only first 4 bytes used) */ + OPAL_HALF_UID_AUTHORITY_OBJ_REF, + OPAL_HALF_UID_BOOLEAN_ACE, + /* omitted optional parameter */ + OPAL_UID_HEXFF, +}; + +#define OPAL_METHOD_LENGTH 8 + +/* Enum for indexing the OPALMETHOD array */ +enum opal_method { + OPAL_PROPERTIES, + OPAL_STARTSESSION, + OPAL_REVERT, + OPAL_ACTIVATE, + OPAL_EGET, + OPAL_ESET, + OPAL_NEXT, + OPAL_EAUTHENTICATE, + OPAL_GETACL, + OPAL_GENKEY, + OPAL_REVERTSP, + OPAL_GET, + OPAL_SET, + OPAL_AUTHENTICATE, + OPAL_RANDOM, + OPAL_ERASE, +}; + +enum opal_token { + /* Boolean */ + OPAL_TRUE = 0x01, + OPAL_FALSE = 0x00, + OPAL_BOOLEAN_EXPR = 0x03, + /* cellblocks */ + OPAL_TABLE = 0x00, + OPAL_STARTROW = 0x01, + OPAL_ENDROW = 0x02, + OPAL_STARTCOLUMN = 0x03, + OPAL_ENDCOLUMN = 0x04, + OPAL_VALUES = 0x01, + /* authority table */ + OPAL_PIN = 0x03, + /* locking tokens */ + OPAL_RANGESTART = 0x03, + OPAL_RANGELENGTH = 0x04, + OPAL_READLOCKENABLED = 0x05, + OPAL_WRITELOCKENABLED = 0x06, + OPAL_READLOCKED = 0x07, + OPAL_WRITELOCKED = 0x08, + OPAL_ACTIVEKEY = 0x0A, + /* locking info table */ + OPAL_MAXRANGES = 0x04, + /* mbr control */ + OPAL_MBRENABLE = 0x01, + OPAL_MBRDONE = 0x02, + /* properties */ + OPAL_HOSTPROPERTIES = 0x00, + /* atoms */ + OPAL_STARTLIST = 0xf0, + OPAL_ENDLIST = 0xf1, + OPAL_STARTNAME = 0xf2, + OPAL_ENDNAME = 0xf3, + OPAL_CALL = 0xf8, + OPAL_ENDOFDATA = 0xf9, + OPAL_ENDOFSESSION = 0xfa, + OPAL_STARTTRANSACTON = 0xfb, + OPAL_ENDTRANSACTON = 0xfC, + OPAL_EMPTYATOM = 0xff, + OPAL_WHERE = 0x00, +}; + +/* Locking state for a locking range */ +enum opal_lockingstate { + OPAL_LOCKING_READWRITE = 0x01, + OPAL_LOCKING_READONLY = 0x02, + OPAL_LOCKING_LOCKED = 0x03, +}; + +/* Packets derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Secion: 3.2.3 ComPackets, Packets & Subpackets + */ + +/* Comm Packet (header) for transmissions. */ +struct opal_compacket { + __be32 reserved0; + u8 extendedComID[4]; + __be32 outstandingData; + __be32 minTransfer; + __be32 length; +}; + +/* Packet structure. */ +struct opal_packet { + __be32 tsn; + __be32 hsn; + __be32 seq_number; + __be16 reserved0; + __be16 ack_type; + __be32 acknowledgment; + __be32 length; +}; + +/* Data sub packet header */ +struct opal_data_subpacket { + u8 reserved0[6]; + __be16 kind; + __be32 length; +}; + +/* header of a response */ +struct opal_header { + struct opal_compacket cp; + struct opal_packet pkt; + struct opal_data_subpacket subpkt; +}; + +#define FC_TPER 0x0001 +#define FC_LOCKING 0x0002 +#define FC_GEOMETRY 0x0003 +#define FC_ENTERPRISE 0x0100 +#define FC_DATASTORE 0x0202 +#define FC_SINGLEUSER 0x0201 +#define FC_OPALV100 0x0200 +#define FC_OPALV200 0x0203 + +/* + * The Discovery 0 Header. As defined in + * Opal SSC Documentation + * Section: 3.3.5 Capability Discovery + */ +struct d0_header { + __be32 length; /* the length of the header 48 in 2.00.100 */ + __be32 revision; /**< revision of the header 1 in 2.00.100 */ + __be32 reserved01; + __be32 reserved02; + /* + * the remainder of the structure is vendor specific and will not be + * addressed now + */ + u8 ignored[32]; +}; + +/* + * TPer Feature Descriptor. Contains flags indicating support for the + * TPer features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x001 in 2.00.100 + */ +struct d0_tper_features { + /* + * supported_features bits: + * bit 7: reserved + * bit 6: com ID management + * bit 5: reserved + * bit 4: streaming support + * bit 3: buffer management + * bit 2: ACK/NACK + * bit 1: async + * bit 0: sync + */ + u8 supported_features; + /* + * bytes 5 through 15 are reserved, but we represent the first 3 as + * u8 to keep the other two 32bits integers aligned. + */ + u8 reserved01[3]; + __be32 reserved02; + __be32 reserved03; +}; + +/* + * Locking Feature Descriptor. Contains flags indicating support for the + * locking features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x0002 in 2.00.100 + */ +struct d0_locking_features { + /* + * supported_features bits: + * bits 6-7: reserved + * bit 5: MBR done + * bit 4: MBR enabled + * bit 3: media encryption + * bit 2: locked + * bit 1: locking enabled + * bit 0: locking supported + */ + u8 supported_features; + /* + * bytes 5 through 15 are reserved, but we represent the first 3 as + * u8 to keep the other two 32bits integers aligned. + */ + u8 reserved01[3]; + __be32 reserved02; + __be32 reserved03; +}; + +/* + * Geometry Feature Descriptor. Contains flags indicating support for the + * geometry features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x0003 in 2.00.100 + */ +struct d0_geometry_features { + /* + * skip 32 bits from header, needed to align the struct to 64 bits. + */ + u8 header[4]; + /* + * reserved01: + * bits 1-6: reserved + * bit 0: align + */ + u8 reserved01; + u8 reserved02[7]; + __be32 logical_block_size; + __be64 alignment_granularity; + __be64 lowest_aligned_lba; +}; + +/* + * Enterprise SSC Feature + * + * code == 0x0100 + */ +struct d0_enterprise_ssc { + __be16 baseComID; + __be16 numComIDs; + /* range_crossing: + * bits 1-6: reserved + * bit 0: range crossing + */ + u8 range_crossing; + u8 reserved01; + __be16 reserved02; + __be32 reserved03; + __be32 reserved04; +}; + +/* + * Opal V1 feature + * + * code == 0x0200 + */ +struct d0_opal_v100 { + __be16 baseComID; + __be16 numComIDs; +}; + +/* + * Single User Mode feature + * + * code == 0x0201 + */ +struct d0_single_user_mode { + __be32 num_locking_objects; + /* reserved01: + * bit 0: any + * bit 1: all + * bit 2: policy + * bits 3-7: reserved + */ + u8 reserved01; + u8 reserved02; + __be16 reserved03; + __be32 reserved04; +}; + +/* + * Additonal Datastores feature + * + * code == 0x0202 + */ +struct d0_datastore_table { + __be16 reserved01; + __be16 max_tables; + __be32 max_size_tables; + __be32 table_size_alignment; +}; + +/* + * OPAL 2.0 feature + * + * code == 0x0203 + */ +struct d0_opal_v200 { + __be16 baseComID; + __be16 numComIDs; + /* range_crossing: + * bits 1-6: reserved + * bit 0: range crossing + */ + u8 range_crossing; + /* num_locking_admin_auth: + * not aligned to 16 bits, so use two u8. + * stored in big endian: + * 0: MSB + * 1: LSB + */ + u8 num_locking_admin_auth[2]; + /* num_locking_user_auth: + * not aligned to 16 bits, so use two u8. + * stored in big endian: + * 0: MSB + * 1: LSB + */ + u8 num_locking_user_auth[2]; + u8 initialPIN; + u8 revertedPIN; + u8 reserved01; + __be32 reserved02; +}; + +/* Union of features used to parse the discovery 0 response */ +struct d0_features { + __be16 code; + /* + * r_version bits: + * bits 4-7: version + * bits 0-3: reserved + */ + u8 r_version; + u8 length; + u8 features[]; +}; + +#endif /* _OPAL_PROTO_H */ diff --git a/block/partition-generic.c b/block/partition-generic.c index 71d9ed9df8da..7afb9907821f 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -430,6 +430,56 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } +static bool part_zone_aligned(struct gendisk *disk, + struct block_device *bdev, + sector_t from, sector_t size) +{ + unsigned int zone_sectors = bdev_zone_sectors(bdev); + + /* + * If this function is called, then the disk is a zoned block device + * (host-aware or host-managed). This can be detected even if the + * zoned block device support is disabled (CONFIG_BLK_DEV_ZONED not + * set). In this case, however, only host-aware devices will be seen + * as a block device is not created for host-managed devices. Without + * zoned block device support, host-aware drives can still be used as + * regular block devices (no zone operation) and their zone size will + * be reported as 0. Allow this case. + */ + if (!zone_sectors) + return true; + + /* + * Check partition start and size alignement. If the drive has a + * smaller last runt zone, ignore it and allow the partition to + * use it. Check the zone size too: it should be a power of 2 number + * of sectors. + */ + if (WARN_ON_ONCE(!is_power_of_2(zone_sectors))) { + u32 rem; + + div_u64_rem(from, zone_sectors, &rem); + if (rem) + return false; + if ((from + size) < get_capacity(disk)) { + div_u64_rem(size, zone_sectors, &rem); + if (rem) + return false; + } + + } else { + + if (from & (zone_sectors - 1)) + return false; + if ((from + size) < get_capacity(disk) && + (size & (zone_sectors - 1))) + return false; + + } + + return true; +} + int rescan_partitions(struct gendisk *disk, struct block_device *bdev) { struct parsed_partitions *state = NULL; @@ -529,6 +579,21 @@ rescan: } } + /* + * On a zoned block device, partitions should be aligned on the + * device zone size (i.e. zone boundary crossing not allowed). + * Otherwise, resetting the write pointer of the last zone of + * one partition may impact the following partition. + */ + if (bdev_is_zoned(bdev) && + !part_zone_aligned(disk, bdev, from, size)) { + printk(KERN_WARNING + "%s: p%d start %llu+%llu is not zone aligned\n", + disk->disk_name, p, (unsigned long long) from, + (unsigned long long) size); + continue; + } + part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); diff --git a/block/partitions/efi.c b/block/partitions/efi.c index bcd86e5cd546..39f70d968754 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -293,7 +293,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, if (!gpt) return NULL; - count = le32_to_cpu(gpt->num_partition_entries) * + count = (size_t)le32_to_cpu(gpt->num_partition_entries) * le32_to_cpu(gpt->sizeof_partition_entry); if (!count) return NULL; @@ -352,7 +352,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; - u64 lastlba; + u64 lastlba, pt_size; if (!ptes) return 0; @@ -434,13 +434,20 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, goto fail; } + /* Sanity check partition table size */ + pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) * + le32_to_cpu((*gpt)->sizeof_partition_entry); + if (pt_size > KMALLOC_MAX_SIZE) { + pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n", + (unsigned long long)pt_size, KMALLOC_MAX_SIZE); + goto fail; + } + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ - crc = efi_crc32((const unsigned char *) (*ptes), - le32_to_cpu((*gpt)->num_partition_entries) * - le32_to_cpu((*gpt)->sizeof_partition_entry)); + crc = efi_crc32((const unsigned char *) (*ptes), pt_size); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { pr_debug("GUID Partition Entry Array CRC check failed.\n"); diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 47a61474e795..14b081af8d61 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -10,7 +10,7 @@ #include <linux/slab.h> #include <asm/dasd.h> #include <asm/ebcdic.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/vtoc.h> #include "check.h" diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 0774799942e0..2a2fc768b27a 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -28,7 +28,7 @@ #include <linux/slab.h> #include <linux/times.h> #include <linux/uio.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> @@ -182,6 +182,9 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) __set_bit(WRITE_16, filter->write_ok); __set_bit(WRITE_LONG, filter->write_ok); __set_bit(WRITE_LONG_2, filter->write_ok); + __set_bit(WRITE_SAME, filter->write_ok); + __set_bit(WRITE_SAME_16, filter->write_ok); + __set_bit(WRITE_SAME_32, filter->write_ok); __set_bit(ERASE, filter->write_ok); __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); __set_bit(MODE_SELECT, filter->write_ok); @@ -227,15 +230,17 @@ EXPORT_SYMBOL(blk_verify_command); static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, struct sg_io_hdr *hdr, fmode_t mode) { - if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len)) + struct scsi_request *req = scsi_req(rq); + + if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (blk_verify_command(rq->cmd, mode & FMODE_WRITE)) + if (blk_verify_command(req->cmd, mode & FMODE_WRITE)) return -EPERM; /* * fill in request structure */ - rq->cmd_len = hdr->cmd_len; + req->cmd_len = hdr->cmd_len; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) @@ -251,6 +256,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, struct bio *bio) { + struct scsi_request *req = scsi_req(rq); int r, ret = 0; /* @@ -264,13 +270,13 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, hdr->info = 0; if (hdr->masked_status || hdr->host_status || hdr->driver_status) hdr->info |= SG_INFO_CHECK; - hdr->resid = rq->resid_len; + hdr->resid = req->resid_len; hdr->sb_len_wr = 0; - if (rq->sense_len && hdr->sbp) { - int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len); + if (req->sense_len && hdr->sbp) { + int len = min((unsigned int) hdr->mx_sb_len, req->sense_len); - if (!copy_to_user(hdr->sbp, rq->sense, len)) + if (!copy_to_user(hdr->sbp, req->sense, len)) hdr->sb_len_wr = len; else ret = -EFAULT; @@ -291,7 +297,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, int writing = 0; int at_head = 0; struct request *rq; - char sense[SCSI_SENSE_BUFFERSIZE]; + struct scsi_request *req; struct bio *bio; if (hdr->interface_id != 'S') @@ -315,14 +321,16 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, at_head = 1; ret = -ENOMEM; - rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); + rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, + GFP_KERNEL); if (IS_ERR(rq)) return PTR_ERR(rq); - blk_rq_set_block_pc(rq); + req = scsi_req(rq); + scsi_req_init(rq); if (hdr->cmd_len > BLK_MAX_CDB) { - rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); - if (!rq->cmd) + req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); + if (!req->cmd) goto out_put_request; } @@ -354,9 +362,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, goto out_free_cdb; bio = rq->bio; - memset(sense, 0, sizeof(sense)); - rq->sense = sense; - rq->sense_len = 0; rq->retries = 0; start_time = jiffies; @@ -372,8 +377,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, ret = blk_complete_sghdr_rq(rq, hdr, bio); out_free_cdb: - if (rq->cmd != rq->__cmd) - kfree(rq->cmd); + scsi_req_free_cmd(req); out_put_request: blk_put_request(rq); return ret; @@ -417,9 +421,10 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, struct scsi_ioctl_command __user *sic) { struct request *rq; + struct scsi_request *req; int err; unsigned int in_len, out_len, bytes, opcode, cmdlen; - char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE]; + char *buffer = NULL; if (!sic) return -EINVAL; @@ -444,12 +449,14 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } - rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM); + rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, + __GFP_RECLAIM); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; } - blk_rq_set_block_pc(rq); + req = scsi_req(rq); + scsi_req_init(rq); cmdlen = COMMAND_SIZE(opcode); @@ -457,14 +464,14 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, * get command and data to send to device, if any */ err = -EFAULT; - rq->cmd_len = cmdlen; - if (copy_from_user(rq->cmd, sic->data, cmdlen)) + req->cmd_len = cmdlen; + if (copy_from_user(req->cmd, sic->data, cmdlen)) goto error; if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; - err = blk_verify_command(rq->cmd, mode & FMODE_WRITE); + err = blk_verify_command(req->cmd, mode & FMODE_WRITE); if (err) goto error; @@ -500,18 +507,14 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, goto error; } - memset(sense, 0, sizeof(sense)); - rq->sense = sense; - rq->sense_len = 0; - blk_execute_rq(q, disk, rq, 0); err = rq->errors & 0xff; /* only 8 bit SCSI status */ if (err) { - if (rq->sense_len && rq->sense) { - bytes = (OMAX_SB_LEN > rq->sense_len) ? - rq->sense_len : OMAX_SB_LEN; - if (copy_to_user(sic->data, rq->sense, bytes)) + if (req->sense_len && req->sense) { + bytes = (OMAX_SB_LEN > req->sense_len) ? + req->sense_len : OMAX_SB_LEN; + if (copy_to_user(sic->data, req->sense, bytes)) err = -EFAULT; } } else { @@ -536,14 +539,14 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, struct request *rq; int err; - rq = blk_get_request(q, WRITE, __GFP_RECLAIM); + rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); - blk_rq_set_block_pc(rq); + scsi_req_init(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; - rq->cmd[0] = cmd; - rq->cmd[4] = data; - rq->cmd_len = 6; + scsi_req(rq)->cmd[0] = cmd; + scsi_req(rq)->cmd[4] = data; + scsi_req(rq)->cmd_len = 6; err = blk_execute_rq(q, bd_disk, rq, 0); blk_put_request(rq); @@ -740,6 +743,17 @@ int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, } EXPORT_SYMBOL(scsi_cmd_blk_ioctl); +void scsi_req_init(struct request *rq) +{ + struct scsi_request *req = scsi_req(rq); + + memset(req->__cmd, 0, sizeof(req->__cmd)); + req->cmd = req->__cmd; + req->cmd_len = BLK_MAX_CDB; + req->sense_len = 0; +} +EXPORT_SYMBOL(scsi_req_init); + static int __init blk_scsi_ioctl_init(void) { blk_set_cmd_filter_defaults(&blk_default_cmd_filter); diff --git a/block/sed-opal.c b/block/sed-opal.c new file mode 100644 index 000000000000..d1c52ba4d62d --- /dev/null +++ b/block/sed-opal.c @@ -0,0 +1,2488 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Scott Bauer <scott.bauer@intel.com> + * Rafael Antognolli <rafael.antognolli@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":OPAL: " fmt + +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/genhd.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <uapi/linux/sed-opal.h> +#include <linux/sed-opal.h> +#include <linux/string.h> +#include <linux/kdev_t.h> + +#include "opal_proto.h" + +#define IO_BUFFER_LENGTH 2048 +#define MAX_TOKS 64 + +typedef int (*opal_step)(struct opal_dev *dev); + +enum opal_atom_width { + OPAL_WIDTH_TINY, + OPAL_WIDTH_SHORT, + OPAL_WIDTH_MEDIUM, + OPAL_WIDTH_LONG, + OPAL_WIDTH_TOKEN +}; + +/* + * On the parsed response, we don't store again the toks that are already + * stored in the response buffer. Instead, for each token, we just store a + * pointer to the position in the buffer where the token starts, and the size + * of the token in bytes. + */ +struct opal_resp_tok { + const u8 *pos; + size_t len; + enum opal_response_token type; + enum opal_atom_width width; + union { + u64 u; + s64 s; + } stored; +}; + +/* + * From the response header it's not possible to know how many tokens there are + * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later + * if we start dealing with messages that have more than that, we can increase + * this number. This is done to avoid having to make two passes through the + * response, the first one counting how many tokens we have and the second one + * actually storing the positions. + */ +struct parsed_resp { + int num; + struct opal_resp_tok toks[MAX_TOKS]; +}; + +struct opal_dev { + bool supported; + + void *data; + sec_send_recv *send_recv; + + const opal_step *funcs; + void **func_data; + int state; + struct mutex dev_lock; + u16 comid; + u32 hsn; + u32 tsn; + u64 align; + u64 lowest_lba; + + size_t pos; + u8 cmd[IO_BUFFER_LENGTH]; + u8 resp[IO_BUFFER_LENGTH]; + + struct parsed_resp parsed; + size_t prev_d_len; + void *prev_data; + + struct list_head unlk_lst; +}; + + +static const u8 opaluid[][OPAL_UID_LENGTH] = { + /* users */ + [OPAL_SMUID_UID] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff }, + [OPAL_THISSP_UID] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_ADMINSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 }, + [OPAL_ENTERPRISE_LOCKINGSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x01, 0x00, 0x01 }, + [OPAL_ANYBODY_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_SID_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 }, + [OPAL_ADMIN1_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 }, + [OPAL_USER1_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 }, + [OPAL_USER2_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 }, + [OPAL_PSID_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 }, + [OPAL_ENTERPRISE_BANDMASTER0_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x80, 0x01 }, + [OPAL_ENTERPRISE_ERASEMASTER_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 }, + + /* tables */ + + [OPAL_LOCKINGRANGE_GLOBAL] = + { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_WRLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 }, + [OPAL_MBRCONTROL] = + { 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_MBR] = + { 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_AUTHORITY_TABLE] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00}, + [OPAL_C_PIN_TABLE] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00}, + [OPAL_LOCKING_INFO_TABLE] = + { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_ENTERPRISE_LOCKING_INFO_TABLE] = + { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, + + /* C_PIN_TABLE object ID's */ + + [OPAL_C_PIN_MSID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02}, + [OPAL_C_PIN_SID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01}, + [OPAL_C_PIN_ADMIN1] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01}, + + /* half UID's (only first 4 bytes used) */ + + [OPAL_HALF_UID_AUTHORITY_OBJ_REF] = + { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff }, + [OPAL_HALF_UID_BOOLEAN_ACE] = + { 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff }, + + /* special value for omitted optional parameter */ + [OPAL_UID_HEXFF] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, +}; + +/* + * TCG Storage SSC Methods. + * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 6.3 Assigned UIDs + */ +static const u8 opalmethod[][OPAL_UID_LENGTH] = { + [OPAL_PROPERTIES] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 }, + [OPAL_STARTSESSION] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 }, + [OPAL_REVERT] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 }, + [OPAL_ACTIVATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 }, + [OPAL_EGET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06 }, + [OPAL_ESET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07 }, + [OPAL_NEXT] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 }, + [OPAL_EAUTHENTICATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0c }, + [OPAL_GETACL] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d }, + [OPAL_GENKEY] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 }, + [OPAL_REVERTSP] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 }, + [OPAL_GET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 }, + [OPAL_SET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 }, + [OPAL_AUTHENTICATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c }, + [OPAL_RANDOM] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 }, + [OPAL_ERASE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 }, +}; + +typedef int (cont_fn)(struct opal_dev *dev); + +static int end_opal_session_error(struct opal_dev *dev); + +struct opal_suspend_data { + struct opal_lock_unlock unlk; + u8 lr; + struct list_head node; +}; + +/* + * Derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 5.1.5 Method Status Codes + */ +static const char * const opal_errors[] = { + "Success", + "Not Authorized", + "Unknown Error", + "SP Busy", + "SP Failed", + "SP Disabled", + "SP Frozen", + "No Sessions Available", + "Uniqueness Conflict", + "Insufficient Space", + "Insufficient Rows", + "Invalid Function", + "Invalid Parameter", + "Invalid Reference", + "Unknown Error", + "TPER Malfunction", + "Transaction Failure", + "Response Overflow", + "Authority Locked Out", +}; + +static const char *opal_error_to_human(int error) +{ + if (error == 0x3f) + return "Failed"; + + if (error >= ARRAY_SIZE(opal_errors) || error < 0) + return "Unknown Error"; + + return opal_errors[error]; +} + +static void print_buffer(const u8 *ptr, u32 length) +{ +#ifdef DEBUG + print_hex_dump_bytes("OPAL: ", DUMP_PREFIX_OFFSET, ptr, length); + pr_debug("\n"); +#endif +} + +static bool check_tper(const void *data) +{ + const struct d0_tper_features *tper = data; + u8 flags = tper->supported_features; + + if (!(flags & TPER_SYNC_SUPPORTED)) { + pr_err("TPer sync not supported. flags = %d\n", + tper->supported_features); + return false; + } + + return true; +} + +static bool check_sum(const void *data) +{ + const struct d0_single_user_mode *sum = data; + u32 nlo = be32_to_cpu(sum->num_locking_objects); + + if (nlo == 0) { + pr_err("Need at least one locking object.\n"); + return false; + } + + pr_debug("Number of locking objects: %d\n", nlo); + + return true; +} + +static u16 get_comid_v100(const void *data) +{ + const struct d0_opal_v100 *v100 = data; + + return be16_to_cpu(v100->baseComID); +} + +static u16 get_comid_v200(const void *data) +{ + const struct d0_opal_v200 *v200 = data; + + return be16_to_cpu(v200->baseComID); +} + +static int opal_send_cmd(struct opal_dev *dev) +{ + return dev->send_recv(dev->data, dev->comid, TCG_SECP_01, + dev->cmd, IO_BUFFER_LENGTH, + true); +} + +static int opal_recv_cmd(struct opal_dev *dev) +{ + return dev->send_recv(dev->data, dev->comid, TCG_SECP_01, + dev->resp, IO_BUFFER_LENGTH, + false); +} + +static int opal_recv_check(struct opal_dev *dev) +{ + size_t buflen = IO_BUFFER_LENGTH; + void *buffer = dev->resp; + struct opal_header *hdr = buffer; + int ret; + + do { + pr_debug("Sent OPAL command: outstanding=%d, minTransfer=%d\n", + hdr->cp.outstandingData, + hdr->cp.minTransfer); + + if (hdr->cp.outstandingData == 0 || + hdr->cp.minTransfer != 0) + return 0; + + memset(buffer, 0, buflen); + ret = opal_recv_cmd(dev); + } while (!ret); + + return ret; +} + +static int opal_send_recv(struct opal_dev *dev, cont_fn *cont) +{ + int ret; + + ret = opal_send_cmd(dev); + if (ret) + return ret; + ret = opal_recv_cmd(dev); + if (ret) + return ret; + ret = opal_recv_check(dev); + if (ret) + return ret; + return cont(dev); +} + +static void check_geometry(struct opal_dev *dev, const void *data) +{ + const struct d0_geometry_features *geo = data; + + dev->align = geo->alignment_granularity; + dev->lowest_lba = geo->lowest_aligned_lba; +} + +static int next(struct opal_dev *dev) +{ + opal_step func; + int error = 0; + + do { + func = dev->funcs[dev->state]; + if (!func) + break; + + error = func(dev); + if (error) { + pr_err("Error on step function: %d with error %d: %s\n", + dev->state, error, + opal_error_to_human(error)); + + /* For each OPAL command we do a discovery0 then we + * start some sort of session. + * If we haven't passed state 1 then there was an error + * on discovery0 or during the attempt to start a + * session. Therefore we shouldn't attempt to terminate + * a session, as one has not yet been created. + */ + if (dev->state > 1) + return end_opal_session_error(dev); + } + dev->state++; + } while (!error); + + return error; +} + +static int opal_discovery0_end(struct opal_dev *dev) +{ + bool found_com_id = false, supported = true, single_user = false; + const struct d0_header *hdr = (struct d0_header *)dev->resp; + const u8 *epos = dev->resp, *cpos = dev->resp; + u16 comid = 0; + + print_buffer(dev->resp, be32_to_cpu(hdr->length)); + + epos += be32_to_cpu(hdr->length); /* end of buffer */ + cpos += sizeof(*hdr); /* current position on buffer */ + + while (cpos < epos && supported) { + const struct d0_features *body = + (const struct d0_features *)cpos; + + switch (be16_to_cpu(body->code)) { + case FC_TPER: + supported = check_tper(body->features); + break; + case FC_SINGLEUSER: + single_user = check_sum(body->features); + break; + case FC_GEOMETRY: + check_geometry(dev, body); + break; + case FC_LOCKING: + case FC_ENTERPRISE: + case FC_DATASTORE: + /* some ignored properties */ + pr_debug("Found OPAL feature description: %d\n", + be16_to_cpu(body->code)); + break; + case FC_OPALV100: + comid = get_comid_v100(body->features); + found_com_id = true; + break; + case FC_OPALV200: + comid = get_comid_v200(body->features); + found_com_id = true; + break; + case 0xbfff ... 0xffff: + /* vendor specific, just ignore */ + break; + default: + pr_debug("OPAL Unknown feature: %d\n", + be16_to_cpu(body->code)); + + } + cpos += body->length + 4; + } + + if (!supported) { + pr_debug("This device is not Opal enabled. Not Supported!\n"); + return -EOPNOTSUPP; + } + + if (!single_user) + pr_debug("Device doesn't support single user mode\n"); + + + if (!found_com_id) { + pr_debug("Could not find OPAL comid for device. Returning early\n"); + return -EOPNOTSUPP;; + } + + dev->comid = comid; + + return 0; +} + +static int opal_discovery0(struct opal_dev *dev) +{ + int ret; + + memset(dev->resp, 0, IO_BUFFER_LENGTH); + dev->comid = OPAL_DISCOVERY_COMID; + ret = opal_recv_cmd(dev); + if (ret) + return ret; + return opal_discovery0_end(dev); +} + +static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok) +{ + if (*err) + return; + if (cmd->pos >= IO_BUFFER_LENGTH - 1) { + pr_err("Error adding u8: end of buffer.\n"); + *err = -ERANGE; + return; + } + cmd->cmd[cmd->pos++] = tok; +} + +static void add_short_atom_header(struct opal_dev *cmd, bool bytestring, + bool has_sign, int len) +{ + u8 atom; + int err = 0; + + atom = SHORT_ATOM_ID; + atom |= bytestring ? SHORT_ATOM_BYTESTRING : 0; + atom |= has_sign ? SHORT_ATOM_SIGNED : 0; + atom |= len & SHORT_ATOM_LEN_MASK; + + add_token_u8(&err, cmd, atom); +} + +static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring, + bool has_sign, int len) +{ + u8 header0; + + header0 = MEDIUM_ATOM_ID; + header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0; + header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0; + header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK; + cmd->cmd[cmd->pos++] = header0; + cmd->cmd[cmd->pos++] = len; +} + +static void add_token_u64(int *err, struct opal_dev *cmd, u64 number) +{ + + size_t len; + int msb; + u8 n; + + if (!(number & ~TINY_ATOM_DATA_MASK)) { + add_token_u8(err, cmd, number); + return; + } + + msb = fls(number); + len = DIV_ROUND_UP(msb, 4); + + if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) { + pr_err("Error adding u64: end of buffer.\n"); + *err = -ERANGE; + return; + } + add_short_atom_header(cmd, false, false, len); + while (len--) { + n = number >> (len * 8); + add_token_u8(err, cmd, n); + } +} + +static void add_token_bytestring(int *err, struct opal_dev *cmd, + const u8 *bytestring, size_t len) +{ + size_t header_len = 1; + bool is_short_atom = true; + + if (*err) + return; + + if (len & ~SHORT_ATOM_LEN_MASK) { + header_len = 2; + is_short_atom = false; + } + + if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) { + pr_err("Error adding bytestring: end of buffer.\n"); + *err = -ERANGE; + return; + } + + if (is_short_atom) + add_short_atom_header(cmd, true, false, len); + else + add_medium_atom_header(cmd, true, false, len); + + memcpy(&cmd->cmd[cmd->pos], bytestring, len); + cmd->pos += len; + +} + +static int build_locking_range(u8 *buffer, size_t length, u8 lr) +{ + if (length > OPAL_UID_LENGTH) { + pr_err("Can't build locking range. Length OOB\n"); + return -ERANGE; + } + + memcpy(buffer, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH); + + if (lr == 0) + return 0; + buffer[5] = LOCKING_RANGE_NON_GLOBAL; + buffer[7] = lr; + + return 0; +} + +static int build_locking_user(u8 *buffer, size_t length, u8 lr) +{ + if (length > OPAL_UID_LENGTH) { + pr_err("Can't build locking range user, Length OOB\n"); + return -ERANGE; + } + + memcpy(buffer, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + + buffer[7] = lr + 1; + + return 0; +} + +static void set_comid(struct opal_dev *cmd, u16 comid) +{ + struct opal_header *hdr = (struct opal_header *)cmd->cmd; + + hdr->cp.extendedComID[0] = comid >> 8; + hdr->cp.extendedComID[1] = comid; + hdr->cp.extendedComID[2] = 0; + hdr->cp.extendedComID[3] = 0; +} + +static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) +{ + struct opal_header *hdr; + int err = 0; + + add_token_u8(&err, cmd, OPAL_ENDOFDATA); + add_token_u8(&err, cmd, OPAL_STARTLIST); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, OPAL_ENDLIST); + + if (err) { + pr_err("Error finalizing command.\n"); + return -EFAULT; + } + + hdr = (struct opal_header *) cmd->cmd; + + hdr->pkt.tsn = cpu_to_be32(tsn); + hdr->pkt.hsn = cpu_to_be32(hsn); + + hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr)); + while (cmd->pos % 4) { + if (cmd->pos >= IO_BUFFER_LENGTH) { + pr_err("Error: Buffer overrun\n"); + return -ERANGE; + } + cmd->cmd[cmd->pos++] = 0; + } + hdr->pkt.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp) - + sizeof(hdr->pkt)); + hdr->cp.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp)); + + return 0; +} + +static enum opal_response_token token_type(const struct parsed_resp *resp, + int n) +{ + const struct opal_resp_tok *tok; + + if (n >= resp->num) { + pr_err("Token number doesn't exist: %d, resp: %d\n", + n, resp->num); + return OPAL_DTA_TOKENID_INVALID; + } + + tok = &resp->toks[n]; + if (tok->len == 0) { + pr_err("Token length must be non-zero\n"); + return OPAL_DTA_TOKENID_INVALID; + } + + return tok->type; +} + +/* + * This function returns 0 in case of invalid token. One should call + * token_type() first to find out if the token is valid or not. + */ +static enum opal_token response_get_token(const struct parsed_resp *resp, + int n) +{ + const struct opal_resp_tok *tok; + + if (n >= resp->num) { + pr_err("Token number doesn't exist: %d, resp: %d\n", + n, resp->num); + return 0; + } + + tok = &resp->toks[n]; + if (tok->len == 0) { + pr_err("Token length must be non-zero\n"); + return 0; + } + + return tok->pos[0]; +} + +static size_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = 1; + tok->width = OPAL_WIDTH_TINY; + + if (pos[0] & TINY_ATOM_SIGNED) { + tok->type = OPAL_DTA_TOKENID_SINT; + } else { + tok->type = OPAL_DTA_TOKENID_UINT; + tok->stored.u = pos[0] & 0x3f; + } + + return tok->len; +} + +static size_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; + tok->width = OPAL_WIDTH_SHORT; + + if (pos[0] & SHORT_ATOM_BYTESTRING) { + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + } else if (pos[0] & SHORT_ATOM_SIGNED) { + tok->type = OPAL_DTA_TOKENID_SINT; + } else { + u64 u_integer = 0; + int i, b = 0; + + tok->type = OPAL_DTA_TOKENID_UINT; + if (tok->len > 9) { + pr_warn("uint64 with more than 8 bytes\n"); + return -EINVAL; + } + for (i = tok->len - 1; i > 0; i--) { + u_integer |= ((u64)pos[i] << (8 * b)); + b++; + } + tok->stored.u = u_integer; + } + + return tok->len; +} + +static size_t response_parse_medium(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; + tok->width = OPAL_WIDTH_MEDIUM; + + if (pos[0] & MEDIUM_ATOM_BYTESTRING) + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + else if (pos[0] & MEDIUM_ATOM_SIGNED) + tok->type = OPAL_DTA_TOKENID_SINT; + else + tok->type = OPAL_DTA_TOKENID_UINT; + + return tok->len; +} + +static size_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; + tok->width = OPAL_WIDTH_LONG; + + if (pos[0] & LONG_ATOM_BYTESTRING) + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + else if (pos[0] & LONG_ATOM_SIGNED) + tok->type = OPAL_DTA_TOKENID_SINT; + else + tok->type = OPAL_DTA_TOKENID_UINT; + + return tok->len; +} + +static size_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = 1; + tok->type = OPAL_DTA_TOKENID_TOKEN; + tok->width = OPAL_WIDTH_TOKEN; + + return tok->len; +} + +static int response_parse(const u8 *buf, size_t length, + struct parsed_resp *resp) +{ + const struct opal_header *hdr; + struct opal_resp_tok *iter; + int num_entries = 0; + int total; + size_t token_length; + const u8 *pos; + + if (!buf) + return -EFAULT; + + if (!resp) + return -EFAULT; + + hdr = (struct opal_header *)buf; + pos = buf; + pos += sizeof(*hdr); + + pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", + be32_to_cpu(hdr->cp.length), + be32_to_cpu(hdr->pkt.length), + be32_to_cpu(hdr->subpkt.length)); + + if (hdr->cp.length == 0 || hdr->pkt.length == 0 || + hdr->subpkt.length == 0) { + pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", + be32_to_cpu(hdr->cp.length), + be32_to_cpu(hdr->pkt.length), + be32_to_cpu(hdr->subpkt.length)); + print_buffer(pos, sizeof(*hdr)); + return -EINVAL; + } + + if (pos > buf + length) + return -EFAULT; + + iter = resp->toks; + total = be32_to_cpu(hdr->subpkt.length); + print_buffer(pos, total); + while (total > 0) { + if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ + token_length = response_parse_tiny(iter, pos); + else if (pos[0] <= SHORT_ATOM_BYTE) /* short atom */ + token_length = response_parse_short(iter, pos); + else if (pos[0] <= MEDIUM_ATOM_BYTE) /* medium atom */ + token_length = response_parse_medium(iter, pos); + else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */ + token_length = response_parse_long(iter, pos); + else /* TOKEN */ + token_length = response_parse_token(iter, pos); + + if (token_length == -EINVAL) + return -EINVAL; + + pos += token_length; + total -= token_length; + iter++; + num_entries++; + } + + if (num_entries == 0) { + pr_err("Couldn't parse response.\n"); + return -EINVAL; + } + resp->num = num_entries; + + return 0; +} + +static size_t response_get_string(const struct parsed_resp *resp, int n, + const char **store) +{ + *store = NULL; + if (!resp) { + pr_err("Response is NULL\n"); + return 0; + } + + if (n > resp->num) { + pr_err("Response has %d tokens. Can't access %d\n", + resp->num, n); + return 0; + } + + if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) { + pr_err("Token is not a byte string!\n"); + return 0; + } + + *store = resp->toks[n].pos + 1; + return resp->toks[n].len - 1; +} + +static u64 response_get_u64(const struct parsed_resp *resp, int n) +{ + if (!resp) { + pr_err("Response is NULL\n"); + return 0; + } + + if (n > resp->num) { + pr_err("Response has %d tokens. Can't access %d\n", + resp->num, n); + return 0; + } + + if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) { + pr_err("Token is not unsigned it: %d\n", + resp->toks[n].type); + return 0; + } + + if (!(resp->toks[n].width == OPAL_WIDTH_TINY || + resp->toks[n].width == OPAL_WIDTH_SHORT)) { + pr_err("Atom is not short or tiny: %d\n", + resp->toks[n].width); + return 0; + } + + return resp->toks[n].stored.u; +} + +static u8 response_status(const struct parsed_resp *resp) +{ + if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && + response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + return 0; + } + + if (resp->num < 5) + return DTAERROR_NO_METHOD_STATUS; + + if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || + token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || + response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || + response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + return DTAERROR_NO_METHOD_STATUS; + + return response_get_u64(resp, resp->num - 4); +} + +/* Parses and checks for errors */ +static int parse_and_check_status(struct opal_dev *dev) +{ + int error; + + print_buffer(dev->cmd, dev->pos); + + error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed); + if (error) { + pr_err("Couldn't parse response.\n"); + return error; + } + + return response_status(&dev->parsed); +} + +static void clear_opal_cmd(struct opal_dev *dev) +{ + dev->pos = sizeof(struct opal_header); + memset(dev->cmd, 0, IO_BUFFER_LENGTH); +} + +static int start_opal_session_cont(struct opal_dev *dev) +{ + u32 hsn, tsn; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + hsn = response_get_u64(&dev->parsed, 4); + tsn = response_get_u64(&dev->parsed, 5); + + if (hsn == 0 && tsn == 0) { + pr_err("Couldn't authenticate session\n"); + return -EPERM; + } + + dev->hsn = hsn; + dev->tsn = tsn; + return 0; +} + +static void add_suspend_info(struct opal_dev *dev, + struct opal_suspend_data *sus) +{ + struct opal_suspend_data *iter; + + list_for_each_entry(iter, &dev->unlk_lst, node) { + if (iter->lr == sus->lr) { + list_del(&iter->node); + kfree(iter); + break; + } + } + list_add_tail(&sus->node, &dev->unlk_lst); +} + +static int end_session_cont(struct opal_dev *dev) +{ + dev->hsn = 0; + dev->tsn = 0; + return parse_and_check_status(dev); +} + +static int finalize_and_send(struct opal_dev *dev, cont_fn cont) +{ + int ret; + + ret = cmd_finalize(dev, dev->hsn, dev->tsn); + if (ret) { + pr_err("Error finalizing command buffer: %d\n", ret); + return ret; + } + + print_buffer(dev->cmd, dev->pos); + + return opal_send_recv(dev, cont); +} + +static int gen_key(struct opal_dev *dev) +{ + const u8 *method; + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len)); + method = opalmethod[OPAL_GENKEY]; + kfree(dev->prev_data); + dev->prev_data = NULL; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GENKEY], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building gen key command\n"); + return err; + + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int get_active_key_cont(struct opal_dev *dev) +{ + const char *activekey; + size_t keylen; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + keylen = response_get_string(&dev->parsed, 4, &activekey); + if (!activekey) { + pr_err("%s: Couldn't extract the Activekey from the response\n", + __func__); + return OPAL_INVAL_PARAM; + } + dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL); + + if (!dev->prev_data) + return -ENOMEM; + + dev->prev_d_len = keylen; + + return 0; +} + +static int get_active_key(struct opal_dev *dev) +{ + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + u8 *lr; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + lr = dev->func_data[dev->state]; + + err = build_locking_range(uid, sizeof(uid), *lr); + if (err) + return err; + + err = 0; + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* startCloumn */ + add_token_u8(&err, dev, 10); /* ActiveKey */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* endColumn */ + add_token_u8(&err, dev, 10); /* ActiveKey */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + if (err) { + pr_err("Error building get active key command\n"); + return err; + } + + return finalize_and_send(dev, get_active_key_cont); +} + +static int generic_lr_enable_disable(struct opal_dev *dev, + u8 *uid, bool rle, bool wle, + bool rl, bool wl) +{ + int err = 0; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 5); /* ReadLockEnabled */ + add_token_u8(&err, dev, rle); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 6); /* WriteLockEnabled */ + add_token_u8(&err, dev, wle); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_READLOCKED); + add_token_u8(&err, dev, rl); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WRITELOCKED); + add_token_u8(&err, dev, wl); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + return err; +} + +static inline int enable_global_lr(struct opal_dev *dev, u8 *uid, + struct opal_user_lr_setup *setup) +{ + int err; + + err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, + 0, 0); + if (err) + pr_err("Failed to create enable global lr command\n"); + return err; +} + +static int setup_locking_range(struct opal_dev *dev) +{ + u8 uid[OPAL_UID_LENGTH]; + struct opal_user_lr_setup *setup; + u8 lr; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + setup = dev->func_data[dev->state]; + lr = setup->session.opal_key.lr; + err = build_locking_range(uid, sizeof(uid), lr); + if (err) + return err; + + if (lr == 0) + err = enable_global_lr(dev, uid, setup); + else { + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], + OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* Ranges Start */ + add_token_u64(&err, dev, setup->range_start); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* Ranges length */ + add_token_u64(&err, dev, setup->range_length); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 5); /*ReadLockEnabled */ + add_token_u64(&err, dev, !!setup->RLE); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 6); /*WriteLockEnabled*/ + add_token_u64(&err, dev, !!setup->WLE); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + } + if (err) { + pr_err("Error building Setup Locking range command.\n"); + return err; + + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int start_generic_opal_session(struct opal_dev *dev, + enum opal_uid auth, + enum opal_uid sp_type, + const char *key, + u8 key_len) +{ + u32 hsn; + int err = 0; + + if (key == NULL && auth != OPAL_ANYBODY_UID) { + pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \ + "Challenge, and not as the Anybody UID\n", __func__); + return OPAL_INVAL_PARAM; + } + + clear_opal_cmd(dev); + + set_comid(dev, dev->comid); + hsn = GENERIC_HOST_SESSION_NUM; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u64(&err, dev, hsn); + add_token_bytestring(&err, dev, opaluid[sp_type], OPAL_UID_LENGTH); + add_token_u8(&err, dev, 1); + + switch (auth) { + case OPAL_ANYBODY_UID: + add_token_u8(&err, dev, OPAL_ENDLIST); + break; + case OPAL_ADMIN1_UID: + case OPAL_SID_UID: + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 0); /* HostChallenge */ + add_token_bytestring(&err, dev, key, key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* HostSignAuth */ + add_token_bytestring(&err, dev, opaluid[auth], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + break; + default: + pr_err("Cannot start Admin SP session with auth %d\n", auth); + return OPAL_INVAL_PARAM; + } + + if (err) { + pr_err("Error building start adminsp session command.\n"); + return err; + } + + return finalize_and_send(dev, start_opal_session_cont); +} + +static int start_anybodyASP_opal_session(struct opal_dev *dev) +{ + return start_generic_opal_session(dev, OPAL_ANYBODY_UID, + OPAL_ADMINSP_UID, NULL, 0); +} + +static int start_SIDASP_opal_session(struct opal_dev *dev) +{ + int ret; + const u8 *key = dev->prev_data; + struct opal_key *okey; + + if (!key) { + okey = dev->func_data[dev->state]; + ret = start_generic_opal_session(dev, OPAL_SID_UID, + OPAL_ADMINSP_UID, + okey->key, + okey->key_len); + } else { + ret = start_generic_opal_session(dev, OPAL_SID_UID, + OPAL_ADMINSP_UID, + key, dev->prev_d_len); + kfree(key); + dev->prev_data = NULL; + } + return ret; +} + +static inline int start_admin1LSP_opal_session(struct opal_dev *dev) +{ + struct opal_key *key = dev->func_data[dev->state]; + + return start_generic_opal_session(dev, OPAL_ADMIN1_UID, + OPAL_LOCKINGSP_UID, + key->key, key->key_len); +} + +static int start_auth_opal_session(struct opal_dev *dev) +{ + u8 lk_ul_user[OPAL_UID_LENGTH]; + int err = 0; + + struct opal_session_info *session = dev->func_data[dev->state]; + size_t keylen = session->opal_key.key_len; + u8 *key = session->opal_key.key; + u32 hsn = GENERIC_HOST_SESSION_NUM; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + if (session->sum) { + err = build_locking_user(lk_ul_user, sizeof(lk_ul_user), + session->opal_key.lr); + if (err) + return err; + + } else if (session->who != OPAL_ADMIN1 && !session->sum) { + err = build_locking_user(lk_ul_user, sizeof(lk_ul_user), + session->who - 1); + if (err) + return err; + } else + memcpy(lk_ul_user, opaluid[OPAL_ADMIN1_UID], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION], + OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u64(&err, dev, hsn); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 0); + add_token_bytestring(&err, dev, key, keylen); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); + add_token_bytestring(&err, dev, lk_ul_user, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building STARTSESSION command.\n"); + return err; + } + + return finalize_and_send(dev, start_opal_session_cont); +} + +static int revert_tper(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_ADMINSP_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_REVERT], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + if (err) { + pr_err("Error building REVERT TPER command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int internal_activate_user(struct opal_dev *dev) +{ + struct opal_session_info *session = dev->func_data[dev->state]; + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + memcpy(uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + uid[7] = session->who; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 5); /* Enabled */ + add_token_u8(&err, dev, OPAL_TRUE); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building Activate UserN command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int erase_locking_range(struct opal_dev *dev) +{ + struct opal_session_info *session; + u8 uid[OPAL_UID_LENGTH]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + session = dev->func_data[dev->state]; + + if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0) + return -ERANGE; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_ERASE], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building Erase Locking Range Command.\n"); + return err; + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_mbr_done(struct opal_dev *dev) +{ + u8 mbr_done_tf = *(u8 *)dev->func_data[dev->state]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 2); /* Done */ + add_token_u8(&err, dev, mbr_done_tf); /* Done T or F */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error Building set MBR Done command\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_mbr_enable_disable(struct opal_dev *dev) +{ + u8 mbr_en_dis = *(u8 *)dev->func_data[dev->state]; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, mbr_en_dis); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error Building set MBR done command\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, + struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, cpin_uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* PIN */ + add_token_bytestring(&err, dev, key, key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + return err; +} + +static int set_new_pw(struct opal_dev *dev) +{ + u8 cpin_uid[OPAL_UID_LENGTH]; + struct opal_session_info *usr = dev->func_data[dev->state]; + + + memcpy(cpin_uid, opaluid[OPAL_C_PIN_ADMIN1], OPAL_UID_LENGTH); + + if (usr->who != OPAL_ADMIN1) { + cpin_uid[5] = 0x03; + if (usr->sum) + cpin_uid[7] = usr->opal_key.lr + 1; + else + cpin_uid[7] = usr->who; + } + + if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len, + cpin_uid, dev)) { + pr_err("Error building set password command.\n"); + return -ERANGE; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_sid_cpin_pin(struct opal_dev *dev) +{ + u8 cpin_uid[OPAL_UID_LENGTH]; + struct opal_key *key = dev->func_data[dev->state]; + + memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH); + + if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) { + pr_err("Error building Set SID cpin\n"); + return -ERANGE; + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int add_user_to_lr(struct opal_dev *dev) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u8 user_uid[OPAL_UID_LENGTH]; + struct opal_lock_unlock *lkul; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + lkul = dev->func_data[dev->state]; + + memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED], + OPAL_UID_LENGTH); + + if (lkul->l_state == OPAL_RW) + memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED], + OPAL_UID_LENGTH); + + lr_buffer[7] = lkul->session.opal_key.lr; + + memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + + user_uid[7] = lkul->session.who; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], + OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); + + add_token_u8(&err, dev, OPAL_STARTLIST); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE], + OPAL_UID_LENGTH/2); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building add user to locking range command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int lock_unlock_locking_range(struct opal_dev *dev) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + const u8 *method; + struct opal_lock_unlock *lkul; + u8 read_locked = 1, write_locked = 1; + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + method = opalmethod[OPAL_SET]; + lkul = dev->func_data[dev->state]; + if (build_locking_range(lr_buffer, sizeof(lr_buffer), + lkul->session.opal_key.lr) < 0) + return -ERANGE; + + switch (lkul->l_state) { + case OPAL_RO: + read_locked = 0; + write_locked = 1; + break; + case OPAL_RW: + read_locked = 0; + write_locked = 0; + break; + case OPAL_LK: + /* vars are initalized to locked */ + break; + default: + pr_err("Tried to set an invalid locking state... returning to uland\n"); + return OPAL_INVAL_PARAM; + } + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_READLOCKED); + add_token_u8(&err, dev, read_locked); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WRITELOCKED); + add_token_u8(&err, dev, write_locked); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building SET command.\n"); + return err; + } + return finalize_and_send(dev, parse_and_check_status); +} + + +static int lock_unlock_locking_range_sum(struct opal_dev *dev) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u8 read_locked = 1, write_locked = 1; + const u8 *method; + struct opal_lock_unlock *lkul; + int ret; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + method = opalmethod[OPAL_SET]; + lkul = dev->func_data[dev->state]; + if (build_locking_range(lr_buffer, sizeof(lr_buffer), + lkul->session.opal_key.lr) < 0) + return -ERANGE; + + switch (lkul->l_state) { + case OPAL_RO: + read_locked = 0; + write_locked = 1; + break; + case OPAL_RW: + read_locked = 0; + write_locked = 0; + break; + case OPAL_LK: + /* vars are initalized to locked */ + break; + default: + pr_err("Tried to set an invalid locking state.\n"); + return OPAL_INVAL_PARAM; + } + ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1, + read_locked, write_locked); + + if (ret < 0) { + pr_err("Error building SET command.\n"); + return ret; + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int activate_lsp(struct opal_dev *dev) +{ + struct opal_lr_act *opal_act; + u8 user_lr[OPAL_UID_LENGTH]; + u8 uint_3 = 0x83; + int err = 0, i; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + opal_act = dev->func_data[dev->state]; + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_ACTIVATE], + OPAL_UID_LENGTH); + + + if (opal_act->sum) { + err = build_locking_range(user_lr, sizeof(user_lr), + opal_act->lr[0]); + if (err) + return err; + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, uint_3); + add_token_u8(&err, dev, 6); + add_token_u8(&err, dev, 0); + add_token_u8(&err, dev, 0); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + for (i = 1; i < opal_act->num_lrs; i++) { + user_lr[7] = opal_act->lr[i]; + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + } + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + } else { + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + } + + if (err) { + pr_err("Error building Activate LockingSP command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int get_lsp_lifecycle_cont(struct opal_dev *dev) +{ + u8 lc_status; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + lc_status = response_get_u64(&dev->parsed, 4); + /* 0x08 is Manufacured Inactive */ + /* 0x09 is Manufactured */ + if (lc_status != OPAL_MANUFACTURED_INACTIVE) { + pr_err("Couldn't determine the status of the Lifcycle state\n"); + return -ENODEV; + } + + return 0; +} + +/* Determine if we're in the Manufactured Inactive or Active state */ +static int get_lsp_lifecycle(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* Start Column */ + add_token_u8(&err, dev, 6); /* Lifecycle Column */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* End Column */ + add_token_u8(&err, dev, 6); /* Lifecycle Column */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error Building GET Lifecycle Status command\n"); + return err; + } + + return finalize_and_send(dev, get_lsp_lifecycle_cont); +} + +static int get_msid_cpin_pin_cont(struct opal_dev *dev) +{ + const char *msid_pin; + size_t strlen; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + strlen = response_get_string(&dev->parsed, 4, &msid_pin); + if (!msid_pin) { + pr_err("%s: Couldn't extract PIN from response\n", __func__); + return OPAL_INVAL_PARAM; + } + + dev->prev_data = kmemdup(msid_pin, strlen, GFP_KERNEL); + if (!dev->prev_data) + return -ENOMEM; + + dev->prev_d_len = strlen; + + return 0; +} + +static int get_msid_cpin_pin(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, opaluid[OPAL_C_PIN_MSID], + OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* Start Column */ + add_token_u8(&err, dev, 3); /* PIN */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 4); /* End Column */ + add_token_u8(&err, dev, 3); /* Lifecycle Column */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_err("Error building Get MSID CPIN PIN command.\n"); + return err; + } + + return finalize_and_send(dev, get_msid_cpin_pin_cont); +} + +static int build_end_opal_session(struct opal_dev *dev) +{ + int err = 0; + + clear_opal_cmd(dev); + + set_comid(dev, dev->comid); + add_token_u8(&err, dev, OPAL_ENDOFSESSION); + return err; +} + +static int end_opal_session(struct opal_dev *dev) +{ + int ret = build_end_opal_session(dev); + + if (ret < 0) + return ret; + return finalize_and_send(dev, end_session_cont); +} + +static int end_opal_session_error(struct opal_dev *dev) +{ + const opal_step error_end_session[] = { + end_opal_session, + NULL, + }; + dev->funcs = error_end_session; + dev->state = 0; + return next(dev); +} + +static inline void setup_opal_dev(struct opal_dev *dev, + const opal_step *funcs) +{ + dev->state = 0; + dev->funcs = funcs; + dev->tsn = 0; + dev->hsn = 0; + dev->func_data = NULL; + dev->prev_data = NULL; +} + +static int check_opal_support(struct opal_dev *dev) +{ + static const opal_step funcs[] = { + opal_discovery0, + NULL + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, funcs); + ret = next(dev); + dev->supported = !ret; + mutex_unlock(&dev->dev_lock); + return ret; +} + +struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) +{ + struct opal_dev *dev; + + dev = kmalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + + INIT_LIST_HEAD(&dev->unlk_lst); + mutex_init(&dev->dev_lock); + dev->data = data; + dev->send_recv = send_recv; + if (check_opal_support(dev) != 0) { + pr_debug("Opal is not supported on this device\n"); + kfree(dev); + return NULL; + } + return dev; +} +EXPORT_SYMBOL(init_opal_dev); + +static int opal_secure_erase_locking_range(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + void *data[3] = { NULL }; + static const opal_step erase_funcs[] = { + opal_discovery0, + start_auth_opal_session, + get_active_key, + gen_key, + end_opal_session, + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, erase_funcs); + + dev->func_data = data; + dev->func_data[1] = opal_session; + dev->func_data[2] = &opal_session->opal_key.lr; + + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_erase_locking_range(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + void *data[3] = { NULL }; + static const opal_step erase_funcs[] = { + opal_discovery0, + start_auth_opal_session, + erase_locking_range, + end_opal_session, + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, erase_funcs); + + dev->func_data = data; + dev->func_data[1] = opal_session; + dev->func_data[2] = opal_session; + + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, + struct opal_mbr_data *opal_mbr) +{ + void *func_data[6] = { NULL }; + static const opal_step mbr_funcs[] = { + opal_discovery0, + start_admin1LSP_opal_session, + set_mbr_done, + end_opal_session, + start_admin1LSP_opal_session, + set_mbr_enable_disable, + end_opal_session, + NULL, + }; + int ret; + + if (opal_mbr->enable_disable != OPAL_MBR_ENABLE && + opal_mbr->enable_disable != OPAL_MBR_DISABLE) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, mbr_funcs); + dev->func_data = func_data; + dev->func_data[1] = &opal_mbr->key; + dev->func_data[2] = &opal_mbr->enable_disable; + dev->func_data[4] = &opal_mbr->key; + dev->func_data[5] = &opal_mbr->enable_disable; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) +{ + struct opal_suspend_data *suspend; + + suspend = kzalloc(sizeof(*suspend), GFP_KERNEL); + if (!suspend) + return -ENOMEM; + + suspend->unlk = *lk_unlk; + suspend->lr = lk_unlk->session.opal_key.lr; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, NULL); + add_suspend_info(dev, suspend); + mutex_unlock(&dev->dev_lock); + return 0; +} + +static int opal_add_user_to_lr(struct opal_dev *dev, + struct opal_lock_unlock *lk_unlk) +{ + void *func_data[3] = { NULL }; + static const opal_step funcs[] = { + opal_discovery0, + start_admin1LSP_opal_session, + add_user_to_lr, + end_opal_session, + NULL + }; + int ret; + + if (lk_unlk->l_state != OPAL_RO && + lk_unlk->l_state != OPAL_RW) { + pr_err("Locking state was not RO or RW\n"); + return -EINVAL; + } + if (lk_unlk->session.who < OPAL_USER1 && + lk_unlk->session.who > OPAL_USER9) { + pr_err("Authority was not within the range of users: %d\n", + lk_unlk->session.who); + return -EINVAL; + } + if (lk_unlk->session.sum) { + pr_err("%s not supported in sum. Use setup locking range\n", + __func__); + return -EINVAL; + } + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, funcs); + dev->func_data = func_data; + dev->func_data[1] = &lk_unlk->session.opal_key; + dev->func_data[2] = lk_unlk; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal) +{ + void *data[2] = { NULL }; + static const opal_step revert_funcs[] = { + opal_discovery0, + start_SIDASP_opal_session, + revert_tper, /* controller will terminate session */ + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, revert_funcs); + dev->func_data = data; + dev->func_data[1] = opal; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int __opal_lock_unlock_sum(struct opal_dev *dev) +{ + static const opal_step ulk_funcs_sum[] = { + opal_discovery0, + start_auth_opal_session, + lock_unlock_locking_range_sum, + end_opal_session, + NULL + }; + + dev->funcs = ulk_funcs_sum; + return next(dev); +} + +static int __opal_lock_unlock(struct opal_dev *dev) +{ + static const opal_step _unlock_funcs[] = { + opal_discovery0, + start_auth_opal_session, + lock_unlock_locking_range, + end_opal_session, + NULL + }; + + dev->funcs = _unlock_funcs; + return next(dev); +} + +static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) +{ + void *func_data[3] = { NULL }; + int ret; + + if (lk_unlk->session.who < OPAL_ADMIN1 || + lk_unlk->session.who > OPAL_USER9) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, NULL); + dev->func_data = func_data; + dev->func_data[1] = &lk_unlk->session; + dev->func_data[2] = lk_unlk; + + if (lk_unlk->session.sum) + ret = __opal_lock_unlock_sum(dev); + else + ret = __opal_lock_unlock(dev); + + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) +{ + static const opal_step owner_funcs[] = { + opal_discovery0, + start_anybodyASP_opal_session, + get_msid_cpin_pin, + end_opal_session, + start_SIDASP_opal_session, + set_sid_cpin_pin, + end_opal_session, + NULL + }; + void *data[6] = { NULL }; + int ret; + + if (!dev) + return -ENODEV; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, owner_funcs); + dev->func_data = data; + dev->func_data[4] = opal; + dev->func_data[5] = opal; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_act) +{ + void *data[4] = { NULL }; + static const opal_step active_funcs[] = { + opal_discovery0, + start_SIDASP_opal_session, /* Open session as SID auth */ + get_lsp_lifecycle, + activate_lsp, + end_opal_session, + NULL + }; + int ret; + + if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, active_funcs); + dev->func_data = data; + dev->func_data[1] = &opal_lr_act->key; + dev->func_data[3] = opal_lr_act; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_setup_locking_range(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + void *data[3] = { NULL }; + static const opal_step lr_funcs[] = { + opal_discovery0, + start_auth_opal_session, + setup_locking_range, + end_opal_session, + NULL, + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, lr_funcs); + dev->func_data = data; + dev->func_data[1] = &opal_lrs->session; + dev->func_data[2] = opal_lrs; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) +{ + static const opal_step pw_funcs[] = { + opal_discovery0, + start_auth_opal_session, + set_new_pw, + end_opal_session, + NULL + }; + void *data[3] = { NULL }; + int ret; + + if (opal_pw->session.who < OPAL_ADMIN1 || + opal_pw->session.who > OPAL_USER9 || + opal_pw->new_user_pw.who < OPAL_ADMIN1 || + opal_pw->new_user_pw.who > OPAL_USER9) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, pw_funcs); + dev->func_data = data; + dev->func_data[1] = (void *) &opal_pw->session; + dev->func_data[2] = (void *) &opal_pw->new_user_pw; + + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +static int opal_activate_user(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + static const opal_step act_funcs[] = { + opal_discovery0, + start_admin1LSP_opal_session, + internal_activate_user, + end_opal_session, + NULL + }; + void *data[3] = { NULL }; + int ret; + + /* We can't activate Admin1 it's active as manufactured */ + if (opal_session->who < OPAL_USER1 && + opal_session->who > OPAL_USER9) { + pr_err("Who was not a valid user: %d\n", opal_session->who); + return -EINVAL; + } + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, act_funcs); + dev->func_data = data; + dev->func_data[1] = &opal_session->opal_key; + dev->func_data[2] = opal_session; + ret = next(dev); + mutex_unlock(&dev->dev_lock); + return ret; +} + +bool opal_unlock_from_suspend(struct opal_dev *dev) +{ + struct opal_suspend_data *suspend; + void *func_data[3] = { NULL }; + bool was_failure = false; + int ret = 0; + + if (!dev) + return false; + if (!dev->supported) + return false; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev, NULL); + dev->func_data = func_data; + + list_for_each_entry(suspend, &dev->unlk_lst, node) { + dev->state = 0; + dev->func_data[1] = &suspend->unlk.session; + dev->func_data[2] = &suspend->unlk; + dev->tsn = 0; + dev->hsn = 0; + + if (suspend->unlk.session.sum) + ret = __opal_lock_unlock_sum(dev); + else + ret = __opal_lock_unlock(dev); + if (ret) { + pr_warn("Failed to unlock LR %hhu with sum %d\n", + suspend->unlk.session.opal_key.lr, + suspend->unlk.session.sum); + was_failure = true; + } + } + mutex_unlock(&dev->dev_lock); + return was_failure; +} +EXPORT_SYMBOL(opal_unlock_from_suspend); + +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) +{ + void *p; + int ret = -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!dev) + return -ENOTSUPP; + if (!dev->supported) { + pr_err("Not supported\n"); + return -ENOTSUPP; + } + + p = memdup_user(arg, _IOC_SIZE(cmd)); + if (IS_ERR(p)) + return PTR_ERR(p); + + switch (cmd) { + case IOC_OPAL_SAVE: + ret = opal_save(dev, p); + break; + case IOC_OPAL_LOCK_UNLOCK: + ret = opal_lock_unlock(dev, p); + break; + case IOC_OPAL_TAKE_OWNERSHIP: + ret = opal_take_ownership(dev, p); + break; + case IOC_OPAL_ACTIVATE_LSP: + ret = opal_activate_lsp(dev, p); + break; + case IOC_OPAL_SET_PW: + ret = opal_set_new_pw(dev, p); + break; + case IOC_OPAL_ACTIVATE_USR: + ret = opal_activate_user(dev, p); + break; + case IOC_OPAL_REVERT_TPR: + ret = opal_reverttper(dev, p); + break; + case IOC_OPAL_LR_SETUP: + ret = opal_setup_locking_range(dev, p); + break; + case IOC_OPAL_ADD_USR_TO_LR: + ret = opal_add_user_to_lr(dev, p); + break; + case IOC_OPAL_ENABLE_DISABLE_MBR: + ret = opal_enable_disable_shadow_mbr(dev, p); + break; + case IOC_OPAL_ERASE_LR: + ret = opal_erase_locking_range(dev, p); + break; + case IOC_OPAL_SECURE_ERASE_LR: + ret = opal_secure_erase_locking_range(dev, p); + break; + default: + pr_warn("No such Opal Ioctl %u\n", cmd); + } + + kfree(p); + return ret; +} +EXPORT_SYMBOL_GPL(sed_ioctl); |