diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 6 | ||||
-rw-r--r-- | block/Makefile | 4 | ||||
-rw-r--r-- | block/bio.c | 5 | ||||
-rw-r--r-- | block/blk-cgroup.c | 4 | ||||
-rw-r--r-- | block/blk-core.c | 16 | ||||
-rw-r--r-- | block/blk-flush.c | 6 | ||||
-rw-r--r-- | block/blk-lib.c | 15 | ||||
-rw-r--r-- | block/blk-mq-cpu.c | 67 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 25 | ||||
-rw-r--r-- | block/blk-mq-pci.c | 47 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 40 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 514 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 44 | ||||
-rw-r--r-- | block/blk-mq.c | 389 | ||||
-rw-r--r-- | block/blk-mq.h | 28 | ||||
-rw-r--r-- | block/blk-softirq.c | 27 | ||||
-rw-r--r-- | block/blk-sysfs.c | 4 | ||||
-rw-r--r-- | block/blk.h | 11 | ||||
-rw-r--r-- | block/cfq-iosched.c | 13 | ||||
-rw-r--r-- | block/ioctl.c | 18 |
20 files changed, 449 insertions, 834 deletions
diff --git a/block/Kconfig b/block/Kconfig index 161491d0a879..1d4d624492fc 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -4,6 +4,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select SBITMAP help Provide block layer support for the kernel. @@ -124,4 +125,9 @@ config BLOCK_COMPAT depends on BLOCK && COMPAT default y +config BLK_MQ_PCI + bool + depends on BLOCK && PCI + default y + source block/Kconfig.iosched diff --git a/block/Makefile b/block/Makefile index 9eda2322b2d4..36acdd7545be 100644 --- a/block/Makefile +++ b/block/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o \ - blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ + blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ @@ -22,4 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o - +obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o diff --git a/block/bio.c b/block/bio.c index aa7354088008..db85c5753a76 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1068,7 +1068,7 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) return 0; } -static void bio_free_pages(struct bio *bio) +void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; int i; @@ -1076,6 +1076,7 @@ static void bio_free_pages(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); } +EXPORT_SYMBOL(bio_free_pages); /** * bio_uncopy_user - finish previously mapped bio @@ -1274,7 +1275,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, nr_pages += end - start; /* - * buffer must be aligned to at least hardsector size for now + * buffer must be aligned to at least logical block size for now */ if (uaddr & queue_dma_alignment(q)) return ERR_PTR(-EINVAL); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index dd38e5ced4a3..b08ccbb9393a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1340,10 +1340,8 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) { - mutex_unlock(&blkcg_pol_mutex); + if (!cpd) goto err_free_cpds; - } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; diff --git a/block/blk-core.c b/block/blk-core.c index 36c7ac328d8c..14d7c0740dc0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -288,7 +288,7 @@ void blk_sync_queue(struct request_queue *q) int i; queue_for_each_hw_ctx(q, hctx, i) { - cancel_delayed_work_sync(&hctx->run_work); + cancel_work_sync(&hctx->run_work); cancel_delayed_work_sync(&hctx->delay_work); } } else { @@ -3097,6 +3097,12 @@ int kblockd_schedule_work(struct work_struct *work) } EXPORT_SYMBOL(kblockd_schedule_work); +int kblockd_schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, kblockd_workqueue, work); +} +EXPORT_SYMBOL(kblockd_schedule_work_on); + int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { @@ -3301,19 +3307,23 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie) { struct blk_plug *plug; long state; + unsigned int queue_num; + struct blk_mq_hw_ctx *hctx; if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return false; + queue_num = blk_qc_t_to_queue_num(cookie); + hctx = q->queue_hw_ctx[queue_num]; + hctx->poll_considered++; + plug = current->plug; if (plug) blk_flush_plug_list(plug, false); state = current->state; while (!need_resched()) { - unsigned int queue_num = blk_qc_t_to_queue_num(cookie); - struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; int ret; hctx->poll_invoked++; diff --git a/block/blk-flush.c b/block/blk-flush.c index d308def812db..6a14b68b9135 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -232,7 +232,7 @@ static void flush_end_io(struct request *flush_rq, int error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu); + hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); flush_rq->tag = -1; } @@ -325,7 +325,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) flush_rq->tag = first_rq->tag; fq->orig_rq = first_rq; - hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu); + hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); } @@ -358,7 +358,7 @@ static void mq_flush_data_end_io(struct request *rq, int error) unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); /* * After populating an empty queue, kick it to avoid stall. Read diff --git a/block/blk-lib.c b/block/blk-lib.c index 083e56f72308..46fe9248410d 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -31,6 +31,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, unsigned int granularity; enum req_op op; int alignment; + sector_t bs_mask; if (!q) return -ENXIO; @@ -50,6 +51,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, op = REQ_OP_DISCARD; } + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; @@ -150,10 +155,15 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, unsigned int max_write_same_sectors; struct bio *bio = NULL; int ret = 0; + sector_t bs_mask; if (!q) return -ENXIO; + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + /* Ensure that max_write_same_sectors doesn't overflow bi_size */ max_write_same_sectors = UINT_MAX >> 9; @@ -202,6 +212,11 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, int ret; struct bio *bio = NULL; unsigned int sz; + sector_t bs_mask; + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; while (nr_sects != 0) { bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES), diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c deleted file mode 100644 index bb3ed488f7b5..000000000000 --- a/block/blk-mq-cpu.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * CPU notifier helper code for blk-mq - * - * Copyright (C) 2013-2014 Jens Axboe - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/blkdev.h> -#include <linux/list.h> -#include <linux/llist.h> -#include <linux/smp.h> -#include <linux/cpu.h> - -#include <linux/blk-mq.h> -#include "blk-mq.h" - -static LIST_HEAD(blk_mq_cpu_notify_list); -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock); - -static int blk_mq_main_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long) hcpu; - struct blk_mq_cpu_notifier *notify; - int ret = NOTIFY_OK; - - raw_spin_lock(&blk_mq_cpu_notify_lock); - - list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) { - ret = notify->notify(notify->data, action, cpu); - if (ret != NOTIFY_OK) - break; - } - - raw_spin_unlock(&blk_mq_cpu_notify_lock); - return ret; -} - -void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) -{ - BUG_ON(!notifier->notify); - - raw_spin_lock(&blk_mq_cpu_notify_lock); - list_add_tail(¬ifier->list, &blk_mq_cpu_notify_list); - raw_spin_unlock(&blk_mq_cpu_notify_lock); -} - -void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) -{ - raw_spin_lock(&blk_mq_cpu_notify_lock); - list_del(¬ifier->list); - raw_spin_unlock(&blk_mq_cpu_notify_lock); -} - -void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - int (*fn)(void *, unsigned long, unsigned int), - void *data) -{ - notifier->notify = fn; - notifier->data = data; -} - -void __init blk_mq_cpu_init(void) -{ - hotcpu_notifier(blk_mq_main_cpu_notify, 0); -} diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index d0634bcf322f..19b1d9c5f07e 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -31,14 +31,16 @@ static int get_first_sibling(unsigned int cpu) return cpu; } -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, - const struct cpumask *online_mask) +int blk_mq_map_queues(struct blk_mq_tag_set *set) { + unsigned int *map = set->mq_map; + unsigned int nr_queues = set->nr_hw_queues; + const struct cpumask *online_mask = cpu_online_mask; unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; cpumask_var_t cpus; if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) - return 1; + return -ENOMEM; cpumask_clear(cpus); nr_cpus = nr_uniq_cpus = 0; @@ -86,23 +88,6 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, return 0; } -unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) -{ - unsigned int *map; - - /* If cpus are offline, map them to first hctx */ - map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL, - set->numa_node); - if (!map) - return NULL; - - if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask)) - return map; - - kfree(map); - return NULL; -} - /* * We have no quick way of doing reverse lookups. This is only used at * queue init time, so runtime isn't important. diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c new file mode 100644 index 000000000000..966c2169762e --- /dev/null +++ b/block/blk-mq-pci.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/kobject.h> +#include <linux/blkdev.h> +#include <linux/blk-mq.h> +#include <linux/blk-mq-pci.h> +#include <linux/pci.h> +#include <linux/module.h> + +/** + * blk_mq_pci_map_queues - provide a default queue mapping for PCI device + * @set: tagset to provide the mapping for + * @pdev: PCI device associated with @set. + * + * This function assumes the PCI device @pdev has at least as many available + * interrupt vetors as @set has queues. It will then queuery the vector + * corresponding to each queue for it's affinity mask and built queue mapping + * that maps a queue to the CPUs that have irq affinity for the corresponding + * vector. + */ +int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < set->nr_hw_queues; queue++) { + mask = pci_irq_get_affinity(pdev, queue); + if (!mask) + return -EINVAL; + + for_each_cpu(cpu, mask) + set->mq_map[cpu] = queue; + } + + return 0; +} +EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index fe822aa5b8e4..01fb455d3377 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -176,7 +176,17 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) { - return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success); + return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n", + hctx->poll_considered, hctx->poll_invoked, + hctx->poll_success); +} + +static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t size) +{ + hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; + + return size; } static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, @@ -198,12 +208,14 @@ static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); - for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { - unsigned long d = 1U << (i - 1); + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { + unsigned int d = 1U << (i - 1); - page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); + page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]); } + page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1), + hctx->dispatched[i]); return page - start_page; } @@ -301,8 +313,9 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .show = blk_mq_hw_sysfs_cpus_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { - .attr = {.name = "io_poll", .mode = S_IRUGO }, + .attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO }, .show = blk_mq_hw_sysfs_poll_show, + .store = blk_mq_hw_sysfs_poll_store, }; static struct attribute *default_hw_ctx_attrs[] = { @@ -380,9 +393,8 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -static void __blk_mq_unregister_disk(struct gendisk *disk) +static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { - struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; int i, j; @@ -400,15 +412,15 @@ static void __blk_mq_unregister_disk(struct gendisk *disk) kobject_del(&q->mq_kobj); kobject_put(&q->mq_kobj); - kobject_put(&disk_to_dev(disk)->kobj); + kobject_put(&dev->kobj); q->mq_sysfs_init_done = false; } -void blk_mq_unregister_disk(struct gendisk *disk) +void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { blk_mq_disable_hotplug(); - __blk_mq_unregister_disk(disk); + __blk_mq_unregister_dev(dev, q); blk_mq_enable_hotplug(); } @@ -430,10 +442,8 @@ static void blk_mq_sysfs_init(struct request_queue *q) } } -int blk_mq_register_disk(struct gendisk *disk) +int blk_mq_register_dev(struct device *dev, struct request_queue *q) { - struct device *dev = disk_to_dev(disk); - struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; int ret, i; @@ -454,7 +464,7 @@ int blk_mq_register_disk(struct gendisk *disk) } if (ret) - __blk_mq_unregister_disk(disk); + __blk_mq_unregister_dev(dev, q); else q->mq_sysfs_init_done = true; out: @@ -462,7 +472,7 @@ out: return ret; } -EXPORT_SYMBOL_GPL(blk_mq_register_disk); +EXPORT_SYMBOL_GPL(blk_mq_register_dev); void blk_mq_sysfs_unregister(struct request_queue *q) { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 729bac3a673b..dcf5ce3ba4bf 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -1,58 +1,24 @@ /* - * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread - * over multiple cachelines to avoid ping-pong between multiple submitters - * or submitter and completer. Uses rolling wakeups to avoid falling of - * the scaling cliff when we run out of tags and have to start putting - * submitters to sleep. - * - * Uses active queue tracking to support fairer distribution of tags - * between multiple submitters when a shared tag map is used. + * Tag allocation using scalable bitmaps. Uses active queue tracking to support + * fairer distribution of tags between multiple submitters when a shared tag map + * is used. * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/random.h> #include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" -static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) -{ - int i; - - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - int ret; - - ret = find_first_zero_bit(&bm->word, bm->depth); - if (ret < bm->depth) - return true; - } - - return false; -} - bool blk_mq_has_free_tags(struct blk_mq_tags *tags) { if (!tags) return true; - return bt_has_free_tags(&tags->bitmap_tags); -} - -static inline int bt_index_inc(int index) -{ - return (index + 1) & (BT_WAIT_QUEUES - 1); -} - -static inline void bt_index_atomic_inc(atomic_t *index) -{ - int old = atomic_read(index); - int new = bt_index_inc(old); - atomic_cmpxchg(index, old, new); + return sbitmap_any_bit_clear(&tags->bitmap_tags.sb); } /* @@ -72,29 +38,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { - struct blk_mq_bitmap_tags *bt; - int i, wake_index; - - /* - * Make sure all changes prior to this are visible from other CPUs. - */ - smp_mb(); - bt = &tags->bitmap_tags; - wake_index = atomic_read(&bt->wake_index); - for (i = 0; i < BT_WAIT_QUEUES; i++) { - struct bt_wait_state *bs = &bt->bs[wake_index]; - - if (waitqueue_active(&bs->wait)) - wake_up(&bs->wait); - - wake_index = bt_index_inc(wake_index); - } - - if (include_reserve) { - bt = &tags->breserved_tags; - if (waitqueue_active(&bt->bs[0].wait)) - wake_up(&bt->bs[0].wait); - } + sbitmap_queue_wake_all(&tags->bitmap_tags); + if (include_reserve) + sbitmap_queue_wake_all(&tags->breserved_tags); } /* @@ -118,7 +64,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, - struct blk_mq_bitmap_tags *bt) + struct sbitmap_queue *bt) { unsigned int depth, users; @@ -130,7 +76,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* * Don't try dividing an ant */ - if (bt->depth == 1) + if (bt->sb.depth == 1) return true; users = atomic_read(&hctx->tags->active_queues); @@ -140,142 +86,36 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* * Allow at least some tags */ - depth = max((bt->depth + users - 1) / users, 4U); + depth = max((bt->sb.depth + users - 1) / users, 4U); return atomic_read(&hctx->nr_active) < depth; } -static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag, - bool nowrap) -{ - int tag, org_last_tag = last_tag; - - while (1) { - tag = find_next_zero_bit(&bm->word, bm->depth, last_tag); - if (unlikely(tag >= bm->depth)) { - /* - * We started with an offset, and we didn't reset the - * offset to 0 in a failure case, so start from 0 to - * exhaust the map. - */ - if (org_last_tag && last_tag && !nowrap) { - last_tag = org_last_tag = 0; - continue; - } - return -1; - } - - if (!test_and_set_bit(tag, &bm->word)) - break; - - last_tag = tag + 1; - if (last_tag >= bm->depth - 1) - last_tag = 0; - } - - return tag; -} - -#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) - -/* - * Straight forward bitmap tag implementation, where each bit is a tag - * (cleared == free, and set == busy). The small twist is using per-cpu - * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue - * contexts. This enables us to drastically limit the space searched, - * without dirtying an extra shared cacheline like we would if we stored - * the cache value inside the shared blk_mq_bitmap_tags structure. On top - * of that, each word of tags is in a separate cacheline. This means that - * multiple users will tend to stick to different cachelines, at least - * until the map is exhausted. - */ -static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, - unsigned int *tag_cache, struct blk_mq_tags *tags) +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { - unsigned int last_tag, org_last_tag; - int index, i, tag; - if (!hctx_may_queue(hctx, bt)) return -1; - - last_tag = org_last_tag = *tag_cache; - index = TAG_TO_INDEX(bt, last_tag); - - for (i = 0; i < bt->map_nr; i++) { - tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag), - BT_ALLOC_RR(tags)); - if (tag != -1) { - tag += (index << bt->bits_per_word); - goto done; - } - - /* - * Jump to next index, and reset the last tag to be the - * first tag of that index - */ - index++; - last_tag = (index << bt->bits_per_word); - - if (index >= bt->map_nr) { - index = 0; - last_tag = 0; - } - } - - *tag_cache = 0; - return -1; - - /* - * Only update the cache from the allocation path, if we ended - * up using the specific cached tag. - */ -done: - if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) { - last_tag = tag + 1; - if (last_tag >= bt->depth - 1) - last_tag = 0; - - *tag_cache = last_tag; - } - - return tag; + return __sbitmap_queue_get(bt); } -static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, - struct blk_mq_hw_ctx *hctx) +static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags) { - struct bt_wait_state *bs; - int wait_index; - - if (!hctx) - return &bt->bs[0]; - - wait_index = atomic_read(&hctx->wait_index); - bs = &bt->bs[wait_index]; - bt_index_atomic_inc(&hctx->wait_index); - return bs; -} - -static int bt_get(struct blk_mq_alloc_data *data, - struct blk_mq_bitmap_tags *bt, - struct blk_mq_hw_ctx *hctx, - unsigned int *last_tag, struct blk_mq_tags *tags) -{ - struct bt_wait_state *bs; + struct sbq_wait_state *ws; DEFINE_WAIT(wait); int tag; - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __bt_get(hctx, bt); if (tag != -1) return tag; if (data->flags & BLK_MQ_REQ_NOWAIT) return -1; - bs = bt_wait_ptr(bt, hctx); + ws = bt_wait_ptr(bt, hctx); do { - prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __bt_get(hctx, bt); if (tag != -1) break; @@ -292,7 +132,7 @@ static int bt_get(struct blk_mq_alloc_data *data, * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __bt_get(hctx, bt); if (tag != -1) break; @@ -301,20 +141,18 @@ static int bt_get(struct blk_mq_alloc_data *data, io_schedule(); data->ctx = blk_mq_get_ctx(data->q); - data->hctx = data->q->mq_ops->map_queue(data->q, - data->ctx->cpu); + data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu); if (data->flags & BLK_MQ_REQ_RESERVED) { bt = &data->hctx->tags->breserved_tags; } else { - last_tag = &data->ctx->last_tag; hctx = data->hctx; bt = &hctx->tags->bitmap_tags; } - finish_wait(&bs->wait, &wait); - bs = bt_wait_ptr(bt, hctx); + finish_wait(&ws->wait, &wait); + ws = bt_wait_ptr(bt, hctx); } while (1); - finish_wait(&bs->wait, &wait); + finish_wait(&ws->wait, &wait); return tag; } @@ -323,7 +161,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) int tag; tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, - &data->ctx->last_tag, data->hctx->tags); + data->hctx->tags); if (tag >= 0) return tag + data->hctx->tags->nr_reserved_tags; @@ -332,15 +170,15 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) { - int tag, zero = 0; + int tag; if (unlikely(!data->hctx->tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_TAG_FAIL; } - tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero, - data->hctx->tags); + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, + data->hctx->tags); if (tag < 0) return BLK_MQ_TAG_FAIL; @@ -354,55 +192,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) return __blk_mq_get_tag(data); } -static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) -{ - int i, wake_index; - - wake_index = atomic_read(&bt->wake_index); - for (i = 0; i < BT_WAIT_QUEUES; i++) { - struct bt_wait_state *bs = &bt->bs[wake_index]; - - if (waitqueue_active(&bs->wait)) { - int o = atomic_read(&bt->wake_index); - if (wake_index != o) - atomic_cmpxchg(&bt->wake_index, o, wake_index); - - return bs; - } - - wake_index = bt_index_inc(wake_index); - } - - return NULL; -} - -static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) -{ - const int index = TAG_TO_INDEX(bt, tag); - struct bt_wait_state *bs; - int wait_cnt; - - clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); - - /* Ensure that the wait list checks occur after clear_bit(). */ - smp_mb(); - - bs = bt_wake_ptr(bt); - if (!bs) - return; - - wait_cnt = atomic_dec_return(&bs->wait_cnt); - if (unlikely(wait_cnt < 0)) - wait_cnt = atomic_inc_return(&bs->wait_cnt); - if (wait_cnt == 0) { - atomic_add(bt->wake_cnt, &bs->wait_cnt); - bt_index_atomic_inc(&bt->wake_index); - wake_up(&bs->wait); - } -} - -void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, - unsigned int *last_tag) +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + unsigned int tag) { struct blk_mq_tags *tags = hctx->tags; @@ -410,67 +201,92 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); - bt_clear_tag(&tags->bitmap_tags, real_tag); - if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) - *last_tag = real_tag; + sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { BUG_ON(tag >= tags->nr_reserved_tags); - bt_clear_tag(&tags->breserved_tags, tag); + sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } -static void bt_for_each(struct blk_mq_hw_ctx *hctx, - struct blk_mq_bitmap_tags *bt, unsigned int off, - busy_iter_fn *fn, void *data, bool reserved) +struct bt_iter_data { + struct blk_mq_hw_ctx *hctx; + busy_iter_fn *fn; + void *data; + bool reserved; +}; + +static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { + struct bt_iter_data *iter_data = data; + struct blk_mq_hw_ctx *hctx = iter_data->hctx; + struct blk_mq_tags *tags = hctx->tags; + bool reserved = iter_data->reserved; struct request *rq; - int bit, i; - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; + if (!reserved) + bitnr += tags->nr_reserved_tags; + rq = tags->rqs[bitnr]; - for (bit = find_first_bit(&bm->word, bm->depth); - bit < bm->depth; - bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { - rq = hctx->tags->rqs[off + bit]; - if (rq->q == hctx->queue) - fn(hctx, rq, data, reserved); - } + if (rq->q == hctx->queue) + iter_data->fn(hctx, rq, iter_data->data, reserved); + return true; +} - off += (1 << bt->bits_per_word); - } +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, + busy_iter_fn *fn, void *data, bool reserved) +{ + struct bt_iter_data iter_data = { + .hctx = hctx, + .fn = fn, + .data = data, + .reserved = reserved, + }; + + sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); } -static void bt_tags_for_each(struct blk_mq_tags *tags, - struct blk_mq_bitmap_tags *bt, unsigned int off, - busy_tag_iter_fn *fn, void *data, bool reserved) +struct bt_tags_iter_data { + struct blk_mq_tags *tags; + busy_tag_iter_fn *fn; + void *data; + bool reserved; +}; + +static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { + struct bt_tags_iter_data *iter_data = data; + struct blk_mq_tags *tags = iter_data->tags; + bool reserved = iter_data->reserved; struct request *rq; - int bit, i; - if (!tags->rqs) - return; - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - - for (bit = find_first_bit(&bm->word, bm->depth); - bit < bm->depth; - bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { - rq = tags->rqs[off + bit]; - fn(rq, data, reserved); - } + if (!reserved) + bitnr += tags->nr_reserved_tags; + rq = tags->rqs[bitnr]; - off += (1 << bt->bits_per_word); - } + iter_data->fn(rq, iter_data->data, reserved); + return true; +} + +static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, + busy_tag_iter_fn *fn, void *data, bool reserved) +{ + struct bt_tags_iter_data iter_data = { + .tags = tags, + .fn = fn, + .data = data, + .reserved = reserved, + }; + + if (tags->rqs) + sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); } static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv) { if (tags->nr_reserved_tags) - bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); - bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); + bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true); + bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false); } void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, @@ -529,124 +345,40 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, continue; if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); + bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } } -static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +static unsigned int bt_unused_tags(const struct sbitmap_queue *bt) { - unsigned int i, used; - - for (i = 0, used = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - - used += bitmap_weight(&bm->word, bm->depth); - } - - return bt->depth - used; + return bt->sb.depth - sbitmap_weight(&bt->sb); } -static void bt_update_count(struct blk_mq_bitmap_tags *bt, - unsigned int depth) +static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, + bool round_robin, int node) { - unsigned int tags_per_word = 1U << bt->bits_per_word; - unsigned int map_depth = depth; - - if (depth) { - int i; - - for (i = 0; i < bt->map_nr; i++) { - bt->map[i].depth = min(map_depth, tags_per_word); - map_depth -= bt->map[i].depth; - } - } - - bt->wake_cnt = BT_WAIT_BATCH; - if (bt->wake_cnt > depth / BT_WAIT_QUEUES) - bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES); - - bt->depth = depth; -} - -static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, - int node, bool reserved) -{ - int i; - - bt->bits_per_word = ilog2(BITS_PER_LONG); - - /* - * Depth can be zero for reserved tags, that's not a failure - * condition. - */ - if (depth) { - unsigned int nr, tags_per_word; - - tags_per_word = (1 << bt->bits_per_word); - - /* - * If the tag space is small, shrink the number of tags - * per word so we spread over a few cachelines, at least. - * If less than 4 tags, just forget about it, it's not - * going to work optimally anyway. - */ - if (depth >= 4) { - while (tags_per_word * 4 > depth) { - bt->bits_per_word--; - tags_per_word = (1 << bt->bits_per_word); - } - } - - nr = ALIGN(depth, tags_per_word) / tags_per_word; - bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), - GFP_KERNEL, node); - if (!bt->map) - return -ENOMEM; - - bt->map_nr = nr; - } - - bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); - if (!bt->bs) { - kfree(bt->map); - bt->map = NULL; - return -ENOMEM; - } - - bt_update_count(bt, depth); - - for (i = 0; i < BT_WAIT_QUEUES; i++) { - init_waitqueue_head(&bt->bs[i].wait); - atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt); - } - - return 0; -} - -static void bt_free(struct blk_mq_bitmap_tags *bt) -{ - kfree(bt->map); - kfree(bt->bs); + return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, + node); } static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, int node, int alloc_policy) { unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - tags->alloc_policy = alloc_policy; - - if (bt_alloc(&tags->bitmap_tags, depth, node, false)) - goto enomem; - if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) - goto enomem; + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) + goto free_tags; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin, + node)) + goto free_bitmap_tags; return tags; -enomem: - bt_free(&tags->bitmap_tags); +free_bitmap_tags: + sbitmap_queue_free(&tags->bitmap_tags); +free_tags: kfree(tags); return NULL; } @@ -666,11 +398,6 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; - if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) { - kfree(tags); - return NULL; - } - tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; @@ -679,19 +406,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, void blk_mq_free_tags(struct blk_mq_tags *tags) { - bt_free(&tags->bitmap_tags); - bt_free(&tags->breserved_tags); - free_cpumask_var(tags->cpumask); + sbitmap_queue_free(&tags->bitmap_tags); + sbitmap_queue_free(&tags->breserved_tags); kfree(tags); } -void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) -{ - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; - - *tag = prandom_u32() % depth; -} - int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) { tdepth -= tags->nr_reserved_tags; @@ -702,7 +421,8 @@ int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) * Don't need (or can't) update reserved tags here, they remain * static and should never need resizing. */ - bt_update_count(&tags->bitmap_tags, tdepth); + sbitmap_queue_resize(&tags->bitmap_tags, tdepth); + blk_mq_tag_wakeup_all(tags, false); return 0; } @@ -726,7 +446,7 @@ u32 blk_mq_unique_tag(struct request *rq) int hwq = 0; if (q->mq_ops) { - hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); hwq = hctx->queue_num; } @@ -746,7 +466,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " "bits_per_word=%u\n", tags->nr_tags, tags->nr_reserved_tags, - tags->bitmap_tags.bits_per_word); + 1U << tags->bitmap_tags.sb.shift); free = bt_unused_tags(&tags->bitmap_tags); res = bt_unused_tags(&tags->breserved_tags); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index d468a79f2c4a..d1662734dc53 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -3,31 +3,6 @@ #include "blk-mq.h" -enum { - BT_WAIT_QUEUES = 8, - BT_WAIT_BATCH = 8, -}; - -struct bt_wait_state { - atomic_t wait_cnt; - wait_queue_head_t wait; -} ____cacheline_aligned_in_smp; - -#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) -#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) - -struct blk_mq_bitmap_tags { - unsigned int depth; - unsigned int wake_cnt; - unsigned int bits_per_word; - - unsigned int map_nr; - struct blk_align_bitmap *map; - - atomic_t wake_index; - struct bt_wait_state *bs; -}; - /* * Tag address space map. */ @@ -37,14 +12,11 @@ struct blk_mq_tags { atomic_t active_queues; - struct blk_mq_bitmap_tags bitmap_tags; - struct blk_mq_bitmap_tags breserved_tags; + struct sbitmap_queue bitmap_tags; + struct sbitmap_queue breserved_tags; struct request **rqs; struct list_head page_list; - - int alloc_policy; - cpumask_var_t cpumask; }; @@ -52,15 +24,23 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); -extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + unsigned int tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); -extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv); +static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx) +{ + if (!hctx) + return &bt->ws[0]; + return sbq_wait_ptr(bt, &hctx->wait_index); +} + enum { BLK_MQ_TAG_CACHE_MIN = 1, BLK_MQ_TAG_CACHE_MAX = 64, diff --git a/block/blk-mq.c b/block/blk-mq.c index c207fa9870eb..ddc2eed64771 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -22,6 +22,7 @@ #include <linux/sched/sysctl.h> #include <linux/delay.h> #include <linux/crash_dump.h> +#include <linux/prefetch.h> #include <trace/events/block.h> @@ -33,49 +34,28 @@ static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); - /* * Check if any of the ctx's have pending work in this hardware queue */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - unsigned int i; - - for (i = 0; i < hctx->ctx_map.size; i++) - if (hctx->ctx_map.map[i].word) - return true; - - return false; + return sbitmap_any_bit_set(&hctx->ctx_map); } -static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx) -{ - return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; -} - -#define CTX_TO_BIT(hctx, ctx) \ - ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) - /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - struct blk_align_bitmap *bm = get_bm(hctx, ctx); - - if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) - set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); + if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) + sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - struct blk_align_bitmap *bm = get_bm(hctx, ctx); - - clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); + sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); } void blk_mq_freeze_queue_start(struct request_queue *q) @@ -244,21 +224,11 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); - if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); - ctx = alloc_data.ctx; - } blk_mq_put_ctx(ctx); + if (!rq) { blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); @@ -333,7 +303,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - blk_mq_put_tag(hctx, tag, &ctx->last_tag); + blk_mq_put_tag(hctx, ctx, tag); blk_queue_exit(q); } @@ -349,11 +319,7 @@ EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); void blk_mq_free_request(struct request *rq) { - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; - - hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); - blk_mq_free_hctx_request(hctx, rq); + blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); @@ -513,7 +479,7 @@ EXPORT_SYMBOL(blk_mq_requeue_request); static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = - container_of(work, struct request_queue, requeue_work); + container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); struct request *rq, *next; unsigned long flags; @@ -568,16 +534,24 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_cancel_requeue_work(struct request_queue *q) { - cancel_work_sync(&q->requeue_work); + cancel_delayed_work_sync(&q->requeue_work); } EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); void blk_mq_kick_requeue_list(struct request_queue *q) { - kblockd_schedule_work(&q->requeue_work); + kblockd_schedule_delayed_work(&q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); +void blk_mq_delay_kick_requeue_list(struct request_queue *q, + unsigned long msecs) +{ + kblockd_schedule_delayed_work(&q->requeue_work, + msecs_to_jiffies(msecs)); +} +EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); + void blk_mq_abort_requeue_list(struct request_queue *q) { unsigned long flags; @@ -600,8 +574,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - if (tag < tags->nr_tags) + if (tag < tags->nr_tags) { + prefetch(tags->rqs[tag]); return tags->rqs[tag]; + } return NULL; } @@ -756,38 +732,44 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } +struct flush_busy_ctx_data { + struct blk_mq_hw_ctx *hctx; + struct list_head *list; +}; + +static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) +{ + struct flush_busy_ctx_data *flush_data = data; + struct blk_mq_hw_ctx *hctx = flush_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + + sbitmap_clear_bit(sb, bitnr); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, flush_data->list); + spin_unlock(&ctx->lock); + return true; +} + /* * Process software queues that have been marked busy, splicing them * to the for-dispatch */ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - struct blk_mq_ctx *ctx; - int i; - - for (i = 0; i < hctx->ctx_map.size; i++) { - struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; - unsigned int off, bit; - - if (!bm->word) - continue; + struct flush_busy_ctx_data data = { + .hctx = hctx, + .list = list, + }; - bit = 0; - off = i * hctx->ctx_map.bits_per_word; - do { - bit = find_next_bit(&bm->word, bm->depth, bit); - if (bit >= bm->depth) - break; + sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); +} - ctx = hctx->ctxs[bit + off]; - clear_bit(bit, &bm->word); - spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, list); - spin_unlock(&ctx->lock); +static inline unsigned int queued_to_index(unsigned int queued) +{ + if (!queued) + return 0; - bit++; - } while (1); - } + return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } /* @@ -878,10 +860,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) dptr = &driver_list; } - if (!queued) - hctx->dispatched[0]++; - else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) - hctx->dispatched[ilog2(queued) + 1]++; + hctx->dispatched[queued_to_index(queued)]++; /* * Any items that need requeuing? Stuff them into hctx->dispatch, @@ -937,7 +916,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) !blk_mq_hw_queue_mapped(hctx))) return; - if (!async) { + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); @@ -948,8 +927,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) put_cpu(); } - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, 0); + kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); } void blk_mq_run_hw_queues(struct request_queue *q, bool async) @@ -970,7 +948,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { - cancel_delayed_work(&hctx->run_work); + cancel_work(&hctx->run_work); cancel_delayed_work(&hctx->delay_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } @@ -1023,7 +1001,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx; - hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + hctx = container_of(work, struct blk_mq_hw_ctx, run_work); __blk_mq_run_hw_queue(hctx); } @@ -1076,9 +1054,7 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, { struct blk_mq_ctx *ctx = rq->mq_ctx; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); spin_lock(&ctx->lock); __blk_mq_insert_request(hctx, rq, at_head); @@ -1095,12 +1071,10 @@ static void blk_mq_insert_requests(struct request_queue *q, bool from_schedule) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); trace_block_unplug(q, depth, !from_schedule); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now @@ -1234,26 +1208,14 @@ static struct request *blk_mq_map_request(struct request_queue *q, blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); if (rw_is_sync(bio_op(bio), bio->bi_opf)) op_flags |= REQ_SYNC; trace_block_getrq(q, bio, op); - blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); + blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - if (unlikely(!rq)) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - trace_block_sleeprq(q, bio, op); - - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - ctx = alloc_data.ctx; - hctx = alloc_data.hctx; - } hctx->queued++; data->hctx = hctx; @@ -1265,8 +1227,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) { int ret; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, - rq->mq_ctx->cpu); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, @@ -1470,15 +1431,6 @@ run_queue: return cookie; } -/* - * Default mapping to a software queue, since we use one per CPU. - */ -struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) -{ - return q->queue_hw_ctx[q->mq_map[cpu]]; -} -EXPORT_SYMBOL(blk_mq_map_queue); - static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { @@ -1606,42 +1558,18 @@ fail: return NULL; } -static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) -{ - kfree(bitmap->map); -} - -static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) -{ - unsigned int bpw = 8, total, num_maps, i; - - bitmap->bits_per_word = bpw; - - num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; - bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), - GFP_KERNEL, node); - if (!bitmap->map) - return -ENOMEM; - - total = nr_cpu_ids; - for (i = 0; i < num_maps; i++) { - bitmap->map[i].depth = min(total, bitmap->bits_per_word); - total -= bitmap->map[i].depth; - } - - return 0; -} - /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it * gets run. */ -static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) +static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) { + struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); + hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); ctx = __blk_mq_get_ctx(hctx->queue, cpu); spin_lock(&ctx->lock); @@ -1652,30 +1580,20 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) spin_unlock(&ctx->lock); if (list_empty(&tmp)) - return NOTIFY_OK; + return 0; spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); blk_mq_run_hw_queue(hctx, true); - return NOTIFY_OK; + return 0; } -static int blk_mq_hctx_notify(void *data, unsigned long action, - unsigned int cpu) +static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { - struct blk_mq_hw_ctx *hctx = data; - - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) - return blk_mq_hctx_cpu_offline(hctx, cpu); - - /* - * In case of CPU online, tags may be reallocated - * in blk_mq_map_swqueue() after mapping is updated. - */ - - return NOTIFY_OK; + cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, + &hctx->cpuhp_dead); } /* hctx->ctxs will be freed in queue's release handler */ @@ -1695,9 +1613,9 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); - blk_mq_free_bitmap(&hctx->ctx_map); + sbitmap_free(&hctx->ctx_map); } static void blk_mq_exit_hw_queues(struct request_queue *q, @@ -1734,7 +1652,7 @@ static int blk_mq_init_hctx(struct request_queue *q, if (node == NUMA_NO_NODE) node = hctx->numa_node = set->numa_node; - INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); @@ -1742,9 +1660,7 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->queue_num = hctx_idx; hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; - blk_mq_init_cpu_notifier(&hctx->cpu_notifier, - blk_mq_hctx_notify, hctx); - blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); hctx->tags = set->tags[hctx_idx]; @@ -1757,7 +1673,8 @@ static int blk_mq_init_hctx(struct request_queue *q, if (!hctx->ctxs) goto unregister_cpu_notifier; - if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) + if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, + node)) goto free_ctxs; hctx->nr_ctx = 0; @@ -1784,12 +1701,11 @@ static int blk_mq_init_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); free_bitmap: - blk_mq_free_bitmap(&hctx->ctx_map); + sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); unregister_cpu_notifier: - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - + blk_mq_remove_cpuhp(hctx); return -1; } @@ -1812,7 +1728,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, if (!cpu_online(i)) continue; - hctx = q->mq_ops->map_queue(q, i); + hctx = blk_mq_map_queue(q, i); /* * Set local node, IFF we have more than one hw queue. If @@ -1850,7 +1766,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, continue; ctx = per_cpu_ptr(q->queue_ctx, i); - hctx = q->mq_ops->map_queue(q, i); + hctx = blk_mq_map_queue(q, i); cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; @@ -1860,8 +1776,6 @@ static void blk_mq_map_swqueue(struct request_queue *q, mutex_unlock(&q->sysfs_lock); queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_ctxmap *map = &hctx->ctx_map; - /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. @@ -1881,13 +1795,12 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); - cpumask_copy(hctx->tags->cpumask, hctx->cpumask); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping * over all possibly mapped software queues. */ - map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word); + sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* * Initialize batch roundrobin counts @@ -1975,7 +1888,6 @@ void blk_mq_release(struct request_queue *q) kfree(hctx); } - kfree(q->mq_map); q->mq_map = NULL; kfree(q->queue_hw_ctx); @@ -2074,9 +1986,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (!q->queue_hw_ctx) goto err_percpu; - q->mq_map = blk_mq_make_queue_map(set); - if (!q->mq_map) - goto err_map; + q->mq_map = set->mq_map; blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) @@ -2094,7 +2004,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->sg_reserved_size = INT_MAX; - INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); @@ -2126,8 +2036,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return q; err_hctxs: - kfree(q->mq_map); -err_map: kfree(q->queue_hw_ctx); err_percpu: free_percpu(q->queue_ctx); @@ -2159,8 +2067,6 @@ static void blk_mq_queue_reinit(struct request_queue *q, blk_mq_sysfs_unregister(q); - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); - /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe * we should change hctx numa_node according to new topology (this @@ -2172,50 +2078,18 @@ static void blk_mq_queue_reinit(struct request_queue *q, blk_mq_sysfs_register(q); } -static int blk_mq_queue_reinit_notify(struct notifier_block *nb, - unsigned long action, void *hcpu) +/* + * New online cpumask which is going to be set in this hotplug event. + * Declare this cpumasks as global as cpu-hotplug operation is invoked + * one-by-one and dynamically allocating this could result in a failure. + */ +static struct cpumask cpuhp_online_new; + +static void blk_mq_queue_reinit_work(void) { struct request_queue *q; - int cpu = (unsigned long)hcpu; - /* - * New online cpumask which is going to be set in this hotplug event. - * Declare this cpumasks as global as cpu-hotplug operation is invoked - * one-by-one and dynamically allocating this could result in a failure. - */ - static struct cpumask online_new; - - /* - * Before hotadded cpu starts handling requests, new mappings must - * be established. Otherwise, these requests in hw queue might - * never be dispatched. - * - * For example, there is a single hw queue (hctx) and two CPU queues - * (ctx0 for CPU0, and ctx1 for CPU1). - * - * Now CPU1 is just onlined and a request is inserted into - * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is - * still zero. - * - * And then while running hw queue, flush_busy_ctxs() finds bit0 is - * set in pending bitmap and tries to retrieve requests in - * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, - * so the request in ctx1->rq_list is ignored. - */ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DEAD: - case CPU_UP_CANCELED: - cpumask_copy(&online_new, cpu_online_mask); - break; - case CPU_UP_PREPARE: - cpumask_copy(&online_new, cpu_online_mask); - cpumask_set_cpu(cpu, &online_new); - break; - default: - return NOTIFY_OK; - } mutex_lock(&all_q_mutex); - /* * We need to freeze and reinit all existing queues. Freezing * involves synchronous wait for an RCU grace period and doing it @@ -2236,13 +2110,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, } list_for_each_entry(q, &all_q_list, all_q_node) - blk_mq_queue_reinit(q, &online_new); + blk_mq_queue_reinit(q, &cpuhp_online_new); list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_unfreeze_queue(q); mutex_unlock(&all_q_mutex); - return NOTIFY_OK; +} + +static int blk_mq_queue_reinit_dead(unsigned int cpu) +{ + cpumask_copy(&cpuhp_online_new, cpu_online_mask); + blk_mq_queue_reinit_work(); + return 0; +} + +/* + * Before hotadded cpu starts handling requests, new mappings must be + * established. Otherwise, these requests in hw queue might never be + * dispatched. + * + * For example, there is a single hw queue (hctx) and two CPU queues (ctx0 + * for CPU0, and ctx1 for CPU1). + * + * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list + * and set bit0 in pending bitmap as ctx1->index_hw is still zero. + * + * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in + * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. + * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list + * is ignored. + */ +static int blk_mq_queue_reinit_prepare(unsigned int cpu) +{ + cpumask_copy(&cpuhp_online_new, cpu_online_mask); + cpumask_set_cpu(cpu, &cpuhp_online_new); + blk_mq_queue_reinit_work(); + return 0; } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) @@ -2299,12 +2203,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0; } -struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) -{ - return tags->cpumask; -} -EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); - /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -2313,6 +2211,8 @@ EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { + int ret; + BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); if (!set->nr_hw_queues) @@ -2322,7 +2222,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; - if (!set->ops->queue_rq || !set->ops->map_queue) + if (!set->ops->queue_rq) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { @@ -2351,17 +2251,35 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->tags) return -ENOMEM; - if (blk_mq_alloc_rq_maps(set)) - goto enomem; + ret = -ENOMEM; + set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, + GFP_KERNEL, set->numa_node); + if (!set->mq_map) + goto out_free_tags; + + if (set->ops->map_queues) + ret = set->ops->map_queues(set); + else + ret = blk_mq_map_queues(set); + if (ret) + goto out_free_mq_map; + + ret = blk_mq_alloc_rq_maps(set); + if (ret) + goto out_free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; -enomem: + +out_free_mq_map: + kfree(set->mq_map); + set->mq_map = NULL; +out_free_tags: kfree(set->tags); set->tags = NULL; - return -ENOMEM; + return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); @@ -2374,6 +2292,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) blk_mq_free_rq_map(set, set->tags[i], i); } + kfree(set->mq_map); + set->mq_map = NULL; + kfree(set->tags); set->tags = NULL; } @@ -2444,10 +2365,12 @@ void blk_mq_enable_hotplug(void) static int __init blk_mq_init(void) { - blk_mq_cpu_init(); - - hotcpu_notifier(blk_mq_queue_reinit_notify, 0); + cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, + blk_mq_hctx_notify_dead); + cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare", + blk_mq_queue_reinit_prepare, + blk_mq_queue_reinit_dead); return 0; } subsys_initcall(blk_mq_init); diff --git a/block/blk-mq.h b/block/blk-mq.h index 9087b11037b7..e5d25249028c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -12,8 +12,6 @@ struct blk_mq_ctx { unsigned int cpu; unsigned int index_hw; - unsigned int last_tag ____cacheline_aligned_in_smp; - /* incremented at dispatch time */ unsigned long rq_dispatched[2]; unsigned long rq_merged; @@ -34,24 +32,21 @@ void blk_mq_wake_waiters(struct request_queue *q); /* * CPU hotplug helpers */ -struct blk_mq_cpu_notifier; -void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - int (*fn)(void *, unsigned long, unsigned int), - void *data); -void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); -void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); -void blk_mq_cpu_init(void); void blk_mq_enable_hotplug(void); void blk_mq_disable_hotplug(void); /* * CPU -> queue mappings */ -extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, - const struct cpumask *online_mask); +int blk_mq_map_queues(struct blk_mq_tag_set *set); extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); +static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, + int cpu) +{ + return q->queue_hw_ctx[q->mq_map[cpu]]; +} + /* * sysfs helpers */ @@ -63,15 +58,6 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved); void blk_mq_release(struct request_queue *q); -/* - * Basic implementation of sparser bitmap, allowing the user to spread - * the bits over more cachelines. - */ -struct blk_align_bitmap { - unsigned long word; - unsigned long depth; -} ____cacheline_aligned_in_smp; - static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 489eab825a8a..06cf9807f49a 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -78,30 +78,21 @@ static int raise_blk_irq(int cpu, struct request *rq) } #endif -static int blk_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int blk_softirq_cpu_dead(unsigned int cpu) { /* * If a CPU goes away, splice its entries to the current CPU * and trigger a run of the softirq */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - this_cpu_ptr(&blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - } + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_done, cpu), + this_cpu_ptr(&blk_cpu_done)); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_enable(); - return NOTIFY_OK; + return 0; } -static struct notifier_block blk_cpu_notifier = { - .notifier_call = blk_cpu_notify, -}; - void __blk_complete_request(struct request *req) { int ccpu, cpu; @@ -180,7 +171,9 @@ static __init int blk_softirq_init(void) INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); - register_hotcpu_notifier(&blk_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, + "block/softirq:dead", NULL, + blk_softirq_cpu_dead); return 0; } subsys_initcall(blk_softirq_init); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f87a7e747d36..9cc8d7c5439a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -704,7 +704,7 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->kobj, KOBJ_ADD); if (q->mq_ops) - blk_mq_register_disk(disk); + blk_mq_register_dev(dev, q); if (!q->request_fn) return 0; @@ -729,7 +729,7 @@ void blk_unregister_queue(struct gendisk *disk) return; if (q->mq_ops) - blk_mq_unregister_disk(disk); + blk_mq_unregister_dev(disk_to_dev(disk), q); if (q->request_fn) elv_unregister_queue(q); diff --git a/block/blk.h b/block/blk.h index c37492f5edaa..74444c49078f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -39,14 +39,9 @@ extern struct ida blk_queue_ida; static inline struct blk_flush_queue *blk_get_flush_queue( struct request_queue *q, struct blk_mq_ctx *ctx) { - struct blk_mq_hw_ctx *hctx; - - if (!q->mq_ops) - return q->fq; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); - - return hctx->fq; + if (q->mq_ops) + return blk_mq_map_queue(q, ctx->cpu)->fq; + return q->fq; } static inline void __blk_get_queue(struct request_queue *q) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index cc2f6dbd4303..5e24d880306c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3042,7 +3042,6 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq) if (ktime_get_ns() < rq->fifo_time) rq = NULL; - cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); return rq; } @@ -3420,6 +3419,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned int max_dispatch; + if (cfq_cfqq_must_dispatch(cfqq)) + return true; + /* * Drain async requests before we start sync IO */ @@ -3511,15 +3513,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); + rq = cfq_check_fifo(cfqq); + if (rq) + cfq_mark_cfqq_must_dispatch(cfqq); + if (!cfq_may_dispatch(cfqd, cfqq)) return false; /* * follow expired path, else get first next available */ - rq = cfq_check_fifo(cfqq); if (!rq) rq = cfqq->next_rq; + else + cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); /* * insert request into driver dispatch list @@ -3989,7 +3996,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if the new request is sync, but the currently running queue is * not, let the sync request have priority. */ - if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) + if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) return true; /* diff --git a/block/ioctl.c b/block/ioctl.c index ed2397f8de9d..755119c3c1b9 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -225,7 +225,8 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, unsigned long arg) { uint64_t range[2]; - uint64_t start, len; + struct address_space *mapping; + uint64_t start, end, len; if (!(mode & FMODE_WRITE)) return -EBADF; @@ -235,18 +236,23 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, start = range[0]; len = range[1]; + end = start + len - 1; if (start & 511) return -EINVAL; if (len & 511) return -EINVAL; - start >>= 9; - len >>= 9; - - if (start + len > (i_size_read(bdev->bd_inode) >> 9)) + if (end >= (uint64_t)i_size_read(bdev->bd_inode)) + return -EINVAL; + if (end < start) return -EINVAL; - return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false); + /* Invalidate the page cache, including dirty pages */ + mapping = bdev->bd_inode->i_mapping; + truncate_inode_pages_range(mapping, start, end); + + return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, + false); } static int put_ushort(unsigned long arg, unsigned short val) |