diff options
-rw-r--r-- | block/bio.c | 50 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 9 | ||||
-rw-r--r-- | block/blk-mq.c | 184 | ||||
-rw-r--r-- | block/blk-mq.h | 1 | ||||
-rw-r--r-- | block/cfq-iosched.c | 43 | ||||
-rw-r--r-- | fs/block_dev.c | 6 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 2 |
7 files changed, 185 insertions, 110 deletions
diff --git a/block/bio.c b/block/bio.c index cf7591551b17..f124a0a624fc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -296,13 +296,19 @@ void bio_reset(struct bio *bio) } EXPORT_SYMBOL(bio_reset); -static void bio_chain_endio(struct bio *bio) +static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; - parent->bi_error = bio->bi_error; - bio_endio(parent); + if (!parent->bi_error) + parent->bi_error = bio->bi_error; bio_put(bio); + return parent; +} + +static void bio_chain_endio(struct bio *bio) +{ + bio_endio(__bio_chain_endio(bio)); } /* @@ -1742,29 +1748,25 @@ static inline bool bio_remaining_done(struct bio *bio) **/ void bio_endio(struct bio *bio) { - while (bio) { - if (unlikely(!bio_remaining_done(bio))) - break; +again: + if (!bio_remaining_done(bio)) + return; - /* - * Need to have a real endio function for chained bios, - * otherwise various corner cases will break (like stacking - * block devices that save/restore bi_end_io) - however, we want - * to avoid unbounded recursion and blowing the stack. Tail call - * optimization would handle this, but compiling with frame - * pointers also disables gcc's sibling call optimization. - */ - if (bio->bi_end_io == bio_chain_endio) { - struct bio *parent = bio->bi_private; - parent->bi_error = bio->bi_error; - bio_put(bio); - bio = parent; - } else { - if (bio->bi_end_io) - bio->bi_end_io(bio); - bio = NULL; - } + /* + * Need to have a real endio function for chained bios, otherwise + * various corner cases will break (like stacking block devices that + * save/restore bi_end_io) - however, we want to avoid unbounded + * recursion and blowing the stack. Tail call optimization would + * handle this, but compiling with frame pointers also disables + * gcc's sibling call optimization. + */ + if (bio->bi_end_io == bio_chain_endio) { + bio = __bio_chain_endio(bio); + goto again; } + + if (bio->bi_end_io) + bio->bi_end_io(bio); } EXPORT_SYMBOL(bio_endio); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 1cf18784c5cf..431fdda21737 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -408,17 +408,18 @@ void blk_mq_unregister_disk(struct gendisk *disk) blk_mq_enable_hotplug(); } +void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) +{ + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); +} + static void blk_mq_sysfs_init(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; int i; kobject_init(&q->mq_kobj, &blk_mq_ktype); - queue_for_each_hw_ctx(q, hctx, i) - kobject_init(&hctx->kobj, &blk_mq_hw_ktype); - queue_for_each_ctx(q, ctx, i) kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 56c0a726b619..050f7a13021b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -544,7 +544,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - return tags->rqs[tag]; + if (tag < tags->nr_tags) + return tags->rqs[tag]; + + return NULL; } EXPORT_SYMBOL(blk_mq_tag_to_rq); @@ -1744,31 +1747,6 @@ static int blk_mq_init_hctx(struct request_queue *q, return -1; } -static int blk_mq_init_hw_queues(struct request_queue *q, - struct blk_mq_tag_set *set) -{ - struct blk_mq_hw_ctx *hctx; - unsigned int i; - - /* - * Initialize hardware queues - */ - queue_for_each_hw_ctx(q, hctx, i) { - if (blk_mq_init_hctx(q, set, hctx, i)) - break; - } - - if (i == q->nr_hw_queues) - return 0; - - /* - * Init failed - */ - blk_mq_exit_hw_queues(q, set, i); - - return 1; -} - static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { @@ -1826,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, continue; hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; @@ -1974,56 +1953,93 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q) +static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) { - struct blk_mq_hw_ctx **hctxs; - struct blk_mq_ctx __percpu *ctx; - unsigned int *map; - int i; - - ctx = alloc_percpu(struct blk_mq_ctx); - if (!ctx) - return ERR_PTR(-ENOMEM); - - hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, - set->numa_node); - - if (!hctxs) - goto err_percpu; - - map = blk_mq_make_queue_map(set); - if (!map) - goto err_map; + int i, j; + struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + blk_mq_sysfs_unregister(q); for (i = 0; i < set->nr_hw_queues; i++) { - int node = blk_mq_hw_queue_to_node(map, i); + int node; + if (hctxs[i]) + continue; + + node = blk_mq_hw_queue_to_node(q->mq_map, i); hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node); if (!hctxs[i]) - goto err_hctxs; + break; if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, - node)) - goto err_hctxs; + node)) { + kfree(hctxs[i]); + hctxs[i] = NULL; + break; + } atomic_set(&hctxs[i]->nr_active, 0); hctxs[i]->numa_node = node; hctxs[i]->queue_num = i; + + if (blk_mq_init_hctx(q, set, hctxs[i], i)) { + free_cpumask_var(hctxs[i]->cpumask); + kfree(hctxs[i]); + hctxs[i] = NULL; + break; + } + blk_mq_hctx_kobj_init(hctxs[i]); } + for (j = i; j < q->nr_hw_queues; j++) { + struct blk_mq_hw_ctx *hctx = hctxs[j]; + + if (hctx) { + if (hctx->tags) { + blk_mq_free_rq_map(set, hctx->tags, j); + set->tags[j] = NULL; + } + blk_mq_exit_hctx(q, set, hctx, j); + free_cpumask_var(hctx->cpumask); + kobject_put(&hctx->kobj); + kfree(hctx->ctxs); + kfree(hctx); + hctxs[j] = NULL; + + } + } + q->nr_hw_queues = i; + blk_mq_sysfs_register(q); +} + +struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + /* mark the queue as mq asap */ + q->mq_ops = set->ops; + + q->queue_ctx = alloc_percpu(struct blk_mq_ctx); + if (!q->queue_ctx) + return ERR_PTR(-ENOMEM); + + q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), + GFP_KERNEL, set->numa_node); + if (!q->queue_hw_ctx) + goto err_percpu; + + q->mq_map = blk_mq_make_queue_map(set); + if (!q->mq_map) + goto err_map; + + blk_mq_realloc_hw_ctxs(set, q); + if (!q->nr_hw_queues) + goto err_hctxs; INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->nr_queues = nr_cpu_ids; - q->nr_hw_queues = set->nr_hw_queues; - q->mq_map = map; - - q->queue_ctx = ctx; - q->queue_hw_ctx = hctxs; - q->mq_ops = set->ops; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; if (!(set->flags & BLK_MQ_F_SG_MERGE)) @@ -2050,9 +2066,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_init_cpu_queues(q, set->nr_hw_queues); - if (blk_mq_init_hw_queues(q, set)) - goto err_hctxs; - get_online_cpus(); mutex_lock(&all_q_mutex); @@ -2066,17 +2079,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return q; err_hctxs: - kfree(map); - for (i = 0; i < set->nr_hw_queues; i++) { - if (!hctxs[i]) - break; - free_cpumask_var(hctxs[i]->cpumask); - kfree(hctxs[i]); - } + kfree(q->mq_map); err_map: - kfree(hctxs); + kfree(q->queue_hw_ctx); err_percpu: - free_percpu(ctx); + free_percpu(q->queue_ctx); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(blk_mq_init_allocated_queue); @@ -2284,9 +2291,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->nr_hw_queues = 1; set->queue_depth = min(64U, set->queue_depth); } + /* + * There is no use for more h/w queues than cpus. + */ + if (set->nr_hw_queues > nr_cpu_ids) + set->nr_hw_queues = nr_cpu_ids; - set->tags = kmalloc_node(set->nr_hw_queues * - sizeof(struct blk_mq_tags *), + set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) return -ENOMEM; @@ -2309,7 +2320,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (set->tags[i]) blk_mq_free_rq_map(set, set->tags[i], i); } @@ -2330,6 +2341,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) ret = 0; queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->tags) + continue; ret = blk_mq_tag_update_depth(hctx->tags, nr); if (ret) break; @@ -2341,6 +2354,35 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } +void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) +{ + struct request_queue *q; + + if (nr_hw_queues > nr_cpu_ids) + nr_hw_queues = nr_cpu_ids; + if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) + return; + + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_freeze_queue(q); + + set->nr_hw_queues = nr_hw_queues; + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_realloc_hw_ctxs(set, q); + + if (q->nr_hw_queues > 1) + blk_queue_make_request(q, blk_mq_make_request); + else + blk_queue_make_request(q, blk_sq_make_request); + + blk_mq_queue_reinit(q, cpu_online_mask); + } + + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_unfreeze_queue(q); +} +EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); diff --git a/block/blk-mq.h b/block/blk-mq.h index eaede8e45c9c..9087b11037b7 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -57,6 +57,7 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); */ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); +extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); extern void blk_mq_rq_timed_out(struct request *req, bool reserved); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 1f9093e901da..e3c591dd8f19 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -632,6 +632,13 @@ static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) return pblkg ? blkg_to_cfqg(pblkg) : NULL; } +static inline bool cfqg_is_descendant(struct cfq_group *cfqg, + struct cfq_group *ancestor) +{ + return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup, + cfqg_to_blkg(ancestor)->blkcg->css.cgroup); +} + static inline void cfqg_get(struct cfq_group *cfqg) { return blkg_get(cfqg_to_blkg(cfqg)); @@ -758,6 +765,11 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) #else /* CONFIG_CFQ_GROUP_IOSCHED */ static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } +static inline bool cfqg_is_descendant(struct cfq_group *cfqg, + struct cfq_group *ancestor) +{ + return true; +} static inline void cfqg_get(struct cfq_group *cfqg) { } static inline void cfqg_put(struct cfq_group *cfqg) { } @@ -2897,6 +2909,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; + struct cfq_rb_root *st = cfqq->service_tree; struct cfq_io_cq *cic; unsigned long sl, group_idle = 0; @@ -2947,8 +2960,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) return; } - /* There are other queues in the group, don't do group idle */ - if (group_idle && cfqq->cfqg->nr_cfqq > 1) + /* + * There are other queues in the group or this is the only group and + * it has too big thinktime, don't do group idle. + */ + if (group_idle && + (cfqq->cfqg->nr_cfqq > 1 || + cfq_io_thinktime_big(cfqd, &st->ttime, true))) return; cfq_mark_cfqq_wait_request(cfqq); @@ -3947,16 +3965,27 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) return true; - if (new_cfqq->cfqg != cfqq->cfqg) + /* + * Treat ancestors of current cgroup the same way as current cgroup. + * For anybody else we disallow preemption to guarantee service + * fairness among cgroups. + */ + if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg)) return false; if (cfq_slice_used(cfqq)) return true; + /* + * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. + */ + if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) + return true; + + WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class); /* Allow preemption only if we are idling on sync-noidle tree */ if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && - new_cfqq->service_tree->count == 2 && RB_EMPTY_ROOT(&cfqq->sort_list)) return true; @@ -3967,12 +3996,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) return true; - /* - * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. - */ - if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) - return true; - /* An idle queue should not be idle now for some reason */ if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) return true; diff --git a/fs/block_dev.c b/fs/block_dev.c index 826b164a4b5b..3172c4e2f502 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -575,7 +575,11 @@ static const struct super_operations bdev_sops = { static struct dentry *bd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); + struct dentry *dent; + dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); + if (dent) + dent->d_sb->s_iflags |= SB_I_CGROUPWB; + return dent; } static struct file_system_type bd_type = { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7fc9296b5742..15a73d49fd1d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -244,6 +244,8 @@ void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q); +void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); + /* * Driver command data is immediately after the request. So subtract request * size to get back to the original request, add request size to get the PDU. |