From 6d46964230d182c4b6097379738849a809d791dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Nov 2018 17:02:18 +0100 Subject: block: remove the lock argument to blk_alloc_queue_node With the legacy request path gone there is no real need to override the queue_lock. Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c510179a7f84..a733e4c920af 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1896,7 +1896,7 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL); + md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); if (!md->queue) goto bad; md->queue->queuedata = md; -- cgit v1.2.3 From 344e9ffcbd1898e1dc04085564a6e05c30ea8199 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Nov 2018 12:22:51 -0700 Subject: block: add queue_is_mq() helper Various spots check for q->mq_ops being non-NULL, but provide a helper to do this instead. Where the ->mq_ops != NULL check is redundant, remove it. Since mq == rq-based now that legacy is gone, get rid of the queue_is_rq_based() and just use queue_is_mq() everywhere. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 ++++---- block/blk-core.c | 12 ++++++------ block/blk-flush.c | 3 +-- block/blk-mq.c | 2 +- block/blk-sysfs.c | 14 +++++++------- block/blk-throttle.c | 2 +- block/blk-wbt.c | 2 +- block/blk-zoned.c | 2 +- block/bsg.c | 2 +- block/elevator.c | 11 +++++------ block/genhd.c | 8 ++++---- drivers/md/dm-rq.c | 2 +- drivers/md/dm-table.c | 4 ++-- include/linux/blkdev.h | 6 +----- 14 files changed, 36 insertions(+), 42 deletions(-) (limited to 'drivers/md') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0f6b44614165..63d226a084cd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1324,7 +1324,7 @@ int blkcg_activate_policy(struct request_queue *q, if (blkcg_policy_enabled(q, pol)) return 0; - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_freeze_queue(q); pd_prealloc: if (!pd_prealloc) { @@ -1363,7 +1363,7 @@ pd_prealloc: spin_unlock_irq(&q->queue_lock); out_bypass_end: - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); @@ -1387,7 +1387,7 @@ void blkcg_deactivate_policy(struct request_queue *q, if (!blkcg_policy_enabled(q, pol)) return; - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_freeze_queue(q); spin_lock_irq(&q->queue_lock); @@ -1405,7 +1405,7 @@ void blkcg_deactivate_policy(struct request_queue *q, spin_unlock_irq(&q->queue_lock); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); diff --git a/block/blk-core.c b/block/blk-core.c index 92b6b200e9fb..0b684a520a11 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -232,7 +232,7 @@ void blk_sync_queue(struct request_queue *q) del_timer_sync(&q->timeout); cancel_work_sync(&q->timeout_work); - if (q->mq_ops) { + if (queue_is_mq(q)) { struct blk_mq_hw_ctx *hctx; int i; @@ -281,7 +281,7 @@ void blk_set_queue_dying(struct request_queue *q) */ blk_freeze_queue_start(q); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ @@ -356,7 +356,7 @@ void blk_cleanup_queue(struct request_queue *q) * blk_freeze_queue() should be enough for cases of passthrough * request. */ - if (q->mq_ops && blk_queue_init_done(q)) + if (queue_is_mq(q) && blk_queue_init_done(q)) blk_mq_quiesce_queue(q); /* for synchronous bio-based driver finish in-flight integrity i/o */ @@ -374,7 +374,7 @@ void blk_cleanup_queue(struct request_queue *q) blk_exit_queue(q); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_free_queue(q); percpu_ref_exit(&q->q_usage_counter); @@ -982,7 +982,7 @@ generic_make_request_checks(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. */ - if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) + if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) goto not_supported; if (should_fail_bio(bio)) @@ -1657,7 +1657,7 @@ EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); */ int blk_lld_busy(struct request_queue *q) { - if (q->mq_ops && q->mq_ops->busy) + if (queue_is_mq(q) && q->mq_ops->busy) return q->mq_ops->busy(q); return 0; diff --git a/block/blk-flush.c b/block/blk-flush.c index fcd18b158fd6..a3fc7191c694 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -273,8 +273,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, * assigned to empty flushes, and we deadlock if we are expecting * other requests to make progress. Don't defer for that case. */ - if (!list_empty(&fq->flush_data_in_flight) && - !(q->mq_ops && q->elevator) && + if (!list_empty(&fq->flush_data_in_flight) && q->elevator && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; diff --git a/block/blk-mq.c b/block/blk-mq.c index 3b823891b3ef..32b246ed44c0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -150,7 +150,7 @@ void blk_freeze_queue_start(struct request_queue *q) freeze_depth = atomic_inc_return(&q->mq_freeze_depth); if (freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); } } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1e370207a20e..80eef48fddc8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -68,7 +68,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) unsigned long nr; int ret, err; - if (!q->mq_ops) + if (!queue_is_mq(q)) return -EINVAL; ret = queue_var_store(&nr, page, count); @@ -835,12 +835,12 @@ static void __blk_release_queue(struct work_struct *work) blk_queue_free_zone_bitmaps(q); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_release(q); blk_trace_shutdown(q); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_debugfs_unregister(q); bioset_exit(&q->bio_split); @@ -914,7 +914,7 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } - if (q->mq_ops) { + if (queue_is_mq(q)) { __blk_mq_register_dev(dev, q); blk_mq_debugfs_register(q); } @@ -925,7 +925,7 @@ int blk_register_queue(struct gendisk *disk) blk_throtl_register_queue(q); - if ((q->mq_ops && q->elevator)) { + if (q->elevator) { ret = elv_register_queue(q); if (ret) { mutex_unlock(&q->sysfs_lock); @@ -974,7 +974,7 @@ void blk_unregister_queue(struct gendisk *disk) * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); mutex_unlock(&q->sysfs_lock); @@ -983,7 +983,7 @@ void blk_unregister_queue(struct gendisk *disk) blk_trace_remove_sysfs(disk_to_dev(disk)); mutex_lock(&q->sysfs_lock); - if (q->mq_ops && q->elevator) + if (q->elevator) elv_unregister_queue(q); mutex_unlock(&q->sysfs_lock); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d0a23f0bb3ed..8f0a104770ee 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2456,7 +2456,7 @@ void blk_throtl_register_queue(struct request_queue *q) td->throtl_slice = DFL_THROTL_SLICE_HD; #endif - td->track_bio_latency = !queue_is_rq_based(q); + td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); } diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 9f142b84dc85..d051ebfb4852 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -701,7 +701,7 @@ void wbt_enable_default(struct request_queue *q) if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) return; - if (q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) + if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) wbt_init(q); } EXPORT_SYMBOL_GPL(wbt_enable_default); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 13ba2011a306..e9c332b1d9da 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -421,7 +421,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) * BIO based queues do not use a scheduler so only q->nr_zones * needs to be updated so that the sysfs exposed value is correct. */ - if (!queue_is_rq_based(q)) { + if (!queue_is_mq(q)) { q->nr_zones = nr_zones; return 0; } diff --git a/block/bsg.c b/block/bsg.c index 9a442c23a715..44f6028b9567 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -471,7 +471,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, /* * we need a proper transport to send commands, not a stacked device */ - if (!queue_is_rq_based(q)) + if (!queue_is_mq(q)) return 0; bcd = &q->bsg_dev; diff --git a/block/elevator.c b/block/elevator.c index 796436270682..f05e90d4e695 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -667,7 +667,7 @@ static int __elevator_change(struct request_queue *q, const char *name) /* * Special case for mq, turn off scheduling */ - if (q->mq_ops && !strncmp(name, "none", 4)) + if (!strncmp(name, "none", 4)) return elevator_switch(q, NULL); strlcpy(elevator_name, name, sizeof(elevator_name)); @@ -685,8 +685,7 @@ static int __elevator_change(struct request_queue *q, const char *name) static inline bool elv_support_iosched(struct request_queue *q) { - if (q->mq_ops && q->tag_set && (q->tag_set->flags & - BLK_MQ_F_NO_SCHED)) + if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) return false; return true; } @@ -696,7 +695,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, { int ret; - if (!q->mq_ops || !elv_support_iosched(q)) + if (!queue_is_mq(q) || !elv_support_iosched(q)) return count; ret = __elevator_change(q, name); @@ -713,7 +712,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_type *__e; int len = 0; - if (!queue_is_rq_based(q)) + if (!queue_is_mq(q)) return sprintf(name, "none\n"); if (!q->elevator) @@ -732,7 +731,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) } spin_unlock(&elv_list_lock); - if (q->mq_ops && q->elevator) + if (q->elevator) len += sprintf(name+len, "none"); len += sprintf(len+name, "\n"); diff --git a/block/genhd.c b/block/genhd.c index cff6bdf27226..0145bcb0cc76 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -47,7 +47,7 @@ static void disk_release_events(struct gendisk *disk); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (queue_is_mq(q)) return; atomic_inc(&part->in_flight[rw]); @@ -57,7 +57,7 @@ void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (queue_is_mq(q)) return; atomic_dec(&part->in_flight[rw]); @@ -68,7 +68,7 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) void part_in_flight(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - if (q->mq_ops) { + if (queue_is_mq(q)) { blk_mq_in_flight(q, part, inflight); return; } @@ -85,7 +85,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part, void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - if (q->mq_ops) { + if (queue_is_mq(q)) { blk_mq_in_flight_rw(q, part, inflight); return; } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 7cd36e4d1310..1f1fe9a618ea 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -43,7 +43,7 @@ static unsigned dm_get_blk_mq_queue_depth(void) int dm_request_based(struct mapped_device *md) { - return queue_is_rq_based(md->queue); + return queue_is_mq(md->queue); } void dm_start_queue(struct request_queue *q) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 9038c302d5c2..844f7d0f2ef8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -919,12 +919,12 @@ static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev, struct request_queue *q = bdev_get_queue(dev->bdev); struct verify_rq_based_data *v = data; - if (q->mq_ops) + if (queue_is_mq(q)) v->mq_count++; else v->sq_count++; - return queue_is_rq_based(q); + return queue_is_mq(q); } static int dm_table_determine_type(struct dm_table *t) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1d185f1fc333..41aaa05e42c1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -656,11 +656,7 @@ static inline bool blk_account_rq(struct request *rq) #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) -/* - * Driver can handle struct request, if it either has an old style - * request_fn defined, or is blk-mq based. - */ -static inline bool queue_is_rq_based(struct request_queue *q) +static inline bool queue_is_mq(struct request_queue *q) { return q->mq_ops; } -- cgit v1.2.3 From 892ad71f622bbf39c6de321d5ca9b0fdec237c24 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 5 Dec 2018 12:10:30 -0500 Subject: dm: set the static flush bio device on demand The next patch changes the macro bio_set_dev() to associate a bio with a blkg based on the device set. However, dm creates a static bio to be used as the basis for cloning empty flush bios on creation. The bio_set_dev() call in alloc_dev() will cause problems with the next patch adding association to bio_set_dev() because the call is before the bdev is associated with a gendisk (bd_disk is %NULL). To get around this, set the device on the static bio every time and use that to clone to the other bios. Signed-off-by: Dennis Zhou Acked-by: Mike Snitzer Cc: Alasdair Kergon Signed-off-by: Jens Axboe --- block/bio.c | 1 + drivers/md/dm.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/block/bio.c b/block/bio.c index 452b8e79b998..41ebb3f8e2fc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -2021,6 +2021,7 @@ void bio_disassociate_blkg(struct bio *bio) bio->bi_blkg = NULL; } } +EXPORT_SYMBOL_GPL(bio_disassociate_blkg); /** * __bio_associate_blkg - associate a bio with the a blkg diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a733e4c920af..ab72d79775ee 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1417,10 +1417,21 @@ static int __send_empty_flush(struct clone_info *ci) unsigned target_nr = 0; struct dm_target *ti; + /* + * Empty flush uses a statically initialized bio, &md->flush_bio, as + * the base for cloning. However, blkg association requires that a + * bdev is associated with a gendisk, which doesn't happen until the + * bdev is opened. So, blkg association is done at issue time of the + * flush rather than when the device is created in alloc_dev(). + */ + bio_set_dev(ci->bio, ci->io->md->bdev); + BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + bio_disassociate_blkg(ci->bio); + return 0; } @@ -1939,7 +1950,6 @@ static struct mapped_device *alloc_dev(int minor) goto bad; bio_init(&md->flush_bio, NULL, 0); - bio_set_dev(&md->flush_bio, md->bdev); md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; dm_stats_init(&md->stats); -- cgit v1.2.3 From db6638d7d177a8bc74c9e539e2e0d7d061c767b1 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 5 Dec 2018 12:10:35 -0500 Subject: blkcg: remove bio->bi_css and instead use bio->bi_blkg Prior patches ensured that any bio that interacts with a request_queue is properly associated with a blkg. This makes bio->bi_css unnecessary as blkg maintains a reference to blkcg already. This removes the bio field bi_css and transfers corresponding uses to access via bi_blkg. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/bio.c | 59 ++++++++++------------------------------------ block/bounce.c | 2 +- drivers/block/loop.c | 5 ++-- drivers/md/raid0.c | 2 +- include/linux/bio.h | 11 ++++----- include/linux/blk-cgroup.h | 8 +++---- include/linux/blk_types.h | 7 +++--- kernel/trace/blktrace.c | 4 ++-- 8 files changed, 32 insertions(+), 66 deletions(-) (limited to 'drivers/md') diff --git a/block/bio.c b/block/bio.c index b42477b6a225..2b6bc7b805ec 100644 --- a/block/bio.c +++ b/block/bio.c @@ -610,7 +610,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); blkcg_bio_issue_init(bio); } EXPORT_SYMBOL(__bio_clone_fast); @@ -1957,34 +1957,6 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP -/** - * bio_associate_blkcg - associate a bio with the specified blkcg - * @bio: target bio - * @blkcg_css: css of the blkcg to associate - * - * Associate @bio with the blkcg specified by @blkcg_css. Block layer will - * treat @bio as if it were issued by a task which belongs to the blkcg. - * - * This function takes an extra reference of @blkcg_css which will be put - * when @bio is released. The caller must own @bio and is responsible for - * synchronizing calls to this function. If @blkcg_css is %NULL, a call to - * blkcg_get_css() finds the current css from the kthread or task. - */ -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) -{ - if (unlikely(bio->bi_css)) - return -EBUSY; - - if (blkcg_css) - css_get(blkcg_css); - else - blkcg_css = blkcg_get_css(); - - bio->bi_css = blkcg_css; - return 0; -} -EXPORT_SYMBOL_GPL(bio_associate_blkcg); - /** * bio_disassociate_blkg - puts back the blkg reference if associated * @bio: target bio @@ -1994,6 +1966,8 @@ EXPORT_SYMBOL_GPL(bio_associate_blkcg); void bio_disassociate_blkg(struct bio *bio) { if (bio->bi_blkg) { + /* a ref is always taken on css */ + css_put(&bio_blkcg(bio)->css); blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } @@ -2047,7 +2021,6 @@ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { css_get(css); - bio->bi_css = css; __bio_associate_blkg_from_css(bio, css); } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); @@ -2066,13 +2039,10 @@ void bio_associate_blkg_from_page(struct bio *bio, struct page *page) { struct cgroup_subsys_state *css; - if (unlikely(bio->bi_css)) - return; if (!page->mem_cgroup) return; css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); - bio->bi_css = css; __bio_associate_blkg_from_css(bio, css); } #endif /* CONFIG_MEMCG */ @@ -2094,8 +2064,10 @@ void bio_associate_blkg(struct bio *bio) rcu_read_lock(); - bio_associate_blkcg(bio, NULL); - blkcg = bio_blkcg(bio); + if (bio->bi_blkg) + blkcg = bio->bi_blkg->blkcg; + else + blkcg = css_to_blkcg(blkcg_get_css()); if (!blkcg->css.parent) { __bio_associate_blkg(bio, q->root_blkg); @@ -2115,27 +2087,22 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg); */ void bio_disassociate_task(struct bio *bio) { - if (bio->bi_css) { - css_put(bio->bi_css); - bio->bi_css = NULL; - } bio_disassociate_blkg(bio); } /** - * bio_clone_blkcg_association - clone blkcg association from src to dst bio + * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ -void bio_clone_blkcg_association(struct bio *dst, struct bio *src) +void bio_clone_blkg_association(struct bio *dst, struct bio *src) { - if (src->bi_css) - WARN_ON(bio_associate_blkcg(dst, src->bi_css)); - - if (src->bi_blkg) + if (src->bi_blkg) { + css_get(&bio_blkcg(src)->css); __bio_associate_blkg(dst, src->bi_blkg); + } } -EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); +EXPORT_SYMBOL_GPL(bio_clone_blkg_association); #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/block/bounce.c b/block/bounce.c index cfb96d5170d0..ffb9e9ecfa7e 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -277,7 +277,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, } } - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); blkcg_bio_issue_init(bio); return bio; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 176ab1f28eca..0770004616de 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,6 +77,7 @@ #include #include #include +#include #include "loop.h" @@ -1820,8 +1821,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, /* always use the first bio's css */ #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_css) { - cmd->css = rq->bio->bi_css; + if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { + cmd->css = &bio_blkcg(rq->bio)->css; css_get(cmd->css); } else #endif diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ac1cffd2a09b..f3fb5bb8c82a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) !discard_bio) continue; bio_chain(discard_bio, bio); - bio_clone_blkcg_association(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), diff --git a/include/linux/bio.h b/include/linux/bio.h index f0438061a5a3..84e1c4dc703a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -498,7 +498,7 @@ do { \ do { \ (dst)->bi_disk = (src)->bi_disk; \ (dst)->bi_partno = (src)->bi_partno; \ - bio_clone_blkcg_association(dst, src); \ + bio_clone_blkg_association(dst, src); \ } while (0) #define bio_dev(bio) \ @@ -512,24 +512,21 @@ static inline void bio_associate_blkg_from_page(struct bio *bio, #endif #ifdef CONFIG_BLK_CGROUP -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); void bio_disassociate_blkg(struct bio *bio); void bio_associate_blkg(struct bio *bio); void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css); void bio_disassociate_task(struct bio *bio); -void bio_clone_blkcg_association(struct bio *dst, struct bio *src); +void bio_clone_blkg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ -static inline int bio_associate_blkcg(struct bio *bio, - struct cgroup_subsys_state *blkcg_css) { return 0; } static inline void bio_disassociate_blkg(struct bio *bio) { } static inline void bio_associate_blkg(struct bio *bio) { } static inline void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { } static inline void bio_disassociate_task(struct bio *bio) { } -static inline void bio_clone_blkcg_association(struct bio *dst, - struct bio *src) { } +static inline void bio_clone_blkg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 8b069c3775ee..f11c37f8ce09 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -309,8 +309,8 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) */ static inline struct blkcg *__bio_blkcg(struct bio *bio) { - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; return css_to_blkcg(blkcg_css()); } @@ -324,8 +324,8 @@ static inline struct blkcg *__bio_blkcg(struct bio *bio) */ static inline struct blkcg *bio_blkcg(struct bio *bio) { - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); + if (bio && bio->bi_blkg) + return bio->bi_blkg->blkcg; return NULL; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c0ba1a038ff3..46c005d601ac 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -174,10 +174,11 @@ struct bio { void *bi_private; #ifdef CONFIG_BLK_CGROUP /* - * Optional css associated with this bio. Put on bio - * release. Read comment on top of bio_associate_current(). + * Represents the association of the css and request_queue for the bio. + * If a bio goes direct to device, it will not have a blkg as it will + * not have a request_queue associated with it. The reference is put + * on release of the bio. */ - struct cgroup_subsys_state *bi_css; struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2868d85f1fb1..fac0ddf8a8e2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return NULL; - if (!bio->bi_css) + if (!bio->bi_blkg) return NULL; - return cgroup_get_kernfs_id(bio->bi_css->cgroup); + return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else static union kernfs_node_id * -- cgit v1.2.3 From 80a787ba3809deb694ee632919badb73890daf05 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 6 Dec 2018 11:41:16 -0500 Subject: dm: dont rewrite dm_disk(md)->part0.in_flight generic_start_io_acct and generic_end_io_acct already update the variable in_flight using atomic operations, so we don't have to overwrite them again. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ab72d79775ee..33100e536c4e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -663,8 +663,7 @@ static void start_io_acct(struct dm_io *io) generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), &dm_disk(md)->part0); - atomic_set(&dm_disk(md)->part0.in_flight[rw], - atomic_inc_return(&md->pending[rw])); + atomic_inc(&md->pending[rw]); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), @@ -693,7 +692,6 @@ static void end_io_acct(struct dm_io *io) * a flush. */ pending = atomic_dec_return(&md->pending[rw]); - atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); pending += atomic_read(&md->pending[rw^0x1]); /* nudge anyone waiting on suspend queue */ -- cgit v1.2.3 From dbd3bbd291a03f1383de6242e7999e8487cb869b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 6 Dec 2018 11:41:17 -0500 Subject: dm rq: leverage blk_mq_queue_busy() to check for outstanding IO Now that request-based dm-multipath only supports blk-mq, make use of the newly introduced blk_mq_queue_busy() to check for outstanding IO -- rather than (ab)using the block core's in_flight counters. Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm-rq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 1f1fe9a618ea..d2397d8fcbd1 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -130,11 +130,11 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { - atomic_dec(&md->pending[rw]); - /* nudge anyone waiting on suspend queue */ - if (!md_in_flight(md)) - wake_up(&md->wait); + if (unlikely(waitqueue_active(&md->wait))) { + if (!blk_mq_queue_busy(md->queue)) + wake_up(&md->wait); + } /* * dm_put() must be at the end of this function. See the comment above @@ -436,7 +436,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, static void dm_start_request(struct mapped_device *md, struct request *orig) { blk_mq_start_request(orig); - atomic_inc(&md->pending[rq_data_dir(orig)]); if (unlikely(dm_stats_used(&md->stats))) { struct dm_rq_target_io *tio = tio_from_request(orig); -- cgit v1.2.3 From 112f158f66cbe25fd561a5dfe9c3826e06abf757 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 6 Dec 2018 11:41:18 -0500 Subject: block: stop passing 'cpu' to all percpu stats methods All of part_stat_* and related methods are used with preempt disabled, so there is no need to pass cpu around to allow of them. Just call smp_processor_id() as needed. Suggested-by: Jens Axboe Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/bio.c | 16 +++++++++------- block/blk-core.c | 34 +++++++++++++++------------------- block/blk-merge.c | 5 ++--- block/genhd.c | 5 ++--- block/partition-generic.c | 5 ++--- drivers/md/md.c | 7 +++---- include/linux/genhd.h | 26 +++++++++++++------------- 7 files changed, 46 insertions(+), 52 deletions(-) (limited to 'drivers/md') diff --git a/block/bio.c b/block/bio.c index 06760543ec81..0aca870331c3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1668,11 +1668,12 @@ void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part) { const int sgrp = op_stat_group(op); - int cpu = part_stat_lock(); - part_round_stats(q, cpu, part); - part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, sectors[sgrp], sectors); + part_stat_lock(); + + part_round_stats(q, part); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, sectors[sgrp], sectors); part_inc_in_flight(q, part, op_is_write(op)); part_stat_unlock(); @@ -1684,10 +1685,11 @@ void generic_end_io_acct(struct request_queue *q, int req_op, { unsigned long duration = jiffies - start_time; const int sgrp = op_stat_group(req_op); - int cpu = part_stat_lock(); - part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_round_stats(q, cpu, part); + part_stat_lock(); + + part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_round_stats(q, part); part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index ad59102ee30a..734b768c9d9d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -584,14 +584,14 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op, } EXPORT_SYMBOL(blk_get_request); -static void part_round_stats_single(struct request_queue *q, int cpu, +static void part_round_stats_single(struct request_queue *q, struct hd_struct *part, unsigned long now, unsigned int inflight) { if (inflight) { - __part_stat_add(cpu, part, time_in_queue, + __part_stat_add(part, time_in_queue, inflight * (now - part->stamp)); - __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); + __part_stat_add(part, io_ticks, (now - part->stamp)); } part->stamp = now; } @@ -599,7 +599,6 @@ static void part_round_stats_single(struct request_queue *q, int cpu, /** * part_round_stats() - Round off the performance stats on a struct disk_stats. * @q: target block queue - * @cpu: cpu number for stats access * @part: target partition * * The average IO queue length and utilisation statistics are maintained @@ -613,7 +612,7 @@ static void part_round_stats_single(struct request_queue *q, int cpu, * /proc/diskstats. This accounts immediately for all queue usage up to * the current jiffies and restarts the counters again. */ -void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) +void part_round_stats(struct request_queue *q, struct hd_struct *part) { struct hd_struct *part2 = NULL; unsigned long now = jiffies; @@ -635,9 +634,9 @@ void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) part_in_flight(q, part, inflight); if (stats & 2) - part_round_stats_single(q, cpu, part2, now, inflight[1]); + part_round_stats_single(q, part2, now, inflight[1]); if (stats & 1) - part_round_stats_single(q, cpu, part, now, inflight[0]); + part_round_stats_single(q, part, now, inflight[0]); } EXPORT_SYMBOL_GPL(part_round_stats); @@ -1362,11 +1361,10 @@ void blk_account_io_completion(struct request *req, unsigned int bytes) if (blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; - int cpu; - cpu = part_stat_lock(); + part_stat_lock(); part = req->part; - part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); + part_stat_add(part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -1381,14 +1379,13 @@ void blk_account_io_done(struct request *req, u64 now) if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; - int cpu; - cpu = part_stat_lock(); + part_stat_lock(); part = req->part; - part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); - part_round_stats(req->q, cpu, part); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); + part_round_stats(req->q, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); @@ -1400,16 +1397,15 @@ void blk_account_io_start(struct request *rq, bool new_io) { struct hd_struct *part; int rw = rq_data_dir(rq); - int cpu; if (!blk_do_io_stat(rq)) return; - cpu = part_stat_lock(); + part_stat_lock(); if (!new_io) { part = rq->part; - part_stat_inc(cpu, part, merges[rw]); + part_stat_inc(part, merges[rw]); } else { part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); if (!hd_struct_try_get(part)) { @@ -1424,7 +1420,7 @@ void blk_account_io_start(struct request *rq, bool new_io) part = &rq->rq_disk->part0; hd_struct_get(part); } - part_round_stats(rq->q, cpu, part); + part_round_stats(rq->q, part); part_inc_in_flight(rq->q, part, rw); rq->part = part; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 4431da69a5cf..a120d59b9705 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -685,12 +685,11 @@ static void blk_account_io_merge(struct request *req) { if (blk_do_io_stat(req)) { struct hd_struct *part; - int cpu; - cpu = part_stat_lock(); + part_stat_lock(); part = req->part; - part_round_stats(req->q, cpu, part); + part_round_stats(req->q, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); diff --git a/block/genhd.c b/block/genhd.c index 0145bcb0cc76..2fe00cf32b93 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1326,7 +1326,6 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct hd_struct *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight[2]; - int cpu; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1338,8 +1337,8 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - cpu = part_stat_lock(); - part_round_stats(gp->queue, cpu, hd); + part_stat_lock(); + part_round_stats(gp->queue, hd); part_stat_unlock(); part_in_flight(gp->queue, hd, inflight); seq_printf(seqf, "%4d %7d %s " diff --git a/block/partition-generic.c b/block/partition-generic.c index 5f8db5c5140f..7e663cfb1487 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -121,10 +121,9 @@ ssize_t part_stat_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q = part_to_disk(p)->queue; unsigned int inflight[2]; - int cpu; - cpu = part_stat_lock(); - part_round_stats(q, cpu, p); + part_stat_lock(); + part_round_stats(q, p); part_stat_unlock(); part_in_flight(q, p, inflight); return sprintf(buf, diff --git a/drivers/md/md.c b/drivers/md/md.c index fc488cb30a94..9a0a1e0934d5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = q->queuedata; unsigned int sectors; - int cpu; blk_queue_split(q, &bio); @@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) md_handle_request(mddev, bio); - cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors); + part_stat_lock(); + part_stat_inc(&mddev->gendisk->part0, ios[sgrp]); + part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors); part_stat_unlock(); return BLK_QC_T_NONE; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0c5ee17b4d88..1677cd2a4c4e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -295,8 +295,8 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, #define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) #define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) -#define __part_stat_add(cpu, part, field, addnd) \ - (per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd)) +#define __part_stat_add(part, field, addnd) \ + (per_cpu_ptr((part)->dkstats, smp_processor_id())->field += (addnd)) #define part_stat_read(part, field) \ ({ \ @@ -333,7 +333,7 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_lock() ({ rcu_read_lock(); 0; }) #define part_stat_unlock() rcu_read_unlock() -#define __part_stat_add(cpu, part, field, addnd) \ +#define __part_stat_add(part, field, addnd) \ ((part)->dkstats.field += addnd) #define part_stat_read(part, field) ((part)->dkstats.field) @@ -362,19 +362,19 @@ static inline void free_part_stats(struct hd_struct *part) part_stat_read(part, field[STAT_WRITE]) + \ part_stat_read(part, field[STAT_DISCARD])) -#define part_stat_add(cpu, part, field, addnd) do { \ - __part_stat_add((cpu), (part), field, addnd); \ +#define part_stat_add(part, field, addnd) do { \ + __part_stat_add((part), field, addnd); \ if ((part)->partno) \ - __part_stat_add((cpu), &part_to_disk((part))->part0, \ + __part_stat_add(&part_to_disk((part))->part0, \ field, addnd); \ } while (0) -#define part_stat_dec(cpu, gendiskp, field) \ - part_stat_add(cpu, gendiskp, field, -1) -#define part_stat_inc(cpu, gendiskp, field) \ - part_stat_add(cpu, gendiskp, field, 1) -#define part_stat_sub(cpu, gendiskp, field, subnd) \ - part_stat_add(cpu, gendiskp, field, -subnd) +#define part_stat_dec(gendiskp, field) \ + part_stat_add(gendiskp, field, -1) +#define part_stat_inc(gendiskp, field) \ + part_stat_add(gendiskp, field, 1) +#define part_stat_sub(gendiskp, field, subnd) \ + part_stat_add(gendiskp, field, -subnd) void part_in_flight(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); @@ -399,7 +399,7 @@ static inline void free_part_info(struct hd_struct *part) } /* block/blk-core.c */ -extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part); +extern void part_round_stats(struct request_queue *q, struct hd_struct *part); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, -- cgit v1.2.3 From 6f75723190d88e1319bea623bfe0292bf3917965 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 6 Dec 2018 11:41:22 -0500 Subject: dm: remove the pending IO accounting Remove the "pending" atomic counters, that duplicate block-core's in_flight counters, and update md_in_flight() to look at percpu in_flight counters. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm-core.h | 2 -- drivers/md/dm.c | 34 +++++++++++++++------------------- 2 files changed, 15 insertions(+), 21 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 224d44503a06..6fe883fac471 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -65,7 +65,6 @@ struct mapped_device { */ struct work_struct work; wait_queue_head_t wait; - atomic_t pending[2]; spinlock_t deferred_lock; struct bio_list deferred; @@ -119,7 +118,6 @@ struct mapped_device { struct srcu_struct io_barrier; }; -int md_in_flight(struct mapped_device *md); void disable_write_same(struct mapped_device *md); void disable_write_zeroes(struct mapped_device *md); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 33100e536c4e..70568f8b6c53 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -646,25 +646,30 @@ static void free_tio(struct dm_target_io *tio) bio_put(&tio->clone); } -int md_in_flight(struct mapped_device *md) +static bool md_in_flight(struct mapped_device *md) { - return atomic_read(&md->pending[READ]) + - atomic_read(&md->pending[WRITE]); + int cpu; + struct hd_struct *part = &dm_disk(md)->part0; + + for_each_possible_cpu(cpu) { + if (part_stat_local_read_cpu(part, in_flight[0], cpu) || + part_stat_local_read_cpu(part, in_flight[1], cpu)) + return true; + } + + return false; } static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; - int rw = bio_data_dir(bio); io->start_time = jiffies; generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), &dm_disk(md)->part0); - atomic_inc(&md->pending[rw]); - if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), @@ -676,8 +681,6 @@ static void end_io_acct(struct dm_io *io) struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; unsigned long duration = jiffies - io->start_time; - int pending; - int rw = bio_data_dir(bio); generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, io->start_time); @@ -687,16 +690,11 @@ static void end_io_acct(struct dm_io *io) bio->bi_iter.bi_sector, bio_sectors(bio), true, duration, &io->stats_aux); - /* - * After this is decremented the bio must not be touched if it is - * a flush. - */ - pending = atomic_dec_return(&md->pending[rw]); - pending += atomic_read(&md->pending[rw^0x1]); - /* nudge anyone waiting on suspend queue */ - if (!pending) - wake_up(&md->wait); + if (unlikely(waitqueue_active(&md->wait))) { + if (!md_in_flight(md)) + wake_up(&md->wait); + } } /* @@ -1915,8 +1913,6 @@ static struct mapped_device *alloc_dev(int minor) if (!md->disk) goto bad; - atomic_set(&md->pending[0], 0); - atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); -- cgit v1.2.3 From b7934ba4147a883f7a1b32c6408179274a4d6ed1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Dec 2018 15:45:53 -0700 Subject: dm: fix inflight IO check After switching to percpu inflight counters, the inflight check is totally buggy. It's perfectly valid for some counters to be non-zero while having a total inflight IO count of 0, that's how these kinds of counters work (inc on one CPU, dec on another). Fix the md_in_flight() check to sum all counters before returning a false positive, potentially. While at it, remove the inflight read for IO completion. We don't need it, just wake anyone that's waiting for the IO count to drop to zero. The caller needs to re-check that value anyway when woken, which it does. Fixes: 6f75723190d8 ("dm: remove the pending IO accounting") Acked-by: Mike Snitzer Reported-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 70568f8b6c53..79ad4b3d215c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -650,14 +650,14 @@ static bool md_in_flight(struct mapped_device *md) { int cpu; struct hd_struct *part = &dm_disk(md)->part0; + long sum = 0; for_each_possible_cpu(cpu) { - if (part_stat_local_read_cpu(part, in_flight[0], cpu) || - part_stat_local_read_cpu(part, in_flight[1], cpu)) - return true; + sum += part_stat_local_read_cpu(part, in_flight[0], cpu); + sum += part_stat_local_read_cpu(part, in_flight[1], cpu); } - return false; + return sum != 0; } static void start_io_acct(struct dm_io *io) @@ -691,10 +691,8 @@ static void end_io_acct(struct dm_io *io) true, duration, &io->stats_aux); /* nudge anyone waiting on suspend queue */ - if (unlikely(waitqueue_active(&md->wait))) { - if (!md_in_flight(md)) - wake_up(&md->wait); - } + if (unlikely(waitqueue_active(&md->wait))) + wake_up(&md->wait); } /* -- cgit v1.2.3 From c4576aed8d85d808cd6443bda58393d525207d01 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 11 Dec 2018 09:10:26 -0500 Subject: dm: fix request-based dm's use of dm_wait_for_completion The md->wait waitqueue is used by both bio-based and request-based DM. Commit dbd3bbd291 ("dm rq: leverage blk_mq_queue_busy() to check for outstanding IO") lost sight of the requirement that dm_wait_for_completion() must work with all types of DM devices. Fix md_in_flight() to call the blk-mq or bio-based method accordingly. Fixes: dbd3bbd291 ("dm rq: leverage blk_mq_queue_busy() to check for outstanding IO") Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm-rq.c | 6 ++---- drivers/md/dm.c | 10 +++++++++- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index d2397d8fcbd1..4e06be4f0a62 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -131,10 +131,8 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { /* nudge anyone waiting on suspend queue */ - if (unlikely(waitqueue_active(&md->wait))) { - if (!blk_mq_queue_busy(md->queue)) - wake_up(&md->wait); - } + if (unlikely(waitqueue_active(&md->wait))) + wake_up(&md->wait); /* * dm_put() must be at the end of this function. See the comment above diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 79ad4b3d215c..c414d40d645d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -646,7 +646,7 @@ static void free_tio(struct dm_target_io *tio) bio_put(&tio->clone); } -static bool md_in_flight(struct mapped_device *md) +static bool md_in_flight_bios(struct mapped_device *md) { int cpu; struct hd_struct *part = &dm_disk(md)->part0; @@ -660,6 +660,14 @@ static bool md_in_flight(struct mapped_device *md) return sum != 0; } +static bool md_in_flight(struct mapped_device *md) +{ + if (queue_is_mq(md->queue)) + return blk_mq_queue_busy(md->queue); + else + return md_in_flight_bios(md); +} + static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; -- cgit v1.2.3 From d2f96f487f471b4c718324ecd1540c89d2be52ac Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Thu, 13 Dec 2018 22:53:46 +0800 Subject: bcache: add comment for cache_set->fill_iter We have the following define for btree iterator: struct btree_iter { size_t size, used; #ifdef CONFIG_BCACHE_DEBUG struct btree_keys *b; #endif struct btree_iter_set { struct bkey *k, *end; } data[MAX_BSETS]; }; We can see that the length of data[] field is static MAX_BSETS, which is defined as 4 currently. But a btree node on disk could have too many bsets for an iterator to fit on the stack - maybe far more that MAX_BSETS. Have to dynamically allocate space to host more btree_iter_sets. bch_cache_set_alloc() will make sure the pool cache_set->fill_iter can allocate an iterator equipped with enough room that can host (sb.bucket_size / sb.block_size) btree_iter_sets, which is more than static MAX_BSETS. bch_btree_node_read_done() will use that pool to allocate one iterator, to host many bsets in one btree node. Add more comment around cache_set->fill_iter to make code less confusing. Signed-off-by: Shenghui Wang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 6 +++++- drivers/md/bcache/btree.c | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b61b83bbcfff..96d2213f279e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -658,7 +658,11 @@ struct cache_set { /* * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - have to dynamically allocate them + * on the stack - have to dynamically allocate them. + * bch_cache_set_alloc() will make sure the pool can allocate iterators + * equipped with enough room that can host + * (sb.bucket_size / sb.block_size) + * btree_iter_sets, which is more than static MAX_BSETS. */ mempool_t fill_iter; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 3f4211b5cd33..23cb1dc7296b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b) struct bset *i = btree_bset_first(b); struct btree_iter *iter; + /* + * c->fill_iter can allocate an iterator with more memory space + * than static MAX_BSETS. + * See the comment arount cache_set->fill_iter. + */ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; -- cgit v1.2.3 From ae17102316550b4b230a283febe31b2a9ff30084 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Thu, 13 Dec 2018 22:53:47 +0800 Subject: bcache: do not check if debug dentry is ERR or NULL explicitly on remove debugfs_remove and debugfs_remove_recursive will check if the dentry pointer is NULL or ERR, and will do nothing in that case. Remove the check in cache_set_free and bch_debug_init. Signed-off-by: Shenghui Wang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/debug.c | 3 +-- drivers/md/bcache/super.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8f448b9c96a1..8b123be05254 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -249,8 +249,7 @@ void bch_debug_init_cache_set(struct cache_set *c) void bch_debug_exit(void) { - if (!IS_ERR_OR_NULL(bcache_debug)) - debugfs_remove_recursive(bcache_debug); + debugfs_remove_recursive(bcache_debug); } void __init bch_debug_init(void) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7bbd670a5a84..5b59d44656c0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1510,8 +1510,7 @@ static void cache_set_free(struct closure *cl) struct cache *ca; unsigned int i; - if (!IS_ERR_OR_NULL(c->debug)) - debugfs_remove(c->debug); + debugfs_remove(c->debug); bch_open_buckets_free(c); bch_btree_cache_free(c); -- cgit v1.2.3 From 3db4d0783eaf2a2537f6b83679ad57cba0ad0481 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Thu, 13 Dec 2018 22:53:48 +0800 Subject: bcache: update comment for bch_data_insert commit 220bb38c21b8 ("bcache: Break up struct search") introduced changes to struct search and s->iop. bypass/bio are fields of struct data_insert_op now. Update the comment. Signed-off-by: Shenghui Wang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 3bf35914bb57..15070412a32e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -311,11 +311,11 @@ err: * data is written it calls bch_journal, and after the keys have been added to * the next journal write they're inserted into the btree. * - * It inserts the data in s->cache_bio; bi_sector is used for the key offset, + * It inserts the data in op->bio; bi_sector is used for the key offset, * and op->inode is used for the key inode. * - * If s->bypass is true, instead of inserting the data it invalidates the - * region of the cache represented by s->cache_bio and op->inode. + * If op->bypass is true, instead of inserting the data it invalidates the + * region of the cache represented by op->bio and op->inode. */ void bch_data_insert(struct closure *cl) { -- cgit v1.2.3 From 4e361e020e7220fa8fc812acdca4d08814633f49 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Thu, 13 Dec 2018 22:53:49 +0800 Subject: bcache: update comment in sysfs.c We have struct cached_dev allocated by kzalloc in register_bcache(), which initializes all the fields of cached_dev with 0s. And commit ce4c3e19e520 ("bcache: Replace bch_read_string_list() by __sysfs_match_string()") has remove the string "default". Update the comment. Signed-off-by: Shenghui Wang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 26f035a0c5b9..d2e5c9892d4d 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -16,7 +16,7 @@ #include #include -/* Default is -1; we skip past it for struct cached_dev's cache mode */ +/* Default is 0 ("writethrough") */ static const char * const bch_cache_modes[] = { "writethrough", "writeback", @@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = { NULL }; -/* Default is -1; we skip past it for stop_when_cache_set_failed */ +/* Default is 0 ("auto") */ static const char * const bch_stop_on_failure_modes[] = { "auto", "always", -- cgit v1.2.3 From 79b791466e525c98f6aeee9acf5726e7b27f4231 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Thu, 13 Dec 2018 22:53:50 +0800 Subject: bcache: do not mark writeback_running too early A fresh backing device is not attached to any cache_set, and has no writeback kthread created until first attached to some cache_set. But bch_cached_dev_writeback_init run " dc->writeback_running = true; WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); " for any newly formatted backing devices. For a fresh standalone backing device, we can get something like following even if no writeback kthread created: ------------------------ /sys/block/bcache0/bcache# cat writeback_running 1 /sys/block/bcache0/bcache# cat writeback_rate_debug rate: 512.0k/sec dirty: 0.0k target: 0.0k proportional: 0.0k integral: 0.0k change: 0.0k/sec next io: -15427384ms The none ZERO fields are misleading as no alive writeback kthread yet. Set dc->writeback_running false as no writeback thread created in bch_cached_dev_writeback_init(). We have writeback thread created and woken up in bch_cached_dev_writeback _start(). Set dc->writeback_running true before bch_writeback_queue() called, as a writeback thread will check if dc->writeback_running is true before writing back dirty data, and hung if false detected. After the change, we can get the following output for a fresh standalone backing device: ----------------------- /sys/block/bcache0/bcache$ cat writeback_running 0 /sys/block/bcache0/bcache# cat writeback_rate_debug rate: 0.0k/sec dirty: 0.0k target: 0.0k proportional: 0.0k integral: 0.0k change: 0.0k/sec next io: 0ms v1 -> v2: Set dc->writeback_running before bch_writeback_queue() called, Signed-off-by: Shenghui Wang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 08c3a9f9676c..1696b212ec4e 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -777,7 +777,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; - dc->writeback_running = true; + dc->writeback_running = false; dc->writeback_percent = 10; dc->writeback_delay = 30; atomic_long_set(&dc->writeback_rate.rate, 1024); @@ -805,6 +805,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) cached_dev_put(dc); return PTR_ERR(dc->writeback_thread); } + dc->writeback_running = true; WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); schedule_delayed_work(&dc->writeback_rate_update, -- cgit v1.2.3 From f383ae300c4b6466824fd5c8cbd0a6c4ed2b53d3 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Thu, 13 Dec 2018 22:53:51 +0800 Subject: bcache: cannot set writeback_running via sysfs if no writeback kthread created "echo 1 > writeback_running" marks writeback_running even if no writeback kthread created as "d_strtoul(writeback_running)" will simply set dc-> writeback_running without checking the existence of dc->writeback_thread. Add check for setting writeback_running via sysfs: if no writeback kthread available, reject setting to 1. v2 -> v3: * Make message on wrong assignment more clear. * Print name of bcache device instead of name of backing device. Signed-off-by: Shenghui Wang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d2e5c9892d4d..9d5fe12f0c9c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -384,8 +384,25 @@ STORE(bch_cached_dev) mutex_lock(&bch_register_lock); size = __cached_dev_store(kobj, attr, buf, size); - if (attr == &sysfs_writeback_running) - bch_writeback_queue(dc); + if (attr == &sysfs_writeback_running) { + /* dc->writeback_running changed in __cached_dev_store() */ + if (IS_ERR_OR_NULL(dc->writeback_thread)) { + /* + * reject setting it to 1 via sysfs if writeback + * kthread is not created yet. + */ + if (dc->writeback_running) { + dc->writeback_running = false; + pr_err("%s: failed to run non-existent writeback thread", + dc->disk.disk->disk_name); + } + } else + /* + * writeback kthread will check if dc->writeback_running + * is true or false. + */ + bch_writeback_queue(dc); + } if (attr == &sysfs_writeback_percent) if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) -- cgit v1.2.3 From cb07ad63682ffcce10e9beaed7828a26780e3df9 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Dec 2018 22:53:52 +0800 Subject: bcache: introduce force_wake_up_gc() Garbage collection thread starts to work when c->sectors_to_gc is negative value, otherwise nothing will happen even the gc thread is woken up by wake_up_gc(). force_wake_up_gc() sets c->sectors_to_gc to -1 before calling wake_up_gc(), then gc thread may have chance to run if no one else sets c->sectors_to_gc to a positive value before gc_should_run(). This routine can be called where the gc thread is woken up and required to run in force. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.h | 18 ++++++++++++++++++ drivers/md/bcache/sysfs.c | 17 ++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index a68d6c55783b..d1c72ef64edf 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -266,6 +266,24 @@ static inline void wake_up_gc(struct cache_set *c) wake_up(&c->gc_wait); } +static inline void force_wake_up_gc(struct cache_set *c) +{ + /* + * Garbage collection thread only works when sectors_to_gc < 0, + * calling wake_up_gc() won't start gc thread if sectors_to_gc is + * not a nagetive value. + * Therefore sectors_to_gc is set to -1 here, before waking up + * gc thread by calling wake_up_gc(). Then gc_should_run() will + * give a chance to permit gc thread to run. "Give a chance" means + * before going into gc_should_run(), there is still possibility + * that c->sectors_to_gc being set to other positive value. So + * this routine won't 100% make sure gc thread will be woken up + * to run. + */ + atomic_set(&c->sectors_to_gc, -1); + wake_up_gc(c); +} + #define MAP_DONE 0 #define MAP_CONTINUE 1 diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 9d5fe12f0c9c..c09748497cdc 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -742,21 +742,8 @@ STORE(__bch_cache_set) bch_cache_accounting_clear(&c->accounting); } - if (attr == &sysfs_trigger_gc) { - /* - * Garbage collection thread only works when sectors_to_gc < 0, - * when users write to sysfs entry trigger_gc, most of time - * they want to forcibly triger gargage collection. Here -1 is - * set to c->sectors_to_gc, to make gc_should_run() give a - * chance to permit gc thread to run. "give a chance" means - * before going into gc_should_run(), there is still chance - * that c->sectors_to_gc being set to other positive value. So - * writing sysfs entry trigger_gc won't always make sure gc - * thread takes effect. - */ - atomic_set(&c->sectors_to_gc, -1); - wake_up_gc(c); - } + if (attr == &sysfs_trigger_gc) + force_wake_up_gc(c); if (attr == &sysfs_prune_cache) { struct shrink_control sc; -- cgit v1.2.3 From 7a671d8ef821bf5743fdff17fae0600648345b03 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Dec 2018 22:53:53 +0800 Subject: bcache: option to automatically run gc thread after writeback The option gc_after_writeback is disabled by default, because garbage collection will discard SSD data which drops cached data. Echo 1 into /sys/fs/bcache//internal/gc_after_writeback will enable this option, which wakes up gc thread when writeback accomplished and all cached data is clean. This option is helpful for people who cares writing performance more. In heavy writing workload, all cached data can be clean only happens when writeback thread cleans all cached data in I/O idle time. In such situation a following gc running may help to shrink bcache B+ tree and discard more clean data, which may be helpful for future writing requests. If you are not sure whether this is helpful for your own workload, please leave it as disabled by default. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 14 ++++++++++++++ drivers/md/bcache/sysfs.c | 9 +++++++++ drivers/md/bcache/writeback.c | 27 +++++++++++++++++++++++++++ drivers/md/bcache/writeback.h | 2 ++ 4 files changed, 52 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 96d2213f279e..fdf75352e16a 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -626,6 +626,20 @@ struct cache_set { /* Where in the btree gc currently is */ struct bkey gc_done; + /* + * For automatical garbage collection after writeback completed, this + * varialbe is used as bit fields, + * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback + * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback + * This is an optimization for following write request after writeback + * finished, but read hit rate dropped due to clean data on cache is + * discarded. Unless user explicitly sets it via sysfs, it won't be + * enabled. + */ +#define BCH_ENABLE_AUTO_GC 1 +#define BCH_DO_AUTO_GC 2 + uint8_t gc_after_writeback; + /* * The allocation code needs gc_mark in struct bucket to be correct, but * it's not while a gc is in progress. Protected by bucket_lock. diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c09748497cdc..621186b4240f 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -128,6 +128,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(gc_after_writeback); rw_attribute(size); static ssize_t bch_snprint_string_list(char *buf, @@ -693,6 +694,7 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -793,6 +795,12 @@ STORE(__bch_cache_set) sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); + /* + * write gc_after_writeback here may overwrite an already set + * BCH_DO_AUTO_GC, it doesn't matter because this flag will be + * set in next chance. + */ + sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1); return size; } @@ -873,6 +881,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_gc_after_writeback, &sysfs_io_disable, NULL }; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 1696b212ec4e..73f0efac2b9f 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -17,6 +17,15 @@ #include #include +static void update_gc_after_writeback(struct cache_set *c) +{ + if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) || + c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD) + return; + + c->gc_after_writeback |= BCH_DO_AUTO_GC; +} + /* Rate limiting */ static uint64_t __calc_target_rate(struct cached_dev *dc) { @@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work) if (!set_at_max_writeback_rate(c, dc)) { down_read(&dc->writeback_lock); __update_writeback_rate(dc); + update_gc_after_writeback(c); up_read(&dc->writeback_lock); } } @@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg) up_write(&dc->writeback_lock); break; } + + /* + * When dirty data rate is high (e.g. 50%+), there might + * be heavy buckets fragmentation after writeback + * finished, which hurts following write performance. + * If users really care about write performance they + * may set BCH_ENABLE_AUTO_GC via sysfs, then when + * BCH_DO_AUTO_GC is set, garbage collection thread + * will be wake up here. After moving gc, the shrunk + * btree and discarded free buckets SSD space may be + * helpful for following write requests. + */ + if (c->gc_after_writeback == + (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { + c->gc_after_writeback &= ~BCH_DO_AUTO_GC; + force_wake_up_gc(c); + } } up_write(&dc->writeback_lock); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index d2b9fdbc8994..ce63be98a39d 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -11,6 +11,8 @@ #define WRITEBACK_RATE_UPDATE_SECS_MAX 60 #define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 +#define BCH_AUTO_GC_DIRTY_THRESHOLD 50 + /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up -- cgit v1.2.3 From 009673d02fa92acaa7ed0b1e1389610e4390ba49 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Dec 2018 22:53:54 +0800 Subject: bcache: add MODULE_DESCRIPTION information This patch moves MODULE_AUTHOR and MODULE_LICENSE to end of super.c, and add MODULE_DESCRIPTION("Bcache: a Linux block layer cache"). This is preparation for adding module parameters. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 5b59d44656c0..61d3b63fa617 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -25,9 +25,6 @@ #include #include -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kent Overstreet "); - static const char bcache_magic[] = { 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 @@ -2469,3 +2466,7 @@ err: module_exit(bcache_exit); module_init(bcache_init); + +MODULE_DESCRIPTION("Bcache: a Linux block layer cache"); +MODULE_AUTHOR("Kent Overstreet "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 9aaf51654672b16566c5fe787da3ca41ebf6d297 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Dec 2018 22:53:55 +0800 Subject: bcache: make cutoff_writeback and cutoff_writeback_sync tunable Currently the cutoff writeback and cutoff writeback sync thresholds are defined by CUTOFF_WRITEBACK (40) and CUTOFF_WRITEBACK_SYNC (70) as static values. Most of time these they work fine, but when people want to do research on bcache writeback mode performance tuning, there is no chance to modify the soft and hard cutoff writeback values. This patch introduces two module parameters bch_cutoff_writeback_sync and bch_cutoff_writeback which permit people to tune the values when loading bcache.ko. If they are not specified by module loading, current values CUTOFF_WRITEBACK_SYNC and CUTOFF_WRITEBACK will be used as default and nothing changes. When people want to tune this two values, - cutoff_writeback can be set in range [1, 70] - cutoff_writeback_sync can be set in range [1, 90] - cutoff_writeback always <= cutoff_writeback_sync The default values are strongly recommended to most of users for most of workloads. Anyway, if people wants to take their own risk to do research on new writeback cutoff tuning for their own workload, now they can make it. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 40 ++++++++++++++++++++++++++++++++++++++++ drivers/md/bcache/sysfs.c | 7 +++++++ drivers/md/bcache/writeback.h | 10 ++++++++-- 3 files changed, 55 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 61d3b63fa617..4dee119c3664 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -25,6 +25,9 @@ #include #include +unsigned int bch_cutoff_writeback; +unsigned int bch_cutoff_writeback_sync; + static const char bcache_magic[] = { 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 @@ -2420,6 +2423,32 @@ static void bcache_exit(void) mutex_destroy(&bch_register_lock); } +/* Check and fixup module parameters */ +static void check_module_parameters(void) +{ + if (bch_cutoff_writeback_sync == 0) + bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC; + else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) { + pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u", + bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX); + bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX; + } + + if (bch_cutoff_writeback == 0) + bch_cutoff_writeback = CUTOFF_WRITEBACK; + else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) { + pr_warn("set bch_cutoff_writeback (%u) to max value %u", + bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX); + bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX; + } + + if (bch_cutoff_writeback > bch_cutoff_writeback_sync) { + pr_warn("set bch_cutoff_writeback (%u) to %u", + bch_cutoff_writeback, bch_cutoff_writeback_sync); + bch_cutoff_writeback = bch_cutoff_writeback_sync; + } +} + static int __init bcache_init(void) { static const struct attribute *files[] = { @@ -2428,6 +2457,8 @@ static int __init bcache_init(void) NULL }; + check_module_parameters(); + mutex_init(&bch_register_lock); init_waitqueue_head(&unregister_wait); register_reboot_notifier(&reboot); @@ -2464,9 +2495,18 @@ err: return -ENOMEM; } +/* + * Module hooks + */ module_exit(bcache_exit); module_init(bcache_init); +module_param(bch_cutoff_writeback, uint, 0); +MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback"); + +module_param(bch_cutoff_writeback_sync, uint, 0); +MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback"); + MODULE_DESCRIPTION("Bcache: a Linux block layer cache"); MODULE_AUTHOR("Kent Overstreet "); MODULE_LICENSE("GPL"); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 621186b4240f..482b128b3e9d 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -88,6 +88,8 @@ read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); read_attribute(congested); +read_attribute(cutoff_writeback); +read_attribute(cutoff_writeback_sync); rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us); @@ -686,6 +688,9 @@ SHOW(__bch_cache_set) sysfs_print(congested_write_threshold_us, c->congested_write_threshold_us); + sysfs_print(cutoff_writeback, bch_cutoff_writeback); + sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync); + sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); sysfs_printf(verify, "%i", c->verify); sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); @@ -883,6 +888,8 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_copy_gc_enabled, &sysfs_gc_after_writeback, &sysfs_io_disable, + &sysfs_cutoff_writeback, + &sysfs_cutoff_writeback_sync, NULL }; KTYPE(bch_cache_set_internal); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index ce63be98a39d..6a743d3bb338 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -5,6 +5,9 @@ #define CUTOFF_WRITEBACK 40 #define CUTOFF_WRITEBACK_SYNC 70 +#define CUTOFF_WRITEBACK_MAX 70 +#define CUTOFF_WRITEBACK_SYNC_MAX 90 + #define MAX_WRITEBACKS_IN_PASS 5 #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ @@ -55,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, } } +extern unsigned int bch_cutoff_writeback; +extern unsigned int bch_cutoff_writeback_sync; + static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned int cache_mode, bool would_skip) { @@ -62,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - in_use > CUTOFF_WRITEBACK_SYNC) + in_use > bch_cutoff_writeback_sync) return false; if (dc->partial_stripes_expensive && @@ -75,7 +81,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, return (op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO) || - in_use <= CUTOFF_WRITEBACK); + in_use <= bch_cutoff_writeback); } static inline void bch_writeback_queue(struct cached_dev *dc) -- cgit v1.2.3 From cc38ca7ed54a1df04ae9f23614b348ebfc918029 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Dec 2018 22:53:56 +0800 Subject: bcache: set writeback_percent in a flexible range Because CUTOFF_WRITEBACK is defined as 40, so before the changes of dynamic cutoff writeback values, writeback_percent is limited to [0, CUTOFF_WRITEBACK]. Any value larger than CUTOFF_WRITEBACK will be fixed up to 40. Now cutof writeback limit is a dynamic value bch_cutoff_writeback, so the range of writeback_percent can be a more flexible range as [0, bch_cutoff_writeback]. The flexibility is, it can be expended to a larger or smaller range than [0, 40], depends on how value bch_cutoff_writeback is specified. The default value is still strongly recommended to most of users for most of workloads. But for people who want to do research on bcache writeback perforamnce tuning, they may have chance to specify more flexible writeback_percent in range [0, 70]. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 482b128b3e9d..557a8a3270a1 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -267,7 +267,8 @@ STORE(__cached_dev) d_strtoul(writeback_running); d_strtoul(writeback_delay); - sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, + 0, bch_cutoff_writeback); if (attr == &sysfs_writeback_rate) { ssize_t ret; -- cgit v1.2.3 From e78bd0d26f7396c7bd17be60d9686be8ef8e453a Mon Sep 17 00:00:00 2001 From: Guoju Fang Date: Thu, 13 Dec 2018 22:53:57 +0800 Subject: bcache: print number of keys in trace_bcache_journal_write Sometimes flush journal may be very frequent, so it's useful to dump number of keys every time write journal. Signed-off-by: Guoju Fang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 2 +- include/trace/events/bcache.h | 27 ++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 522c7426f3a0..b2fd412715b1 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -663,7 +663,7 @@ static void journal_write_unlocked(struct closure *cl) REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); bch_bio_map(bio, w->data); - trace_bcache_journal_write(bio); + trace_bcache_journal_write(bio, w->data->keys); bio_list_add(&list, bio); SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 2cbd6e42ad83..e4526f85c19d 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -221,9 +221,30 @@ DEFINE_EVENT(cache_set, bcache_journal_entry_full, TP_ARGS(c) ); -DEFINE_EVENT(bcache_bio, bcache_journal_write, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) +TRACE_EVENT(bcache_journal_write, + TP_PROTO(struct bio *bio, u32 keys), + TP_ARGS(bio, keys), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __field(u32, nr_keys ) + ), + + TP_fast_assign( + __entry->dev = bio_dev(bio); + __entry->sector = bio->bi_iter.bi_sector; + __entry->nr_sector = bio->bi_iter.bi_size >> 9; + __entry->nr_keys = keys; + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + ), + + TP_printk("%d,%d %s %llu + %u keys %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, __entry->nr_sector, + __entry->nr_keys) ); /* Btree */ -- cgit v1.2.3 From 3c94d83cb352627f221d971b05f163c17527de74 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Dec 2018 21:11:17 -0700 Subject: blk-mq: change blk_mq_queue_busy() to blk_mq_queue_inflight() There's a single user of this function, dm, and dm just wants to check if IO is inflight, not that it's just allocated. This fixes a hang with srp/002 in blktests with dm, where it tries to suspend but waits for inflight IO to finish first. As it checks for just allocated requests, this fails. Tested-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++-------- drivers/md/dm.c | 2 +- include/linux/blk-mq.h | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'drivers/md') diff --git a/block/blk-mq.c b/block/blk-mq.c index 6847f014606b..b0888a89fa66 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -805,14 +805,14 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) } EXPORT_SYMBOL(blk_mq_tag_to_rq); -static bool blk_mq_check_busy(struct blk_mq_hw_ctx *hctx, struct request *rq, - void *priv, bool reserved) +static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, + void *priv, bool reserved) { /* - * If we find a request, we know the queue is busy. Return false - * to stop the iteration. + * If we find a request that is inflight and the queue matches, + * we know the queue is busy. Return false to stop the iteration. */ - if (rq->q == hctx->queue) { + if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { bool *busy = priv; *busy = true; @@ -822,14 +822,14 @@ static bool blk_mq_check_busy(struct blk_mq_hw_ctx *hctx, struct request *rq, return true; } -bool blk_mq_queue_busy(struct request_queue *q) +bool blk_mq_queue_inflight(struct request_queue *q) { bool busy = false; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_busy, &busy); + blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); return busy; } -EXPORT_SYMBOL_GPL(blk_mq_queue_busy); +EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); static void blk_mq_rq_timed_out(struct request *req, bool reserved) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c414d40d645d..dddbca63e140 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -663,7 +663,7 @@ static bool md_in_flight_bios(struct mapped_device *md) static bool md_in_flight(struct mapped_device *md) { if (queue_is_mq(md->queue)) - return blk_mq_queue_busy(md->queue); + return blk_mq_queue_inflight(md->queue); else return md_in_flight_bios(md); } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 57eda7b20243..d3c0a0d2680b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -257,7 +257,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -bool blk_mq_queue_busy(struct request_queue *q); +bool blk_mq_queue_inflight(struct request_queue *q); enum { /* return when out of requests */ -- cgit v1.2.3 From dbe3ece1287dafe4113c64ada3113c39f344c64a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Dec 2018 09:13:34 -0700 Subject: dm: don't reuse bio for flushes DM currently has a statically allocated bio that it uses to issue empty flushes. It doesn't submit this bio, it just uses it for maintaining state while setting up clones. Multiple users can access this bio at the same time. This wasn't previously an issue, even if it was a bit iffy, but with the blkg associations it can become one. We setup the blkg association, then clone bio's and submit, then remove the blkg assocation again. But since we can have multiple tasks doing this at the same time, against multiple blkg's, then we can either lose references to a blkg, or put it twice. The latter causes complaints on the percpu ref being <= 0 when released, and can cause use-after-free as well. Ming reports that xfstest generic/475 triggers this: ------------[ cut here ]------------ percpu ref (blkg_release) <= 0 (0) after switching to atomic WARNING: CPU: 13 PID: 0 at lib/percpu-refcount.c:155 percpu_ref_switch_to_atomic_rcu+0x2c9/0x4a0 Switch to just using an on-stack bio for this, and get rid of the embedded bio. Fixes: 5cdf2e3fea5e ("blkcg: associate blkg when associating a device") Reported-by: Ming Lei Tested-by: Ming Lei Reviewed-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm-core.h | 3 --- drivers/md/dm.c | 35 +++++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 13 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 6fe883fac471..95c6d86ab5e8 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -106,9 +106,6 @@ struct mapped_device { struct block_device *bdev; - /* zero-length flush that will be cloned and submitted to targets */ - struct bio flush_bio; - struct dm_stats stats; /* for blk-mq request-based DM support */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dddbca63e140..f588a6a83d80 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1420,11 +1420,11 @@ static int __send_empty_flush(struct clone_info *ci) struct dm_target *ti; /* - * Empty flush uses a statically initialized bio, &md->flush_bio, as - * the base for cloning. However, blkg association requires that a - * bdev is associated with a gendisk, which doesn't happen until the - * bdev is opened. So, blkg association is done at issue time of the - * flush rather than when the device is created in alloc_dev(). + * Empty flush uses a statically initialized bio, as the base for + * cloning. However, blkg association requires that a bdev is + * associated with a gendisk, which doesn't happen until the bdev is + * opened. So, blkg association is done at issue time of the flush + * rather than when the device is created in alloc_dev(). */ bio_set_dev(ci->bio, ci->io->md->bdev); @@ -1609,7 +1609,16 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.io->md->flush_bio; + struct bio flush_bio; + + /* + * Use an on-stack bio for this, it's safe since we don't + * need to reference it after submit. It's just used as + * the basis for the clone(s). + */ + bio_init(&flush_bio, NULL, 0); + flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ @@ -1665,7 +1674,16 @@ static blk_qc_t __process_bio(struct mapped_device *md, init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.io->md->flush_bio; + struct bio flush_bio; + + /* + * Use an on-stack bio for this, it's safe since we don't + * need to reference it after submit. It's just used as + * the basis for the clone(s). + */ + bio_init(&flush_bio, NULL, 0); + flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ @@ -1949,9 +1967,6 @@ static struct mapped_device *alloc_dev(int minor) if (!md->bdev) goto bad; - bio_init(&md->flush_bio, NULL, 0); - md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - dm_stats_init(&md->stats); /* Populate the mapping, nobody knows we exist yet */ -- cgit v1.2.3