diff options
author | Jens Axboe <axboe@kernel.dk> | 2020-09-24 22:44:39 +0300 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2020-09-24 22:44:39 +0300 |
commit | ac8f7a0264404c3e982ccabdc173d46d89ee7ea1 (patch) | |
tree | 97873bf5946b541fbf203bc88c0281ace1981ef2 /block | |
parent | 805c6d3c19210c90c109107d189744e960eae025 (diff) | |
parent | f56753ac2a90810726334df04d735e9f8f5a32d9 (diff) | |
download | linux-ac8f7a0264404c3e982ccabdc173d46d89ee7ea1.tar.xz |
Merge branch 'for-5.10/block' into for-5.10/drivers
* for-5.10/block: (140 commits)
bdi: replace BDI_CAP_NO_{WRITEBACK,ACCT_DIRTY} with a single flag
bdi: invert BDI_CAP_NO_ACCT_WB
bdi: replace BDI_CAP_STABLE_WRITES with a queue and a sb flag
mm: use SWP_SYNCHRONOUS_IO more intelligently
bdi: remove BDI_CAP_SYNCHRONOUS_IO
bdi: remove BDI_CAP_CGROUP_WRITEBACK
block: lift setting the readahead size into the block layer
md: update the optimal I/O size on reshape
bdi: initialize ->ra_pages and ->io_pages in bdi_init
aoe: set an optimal I/O size
bcache: inherit the optimal I/O size
drbd: remove dead code in device_to_statistics
fs: remove the unused SB_I_MULTIROOT flag
block: mark blkdev_get static
PM: mm: cleanup swsusp_swap_check
mm: split swap_type_of
PM: rewrite is_hibernate_resume_dev to not require an inode
mm: cleanup claim_swapfile
ocfs2: cleanup o2hb_region_dev_store
dasd: cleanup dasd_scan_partitions
...
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 2 | ||||
-rw-r--r-- | block/bfq-iosched.c | 9 | ||||
-rw-r--r-- | block/blk-cgroup.c | 32 | ||||
-rw-r--r-- | block/blk-core.c | 239 | ||||
-rw-r--r-- | block/blk-integrity.c | 4 | ||||
-rw-r--r-- | block/blk-iocost.c | 1547 | ||||
-rw-r--r-- | block/blk-map.c | 177 | ||||
-rw-r--r-- | block/blk-merge.c | 201 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 11 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 124 | ||||
-rw-r--r-- | block/blk-mq-sched.h | 3 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 152 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 56 | ||||
-rw-r--r-- | block/blk-mq.c | 86 | ||||
-rw-r--r-- | block/blk-mq.h | 76 | ||||
-rw-r--r-- | block/blk-settings.c | 40 | ||||
-rw-r--r-- | block/blk-sysfs.c | 277 | ||||
-rw-r--r-- | block/blk-throttle.c | 59 | ||||
-rw-r--r-- | block/blk.h | 25 | ||||
-rw-r--r-- | block/bsg-lib.c | 2 | ||||
-rw-r--r-- | block/genhd.c | 156 | ||||
-rw-r--r-- | block/ioctl.c | 29 | ||||
-rw-r--r-- | block/ioprio.c | 2 | ||||
-rw-r--r-- | block/kyber-iosched.c | 6 | ||||
-rw-r--r-- | block/mq-deadline.c | 6 | ||||
-rw-r--r-- | block/partitions/core.c | 27 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 2 |
27 files changed, 2116 insertions, 1234 deletions
diff --git a/block/Kconfig b/block/Kconfig index bbad5e8bbffe..a2297edfdde8 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -161,8 +161,6 @@ config BLK_WBT_MQ depends on BLK_WBT help Enable writeback throttling by default on multiqueue devices. - Multiqueue currently doesn't have support for IO scheduling, - enabling this option is recommended. config BLK_DEBUG_FS bool "Block layer debugging information in debugfs" diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fa98470df3f0..9e81d1052091 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4640,6 +4640,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + if (!atomic_read(&hctx->elevator_queued)) + return false; + /* * Avoiding lock: a race on bfqd->busy_queues should cause at * most a call to dispatch for nothing @@ -5554,6 +5557,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); bfq_insert_request(hctx, rq, at_head); + atomic_inc(&hctx->elevator_queued); } } @@ -5921,6 +5925,7 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); bfq_finish_requeue_request_body(bfqq); + atomic_dec(&rq->mq_hctx->elevator_queued); spin_unlock_irqrestore(&bfqd->lock, flags); } else { @@ -6360,8 +6365,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) struct blk_mq_tags *tags = hctx->sched_tags; unsigned int min_shallow; - min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); + min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow); } static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c195365c9817..f9b55614d67d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -119,6 +119,8 @@ static void blkg_async_bio_workfn(struct work_struct *work) async_bio_work); struct bio_list bios = BIO_EMPTY_LIST; struct bio *bio; + struct blk_plug plug; + bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ spin_lock_bh(&blkg->async_bio_lock); @@ -126,8 +128,15 @@ static void blkg_async_bio_workfn(struct work_struct *work) bio_list_init(&blkg->async_bios); spin_unlock_bh(&blkg->async_bio_lock); + /* start plug only when bio_list contains at least 2 bios */ + if (bios.head && bios.head->bi_next) { + need_plug = true; + blk_start_plug(&plug); + } while ((bio = bio_list_pop(&bios))) submit_bio(bio); + if (need_plug) + blk_finish_plug(&plug); } /** @@ -1613,16 +1622,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; + bool clamp; u64 now = ktime_to_ns(ktime_get()); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { - if (atomic_read(&blkg->use_delay)) { + int use_delay = atomic_read(&blkg->use_delay); + + if (use_delay) { + u64 this_delay; + blkcg_scale_delay(blkg, now); - delay_nsec = max_t(u64, delay_nsec, - atomic64_read(&blkg->delay_nsec)); + this_delay = atomic64_read(&blkg->delay_nsec); + if (this_delay > delay_nsec) { + delay_nsec = this_delay; + clamp = use_delay > 0; + } } blkg = blkg->parent; } @@ -1634,10 +1651,13 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the - * delays at 1 second. If there's 10's of seconds worth of delay then - * the tasks will be delayed for 1 second for every syscall. + * delays at 0.25s. If there's 10's of seconds worth of delay then the + * tasks will be delayed for 0.25 second for every syscall. If + * blkcg_set_delay() was used as indicated by negative use_delay, the + * caller is responsible for regulating the range. */ - delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); + if (clamp) + delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); diff --git a/block/blk-core.c b/block/blk-core.c index 10c08ac50697..1cc4fa6bc7fe 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); - rq->tag = -1; - rq->internal_tag = -1; + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; refcount_set(&rq->ref, 1); @@ -538,11 +538,10 @@ struct request_queue *blk_alloc_queue(int node_id) if (!q->stats) goto fail_stats; - q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES; - q->backing_dev_info->io_pages = VM_READAHEAD_PAGES; - q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; q->node = node_id; + atomic_set(&q->nr_active_requests_shared_sbitmap, 0); + timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); @@ -643,162 +642,6 @@ void blk_put_request(struct request *req) } EXPORT_SYMBOL(blk_put_request); -static void blk_account_io_merge_bio(struct request *req) -{ - if (!blk_do_io_stat(req)) - return; - - part_stat_lock(); - part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); - part_stat_unlock(); -} - -bool bio_attempt_back_merge(struct request *req, struct bio *bio, - unsigned int nr_segs) -{ - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; - - if (!ll_back_merge_fn(req, bio, nr_segs)) - return false; - - trace_block_bio_backmerge(req->q, req, bio); - rq_qos_merge(req->q, req, bio); - - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) - blk_rq_set_mixed_merge(req); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->__data_len += bio->bi_iter.bi_size; - - bio_crypt_free_ctx(bio); - - blk_account_io_merge_bio(req); - return true; -} - -bool bio_attempt_front_merge(struct request *req, struct bio *bio, - unsigned int nr_segs) -{ - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; - - if (!ll_front_merge_fn(req, bio, nr_segs)) - return false; - - trace_block_bio_frontmerge(req->q, req, bio); - rq_qos_merge(req->q, req, bio); - - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) - blk_rq_set_mixed_merge(req); - - bio->bi_next = req->bio; - req->bio = bio; - - req->__sector = bio->bi_iter.bi_sector; - req->__data_len += bio->bi_iter.bi_size; - - bio_crypt_do_front_merge(req, bio); - - blk_account_io_merge_bio(req); - return true; -} - -bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, - struct bio *bio) -{ - unsigned short segments = blk_rq_nr_discard_segments(req); - - if (segments >= queue_max_discard_segments(q)) - goto no_merge; - if (blk_rq_sectors(req) + bio_sectors(bio) > - blk_rq_get_max_sectors(req, blk_rq_pos(req))) - goto no_merge; - - rq_qos_merge(q, req, bio); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->__data_len += bio->bi_iter.bi_size; - req->nr_phys_segments = segments + 1; - - blk_account_io_merge_bio(req); - return true; -no_merge: - req_set_nomerge(q, req); - return false; -} - -/** - * blk_attempt_plug_merge - try to merge with %current's plugged list - * @q: request_queue new bio is being queued at - * @bio: new bio being queued - * @nr_segs: number of segments in @bio - * @same_queue_rq: pointer to &struct request that gets filled in when - * another request associated with @q is found on the plug list - * (optional, may be %NULL) - * - * Determine whether @bio being queued on @q can be merged with a request - * on %current's plugged list. Returns %true if merge was successful, - * otherwise %false. - * - * Plugging coalesces IOs from the same issuer for the same purpose without - * going through @q->queue_lock. As such it's more of an issuing mechanism - * than scheduling, and the request, while may have elvpriv data, is not - * added on the elevator at this point. In addition, we don't have - * reliable access to the elevator outside queue lock. Only check basic - * merging parameters without querying the elevator. - * - * Caller must ensure !blk_queue_nomerges(q) beforehand. - */ -bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq) -{ - struct blk_plug *plug; - struct request *rq; - struct list_head *plug_list; - - plug = blk_mq_plug(q, bio); - if (!plug) - return false; - - plug_list = &plug->mq_list; - - list_for_each_entry_reverse(rq, plug_list, queuelist) { - bool merged = false; - - if (rq->q == q && same_queue_rq) { - /* - * Only blk-mq multiple hardware queues case checks the - * rq in the same queue, there should be only one such - * rq in a queue - **/ - *same_queue_rq = rq; - } - - if (rq->q != q || !blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - merged = bio_attempt_back_merge(rq, bio, nr_segs); - break; - case ELEVATOR_FRONT_MERGE: - merged = bio_attempt_front_merge(rq, bio, nr_segs); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - break; - } - - if (merged) - return true; - } - - return false; -} - static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; @@ -1301,14 +1144,28 @@ EXPORT_SYMBOL(submit_bio); * limits when retrying requests on other queues. Those requests need * to be checked against the new queue limits again during dispatch. */ -static int blk_cloned_rq_check_limits(struct request_queue *q, +static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, struct request *rq) { - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) { + unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + + if (blk_rq_sectors(rq) > max_sectors) { + /* + * SCSI device does not have a good way to return if + * Write Same/Zero is actually supported. If a device rejects + * a non-read/write command (discard, write same,etc.) the + * low-level device driver will set the relevant queue limit to + * 0 to prevent blk-lib from issuing more of the offending + * operations. Commands queued prior to the queue limit being + * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O + * errors being propagated to upper layers. + */ + if (max_sectors == 0) + return BLK_STS_NOTSUPP; + printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", - __func__, blk_rq_sectors(rq), - blk_queue_get_max_sectors(q, req_op(rq))); - return -EIO; + __func__, blk_rq_sectors(rq), max_sectors); + return BLK_STS_IOERR; } /* @@ -1321,10 +1178,10 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", __func__, rq->nr_phys_segments, queue_max_segments(q)); - return -EIO; + return BLK_STS_IOERR; } - return 0; + return BLK_STS_OK; } /** @@ -1334,8 +1191,11 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, */ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { - if (blk_cloned_rq_check_limits(q, rq)) - return BLK_STS_IOERR; + blk_status_t ret; + + ret = blk_cloned_rq_check_limits(q, rq); + if (ret != BLK_STS_OK) + return ret; if (rq->rq_disk && should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) @@ -1461,10 +1321,9 @@ void blk_account_io_start(struct request *rq) part_stat_unlock(); } -unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, - unsigned int op) +static unsigned long __part_start_io_acct(struct hd_struct *part, + unsigned int sectors, unsigned int op) { - struct hd_struct *part = &disk->part0; const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); @@ -1477,12 +1336,26 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, return now; } + +unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, + struct bio *bio) +{ + *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); + + return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); +} +EXPORT_SYMBOL_GPL(part_start_io_acct); + +unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, + unsigned int op) +{ + return __part_start_io_acct(&disk->part0, sectors, op); +} EXPORT_SYMBOL(disk_start_io_acct); -void disk_end_io_acct(struct gendisk *disk, unsigned int op, - unsigned long start_time) +static void __part_end_io_acct(struct hd_struct *part, unsigned int op, + unsigned long start_time) { - struct hd_struct *part = &disk->part0; const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; @@ -1493,6 +1366,20 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op, part_stat_local_dec(part, in_flight[op_is_write(op)]); part_stat_unlock(); } + +void part_end_io_acct(struct hd_struct *part, struct bio *bio, + unsigned long start_time) +{ + __part_end_io_acct(part, bio_op(bio), start_time); + hd_struct_put(part); +} +EXPORT_SYMBOL_GPL(part_end_io_acct); + +void disk_end_io_acct(struct gendisk *disk, unsigned int op, + unsigned long start_time) +{ + __part_end_io_acct(&disk->part0, op, start_time); +} EXPORT_SYMBOL(disk_end_io_acct); /* diff --git a/block/blk-integrity.c b/block/blk-integrity.c index c03705cbb9c9..2b36a8f9b813 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -408,7 +408,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; - disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (disk->queue->ksm) { @@ -428,7 +428,7 @@ EXPORT_SYMBOL(blk_integrity_register); */ void blk_integrity_unregister(struct gendisk *disk) { - disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); } EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d37b55db2409..ef9476fca1d8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -68,7 +68,7 @@ * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, * 12.5% each. The distribution mechanism only cares about these flattened * shares. They're called hweights (hierarchical weights) and always add - * upto 1 (HWEIGHT_WHOLE). + * upto 1 (WEIGHT_ONE). * * A given cgroup's vtime runs slower in inverse proportion to its hweight. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) @@ -179,6 +179,8 @@ #include <linux/parser.h> #include <linux/sched/signal.h> #include <linux/blk-cgroup.h> +#include <asm/local.h> +#include <asm/local64.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" @@ -215,36 +217,21 @@ enum { MAX_PERIOD = USEC_PER_SEC, /* - * A cgroup's vtime can run 50% behind the device vtime, which + * iocg->vtime is targeted at 50% behind the device vtime, which * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ - MARGIN_PCT = 50, - INUSE_MARGIN_PCT = 10, + MARGIN_MIN_PCT = 10, + MARGIN_LOW_PCT = 20, + MARGIN_TARGET_PCT = 50, - /* Have some play in waitq timer operations */ - WAITQ_TIMER_MARGIN_PCT = 5, + INUSE_ADJ_STEP_PCT = 25, - /* - * vtime can wrap well within a reasonable uptime when vrate is - * consistently raised. Don't trust recorded cgroup vtime if the - * period counter indicates that it's older than 5mins. - */ - VTIME_VALID_DUR = 300 * USEC_PER_SEC, - - /* - * Remember the past three non-zero usages and use the max for - * surplus calculation. Three slots guarantee that we remember one - * full period usage from the last active stretch even after - * partial deactivation and re-activation periods. Don't start - * giving away weight before collecting two data points to prevent - * hweight adjustments based on one partial activation period. - */ - NR_USAGE_SLOTS = 3, - MIN_VALID_USAGES = 2, + /* Have some play in timer operations */ + TIMER_SLACK_PCT = 1, /* 1/64k is granular enough and can easily be handled w/ u32 */ - HWEIGHT_WHOLE = 1 << 16, + WEIGHT_ONE = 1 << 16, /* * As vtime is used to calculate the cost of each IO, it needs to @@ -275,16 +262,40 @@ enum { /* unbusy hysterisis */ UNBUSY_THR_PCT = 75, - /* don't let cmds which take a very long time pin lagging for too long */ - MAX_LAGGING_PERIODS = 10, + /* + * The effect of delay is indirect and non-linear and a huge amount of + * future debt can accumulate abruptly while unthrottled. Linearly scale + * up delay as debt is going up and then let it decay exponentially. + * This gives us quick ramp ups while delay is accumulating and long + * tails which can help reducing the frequency of debt explosions on + * unthrottle. The parameters are experimentally determined. + * + * The delay mechanism provides adequate protection and behavior in many + * cases. However, this is far from ideal and falls shorts on both + * fronts. The debtors are often throttled too harshly costing a + * significant level of fairness and possibly total work while the + * protection against their impacts on the system can be choppy and + * unreliable. + * + * The shortcoming primarily stems from the fact that, unlike for page + * cache, the kernel doesn't have well-defined back-pressure propagation + * mechanism and policies for anonymous memory. Fully addressing this + * issue will likely require substantial improvements in the area. + */ + MIN_DELAY_THR_PCT = 500, + MAX_DELAY_THR_PCT = 25000, + MIN_DELAY = 250, + MAX_DELAY = 250 * USEC_PER_MSEC, /* - * If usage% * 1.25 + 2% is lower than hweight% by more than 3%, - * donate the surplus. + * Halve debts if total usage keeps staying under 25% w/o any shortages + * for over 100ms. */ - SURPLUS_SCALE_PCT = 125, /* * 125% */ - SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */ - SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */ + DEBT_BUSY_USAGE_PCT = 25, + DEBT_REDUCTION_IDLE_DUR = 100 * USEC_PER_MSEC, + + /* don't let cmds which take a very long time pin lagging for too long */ + MAX_LAGGING_PERIODS = 10, /* switch iff the conditions are met for longer than this */ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, @@ -372,9 +383,15 @@ struct ioc_params { u32 too_slow_vrate_pct; }; +struct ioc_margins { + s64 min; + s64 low; + s64 target; +}; + struct ioc_missed { - u32 nr_met; - u32 nr_missed; + local_t nr_met; + local_t nr_missed; u32 last_met; u32 last_missed; }; @@ -382,7 +399,7 @@ struct ioc_missed { struct ioc_pcpu_stat { struct ioc_missed missed[2]; - u64 rq_wait_ns; + local64_t rq_wait_ns; u64 last_rq_wait_ns; }; @@ -393,8 +410,9 @@ struct ioc { bool enabled; struct ioc_params params; + struct ioc_margins margins; u32 period_us; - u32 margin_us; + u32 timer_slack_ns; u64 vrate_min; u64 vrate_max; @@ -405,18 +423,22 @@ struct ioc { enum ioc_running running; atomic64_t vtime_rate; + u64 vtime_base_rate; + s64 vtime_err; seqcount_spinlock_t period_seqcount; - u32 period_at; /* wallclock starttime */ + u64 period_at; /* wallclock starttime */ u64 period_at_vtime; /* vtime starttime */ atomic64_t cur_period; /* inc'd each period */ int busy_level; /* saturation history */ - u64 inuse_margin_vtime; bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ + /* the last time debt cancel condition wasn't met */ + u64 debt_busy_at; + u64 autop_too_fast_at; u64 autop_too_slow_at; int autop_idx; @@ -424,6 +446,17 @@ struct ioc { bool user_cost_model:1; }; +struct iocg_pcpu_stat { + local64_t abs_vusage; +}; + +struct iocg_stat { + u64 usage_us; + u64 wait_us; + u64 indebt_us; + u64 indelay_us; +}; + /* per device-cgroup pair */ struct ioc_gq { struct blkg_policy_data pd; @@ -443,12 +476,17 @@ struct ioc_gq { * * `last_inuse` remembers `inuse` while an iocg is idle to persist * surplus adjustments. + * + * `inuse` may be adjusted dynamically during period. `saved_*` are used + * to determine and track adjustments. */ u32 cfg_weight; u32 weight; u32 active; u32 inuse; + u32 last_inuse; + s64 saved_margin; sector_t cursor; /* to detect randio */ @@ -461,14 +499,14 @@ struct ioc_gq { * `vtime_done` is the same but progressed on completion rather * than issue. The delta behind `vtime` represents the cost of * currently in-flight IOs. - * - * `last_vtime` is used to remember `vtime` at the end of the last - * period to calculate utilization. */ atomic64_t vtime; atomic64_t done_vtime; u64 abs_vdebt; - u64 last_vtime; + + /* current delay in effect and when it started */ + u64 delay; + u64 delay_at; /* * The period this iocg was last active in. Used for deactivation @@ -477,21 +515,35 @@ struct ioc_gq { atomic64_t active_period; struct list_head active_list; - /* see __propagate_active_weight() and current_hweight() for details */ + /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; + u64 child_adjusted_sum; int hweight_gen; u32 hweight_active; u32 hweight_inuse; - bool has_surplus; + u32 hweight_donating; + u32 hweight_after_donation; + + struct list_head walk_list; + struct list_head surplus_list; struct wait_queue_head waitq; struct hrtimer waitq_timer; - struct hrtimer delay_timer; - /* usage is recorded as fractions of HWEIGHT_WHOLE */ - int usage_idx; - u32 usages[NR_USAGE_SLOTS]; + /* timestamp at the latest activation */ + u64 activated_at; + + /* statistics */ + struct iocg_pcpu_stat __percpu *pcpu_stat; + struct iocg_stat local_stat; + struct iocg_stat desc_stat; + struct iocg_stat last_stat; + u64 last_stat_abs_vusage; + u64 usage_delta_us; + u64 wait_since; + u64 indebt_since; + u64 indelay_since; /* this iocg's depth in the hierarchy and ancestors including self */ int level; @@ -506,7 +558,7 @@ struct ioc_cgrp { struct ioc_now { u64 now_ns; - u32 now; + u64 now; u64 vnow; u64 vrate; }; @@ -656,7 +708,7 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) */ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) { - return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); + return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); } /* @@ -664,18 +716,56 @@ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) */ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) { - return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE); + return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } -static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) +static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, + u64 abs_cost, u64 cost) { + struct iocg_pcpu_stat *gcs; + bio->bi_iocost_cost = cost; atomic64_add(cost, &iocg->vtime); + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); +} + +static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) +{ + if (lock_ioc) { + spin_lock_irqsave(&iocg->ioc->lock, *flags); + spin_lock(&iocg->waitq.lock); + } else { + spin_lock_irqsave(&iocg->waitq.lock, *flags); + } +} + +static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) +{ + if (unlock_ioc) { + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&iocg->ioc->lock, *flags); + } else { + spin_unlock_irqrestore(&iocg->waitq.lock, *flags); + } } #define CREATE_TRACE_POINTS #include <trace/events/iocost.h> +static void ioc_refresh_margins(struct ioc *ioc) +{ + struct ioc_margins *margins = &ioc->margins; + u32 period_us = ioc->period_us; + u64 vrate = ioc->vtime_base_rate; + + margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; + margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; + margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; +} + /* latency Qos params changed, update period_us and all the dependent params */ static void ioc_refresh_period_us(struct ioc *ioc) { @@ -709,9 +799,10 @@ static void ioc_refresh_period_us(struct ioc *ioc) /* calculate dependent params */ ioc->period_us = period_us; - ioc->margin_us = period_us * MARGIN_PCT / 100; - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( - period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100); + ioc->timer_slack_ns = div64_u64( + (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, + 100); + ioc_refresh_margins(ioc); } static int ioc_autop_idx(struct ioc *ioc) @@ -738,8 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc) return idx; /* step up/down based on the vrate */ - vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100, - VTIME_PER_USEC); + vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); now_ns = ktime_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { @@ -847,6 +937,43 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force) return true; } +/* + * When an iocg accumulates too much vtime or gets deactivated, we throw away + * some vtime, which lowers the overall device utilization. As the exact amount + * which is being thrown away is known, we can compensate by accelerating the + * vrate accordingly so that the extra vtime generated in the current period + * matches what got lost. + */ +static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) +{ + s64 pleft = ioc->period_at + ioc->period_us - now->now; + s64 vperiod = ioc->period_us * ioc->vtime_base_rate; + s64 vcomp, vcomp_min, vcomp_max; + + lockdep_assert_held(&ioc->lock); + + /* we need some time left in this period */ + if (pleft <= 0) + goto done; + + /* + * Calculate how much vrate should be adjusted to offset the error. + * Limit the amount of adjustment and deduct the adjusted amount from + * the error. + */ + vcomp = -div64_s64(ioc->vtime_err, pleft); + vcomp_min = -(ioc->vtime_base_rate >> 1); + vcomp_max = ioc->vtime_base_rate; + vcomp = clamp(vcomp, vcomp_min, vcomp_max); + + ioc->vtime_err += vcomp * pleft; + + atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); +done: + /* bound how much error can accumulate */ + ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); +} + /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { @@ -886,16 +1013,25 @@ static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) /* * Update @iocg's `active` and `inuse` to @active and @inuse, update level - * weight sums and propagate upwards accordingly. + * weight sums and propagate upwards accordingly. If @save, the current margin + * is saved to be used as reference for later inuse in-period adjustments. */ -static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, + bool save, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; int lvl; lockdep_assert_held(&ioc->lock); - inuse = min(active, inuse); + inuse = clamp_t(u32, inuse, 1, active); + + iocg->last_inuse = iocg->inuse; + if (save) + iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); + + if (active == iocg->active && inuse == iocg->inuse) + return; for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; @@ -933,7 +1069,7 @@ static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse ioc->weights_updated = true; } -static void commit_active_weights(struct ioc *ioc) +static void commit_weights(struct ioc *ioc) { lockdep_assert_held(&ioc->lock); @@ -945,10 +1081,11 @@ static void commit_active_weights(struct ioc *ioc) } } -static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, + bool save, struct ioc_now *now) { - __propagate_active_weight(iocg, active, inuse); - commit_active_weights(iocg->ioc); + __propagate_weights(iocg, active, inuse, save, now); + commit_weights(iocg->ioc); } static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) @@ -964,9 +1101,9 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep goto out; /* - * Paired with wmb in commit_active_weights(). If we saw the - * updated hweight_gen, all the weight updates from - * __propagate_active_weight() are visible too. + * Paired with wmb in commit_weights(). If we saw the updated + * hweight_gen, all the weight updates from __propagate_weights() are + * visible too. * * We can race with weight updates during calculation and get it * wrong. However, hweight_gen would have changed and a future @@ -975,12 +1112,12 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep */ smp_rmb(); - hwa = hwi = HWEIGHT_WHOLE; + hwa = hwi = WEIGHT_ONE; for (lvl = 0; lvl <= iocg->level - 1; lvl++) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; - u32 active_sum = READ_ONCE(parent->child_active_sum); - u32 inuse_sum = READ_ONCE(parent->child_inuse_sum); + u64 active_sum = READ_ONCE(parent->child_active_sum); + u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); u32 active = READ_ONCE(child->active); u32 inuse = READ_ONCE(child->inuse); @@ -988,11 +1125,11 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep if (!active_sum || !inuse_sum) continue; - active_sum = max(active, active_sum); - hwa = hwa * active / active_sum; /* max 16bits * 10000 */ + active_sum = max_t(u64, active, active_sum); + hwa = div64_u64((u64)hwa * active, active_sum); - inuse_sum = max(inuse, inuse_sum); - hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */ + inuse_sum = max_t(u64, inuse, inuse_sum); + hwi = div64_u64((u64)hwi * inuse, inuse_sum); } iocg->hweight_active = max_t(u32, hwa, 1); @@ -1005,7 +1142,33 @@ out: *hw_inusep = iocg->hweight_inuse; } -static void weight_updated(struct ioc_gq *iocg) +/* + * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the + * other weights stay unchanged. + */ +static u32 current_hweight_max(struct ioc_gq *iocg) +{ + u32 hwm = WEIGHT_ONE; + u32 inuse = iocg->active; + u64 child_inuse_sum; + int lvl; + + lockdep_assert_held(&iocg->ioc->lock); + + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + + child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; + hwm = div64_u64((u64)hwm * inuse, child_inuse_sum); + inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, + parent->child_active_sum); + } + + return max_t(u32, hwm, 1); +} + +static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); @@ -1016,16 +1179,15 @@ static void weight_updated(struct ioc_gq *iocg) weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) - propagate_active_weight(iocg, weight, - DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight)); + propagate_weights(iocg, weight, iocg->inuse, true, now); iocg->weight = weight; } static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period, max_period_delta; - u64 vtime, vmargin, vmin; + u64 last_period, cur_period; + u64 vtime, vtarget; int i; /* @@ -1064,22 +1226,15 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) goto fail_unlock; /* - * vtime may wrap when vrate is raised substantially due to - * underestimated IO costs. Look at the period and ignore its - * vtime if the iocg has been idle for too long. Also, cap the - * budget it can start with to the margin. + * Always start with the target budget. On deactivation, we throw away + * anything above it. */ - max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us); + vtarget = now->vnow - ioc->margins.target; vtime = atomic64_read(&iocg->vtime); - vmargin = ioc->margin_us * now->vrate; - vmin = now->vnow - vmargin; - if (last_period + max_period_delta < cur_period || - time_before64(vtime, vmin)) { - atomic64_add(vmin - vtime, &iocg->vtime); - atomic64_add(vmin - vtime, &iocg->done_vtime); - vtime = vmin; - } + atomic64_add(vtarget - vtime, &iocg->vtime); + atomic64_add(vtarget - vtime, &iocg->done_vtime); + vtime = vtarget; /* * Activate, propagate weight and start period timer if not @@ -1088,16 +1243,18 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) */ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; list_add(&iocg->active_list, &ioc->active_iocgs); - propagate_active_weight(iocg, iocg->weight, - iocg->last_inuse ?: iocg->weight); + + propagate_weights(iocg, iocg->weight, + iocg->last_inuse ?: iocg->weight, true, now); TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); - iocg->last_vtime = vtime; + iocg->activated_at = now->now; if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; + ioc->debt_busy_at = now->now; ioc_start_period(ioc, now); } @@ -1110,6 +1267,110 @@ fail_unlock: return false; } +static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct blkcg_gq *blkg = iocg_to_blkg(iocg); + u64 tdelta, delay, new_delay; + s64 vover, vover_pct; + u32 hwa; + + lockdep_assert_held(&iocg->waitq.lock); + + /* calculate the current delay in effect - 1/2 every second */ + tdelta = now->now - iocg->delay_at; + if (iocg->delay) + delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + else + delay = 0; + + /* calculate the new delay from the debt amount */ + current_hweight(iocg, &hwa, NULL); + vover = atomic64_read(&iocg->vtime) + + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; + vover_pct = div64_s64(100 * vover, + ioc->period_us * ioc->vtime_base_rate); + + if (vover_pct <= MIN_DELAY_THR_PCT) + new_delay = 0; + else if (vover_pct >= MAX_DELAY_THR_PCT) + new_delay = MAX_DELAY; + else + new_delay = MIN_DELAY + + div_u64((MAX_DELAY - MIN_DELAY) * + (vover_pct - MIN_DELAY_THR_PCT), + MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); + + /* pick the higher one and apply */ + if (new_delay > delay) { + iocg->delay = new_delay; + iocg->delay_at = now->now; + delay = new_delay; + } + + if (delay >= MIN_DELAY) { + if (!iocg->indelay_since) + iocg->indelay_since = now->now; + blkcg_set_delay(blkg, delay * NSEC_PER_USEC); + return true; + } else { + if (iocg->indelay_since) { + iocg->local_stat.indelay_us += now->now - iocg->indelay_since; + iocg->indelay_since = 0; + } + iocg->delay = 0; + blkcg_clear_delay(blkg); + return false; + } +} + +static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, + struct ioc_now *now) +{ + struct iocg_pcpu_stat *gcs; + + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + WARN_ON_ONCE(list_empty(&iocg->active_list)); + + /* + * Once in debt, debt handling owns inuse. @iocg stays at the minimum + * inuse donating all of it share to others until its debt is paid off. + */ + if (!iocg->abs_vdebt && abs_cost) { + iocg->indebt_since = now->now; + propagate_weights(iocg, iocg->active, 0, false, now); + } + + iocg->abs_vdebt += abs_cost; + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); +} + +static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, + struct ioc_now *now) +{ + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + + /* make sure that nobody messed with @iocg */ + WARN_ON_ONCE(list_empty(&iocg->active_list)); + WARN_ON_ONCE(iocg->inuse > 1); + + iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); + + /* if debt is paid in full, restore inuse */ + if (!iocg->abs_vdebt) { + iocg->local_stat.indebt_us += now->now - iocg->indebt_since; + iocg->indebt_since = 0; + + propagate_weights(iocg, iocg->active, iocg->last_inuse, + false, now); + } +} + static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { @@ -1122,7 +1383,7 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, if (ctx->vbudget < 0) return -1; - iocg_commit_bio(ctx->iocg, wait->bio, cost); + iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); /* * autoremove_wake_function() removes the wait entry only when it @@ -1136,132 +1397,106 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, return 0; } -static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) +/* + * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters + * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in + * addition to iocg->waitq.lock. + */ +static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, + struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct iocg_wake_ctx ctx = { .iocg = iocg }; - u64 margin_ns = (u64)(ioc->period_us * - WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; - u64 vdebt, vshortage, expires, oexpires; + u64 vshortage, expires, oexpires; s64 vbudget; - u32 hw_inuse; + u32 hwa; lockdep_assert_held(&iocg->waitq.lock); - current_hweight(iocg, NULL, &hw_inuse); + current_hweight(iocg, &hwa, NULL); vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ - vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - if (vdebt && vbudget > 0) { - u64 delta = min_t(u64, vbudget, vdebt); - u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), - iocg->abs_vdebt); + if (pay_debt && iocg->abs_vdebt && vbudget > 0) { + u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa); + u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); + u64 vpay = abs_cost_to_cost(abs_vpay, hwa); + + lockdep_assert_held(&ioc->lock); + + atomic64_add(vpay, &iocg->vtime); + atomic64_add(vpay, &iocg->done_vtime); + iocg_pay_debt(iocg, abs_vpay, now); + vbudget -= vpay; + } + + if (iocg->abs_vdebt || iocg->delay) + iocg_kick_delay(iocg, now); - atomic64_add(delta, &iocg->vtime); - atomic64_add(delta, &iocg->done_vtime); - iocg->abs_vdebt -= abs_delta; + /* + * Debt can still be outstanding if we haven't paid all yet or the + * caller raced and called without @pay_debt. Shouldn't wake up waiters + * under debt. Make sure @vbudget reflects the outstanding amount and is + * not positive. + */ + if (iocg->abs_vdebt) { + s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); + vbudget = min_t(s64, 0, vbudget - vdebt); } /* - * Wake up the ones which are due and see how much vtime we'll need - * for the next one. + * Wake up the ones which are due and see how much vtime we'll need for + * the next one. As paying off debt restores hw_inuse, it must be read + * after the above debt payment. */ - ctx.hw_inuse = hw_inuse; - ctx.vbudget = vbudget - vdebt; + ctx.vbudget = vbudget; + current_hweight(iocg, NULL, &ctx.hw_inuse); + __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); - if (!waitqueue_active(&iocg->waitq)) + + if (!waitqueue_active(&iocg->waitq)) { + if (iocg->wait_since) { + iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = 0; + } return; + } + + if (!iocg->wait_since) + iocg->wait_since = now->now; + if (WARN_ON_ONCE(ctx.vbudget >= 0)) return; - /* determine next wakeup, add a quarter margin to guarantee chunking */ + /* determine next wakeup, add a timer margin to guarantee chunking */ vshortage = -ctx.vbudget; expires = now->now_ns + - DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC; - expires += margin_ns / 4; + DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * + NSEC_PER_USEC; + expires += ioc->timer_slack_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); if (hrtimer_is_queued(&iocg->waitq_timer) && - abs(oexpires - expires) <= margin_ns / 4) + abs(oexpires - expires) <= ioc->timer_slack_ns) return; hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), - margin_ns / 4, HRTIMER_MODE_ABS); + ioc->timer_slack_ns, HRTIMER_MODE_ABS); } static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) { struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); + bool pay_debt = READ_ONCE(iocg->abs_vdebt); struct ioc_now now; unsigned long flags; ioc_now(iocg->ioc, &now); - spin_lock_irqsave(&iocg->waitq.lock, flags); - iocg_kick_waitq(iocg, &now); - spin_unlock_irqrestore(&iocg->waitq.lock, flags); - - return HRTIMER_NORESTART; -} - -static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) -{ - struct ioc *ioc = iocg->ioc; - struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 vtime = atomic64_read(&iocg->vtime); - u64 vmargin = ioc->margin_us * now->vrate; - u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; - u64 delta_ns, expires, oexpires; - u32 hw_inuse; - - lockdep_assert_held(&iocg->waitq.lock); - - /* debt-adjust vtime */ - current_hweight(iocg, NULL, &hw_inuse); - vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - - /* - * Clear or maintain depending on the overage. Non-zero vdebt is what - * guarantees that @iocg is online and future iocg_kick_delay() will - * clear use_delay. Don't leave it on when there's no vdebt. - */ - if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { - blkcg_clear_delay(blkg); - return false; - } - if (!atomic_read(&blkg->use_delay) && - time_before_eq64(vtime, now->vnow + vmargin)) - return false; - - /* use delay */ - delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, - now->vrate) * NSEC_PER_USEC; - blkcg_set_delay(blkg, delta_ns); - expires = now->now_ns + delta_ns; - - /* if already active and close enough, don't bother */ - oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); - if (hrtimer_is_queued(&iocg->delay_timer) && - abs(oexpires - expires) <= margin_ns / 4) - return true; - - hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), - margin_ns / 4, HRTIMER_MODE_ABS); - return true; -} - -static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) -{ - struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); - struct ioc_now now; - unsigned long flags; - - spin_lock_irqsave(&iocg->waitq.lock, flags); - ioc_now(iocg->ioc, &now); - iocg_kick_delay(iocg, &now); - spin_unlock_irqrestore(&iocg->waitq.lock, flags); + iocg_lock(iocg, pay_debt, &flags); + iocg_kick_waitq(iocg, pay_debt, &now); + iocg_unlock(iocg, pay_debt, &flags); return HRTIMER_NORESTART; } @@ -1278,8 +1513,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p u64 this_rq_wait_ns; for (rw = READ; rw <= WRITE; rw++) { - u32 this_met = READ_ONCE(stat->missed[rw].nr_met); - u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed); + u32 this_met = local_read(&stat->missed[rw].nr_met); + u32 this_missed = local_read(&stat->missed[rw].nr_missed); nr_met[rw] += this_met - stat->missed[rw].last_met; nr_missed[rw] += this_missed - stat->missed[rw].last_missed; @@ -1287,7 +1522,7 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p stat->missed[rw].last_missed = this_missed; } - this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns); + this_rq_wait_ns = local64_read(&stat->rq_wait_ns); rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; stat->last_rq_wait_ns = this_rq_wait_ns; } @@ -1322,18 +1557,426 @@ static bool iocg_is_idle(struct ioc_gq *iocg) return true; } -/* returns usage with margin added if surplus is large enough */ -static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse) +/* + * Call this function on the target leaf @iocg's to build pre-order traversal + * list of all the ancestors in @inner_walk. The inner nodes are linked through + * ->walk_list and the caller is responsible for dissolving the list after use. + */ +static void iocg_build_inner_walk(struct ioc_gq *iocg, + struct list_head *inner_walk) { - /* add margin */ - usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); - usage += SURPLUS_SCALE_ABS; + int lvl; - /* don't bother if the surplus is too small */ - if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse) - return 0; + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + + /* find the first ancestor which hasn't been visited yet */ + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + if (!list_empty(&iocg->ancestors[lvl]->walk_list)) + break; + } + + /* walk down and visit the inner nodes to get pre-order traversal */ + while (++lvl <= iocg->level - 1) { + struct ioc_gq *inner = iocg->ancestors[lvl]; + + /* record traversal order */ + list_add_tail(&inner->walk_list, inner_walk); + } +} + +/* collect per-cpu counters and propagate the deltas to the parent */ +static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct iocg_stat new_stat; + u64 abs_vusage = 0; + u64 vusage_delta; + int cpu; + + lockdep_assert_held(&iocg->ioc->lock); + + /* collect per-cpu counters */ + for_each_possible_cpu(cpu) { + abs_vusage += local64_read( + per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); + } + vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; + iocg->last_stat_abs_vusage = abs_vusage; + + iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); + iocg->local_stat.usage_us += iocg->usage_delta_us; + + /* propagate upwards */ + new_stat.usage_us = + iocg->local_stat.usage_us + iocg->desc_stat.usage_us; + new_stat.wait_us = + iocg->local_stat.wait_us + iocg->desc_stat.wait_us; + new_stat.indebt_us = + iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us; + new_stat.indelay_us = + iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us; + + /* propagate the deltas to the parent */ + if (iocg->level > 0) { + struct iocg_stat *parent_stat = + &iocg->ancestors[iocg->level - 1]->desc_stat; + + parent_stat->usage_us += + new_stat.usage_us - iocg->last_stat.usage_us; + parent_stat->wait_us += + new_stat.wait_us - iocg->last_stat.wait_us; + parent_stat->indebt_us += + new_stat.indebt_us - iocg->last_stat.indebt_us; + parent_stat->indelay_us += + new_stat.indelay_us - iocg->last_stat.indelay_us; + } + + iocg->last_stat = new_stat; +} - return usage; +/* get stat counters ready for reading on all active iocgs */ +static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) +{ + LIST_HEAD(inner_walk); + struct ioc_gq *iocg, *tiocg; + + /* flush leaves and build inner node walk list */ + list_for_each_entry(iocg, target_iocgs, active_list) { + iocg_flush_stat_one(iocg, now); + iocg_build_inner_walk(iocg, &inner_walk); + } + + /* keep flushing upwards by walking the inner list backwards */ + list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { + iocg_flush_stat_one(iocg, now); + list_del_init(&iocg->walk_list); + } +} + +/* + * Determine what @iocg's hweight_inuse should be after donating unused + * capacity. @hwm is the upper bound and used to signal no donation. This + * function also throws away @iocg's excess budget. + */ +static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm, + u32 usage, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess, delta, target, new_hwi; + + /* debt handling owns inuse for debtors */ + if (iocg->abs_vdebt) + return 1; + + /* see whether minimum margin requirement is met */ + if (waitqueue_active(&iocg->waitq) || + time_after64(vtime, now->vnow - ioc->margins.min)) + return hwm; + + /* throw away excess above target */ + excess = now->vnow - vtime - ioc->margins.target; + if (excess > 0) { + atomic64_add(excess, &iocg->vtime); + atomic64_add(excess, &iocg->done_vtime); + vtime += excess; + ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); + } + + /* + * Let's say the distance between iocg's and device's vtimes as a + * fraction of period duration is delta. Assuming that the iocg will + * consume the usage determined above, we want to determine new_hwi so + * that delta equals MARGIN_TARGET at the end of the next period. + * + * We need to execute usage worth of IOs while spending the sum of the + * new budget (1 - MARGIN_TARGET) and the leftover from the last period + * (delta): + * + * usage = (1 - MARGIN_TARGET + delta) * new_hwi + * + * Therefore, the new_hwi is: + * + * new_hwi = usage / (1 - MARGIN_TARGET + delta) + */ + delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), + now->vnow - ioc->period_at_vtime); + target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100; + new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); + + return clamp_t(s64, new_hwi, 1, hwm); +} + +/* + * For work-conservation, an iocg which isn't using all of its share should + * donate the leftover to other iocgs. There are two ways to achieve this - 1. + * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight. + * + * #1 is mathematically simpler but has the drawback of requiring synchronous + * global hweight_inuse updates when idle iocg's get activated or inuse weights + * change due to donation snapbacks as it has the possibility of grossly + * overshooting what's allowed by the model and vrate. + * + * #2 is inherently safe with local operations. The donating iocg can easily + * snap back to higher weights when needed without worrying about impacts on + * other nodes as the impacts will be inherently correct. This also makes idle + * iocg activations safe. The only effect activations have is decreasing + * hweight_inuse of others, the right solution to which is for those iocgs to + * snap back to higher weights. + * + * So, we go with #2. The challenge is calculating how each donating iocg's + * inuse should be adjusted to achieve the target donation amounts. This is done + * using Andy's method described in the following pdf. + * + * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo + * + * Given the weights and target after-donation hweight_inuse values, Andy's + * method determines how the proportional distribution should look like at each + * sibling level to maintain the relative relationship between all non-donating + * pairs. To roughly summarize, it divides the tree into donating and + * non-donating parts, calculates global donation rate which is used to + * determine the target hweight_inuse for each node, and then derives per-level + * proportions. + * + * The following pdf shows that global distribution calculated this way can be + * achieved by scaling inuse weights of donating leaves and propagating the + * adjustments upwards proportionally. + * + * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE + * + * Combining the above two, we can determine how each leaf iocg's inuse should + * be adjusted to achieve the target donation. + * + * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN + * + * The inline comments use symbols from the last pdf. + * + * b is the sum of the absolute budgets in the subtree. 1 for the root node. + * f is the sum of the absolute budgets of non-donating nodes in the subtree. + * t is the sum of the absolute budgets of donating nodes in the subtree. + * w is the weight of the node. w = w_f + w_t + * w_f is the non-donating portion of w. w_f = w * f / b + * w_b is the donating portion of w. w_t = w * t / b + * s is the sum of all sibling weights. s = Sum(w) for siblings + * s_f and s_t are the non-donating and donating portions of s. + * + * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g. + * w_pt is the donating portion of the parent's weight and w'_pt the same value + * after adjustments. Subscript r denotes the root node's values. + */ +static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) +{ + LIST_HEAD(over_hwa); + LIST_HEAD(inner_walk); + struct ioc_gq *iocg, *tiocg, *root_iocg; + u32 after_sum, over_sum, over_target, gamma; + + /* + * It's pretty unlikely but possible for the total sum of + * hweight_after_donation's to be higher than WEIGHT_ONE, which will + * confuse the following calculations. If such condition is detected, + * scale down everyone over its full share equally to keep the sum below + * WEIGHT_ONE. + */ + after_sum = 0; + over_sum = 0; + list_for_each_entry(iocg, surpluses, surplus_list) { + u32 hwa; + + current_hweight(iocg, &hwa, NULL); + after_sum += iocg->hweight_after_donation; + + if (iocg->hweight_after_donation > hwa) { + over_sum += iocg->hweight_after_donation; + list_add(&iocg->walk_list, &over_hwa); + } + } + + if (after_sum >= WEIGHT_ONE) { + /* + * The delta should be deducted from the over_sum, calculate + * target over_sum value. + */ + u32 over_delta = after_sum - (WEIGHT_ONE - 1); + WARN_ON_ONCE(over_sum <= over_delta); + over_target = over_sum - over_delta; + } else { + over_target = 0; + } + + list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) { + if (over_target) + iocg->hweight_after_donation = + div_u64((u64)iocg->hweight_after_donation * + over_target, over_sum); + list_del_init(&iocg->walk_list); + } + + /* + * Build pre-order inner node walk list and prepare for donation + * adjustment calculations. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + iocg_build_inner_walk(iocg, &inner_walk); + } + + root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list); + WARN_ON_ONCE(root_iocg->level > 0); + + list_for_each_entry(iocg, &inner_walk, walk_list) { + iocg->child_adjusted_sum = 0; + iocg->hweight_donating = 0; + iocg->hweight_after_donation = 0; + } + + /* + * Propagate the donating budget (b_t) and after donation budget (b'_t) + * up the hierarchy. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + parent->hweight_donating += iocg->hweight_donating; + parent->hweight_after_donation += iocg->hweight_after_donation; + } + + list_for_each_entry_reverse(iocg, &inner_walk, walk_list) { + if (iocg->level > 0) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + parent->hweight_donating += iocg->hweight_donating; + parent->hweight_after_donation += iocg->hweight_after_donation; + } + } + + /* + * Calculate inner hwa's (b) and make sure the donation values are + * within the accepted ranges as we're doing low res calculations with + * roundups. + */ + list_for_each_entry(iocg, &inner_walk, walk_list) { + if (iocg->level) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + iocg->hweight_active = DIV64_U64_ROUND_UP( + (u64)parent->hweight_active * iocg->active, + parent->child_active_sum); + + } + + iocg->hweight_donating = min(iocg->hweight_donating, + iocg->hweight_active); + iocg->hweight_after_donation = min(iocg->hweight_after_donation, + iocg->hweight_donating - 1); + if (WARN_ON_ONCE(iocg->hweight_active <= 1 || + iocg->hweight_donating <= 1 || + iocg->hweight_after_donation == 0)) { + pr_warn("iocg: invalid donation weights in "); + pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); + pr_cont(": active=%u donating=%u after=%u\n", + iocg->hweight_active, iocg->hweight_donating, + iocg->hweight_after_donation); + } + } + + /* + * Calculate the global donation rate (gamma) - the rate to adjust + * non-donating budgets by. + * + * No need to use 64bit multiplication here as the first operand is + * guaranteed to be smaller than WEIGHT_ONE (1<<16). + * + * We know that there are beneficiary nodes and the sum of the donating + * hweights can't be whole; however, due to the round-ups during hweight + * calculations, root_iocg->hweight_donating might still end up equal to + * or greater than whole. Limit the range when calculating the divider. + * + * gamma = (1 - t_r') / (1 - t_r) + */ + gamma = DIV_ROUND_UP( + (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, + WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); + + /* + * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner + * nodes. + */ + list_for_each_entry(iocg, &inner_walk, walk_list) { + struct ioc_gq *parent; + u32 inuse, wpt, wptp; + u64 st, sf; + + if (iocg->level == 0) { + /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */ + iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( + iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), + WEIGHT_ONE - iocg->hweight_after_donation); + continue; + } + + parent = iocg->ancestors[iocg->level - 1]; + + /* b' = gamma * b_f + b_t' */ + iocg->hweight_inuse = DIV64_U64_ROUND_UP( + (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), + WEIGHT_ONE) + iocg->hweight_after_donation; + + /* w' = s' * b' / b'_p */ + inuse = DIV64_U64_ROUND_UP( + (u64)parent->child_adjusted_sum * iocg->hweight_inuse, + parent->hweight_inuse); + + /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */ + st = DIV64_U64_ROUND_UP( + iocg->child_active_sum * iocg->hweight_donating, + iocg->hweight_active); + sf = iocg->child_active_sum - st; + wpt = DIV64_U64_ROUND_UP( + (u64)iocg->active * iocg->hweight_donating, + iocg->hweight_active); + wptp = DIV64_U64_ROUND_UP( + (u64)inuse * iocg->hweight_after_donation, + iocg->hweight_inuse); + + iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); + } + + /* + * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and + * we can finally determine leaf adjustments. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + u32 inuse; + + /* + * In-debt iocgs participated in the donation calculation with + * the minimum target hweight_inuse. Configuring inuse + * accordingly would work fine but debt handling expects + * @iocg->inuse stay at the minimum and we don't wanna + * interfere. + */ + if (iocg->abs_vdebt) { + WARN_ON_ONCE(iocg->inuse > 1); + continue; + } + + /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ + inuse = DIV64_U64_ROUND_UP( + parent->child_adjusted_sum * iocg->hweight_after_donation, + parent->hweight_inuse); + + TRACE_IOCG_PATH(inuse_transfer, iocg, now, + iocg->inuse, inuse, + iocg->hweight_inuse, + iocg->hweight_after_donation); + + __propagate_weights(iocg, iocg->active, inuse, true, now); + } + + /* walk list should be dissolved after use */ + list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list) + list_del_init(&iocg->walk_list); } static void ioc_timer_fn(struct timer_list *timer) @@ -1341,12 +1984,14 @@ static void ioc_timer_fn(struct timer_list *timer) struct ioc *ioc = container_of(timer, struct ioc, timer); struct ioc_gq *iocg, *tiocg; struct ioc_now now; - int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0; + LIST_HEAD(surpluses); + int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; + u64 usage_us_sum = 0; u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; - int prev_busy_level, i; + int prev_busy_level; /* how were the latencies during the period? */ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); @@ -1370,30 +2015,71 @@ static void ioc_timer_fn(struct timer_list *timer) */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && - !iocg_is_idle(iocg)) + !iocg->delay && !iocg_is_idle(iocg)) continue; spin_lock(&iocg->waitq.lock); - if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { + /* flush wait and indebt stat deltas */ + if (iocg->wait_since) { + iocg->local_stat.wait_us += now.now - iocg->wait_since; + iocg->wait_since = now.now; + } + if (iocg->indebt_since) { + iocg->local_stat.indebt_us += + now.now - iocg->indebt_since; + iocg->indebt_since = now.now; + } + if (iocg->indelay_since) { + iocg->local_stat.indelay_us += + now.now - iocg->indelay_since; + iocg->indelay_since = now.now; + } + + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || + iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ - iocg_kick_waitq(iocg, &now); - iocg_kick_delay(iocg, &now); + iocg_kick_waitq(iocg, true, &now); + if (iocg->abs_vdebt) + nr_debtors++; } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ - iocg->last_inuse = iocg->inuse; - __propagate_active_weight(iocg, 0, 0); + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess; + + /* + * @iocg has been inactive for a full duration and will + * have a high budget. Account anything above target as + * error and throw away. On reactivation, it'll start + * with the target budget. + */ + excess = now.vnow - vtime - ioc->margins.target; + if (excess > 0) { + u32 old_hwi; + + current_hweight(iocg, NULL, &old_hwi); + ioc->vtime_err -= div64_u64(excess * old_hwi, + WEIGHT_ONE); + } + + __propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } - commit_active_weights(ioc); + commit_weights(ioc); + + /* + * Wait and indebt stat are flushed above and the donation calculation + * below needs updated usage stat. Let's bring stat up-to-date. + */ + iocg_flush_stat(&ioc->active_iocgs, &now); - /* calc usages and see whether some weights need to be moved around */ + /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, vusage, vmargin, vmin; - u32 hw_active, hw_inuse, usage; + u64 vdone, vtime, usage_us, usage_dur; + u32 usage, hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent @@ -1417,116 +2103,105 @@ static void ioc_timer_fn(struct timer_list *timer) time_before64(vdone, now.vnow - period_vtime)) nr_lagging++; - if (waitqueue_active(&iocg->waitq)) - vusage = now.vnow - iocg->last_vtime; - else if (time_before64(iocg->last_vtime, vtime)) - vusage = vtime - iocg->last_vtime; - else - vusage = 0; - - iocg->last_vtime += vusage; /* - * Factor in in-flight vtime into vusage to avoid - * high-latency completions appearing as idle. This should - * be done after the above ->last_time adjustment. + * Determine absolute usage factoring in in-flight IOs to avoid + * high-latency completions appearing as idle. */ - vusage = max(vusage, vtime - vdone); - - /* calculate hweight based usage ratio and record */ - if (vusage) { - usage = DIV64_U64_ROUND_UP(vusage * hw_inuse, - period_vtime); - iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; - iocg->usages[iocg->usage_idx] = usage; - } else { - usage = 0; + usage_us = iocg->usage_delta_us; + usage_us_sum += usage_us; + + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + ioc->vtime_base_rate); + usage_us = max(usage_us, inflight_us); } + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); + + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), + 1, WEIGHT_ONE); + /* see whether there's surplus vtime */ - vmargin = ioc->margin_us * now.vrate; - vmin = now.vnow - vmargin; - - iocg->has_surplus = false; - - if (!waitqueue_active(&iocg->waitq) && - time_before64(vtime, vmin)) { - u64 delta = vmin - vtime; - - /* throw away surplus vtime */ - atomic64_add(delta, &iocg->vtime); - atomic64_add(delta, &iocg->done_vtime); - iocg->last_vtime += delta; - /* if usage is sufficiently low, maybe it can donate */ - if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { - iocg->has_surplus = true; - nr_surpluses++; - } - } else if (hw_inuse < hw_active) { - u32 new_hwi, new_inuse; + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); + if (hw_inuse < hw_active || + (!waitqueue_active(&iocg->waitq) && + time_before64(vtime, now.vnow - ioc->margins.low))) { + u32 hwa, old_hwi, hwm, new_hwi; - /* was donating but might need to take back some */ - if (waitqueue_active(&iocg->waitq)) { - new_hwi = hw_active; + /* + * Already donating or accumulated enough to start. + * Determine the donation amount. + */ + current_hweight(iocg, &hwa, &old_hwi); + hwm = current_hweight_max(iocg); + new_hwi = hweight_after_donation(iocg, old_hwi, hwm, + usage, &now); + if (new_hwi < hwm) { + iocg->hweight_donating = hwa; + iocg->hweight_after_donation = new_hwi; + list_add(&iocg->surplus_list, &surpluses); } else { - new_hwi = max(hw_inuse, - usage * SURPLUS_SCALE_PCT / 100 + - SURPLUS_SCALE_ABS); - } + TRACE_IOCG_PATH(inuse_shortage, iocg, &now, + iocg->inuse, iocg->active, + iocg->hweight_inuse, new_hwi); - new_inuse = div64_u64((u64)iocg->inuse * new_hwi, - hw_inuse); - new_inuse = clamp_t(u32, new_inuse, 1, iocg->active); - - if (new_inuse > iocg->inuse) { - TRACE_IOCG_PATH(inuse_takeback, iocg, &now, - iocg->inuse, new_inuse, - hw_inuse, new_hwi); - __propagate_active_weight(iocg, iocg->weight, - new_inuse); + __propagate_weights(iocg, iocg->active, + iocg->active, true, &now); + nr_shortages++; } } else { - /* genuninely out of vtime */ + /* genuinely short on vtime */ nr_shortages++; } } - if (!nr_shortages || !nr_surpluses) - goto skip_surplus_transfers; + if (!list_empty(&surpluses) && nr_shortages) + transfer_surpluses(&surpluses, &now); - /* there are both shortages and surpluses, transfer surpluses */ - list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u32 usage, hw_active, hw_inuse, new_hwi, new_inuse; - int nr_valid = 0; + commit_weights(ioc); - if (!iocg->has_surplus) - continue; + /* surplus list should be dissolved after use */ + list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) + list_del_init(&iocg->surplus_list); - /* base the decision on max historical usage */ - for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) { - if (iocg->usages[i]) { - usage = max(usage, iocg->usages[i]); - nr_valid++; + /* + * A low weight iocg can amass a large amount of debt, for example, when + * anonymous memory gets reclaimed aggressively. If the system has a lot + * of memory paired with a slow IO device, the debt can span multiple + * seconds or more. If there are no other subsequent IO issuers, the + * in-debt iocg may end up blocked paying its debt while the IO device + * is idle. + * + * The following protects against such pathological cases. If the device + * has been sufficiently idle for a substantial amount of time, the + * debts are halved. The criteria are on the conservative side as we + * want to resolve the rare extreme cases without impacting regular + * operation by forgiving debts too readily. + */ + if (nr_shortages || + div64_u64(100 * usage_us_sum, now.now - ioc->period_at) >= + DEBT_BUSY_USAGE_PCT) + ioc->debt_busy_at = now.now; + + if (nr_debtors && + now.now - ioc->debt_busy_at >= DEBT_REDUCTION_IDLE_DUR) { + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + if (iocg->abs_vdebt) { + spin_lock(&iocg->waitq.lock); + iocg->abs_vdebt /= 2; + iocg_kick_waitq(iocg, true, &now); + spin_unlock(&iocg->waitq.lock); } } - if (nr_valid < MIN_VALID_USAGES) - continue; - - current_hweight(iocg, &hw_active, &hw_inuse); - new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse); - if (!new_hwi) - continue; - - new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, - hw_inuse); - if (new_inuse < iocg->inuse) { - TRACE_IOCG_PATH(inuse_giveaway, iocg, &now, - iocg->inuse, new_inuse, - hw_inuse, new_hwi); - __propagate_active_weight(iocg, iocg->weight, new_inuse); - } + ioc->debt_busy_at = now.now; } -skip_surplus_transfers: - commit_active_weights(ioc); /* * If q is getting clogged or we're missing too much, we're issuing @@ -1554,11 +2229,9 @@ skip_surplus_transfers: /* * If there are IOs spanning multiple periods, wait - * them out before pushing the device harder. If - * there are surpluses, let redistribution work it - * out first. + * them out before pushing the device harder. */ - if (!nr_lagging && !nr_surpluses) + if (!nr_lagging) ioc->busy_level--; } else { /* @@ -1577,7 +2250,7 @@ skip_surplus_transfers: ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { - u64 vrate = atomic64_read(&ioc->vtime_rate); + u64 vrate = ioc->vtime_base_rate; u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; /* rq_wait signal is always reliable, ignore user vrate_min */ @@ -1612,16 +2285,14 @@ skip_surplus_transfers: } trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, - nr_lagging, nr_shortages, - nr_surpluses); + nr_lagging, nr_shortages); - atomic64_set(&ioc->vtime_rate, vrate); - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( - ioc->period_us * vrate * INUSE_MARGIN_PCT, 100); + ioc->vtime_base_rate = vrate; + ioc_refresh_margins(ioc); } else if (ioc->busy_level != prev_busy_level || nr_lagging) { trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), missed_ppm, rq_wait_pct, nr_lagging, - nr_shortages, nr_surpluses); + nr_shortages); } ioc_refresh_params(ioc, false); @@ -1637,11 +2308,74 @@ skip_surplus_transfers: ioc_start_period(ioc, &now); } else { ioc->busy_level = 0; + ioc->vtime_err = 0; ioc->running = IOC_IDLE; } + + ioc_refresh_vrate(ioc, &now); + } + + spin_unlock_irq(&ioc->lock); +} + +static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, + u64 abs_cost, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct ioc_margins *margins = &ioc->margins; + u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; + u32 hwi, adj_step; + s64 margin; + u64 cost, new_inuse; + + current_hweight(iocg, NULL, &hwi); + old_hwi = hwi; + cost = abs_cost_to_cost(abs_cost, hwi); + margin = now->vnow - vtime - cost; + + /* debt handling owns inuse for debtors */ + if (iocg->abs_vdebt) + return cost; + + /* + * We only increase inuse during period and do so iff the margin has + * deteriorated since the previous adjustment. + */ + if (margin >= iocg->saved_margin || margin >= margins->low || + iocg->inuse == iocg->active) + return cost; + + spin_lock_irq(&ioc->lock); + + /* we own inuse only when @iocg is in the normal active state */ + if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { + spin_unlock_irq(&ioc->lock); + return cost; } + /* + * Bump up inuse till @abs_cost fits in the existing budget. + * adj_step must be determined after acquiring ioc->lock - we might + * have raced and lost to another thread for activation and could + * be reading 0 iocg->active before ioc->lock which will lead to + * infinite loop. + */ + new_inuse = iocg->inuse; + adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); + do { + new_inuse = new_inuse + adj_step; + propagate_weights(iocg, iocg->active, new_inuse, true, now); + current_hweight(iocg, NULL, &hwi); + cost = abs_cost_to_cost(abs_cost, hwi); + } while (time_after64(vtime + cost, now->vnow) && + iocg->inuse != iocg->active); + spin_unlock_irq(&ioc->lock); + + TRACE_IOCG_PATH(inuse_adjust, iocg, now, + old_inuse, iocg->inuse, old_hwi, hwi); + + return cost; } static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, @@ -1725,38 +2459,25 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) struct ioc_gq *iocg = blkg_to_iocg(blkg); struct ioc_now now; struct iocg_wait wait; - u32 hw_active, hw_inuse; u64 abs_cost, cost, vtime; + bool use_debt, ioc_locked; + unsigned long flags; /* bypass IOs if disabled or for root cgroup */ if (!ioc->enabled || !iocg->level) return; - /* always activate so that even 0 cost IOs get protected to some level */ - if (!iocg_activate(iocg, &now)) - return; - /* calculate the absolute vtime cost */ abs_cost = calc_vtime_cost(bio, iocg, false); if (!abs_cost) return; - iocg->cursor = bio_end_sector(bio); + if (!iocg_activate(iocg, &now)) + return; + iocg->cursor = bio_end_sector(bio); vtime = atomic64_read(&iocg->vtime); - current_hweight(iocg, &hw_active, &hw_inuse); - - if (hw_inuse < hw_active && - time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) { - TRACE_IOCG_PATH(inuse_reset, iocg, &now, - iocg->inuse, iocg->weight, hw_inuse, hw_active); - spin_lock_irq(&ioc->lock); - propagate_active_weight(iocg, iocg->weight, iocg->weight); - spin_unlock_irq(&ioc->lock); - current_hweight(iocg, &hw_active, &hw_inuse); - } - - cost = abs_cost_to_cost(abs_cost, hw_inuse); + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* * If no one's waiting and within budget, issue right away. The @@ -1765,21 +2486,32 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && time_before_eq64(vtime + cost, now.vnow)) { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* - * We activated above but w/o any synchronization. Deactivation is - * synchronized with waitq.lock and we won't get deactivated as long - * as we're waiting or has debt, so we're good if we're activated - * here. In the unlikely case that we aren't, just issue the IO. + * We're over budget. This can be handled in two ways. IOs which may + * cause priority inversions are punted to @ioc->aux_iocg and charged as + * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling + * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine + * whether debt handling is needed and acquire locks accordingly. */ - spin_lock_irq(&iocg->waitq.lock); + use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); + ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); +retry_lock: + iocg_lock(iocg, ioc_locked, &flags); + /* + * @iocg must stay activated for debt and waitq handling. Deactivation + * is synchronized against both ioc->lock and waitq.lock and we won't + * get deactivated as long as we're waiting or has debt, so we're good + * if we're activated here. In the unlikely cases that we aren't, just + * issue the IO. + */ if (unlikely(list_empty(&iocg->active_list))) { - spin_unlock_irq(&iocg->waitq.lock); - iocg_commit_bio(iocg, bio, cost); + iocg_unlock(iocg, ioc_locked, &flags); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } @@ -1800,15 +2532,26 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) * clear them and leave @iocg inactive w/ dangling use_delay heavily * penalizing the cgroup and its descendants. */ - if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { - iocg->abs_vdebt += abs_cost; + if (use_debt) { + iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); - spin_unlock_irq(&iocg->waitq.lock); + iocg_unlock(iocg, ioc_locked, &flags); return; } + /* guarantee that iocgs w/ waiters have maximum inuse */ + if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { + if (!ioc_locked) { + iocg_unlock(iocg, false, &flags); + ioc_locked = true; + goto retry_lock; + } + propagate_weights(iocg, iocg->active, iocg->active, true, + &now); + } + /* * Append self to the waitq and schedule the wakeup timer if we're * the first waiter. The timer duration is calculated based on the @@ -1829,9 +2572,9 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) wait.committed = false; /* will be set true by waker */ __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); - iocg_kick_waitq(iocg, &now); + iocg_kick_waitq(iocg, ioc_locked, &now); - spin_unlock_irq(&iocg->waitq.lock); + iocg_unlock(iocg, ioc_locked, &flags); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -1851,8 +2594,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct ioc *ioc = iocg->ioc; sector_t bio_end = bio_end_sector(bio); struct ioc_now now; - u32 hw_inuse; - u64 abs_cost, cost; + u64 vtime, abs_cost, cost; unsigned long flags; /* bypass if disabled or for root cgroup */ @@ -1864,8 +2606,9 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, return; ioc_now(ioc, &now); - current_hweight(iocg, NULL, &hw_inuse); - cost = abs_cost_to_cost(abs_cost, hw_inuse); + + vtime = atomic64_read(&iocg->vtime); + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* update cursor if backmerging into the request at the cursor */ if (blk_rq_pos(rq) < bio_end && @@ -1878,7 +2621,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, */ if (rq->bio && rq->bio->bi_iocost_cost && time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } @@ -1887,14 +2630,20 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, * be for the vast majority of cases. See debt handling in * ioc_rqos_throttle() for details. */ - spin_lock_irqsave(&iocg->waitq.lock, flags); + spin_lock_irqsave(&ioc->lock, flags); + spin_lock(&iocg->waitq.lock); + if (likely(!list_empty(&iocg->active_list))) { - iocg->abs_vdebt += abs_cost; - iocg_kick_delay(iocg, &now); + iocg_incur_debt(iocg, abs_cost, &now); + if (iocg_kick_delay(iocg, &now)) + blkcg_schedule_throttle(rqos->q, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); } - spin_unlock_irqrestore(&iocg->waitq.lock, flags); + + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) @@ -1908,6 +2657,7 @@ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) { struct ioc *ioc = rqos_to_ioc(rqos); + struct ioc_pcpu_stat *ccs; u64 on_q_ns, rq_wait_ns, size_nsec; int pidx, rw; @@ -1931,13 +2681,17 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); + ccs = get_cpu_ptr(ioc->pcpu_stat); + if (on_q_ns <= size_nsec || on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) - this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met); + local_inc(&ccs->missed[rw].nr_met); else - this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed); + local_inc(&ccs->missed[rw].nr_missed); + + local64_add(rq_wait_ns, &ccs->rq_wait_ns); - this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns); + put_cpu_ptr(ccs); } static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) @@ -1977,7 +2731,7 @@ static int blk_iocost_init(struct request_queue *q) { struct ioc *ioc; struct rq_qos *rqos; - int ret; + int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); if (!ioc) @@ -1989,6 +2743,16 @@ static int blk_iocost_init(struct request_queue *q) return -ENOMEM; } + for_each_possible_cpu(cpu) { + struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); + + for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { + local_set(&ccs->missed[i].nr_met, 0); + local_set(&ccs->missed[i].nr_missed, 0); + } + local64_set(&ccs->rq_wait_ns, 0); + } + rqos = &ioc->rqos; rqos->id = RQ_QOS_COST; rqos->ops = &ioc_rqos_ops; @@ -1999,6 +2763,7 @@ static int blk_iocost_init(struct request_queue *q) INIT_LIST_HEAD(&ioc->active_iocgs); ioc->running = IOC_IDLE; + ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); ioc->period_at = ktime_to_us(ktime_get()); @@ -2029,7 +2794,7 @@ static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) if (!iocc) return NULL; - iocc->dfl_weight = CGROUP_WEIGHT_DFL; + iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; return &iocc->cpd; } @@ -2048,6 +2813,12 @@ static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, if (!iocg) return NULL; + iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); + if (!iocg->pcpu_stat) { + kfree(iocg); + return NULL; + } + return &iocg->pd; } @@ -2067,14 +2838,14 @@ static void ioc_pd_init(struct blkg_policy_data *pd) atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); - iocg->hweight_active = HWEIGHT_WHOLE; - iocg->hweight_inuse = HWEIGHT_WHOLE; + INIT_LIST_HEAD(&iocg->walk_list); + INIT_LIST_HEAD(&iocg->surplus_list); + iocg->hweight_active = WEIGHT_ONE; + iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->waitq_timer.function = iocg_waitq_timer_fn; - hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - iocg->delay_timer.function = iocg_delay_timer_fn; iocg->level = blkg->blkcg->css.cgroup->level; @@ -2084,7 +2855,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) } spin_lock_irqsave(&ioc->lock, flags); - weight_updated(iocg); + weight_updated(iocg, &now); spin_unlock_irqrestore(&ioc->lock, flags); } @@ -2096,18 +2867,56 @@ static void ioc_pd_free(struct blkg_policy_data *pd) if (ioc) { spin_lock_irqsave(&ioc->lock, flags); + if (!list_empty(&iocg->active_list)) { - propagate_active_weight(iocg, 0, 0); + struct ioc_now now; + + ioc_now(ioc, &now); + propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } + + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); + spin_unlock_irqrestore(&ioc->lock, flags); hrtimer_cancel(&iocg->waitq_timer); - hrtimer_cancel(&iocg->delay_timer); } + free_percpu(iocg->pcpu_stat); kfree(iocg); } +static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct ioc *ioc = iocg->ioc; + size_t pos = 0; + + if (!ioc->enabled) + return 0; + + if (iocg->level == 0) { + unsigned vp10k = DIV64_U64_ROUND_CLOSEST( + ioc->vtime_base_rate * 10000, + VTIME_PER_USEC); + pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", + vp10k / 100, vp10k % 100); + } + + pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", + iocg->last_stat.usage_us); + + if (blkcg_debug_stats) + pos += scnprintf(buf + pos, size - pos, + " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); + + return pos; +} + static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -2115,7 +2924,7 @@ static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, struct ioc_gq *iocg = pd_to_iocg(pd); if (dname && iocg->cfg_weight) - seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight); + seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); return 0; } @@ -2125,7 +2934,7 @@ static int ioc_weight_show(struct seq_file *sf, void *v) struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); - seq_printf(sf, "default %u\n", iocc->dfl_weight); + seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; @@ -2137,6 +2946,7 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); struct blkg_conf_ctx ctx; + struct ioc_now now; struct ioc_gq *iocg; u32 v; int ret; @@ -2151,13 +2961,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, return -EINVAL; spin_lock(&blkcg->lock); - iocc->dfl_weight = v; + iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); if (iocg) { spin_lock_irq(&iocg->ioc->lock); - weight_updated(iocg); + ioc_now(iocg->ioc, &now); + weight_updated(iocg, &now); spin_unlock_irq(&iocg->ioc->lock); } } @@ -2182,8 +2993,9 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, } spin_lock(&iocg->ioc->lock); - iocg->cfg_weight = v; - weight_updated(iocg); + iocg->cfg_weight = v * WEIGHT_ONE; + ioc_now(iocg->ioc, &now); + weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); blkg_conf_finish(&ctx); @@ -2521,6 +3333,7 @@ static struct blkcg_policy blkcg_policy_iocost = { .pd_alloc_fn = ioc_pd_alloc, .pd_init_fn = ioc_pd_init, .pd_free_fn = ioc_pd_free, + .pd_stat_fn = ioc_pd_stat, }; static int __init ioc_init(void) diff --git a/block/blk-map.c b/block/blk-map.c index 6e804892d5ec..21630dccac62 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -12,7 +12,8 @@ #include "blk.h" struct bio_map_data { - int is_our_pages; + bool is_our_pages : 1; + bool is_null_mapped : 1; struct iov_iter iter; struct iovec iov[]; }; @@ -108,7 +109,7 @@ static int bio_uncopy_user(struct bio *bio) struct bio_map_data *bmd = bio->bi_private; int ret = 0; - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + if (!bmd->is_null_mapped) { /* * if we're in a workqueue, the request is orphaned, so * don't copy into a random user address space, just free @@ -126,24 +127,12 @@ static int bio_uncopy_user(struct bio *bio) return ret; } -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -static struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, struct iov_iter *iter, - gfp_t gfp_mask) +static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, + struct iov_iter *iter, gfp_t gfp_mask) { struct bio_map_data *bmd; struct page *page; - struct bio *bio; + struct bio *bio, *bounce_bio; int i = 0, ret; int nr_pages; unsigned int len = iter->count; @@ -151,14 +140,15 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, bmd = bio_alloc_map_data(iter, gfp_mask); if (!bmd) - return ERR_PTR(-ENOMEM); + return -ENOMEM; /* * We need to do a deep copy of the iov_iter including the iovecs. * The caller provided iov might point to an on-stack or otherwise * shortlived one. */ - bmd->is_our_pages = map_data ? 0 : 1; + bmd->is_our_pages = !map_data; + bmd->is_null_mapped = (map_data && map_data->null_mapped); nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); if (nr_pages > BIO_MAX_PAGES) @@ -168,8 +158,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) goto out_bmd; - - ret = 0; + bio->bi_opf |= req_op(rq); if (map_data) { nr_pages = 1 << map_data->page_order; @@ -186,7 +175,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, if (map_data) { if (i == map_data->nr_entries * nr_pages) { ret = -ENOMEM; - break; + goto cleanup; } page = map_data->pages[i / nr_pages]; @@ -194,14 +183,14 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, i++; } else { - page = alloc_page(q->bounce_gfp | gfp_mask); + page = alloc_page(rq->q->bounce_gfp | gfp_mask); if (!page) { ret = -ENOMEM; - break; + goto cleanup; } } - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { + if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) { if (!map_data) __free_page(page); break; @@ -211,9 +200,6 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, offset = 0; } - if (ret) - goto cleanup; - if (map_data) map_data->offset += bio->bi_iter.bi_size; @@ -233,41 +219,42 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, } bio->bi_private = bmd; - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - return bio; + + bounce_bio = bio; + ret = blk_rq_append_bio(rq, &bounce_bio); + if (ret) + goto cleanup; + + /* + * We link the bounce buffer in and could have to traverse it later, so + * we have to get a ref to prevent it from being freed + */ + bio_get(bounce_bio); + return 0; cleanup: if (!map_data) bio_free_pages(bio); bio_put(bio); out_bmd: kfree(bmd); - return ERR_PTR(ret); + return ret; } -/** - * bio_map_user_iov - map user iovec into bio - * @q: the struct request_queue for the bio - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -static struct bio *bio_map_user_iov(struct request_queue *q, - struct iov_iter *iter, gfp_t gfp_mask) +static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, + gfp_t gfp_mask) { - unsigned int max_sectors = queue_max_hw_sectors(q); - int j; - struct bio *bio; + unsigned int max_sectors = queue_max_hw_sectors(rq->q); + struct bio *bio, *bounce_bio; int ret; + int j; if (!iov_iter_count(iter)) - return ERR_PTR(-EINVAL); + return -EINVAL; bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); if (!bio) - return ERR_PTR(-ENOMEM); + return -ENOMEM; + bio->bi_opf |= req_op(rq); while (iov_iter_count(iter)) { struct page **pages; @@ -283,7 +270,7 @@ static struct bio *bio_map_user_iov(struct request_queue *q, npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - if (unlikely(offs & queue_dma_alignment(q))) { + if (unlikely(offs & queue_dma_alignment(rq->q))) { ret = -EINVAL; j = 0; } else { @@ -295,7 +282,7 @@ static struct bio *bio_map_user_iov(struct request_queue *q, if (n > bytes) n = bytes; - if (!bio_add_hw_page(q, bio, page, n, offs, + if (!bio_add_hw_page(rq->q, bio, page, n, offs, max_sectors, &same_page)) { if (same_page) put_page(page); @@ -319,21 +306,31 @@ static struct bio *bio_map_user_iov(struct request_queue *q, break; } - bio_set_flag(bio, BIO_USER_MAPPED); - /* - * subtle -- if bio_map_user_iov() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it + * Subtle: if we end up needing to bounce a bio, it would normally + * disappear when its bi_end_io is run. However, we need the original + * bio for the unmap, so grab an extra reference to it */ bio_get(bio); - return bio; + bounce_bio = bio; + ret = blk_rq_append_bio(rq, &bounce_bio); + if (ret) + goto out_put_orig; + + /* + * We link the bounce buffer in and could have to traverse it + * later, so we have to get a ref to prevent it from being freed + */ + bio_get(bounce_bio); + return 0; + + out_put_orig: + bio_put(bio); out_unmap: bio_release_pages(bio, false); bio_put(bio); - return ERR_PTR(ret); + return ret; } /** @@ -557,55 +554,6 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio) } EXPORT_SYMBOL(blk_rq_append_bio); -static int __blk_rq_unmap_user(struct bio *bio) -{ - int ret = 0; - - if (bio) { - if (bio_flagged(bio, BIO_USER_MAPPED)) - bio_unmap_user(bio); - else - ret = bio_uncopy_user(bio); - } - - return ret; -} - -static int __blk_rq_map_user_iov(struct request *rq, - struct rq_map_data *map_data, struct iov_iter *iter, - gfp_t gfp_mask, bool copy) -{ - struct request_queue *q = rq->q; - struct bio *bio, *orig_bio; - int ret; - - if (copy) - bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); - else - bio = bio_map_user_iov(q, iter, gfp_mask); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - bio->bi_opf &= ~REQ_OP_MASK; - bio->bi_opf |= req_op(rq); - - orig_bio = bio; - - /* - * We link the bounce buffer in and could have to traverse it - * later so we have to get a ref to prevent it from being freed - */ - ret = blk_rq_append_bio(rq, &bio); - if (ret) { - __blk_rq_unmap_user(orig_bio); - return ret; - } - bio_get(bio); - - return 0; -} - /** * blk_rq_map_user_iov - map user data to a request, for passthrough requests * @q: request queue where request should be inserted @@ -649,7 +597,10 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, i = *iter; do { - ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); + if (copy) + ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask); + else + ret = bio_map_user_iov(rq, &i, gfp_mask); if (ret) goto unmap_rq; if (!bio) @@ -700,9 +651,13 @@ int blk_rq_unmap_user(struct bio *bio) if (unlikely(bio_flagged(bio, BIO_BOUNCED))) mapped_bio = bio->bi_private; - ret2 = __blk_rq_unmap_user(mapped_bio); - if (ret2 && !ret) - ret = ret2; + if (bio->bi_private) { + ret2 = bio_uncopy_user(mapped_bio); + if (ret2 && !ret) + ret = ret2; + } else { + bio_unmap_user(mapped_bio); + } mapped_bio = bio; bio = bio->bi_next; diff --git a/block/blk-merge.c b/block/blk-merge.c index f685d633bcc9..6ed715835d45 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -11,6 +11,7 @@ #include <trace/events/block.h> #include "blk.h" +#include "blk-rq-qos.h" static inline bool bio_will_gap(struct request_queue *q, struct request *prev_rq, struct bio *prev, struct bio *next) @@ -895,3 +896,203 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) return ELEVATOR_FRONT_MERGE; return ELEVATOR_NO_MERGE; } + +static void blk_account_io_merge_bio(struct request *req) +{ + if (!blk_do_io_stat(req)) + return; + + part_stat_lock(); + part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_unlock(); +} + +enum bio_merge_status bio_attempt_back_merge(struct request *req, + struct bio *bio, + unsigned int nr_segs) +{ + const int ff = bio->bi_opf & REQ_FAILFAST_MASK; + + if (!ll_back_merge_fn(req, bio, nr_segs)) + return BIO_MERGE_FAILED; + + trace_block_bio_backmerge(req->q, req, bio); + rq_qos_merge(req->q, req, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_iter.bi_size; + + bio_crypt_free_ctx(bio); + + blk_account_io_merge_bio(req); + return BIO_MERGE_OK; +} + +enum bio_merge_status bio_attempt_front_merge(struct request *req, + struct bio *bio, + unsigned int nr_segs) +{ + const int ff = bio->bi_opf & REQ_FAILFAST_MASK; + + if (!ll_front_merge_fn(req, bio, nr_segs)) + return BIO_MERGE_FAILED; + + trace_block_bio_frontmerge(req->q, req, bio); + rq_qos_merge(req->q, req, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + bio->bi_next = req->bio; + req->bio = bio; + + req->__sector = bio->bi_iter.bi_sector; + req->__data_len += bio->bi_iter.bi_size; + + bio_crypt_do_front_merge(req, bio); + + blk_account_io_merge_bio(req); + return BIO_MERGE_OK; +} + +enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q, + struct request *req, + struct bio *bio) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + + if (segments >= queue_max_discard_segments(q)) + goto no_merge; + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) + goto no_merge; + + rq_qos_merge(q, req, bio); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_iter.bi_size; + req->nr_phys_segments = segments + 1; + + blk_account_io_merge_bio(req); + return BIO_MERGE_OK; +no_merge: + req_set_nomerge(q, req); + return BIO_MERGE_FAILED; +} + +static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, + struct request *rq, + struct bio *bio, + unsigned int nr_segs, + bool sched_allow_merge) +{ + if (!blk_rq_merge_ok(rq, bio)) + return BIO_MERGE_NONE; + + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio)) + return bio_attempt_back_merge(rq, bio, nr_segs); + break; + case ELEVATOR_FRONT_MERGE: + if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio)) + return bio_attempt_front_merge(rq, bio, nr_segs); + break; + case ELEVATOR_DISCARD_MERGE: + return bio_attempt_discard_merge(q, rq, bio); + default: + return BIO_MERGE_NONE; + } + + return BIO_MERGE_FAILED; +} + +/** + * blk_attempt_plug_merge - try to merge with %current's plugged list + * @q: request_queue new bio is being queued at + * @bio: new bio being queued + * @nr_segs: number of segments in @bio + * @same_queue_rq: pointer to &struct request that gets filled in when + * another request associated with @q is found on the plug list + * (optional, may be %NULL) + * + * Determine whether @bio being queued on @q can be merged with a request + * on %current's plugged list. Returns %true if merge was successful, + * otherwise %false. + * + * Plugging coalesces IOs from the same issuer for the same purpose without + * going through @q->queue_lock. As such it's more of an issuing mechanism + * than scheduling, and the request, while may have elvpriv data, is not + * added on the elevator at this point. In addition, we don't have + * reliable access to the elevator outside queue lock. Only check basic + * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. + */ +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs, struct request **same_queue_rq) +{ + struct blk_plug *plug; + struct request *rq; + struct list_head *plug_list; + + plug = blk_mq_plug(q, bio); + if (!plug) + return false; + + plug_list = &plug->mq_list; + + list_for_each_entry_reverse(rq, plug_list, queuelist) { + if (rq->q == q && same_queue_rq) { + /* + * Only blk-mq multiple hardware queues case checks the + * rq in the same queue, there should be only one such + * rq in a queue + **/ + *same_queue_rq = rq; + } + + if (rq->q != q) + continue; + + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) + return true; + } + + return false; +} + +/* + * Iterate list of requests and see if we can merge this bio with any + * of them. + */ +bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, + struct bio *bio, unsigned int nr_segs) +{ + struct request *rq; + int checked = 8; + + list_for_each_entry_reverse(rq, list, queuelist) { + if (!checked--) + break; + + switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) { + case BIO_MERGE_NONE: + continue; + case BIO_MERGE_OK: + return true; + case BIO_MERGE_FAILED: + return false; + } + + } + + return false; +} +EXPORT_SYMBOL_GPL(blk_bio_list_merge); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3f09bcb8a6fd..3094542e12ae 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -116,6 +116,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(DEAD), QUEUE_FLAG_NAME(INIT_DONE), + QUEUE_FLAG_NAME(STABLE_WRITES), QUEUE_FLAG_NAME(POLL), QUEUE_FLAG_NAME(WC), QUEUE_FLAG_NAME(FUA), @@ -240,7 +241,7 @@ static const char *const alloc_policy_name[] = { #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(SHOULD_MERGE), - HCTX_FLAG_NAME(TAG_SHARED), + HCTX_FLAG_NAME(TAG_QUEUE_SHARED), HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(STACKING), @@ -452,11 +453,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, atomic_read(&tags->active_queues)); seq_puts(m, "\nbitmap_tags:\n"); - sbitmap_queue_show(&tags->bitmap_tags, m); + sbitmap_queue_show(tags->bitmap_tags, m); if (tags->nr_reserved_tags) { seq_puts(m, "\nbreserved_tags:\n"); - sbitmap_queue_show(&tags->breserved_tags, m); + sbitmap_queue_show(tags->breserved_tags, m); } } @@ -487,7 +488,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) if (res) goto out; if (hctx->tags) - sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); + sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m); mutex_unlock(&q->sysfs_lock); out: @@ -521,7 +522,7 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) if (res) goto out; if (hctx->sched_tags) - sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); + sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m); mutex_unlock(&q->sysfs_lock); out: diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index d2790e5b06d1..3e9596738852 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -18,21 +18,6 @@ #include "blk-mq-tag.h" #include "blk-wbt.h" -void blk_mq_sched_free_hctx_data(struct request_queue *q, - void (*exit)(struct blk_mq_hw_ctx *)) -{ - struct blk_mq_hw_ctx *hctx; - int i; - - queue_for_each_hw_ctx(q, hctx, i) { - if (exit && hctx->sched_data) - exit(hctx); - kfree(hctx->sched_data); - hctx->sched_data = NULL; - } -} -EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); - void blk_mq_sched_assign_ioc(struct request *rq) { struct request_queue *q = rq->q; @@ -368,7 +353,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, case ELEVATOR_BACK_MERGE: if (!blk_mq_sched_allow_merge(q, rq, bio)) return false; - if (!bio_attempt_back_merge(rq, bio, nr_segs)) + if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK) return false; *merged_request = attempt_back_merge(q, rq); if (!*merged_request) @@ -377,86 +362,20 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, case ELEVATOR_FRONT_MERGE: if (!blk_mq_sched_allow_merge(q, rq, bio)) return false; - if (!bio_attempt_front_merge(rq, bio, nr_segs)) + if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK) return false; *merged_request = attempt_front_merge(q, rq); if (!*merged_request) elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); return true; case ELEVATOR_DISCARD_MERGE: - return bio_attempt_discard_merge(q, rq, bio); + return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK; default: return false; } } EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); -/* - * Iterate list of requests and see if we can merge this bio with any - * of them. - */ -bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, - struct bio *bio, unsigned int nr_segs) -{ - struct request *rq; - int checked = 8; - - list_for_each_entry_reverse(rq, list, queuelist) { - bool merged = false; - - if (!checked--) - break; - - if (!blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_back_merge(rq, bio, - nr_segs); - break; - case ELEVATOR_FRONT_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_front_merge(rq, bio, - nr_segs); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - continue; - } - - return merged; - } - - return false; -} -EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); - -/* - * Reverse check our software queue for entries that we could potentially - * merge with. Currently includes a hand-wavy stop count of 8, to not spend - * too much time checking for merges. - */ -static bool blk_mq_attempt_merge(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, struct bio *bio, - unsigned int nr_segs) -{ - enum hctx_type type = hctx->type; - - lockdep_assert_held(&ctx->lock); - - if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { - ctx->rq_merged++; - return true; - } - - return false; -} - bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { @@ -470,14 +389,24 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, return e->type->ops.bio_merge(hctx, bio, nr_segs); type = hctx->type; - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - !list_empty_careful(&ctx->rq_lists[type])) { - /* default per sw-queue merge */ - spin_lock(&ctx->lock); - ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs); - spin_unlock(&ctx->lock); + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || + list_empty_careful(&ctx->rq_lists[type])) + return false; + + /* default per sw-queue merge */ + spin_lock(&ctx->lock); + /* + * Reverse check our software queue for entries that we could + * potentially merge with. Currently includes a hand-wavy stop + * count of 8, to not spend too much time checking for merges. + */ + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { + ctx->rq_merged++; + ret = true; } + spin_unlock(&ctx->lock); + return ret; } @@ -531,7 +460,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, goto run; } - WARN_ON(e && (rq->tag != -1)); + WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG)); if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { /* @@ -616,9 +545,11 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; + if (hctx->sched_tags) { blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); - blk_mq_free_rq_map(hctx->sched_tags); + blk_mq_free_rq_map(hctx->sched_tags, flags); hctx->sched_tags = NULL; } } @@ -628,10 +559,12 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, unsigned int hctx_idx) { struct blk_mq_tag_set *set = q->tag_set; + /* Clear HCTX_SHARED so tags are init'ed */ + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; int ret; hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags); + set->reserved_tags, flags); if (!hctx->sched_tags) return -ENOMEM; @@ -649,8 +582,11 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q) int i; queue_for_each_hw_ctx(q, hctx, i) { + /* Clear HCTX_SHARED so tags are freed */ + unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; + if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags); + blk_mq_free_rq_map(hctx->sched_tags, flags); hctx->sched_tags = NULL; } } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index e81ca1bf6e10..0476360f05f1 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -5,9 +5,6 @@ #include "blk-mq.h" #include "blk-mq-tag.h" -void blk_mq_sched_free_hctx_data(struct request_queue *q, - void (*exit)(struct blk_mq_hw_ctx *)); - void blk_mq_sched_assign_ioc(struct request *rq); void blk_mq_sched_request_inserted(struct request *rq); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 32d82e23b095..aacf10decdbd 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -23,9 +23,18 @@ */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - atomic_inc(&hctx->tags->active_queues); + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && + !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) + atomic_inc(&set->active_queues_shared_sbitmap); + } else { + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + atomic_inc(&hctx->tags->active_queues); + } return true; } @@ -35,9 +44,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { - sbitmap_queue_wake_all(&tags->bitmap_tags); + sbitmap_queue_wake_all(tags->bitmap_tags); if (include_reserve) - sbitmap_queue_wake_all(&tags->breserved_tags); + sbitmap_queue_wake_all(tags->breserved_tags); } /* @@ -47,11 +56,19 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; - - if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - return; - - atomic_dec(&tags->active_queues); + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, + &q->queue_flags)) + return; + atomic_dec(&set->active_queues_shared_sbitmap); + } else { + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + atomic_dec(&tags->active_queues); + } blk_mq_tag_wakeup_all(tags, false); } @@ -59,7 +76,8 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { - if (!data->q->elevator && !hctx_may_queue(data->hctx, bt)) + if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && + !hctx_may_queue(data->hctx, bt)) return BLK_MQ_NO_TAG; if (data->shallow_depth) @@ -82,10 +100,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) WARN_ON_ONCE(1); return BLK_MQ_NO_TAG; } - bt = &tags->breserved_tags; + bt = tags->breserved_tags; tag_offset = 0; } else { - bt = &tags->bitmap_tags; + bt = tags->bitmap_tags; tag_offset = tags->nr_reserved_tags; } @@ -131,9 +149,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) - bt = &tags->breserved_tags; + bt = tags->breserved_tags; else - bt = &tags->bitmap_tags; + bt = tags->bitmap_tags; /* * If destination hw queue is changed, fake wake up on @@ -167,10 +185,10 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); - sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); + sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu); } else { BUG_ON(tag >= tags->nr_reserved_tags); - sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); + sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu); } } @@ -197,7 +215,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ - if (rq && rq->q == hctx->queue) + if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx) return iter_data->fn(hctx, rq, iter_data->data, reserved); return true; } @@ -298,9 +316,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); if (tags->nr_reserved_tags) - bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, + bt_tags_for_each(tags, tags->breserved_tags, fn, priv, flags | BT_TAG_ITER_RESERVED); - bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); + bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags); } /** @@ -416,8 +434,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, continue; if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); + bt_for_each(hctx, tags->breserved_tags, fn, priv, true); + bt_for_each(hctx, tags->bitmap_tags, fn, priv, false); } blk_queue_exit(q); } @@ -429,30 +447,64 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); } -static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, - int node, int alloc_policy) +static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node, int alloc_policy) { unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) - goto free_tags; - if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin, - node)) + if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node)) + return -ENOMEM; + if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags, + round_robin, node)) goto free_bitmap_tags; - return tags; + tags->bitmap_tags = &tags->__bitmap_tags; + tags->breserved_tags = &tags->__breserved_tags; + + return 0; free_bitmap_tags: - sbitmap_queue_free(&tags->bitmap_tags); -free_tags: - kfree(tags); - return NULL; + sbitmap_queue_free(&tags->__bitmap_tags); + return -ENOMEM; +} + +int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) +{ + unsigned int depth = set->queue_depth - set->reserved_tags; + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; + int i, node = set->numa_node; + + if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node)) + return -ENOMEM; + if (bt_alloc(&set->__breserved_tags, set->reserved_tags, + round_robin, node)) + goto free_bitmap_tags; + + for (i = 0; i < set->nr_hw_queues; i++) { + struct blk_mq_tags *tags = set->tags[i]; + + tags->bitmap_tags = &set->__bitmap_tags; + tags->breserved_tags = &set->__breserved_tags; + } + + return 0; +free_bitmap_tags: + sbitmap_queue_free(&set->__bitmap_tags); + return -ENOMEM; +} + +void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) +{ + sbitmap_queue_free(&set->__bitmap_tags); + sbitmap_queue_free(&set->__breserved_tags); } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, - int node, int alloc_policy) + int node, unsigned int flags) { + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags); struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { @@ -467,13 +519,22 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - return blk_mq_init_bitmap_tags(tags, node, alloc_policy); + if (flags & BLK_MQ_F_TAG_HCTX_SHARED) + return tags; + + if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { + kfree(tags); + return NULL; + } + return tags; } -void blk_mq_free_tags(struct blk_mq_tags *tags) +void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) { - sbitmap_queue_free(&tags->bitmap_tags); - sbitmap_queue_free(&tags->breserved_tags); + if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) { + sbitmap_queue_free(tags->bitmap_tags); + sbitmap_queue_free(tags->breserved_tags); + } kfree(tags); } @@ -492,6 +553,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, */ if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; + /* Only sched tags can grow, so clear HCTX_SHARED flag */ + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; struct blk_mq_tags *new; bool ret; @@ -506,30 +569,35 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, return -EINVAL; new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, - tags->nr_reserved_tags); + tags->nr_reserved_tags, flags); if (!new) return -ENOMEM; ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); if (ret) { - blk_mq_free_rq_map(new); + blk_mq_free_rq_map(new, flags); return -ENOMEM; } blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); - blk_mq_free_rq_map(*tagsptr); + blk_mq_free_rq_map(*tagsptr, flags); *tagsptr = new; } else { /* * Don't need (or can't) update reserved tags here, they * remain static and should never need resizing. */ - sbitmap_queue_resize(&tags->bitmap_tags, + sbitmap_queue_resize(tags->bitmap_tags, tdepth - tags->nr_reserved_tags); } return 0; } +void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size) +{ + sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); +} + /** * blk_mq_unique_tag() - return a tag that is unique queue-wide * @rq: request for which to compute a unique tag diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index b1acac518c4e..7d3e6b333a4a 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -2,8 +2,6 @@ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H -#include "blk-mq.h" - /* * Tag address space map. */ @@ -13,17 +11,25 @@ struct blk_mq_tags { atomic_t active_queues; - struct sbitmap_queue bitmap_tags; - struct sbitmap_queue breserved_tags; + struct sbitmap_queue *bitmap_tags; + struct sbitmap_queue *breserved_tags; + + struct sbitmap_queue __bitmap_tags; + struct sbitmap_queue __breserved_tags; struct request **rqs; struct request **static_rqs; struct list_head page_list; }; +extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, + unsigned int reserved_tags, + int node, unsigned int flags); +extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); -extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); -extern void blk_mq_free_tags(struct blk_mq_tags *tags); +extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, + unsigned int flags); +extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, @@ -31,6 +37,9 @@ extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); +extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, + unsigned int size); + extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv); @@ -56,7 +65,7 @@ extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { - if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return false; return __blk_mq_tag_busy(hctx); @@ -64,43 +73,12 @@ static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { - if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return; __blk_mq_tag_idle(hctx); } -/* - * For shared tag users, we track the number of currently active users - * and attempt to provide a fair share of the tag depth for each of them. - */ -static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, - struct sbitmap_queue *bt) -{ - unsigned int depth, users; - - if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) - return true; - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - return true; - - /* - * Don't try dividing an ant - */ - if (bt->sb.depth == 1) - return true; - - users = atomic_read(&hctx->tags->active_queues); - if (!users) - return true; - - /* - * Allow at least some tags - */ - depth = max((bt->sb.depth + users - 1) / users, 4U); - return atomic_read(&hctx->nr_active) < depth; -} - static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { diff --git a/block/blk-mq.c b/block/blk-mq.c index b3d2785eefe9..e04b759add75 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -519,7 +519,7 @@ void blk_mq_free_request(struct request *rq) ctx->rq_completed[rq_is_sync(rq)]++; if (rq->rq_flags & RQF_MQ_INFLIGHT) - atomic_dec(&hctx->nr_active); + __blk_mq_dec_active_requests(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->backing_dev_info); @@ -1096,19 +1096,20 @@ static inline unsigned int queued_to_index(unsigned int queued) static bool __blk_mq_get_driver_tag(struct request *rq) { - struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; + struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; int tag; blk_mq_tag_busy(rq->mq_hctx); if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { - bt = &rq->mq_hctx->tags->breserved_tags; + bt = rq->mq_hctx->tags->breserved_tags; tag_offset = 0; + } else { + if (!hctx_may_queue(rq->mq_hctx, bt)) + return false; } - if (!hctx_may_queue(rq->mq_hctx, bt)) - return false; tag = __sbitmap_queue_get(bt); if (tag == BLK_MQ_NO_TAG) return false; @@ -1124,10 +1125,10 @@ static bool blk_mq_get_driver_tag(struct request *rq) if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) return false; - if ((hctx->flags & BLK_MQ_F_TAG_SHARED) && + if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && !(rq->rq_flags & RQF_MQ_INFLIGHT)) { rq->rq_flags |= RQF_MQ_INFLIGHT; - atomic_inc(&hctx->nr_active); + __blk_mq_inc_active_requests(hctx); } hctx->tags->rqs[rq->tag] = rq; return true; @@ -1145,7 +1146,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, struct sbitmap_queue *sbq; list_del_init(&wait->entry); - sbq = &hctx->tags->bitmap_tags; + sbq = hctx->tags->bitmap_tags; atomic_dec(&sbq->ws_active); } spin_unlock(&hctx->dispatch_wait_lock); @@ -1163,12 +1164,12 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; + struct sbitmap_queue *sbq = hctx->tags->bitmap_tags; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; - if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { blk_mq_sched_mark_restart_hctx(hctx); /* @@ -1420,7 +1421,7 @@ out: bool needs_restart; /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && - (hctx->flags & BLK_MQ_F_TAG_SHARED); + (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; blk_mq_release_budgets(q, nr_budgets); @@ -2296,20 +2297,21 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } -void blk_mq_free_rq_map(struct blk_mq_tags *tags) +void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; - blk_mq_free_tags(tags); + blk_mq_free_tags(tags, flags); } struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, - unsigned int reserved_tags) + unsigned int reserved_tags, + unsigned int flags) { struct blk_mq_tags *tags; int node; @@ -2318,8 +2320,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, - BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags); if (!tags) return NULL; @@ -2327,7 +2328,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) { - blk_mq_free_tags(tags); + blk_mq_free_tags(tags, flags); return NULL; } @@ -2336,7 +2337,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, node); if (!tags->static_rqs) { kfree(tags->rqs); - blk_mq_free_tags(tags); + blk_mq_free_tags(tags, flags); return NULL; } @@ -2660,6 +2661,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, goto free_hctx; atomic_set(&hctx->nr_active, 0); + atomic_set(&hctx->elevator_queued, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; @@ -2668,7 +2670,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; - hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; + hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; INIT_LIST_HEAD(&hctx->hctx_list); @@ -2745,10 +2747,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, int hctx_idx) { + unsigned int flags = set->flags; int ret = 0; set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, - set->queue_depth, set->reserved_tags); + set->queue_depth, set->reserved_tags, flags); if (!set->tags[hctx_idx]) return false; @@ -2757,7 +2760,7 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, if (!ret) return true; - blk_mq_free_rq_map(set->tags[hctx_idx]); + blk_mq_free_rq_map(set->tags[hctx_idx], flags); set->tags[hctx_idx] = NULL; return false; } @@ -2765,9 +2768,11 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, unsigned int hctx_idx) { + unsigned int flags = set->flags; + if (set->tags && set->tags[hctx_idx]) { blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); - blk_mq_free_rq_map(set->tags[hctx_idx]); + blk_mq_free_rq_map(set->tags[hctx_idx], flags); set->tags[hctx_idx] = NULL; } } @@ -2885,14 +2890,14 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) queue_for_each_hw_ctx(q, hctx, i) { if (shared) - hctx->flags |= BLK_MQ_F_TAG_SHARED; + hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; else - hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; } } -static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, - bool shared) +static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, + bool shared) { struct request_queue *q; @@ -2913,9 +2918,9 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) list_del(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ - set->flags &= ~BLK_MQ_F_TAG_SHARED; + set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ - blk_mq_update_tag_set_depth(set, false); + blk_mq_update_tag_set_shared(set, false); } mutex_unlock(&set->tag_list_lock); INIT_LIST_HEAD(&q->tag_set_list); @@ -2930,12 +2935,12 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, * Check to see if we're transitioning to shared (from 1 to 2 queues). */ if (!list_empty(&set->tag_list) && - !(set->flags & BLK_MQ_F_TAG_SHARED)) { - set->flags |= BLK_MQ_F_TAG_SHARED; + !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ - blk_mq_update_tag_set_depth(set, true); + blk_mq_update_tag_set_shared(set, true); } - if (set->flags & BLK_MQ_F_TAG_SHARED) + if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) queue_set_hctx_shared(q, true); list_add_tail(&q->tag_set_list, &set->tag_list); @@ -3438,11 +3443,23 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_mq_map; + if (blk_mq_is_sbitmap_shared(set->flags)) { + atomic_set(&set->active_queues_shared_sbitmap, 0); + + if (blk_mq_init_shared_sbitmap(set, set->flags)) { + ret = -ENOMEM; + goto out_free_mq_rq_maps; + } + } + mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; +out_free_mq_rq_maps: + for (i = 0; i < set->nr_hw_queues; i++) + blk_mq_free_map_and_requests(set, i); out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); @@ -3461,6 +3478,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) for (i = 0; i < set->nr_hw_queues; i++) blk_mq_free_map_and_requests(set, i); + if (blk_mq_is_sbitmap_shared(set->flags)) + blk_mq_exit_shared_sbitmap(set); + for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); set->map[j].mq_map = NULL; @@ -3497,6 +3517,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); + if (!ret && blk_mq_is_sbitmap_shared(set->flags)) + blk_mq_tag_resize_shared_sbitmap(set, nr); } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); diff --git a/block/blk-mq.h b/block/blk-mq.h index 863a2f3346d4..a52703c98b77 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -53,11 +53,12 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); -void blk_mq_free_rq_map(struct blk_mq_tags *tags); +void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags); struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, - unsigned int reserved_tags); + unsigned int reserved_tags, + unsigned int flags); int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth); @@ -158,6 +159,11 @@ struct blk_mq_alloc_data { struct blk_mq_hw_ctx *hctx; }; +static inline bool blk_mq_is_sbitmap_shared(unsigned int flags) +{ + return flags & BLK_MQ_F_TAG_HCTX_SHARED; +} + static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->q->elevator) @@ -193,6 +199,28 @@ static inline bool blk_mq_get_dispatch_budget(struct request_queue *q) return true; } +static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_sbitmap_shared(hctx->flags)) + atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap); + else + atomic_inc(&hctx->nr_active); +} + +static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_sbitmap_shared(hctx->flags)) + atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap); + else + atomic_dec(&hctx->nr_active); +} + +static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_sbitmap_shared(hctx->flags)) + return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap); + return atomic_read(&hctx->nr_active); +} static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { @@ -201,7 +229,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, if (rq->rq_flags & RQF_MQ_INFLIGHT) { rq->rq_flags &= ~RQF_MQ_INFLIGHT; - atomic_dec(&hctx->nr_active); + __blk_mq_dec_active_requests(hctx); } } @@ -253,4 +281,46 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q, return NULL; } +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct sbitmap_queue *bt) +{ + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->sb.depth == 1) + return true; + + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &q->queue_flags)) + return true; + users = atomic_read(&set->active_queues_shared_sbitmap); + } else { + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + users = atomic_read(&hctx->tags->active_queues); + } + + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->sb.depth + users - 1) / users, 4U); + return __blk_mq_active_requests(hctx) < depth; +} + + #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index 76a7e03bcd6c..4f6eb4bb1723 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -172,15 +172,13 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors); * * Description: * If a driver doesn't want IOs to cross a given chunk size, it can set - * this limit and prevent merging across chunks. Note that the chunk size - * must currently be a power-of-2 in sectors. Also note that the block - * layer must accept a page worth of data at any offset. So if the - * crossing of chunks is a hard limitation in the driver, it must still be - * prepared to split single page bios. + * this limit and prevent merging across chunks. Note that the block layer + * must accept a page worth of data at any offset. So if the crossing of + * chunks is a hard limitation in the driver, it must still be prepared + * to split single page bios. **/ void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) { - BUG_ON(!is_power_of_2(chunk_sectors)); q->limits.chunk_sectors = chunk_sectors; } EXPORT_SYMBOL(blk_queue_chunk_sectors); @@ -374,6 +372,19 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) } EXPORT_SYMBOL(blk_queue_alignment_offset); +void blk_queue_update_readahead(struct request_queue *q) +{ + /* + * For read-ahead of large files to be effective, we need to read ahead + * at least twice the optimal I/O size. + */ + q->backing_dev_info->ra_pages = + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); + q->backing_dev_info->io_pages = + queue_max_sectors(q) >> (PAGE_SHIFT - 9); +} +EXPORT_SYMBOL_GPL(blk_queue_update_readahead); + /** * blk_limits_io_min - set minimum request size for a device * @limits: the queue limits @@ -452,6 +463,8 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); + q->backing_dev_info->ra_pages = + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); @@ -534,6 +547,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->io_min = max(t->io_min, b->io_min); t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); + t->chunk_sectors = lcm_not_zero(t->chunk_sectors, b->chunk_sectors); /* Physical block size a multiple of the logical block size? */ if (t->physical_block_size & (t->logical_block_size - 1)) { @@ -556,6 +570,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, ret = -1; } + /* chunk_sectors a multiple of the physical block size? */ + if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { + t->chunk_sectors = 0; + t->misaligned = 1; + ret = -1; + } + t->raid_partial_stripes_expensive = max(t->raid_partial_stripes_expensive, b->raid_partial_stripes_expensive); @@ -594,10 +615,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_granularity; } - if (b->chunk_sectors) - t->chunk_sectors = min_not_zero(t->chunk_sectors, - b->chunk_sectors); - t->zoned = max(t->zoned, b->zoned); return ret; } @@ -629,8 +646,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, top, bottom); } - t->backing_dev_info->io_pages = - t->limits.max_sectors >> (PAGE_SHIFT - 9); + blk_queue_update_readahead(disk->queue); } EXPORT_SYMBOL(disk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7dda709f3ccb..76b54c7750b0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -260,14 +260,14 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ static ssize_t \ -queue_show_##name(struct request_queue *q, char *page) \ +queue_##name##_show(struct request_queue *q, char *page) \ { \ int bit; \ bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \ return queue_var_show(neg ? !bit : bit, page); \ } \ static ssize_t \ -queue_store_##name(struct request_queue *q, const char *page, size_t count) \ +queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ { \ unsigned long val; \ ssize_t ret; \ @@ -287,6 +287,7 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \ QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); +QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); #undef QUEUE_SYSFS_BIT_FNS static ssize_t queue_zoned_show(struct request_queue *q, char *page) @@ -547,218 +548,73 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_dax(q), page); } -static struct queue_sysfs_entry queue_requests_entry = { - .attr = {.name = "nr_requests", .mode = 0644 }, - .show = queue_requests_show, - .store = queue_requests_store, -}; - -static struct queue_sysfs_entry queue_ra_entry = { - .attr = {.name = "read_ahead_kb", .mode = 0644 }, - .show = queue_ra_show, - .store = queue_ra_store, -}; - -static struct queue_sysfs_entry queue_max_sectors_entry = { - .attr = {.name = "max_sectors_kb", .mode = 0644 }, - .show = queue_max_sectors_show, - .store = queue_max_sectors_store, -}; - -static struct queue_sysfs_entry queue_max_hw_sectors_entry = { - .attr = {.name = "max_hw_sectors_kb", .mode = 0444 }, - .show = queue_max_hw_sectors_show, -}; - -static struct queue_sysfs_entry queue_max_segments_entry = { - .attr = {.name = "max_segments", .mode = 0444 }, - .show = queue_max_segments_show, -}; - -static struct queue_sysfs_entry queue_max_discard_segments_entry = { - .attr = {.name = "max_discard_segments", .mode = 0444 }, - .show = queue_max_discard_segments_show, -}; - -static struct queue_sysfs_entry queue_max_integrity_segments_entry = { - .attr = {.name = "max_integrity_segments", .mode = 0444 }, - .show = queue_max_integrity_segments_show, -}; - -static struct queue_sysfs_entry queue_max_segment_size_entry = { - .attr = {.name = "max_segment_size", .mode = 0444 }, - .show = queue_max_segment_size_show, -}; +#define QUEUE_RO_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0444 }, \ + .show = _prefix##_show, \ +}; + +#define QUEUE_RW_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0644 }, \ + .show = _prefix##_show, \ + .store = _prefix##_store, \ +}; + +QUEUE_RW_ENTRY(queue_requests, "nr_requests"); +QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); +QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); +QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); +QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); +QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); +QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_RW_ENTRY(elv_iosched, "scheduler"); + +QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); +QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); +QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); +QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size"); +QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); + +QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); +QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); +QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); +QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); +QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); + +QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); +QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); +QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); + +QUEUE_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); +QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); +QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); + +QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); +QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); +QUEUE_RW_ENTRY(queue_poll, "io_poll"); +QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); +QUEUE_RW_ENTRY(queue_wc, "write_cache"); +QUEUE_RO_ENTRY(queue_fua, "fua"); +QUEUE_RO_ENTRY(queue_dax, "dax"); +QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); +QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); -static struct queue_sysfs_entry queue_iosched_entry = { - .attr = {.name = "scheduler", .mode = 0644 }, - .show = elv_iosched_show, - .store = elv_iosched_store, -}; +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); +#endif +/* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .attr = {.name = "hw_sector_size", .mode = 0444 }, .show = queue_logical_block_size_show, }; -static struct queue_sysfs_entry queue_logical_block_size_entry = { - .attr = {.name = "logical_block_size", .mode = 0444 }, - .show = queue_logical_block_size_show, -}; - -static struct queue_sysfs_entry queue_physical_block_size_entry = { - .attr = {.name = "physical_block_size", .mode = 0444 }, - .show = queue_physical_block_size_show, -}; - -static struct queue_sysfs_entry queue_chunk_sectors_entry = { - .attr = {.name = "chunk_sectors", .mode = 0444 }, - .show = queue_chunk_sectors_show, -}; - -static struct queue_sysfs_entry queue_io_min_entry = { - .attr = {.name = "minimum_io_size", .mode = 0444 }, - .show = queue_io_min_show, -}; - -static struct queue_sysfs_entry queue_io_opt_entry = { - .attr = {.name = "optimal_io_size", .mode = 0444 }, - .show = queue_io_opt_show, -}; - -static struct queue_sysfs_entry queue_discard_granularity_entry = { - .attr = {.name = "discard_granularity", .mode = 0444 }, - .show = queue_discard_granularity_show, -}; - -static struct queue_sysfs_entry queue_discard_max_hw_entry = { - .attr = {.name = "discard_max_hw_bytes", .mode = 0444 }, - .show = queue_discard_max_hw_show, -}; - -static struct queue_sysfs_entry queue_discard_max_entry = { - .attr = {.name = "discard_max_bytes", .mode = 0644 }, - .show = queue_discard_max_show, - .store = queue_discard_max_store, -}; - -static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { - .attr = {.name = "discard_zeroes_data", .mode = 0444 }, - .show = queue_discard_zeroes_data_show, -}; - -static struct queue_sysfs_entry queue_write_same_max_entry = { - .attr = {.name = "write_same_max_bytes", .mode = 0444 }, - .show = queue_write_same_max_show, -}; - -static struct queue_sysfs_entry queue_write_zeroes_max_entry = { - .attr = {.name = "write_zeroes_max_bytes", .mode = 0444 }, - .show = queue_write_zeroes_max_show, -}; - -static struct queue_sysfs_entry queue_zone_append_max_entry = { - .attr = {.name = "zone_append_max_bytes", .mode = 0444 }, - .show = queue_zone_append_max_show, -}; - -static struct queue_sysfs_entry queue_nonrot_entry = { - .attr = {.name = "rotational", .mode = 0644 }, - .show = queue_show_nonrot, - .store = queue_store_nonrot, -}; - -static struct queue_sysfs_entry queue_zoned_entry = { - .attr = {.name = "zoned", .mode = 0444 }, - .show = queue_zoned_show, -}; - -static struct queue_sysfs_entry queue_nr_zones_entry = { - .attr = {.name = "nr_zones", .mode = 0444 }, - .show = queue_nr_zones_show, -}; - -static struct queue_sysfs_entry queue_max_open_zones_entry = { - .attr = {.name = "max_open_zones", .mode = 0444 }, - .show = queue_max_open_zones_show, -}; - -static struct queue_sysfs_entry queue_max_active_zones_entry = { - .attr = {.name = "max_active_zones", .mode = 0444 }, - .show = queue_max_active_zones_show, -}; - -static struct queue_sysfs_entry queue_nomerges_entry = { - .attr = {.name = "nomerges", .mode = 0644 }, - .show = queue_nomerges_show, - .store = queue_nomerges_store, -}; - -static struct queue_sysfs_entry queue_rq_affinity_entry = { - .attr = {.name = "rq_affinity", .mode = 0644 }, - .show = queue_rq_affinity_show, - .store = queue_rq_affinity_store, -}; - -static struct queue_sysfs_entry queue_iostats_entry = { - .attr = {.name = "iostats", .mode = 0644 }, - .show = queue_show_iostats, - .store = queue_store_iostats, -}; - -static struct queue_sysfs_entry queue_random_entry = { - .attr = {.name = "add_random", .mode = 0644 }, - .show = queue_show_random, - .store = queue_store_random, -}; - -static struct queue_sysfs_entry queue_poll_entry = { - .attr = {.name = "io_poll", .mode = 0644 }, - .show = queue_poll_show, - .store = queue_poll_store, -}; - -static struct queue_sysfs_entry queue_poll_delay_entry = { - .attr = {.name = "io_poll_delay", .mode = 0644 }, - .show = queue_poll_delay_show, - .store = queue_poll_delay_store, -}; - -static struct queue_sysfs_entry queue_wc_entry = { - .attr = {.name = "write_cache", .mode = 0644 }, - .show = queue_wc_show, - .store = queue_wc_store, -}; - -static struct queue_sysfs_entry queue_fua_entry = { - .attr = {.name = "fua", .mode = 0444 }, - .show = queue_fua_show, -}; - -static struct queue_sysfs_entry queue_dax_entry = { - .attr = {.name = "dax", .mode = 0444 }, - .show = queue_dax_show, -}; - -static struct queue_sysfs_entry queue_io_timeout_entry = { - .attr = {.name = "io_timeout", .mode = 0644 }, - .show = queue_io_timeout_show, - .store = queue_io_timeout_store, -}; - -static struct queue_sysfs_entry queue_wb_lat_entry = { - .attr = {.name = "wbt_lat_usec", .mode = 0644 }, - .show = queue_wb_lat_show, - .store = queue_wb_lat_store, -}; - -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static struct queue_sysfs_entry throtl_sample_time_entry = { - .attr = {.name = "throttle_sample_time", .mode = 0644 }, - .show = blk_throtl_sample_time_show, - .store = blk_throtl_sample_time_store, -}; -#endif +QUEUE_RW_ENTRY(queue_nonrot, "rotational"); +QUEUE_RW_ENTRY(queue_iostats, "iostats"); +QUEUE_RW_ENTRY(queue_random, "add_random"); +QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); static struct attribute *queue_attrs[] = { &queue_requests_entry.attr, @@ -769,7 +625,7 @@ static struct attribute *queue_attrs[] = { &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, - &queue_iosched_entry.attr, + &elv_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, @@ -791,6 +647,7 @@ static struct attribute *queue_attrs[] = { &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, + &queue_stable_writes_entry.attr, &queue_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, @@ -800,7 +657,7 @@ static struct attribute *queue_attrs[] = { &queue_poll_delay_entry.attr, &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - &throtl_sample_time_entry.attr, + &blk_throtl_sample_time_entry.attr, #endif NULL, }; @@ -1000,6 +857,8 @@ int blk_register_queue(struct gendisk *disk) percpu_ref_switch_to_percpu(&q->q_usage_counter); } + blk_queue_update_readahead(q); + ret = blk_trace_init_sysfs(dev); if (ret) return ret; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index fee3325edf27..36ba61c5cdbd 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -15,10 +15,10 @@ #include "blk-cgroup-rwstat.h" /* Max dispatch from a group in 1 round */ -static int throtl_grp_quantum = 8; +#define THROTL_GRP_QUANTUM 8 /* Total max dispatch from all groups in one round */ -static int throtl_quantum = 32; +#define THROTL_QUANTUM 32 /* Throttling is performed over a slice and after that slice is renewed */ #define DFL_THROTL_SLICE_HD (HZ / 10) @@ -150,7 +150,7 @@ struct throtl_grp { /* user configured IOPS limits */ unsigned int iops_conf[2][LIMIT_CNT]; - /* Number of bytes disptached in current slice */ + /* Number of bytes dispatched in current slice */ uint64_t bytes_disp[2]; /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; @@ -852,7 +852,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) /* * A bio has been dispatched. Also adjust slice_end. It might happen * that initially cgroup limit was very low resulting in high - * slice_end, but later limit was bumped up and bio was dispached + * slice_end, but later limit was bumped up and bio was dispatched * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ @@ -894,13 +894,19 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) } static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, - unsigned long *wait) + u32 iops_limit, unsigned long *wait) { bool rw = bio_data_dir(bio); unsigned int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; + if (iops_limit == UINT_MAX) { + if (wait) + *wait = 0; + return true; + } + jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ @@ -913,7 +919,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, * have been trimmed. */ - tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; + tmp = (u64)iops_limit * jiffy_elapsed_rnd; do_div(tmp, HZ); if (tmp > UINT_MAX) @@ -936,13 +942,19 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, } static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, - unsigned long *wait) + u64 bps_limit, unsigned long *wait) { bool rw = bio_data_dir(bio); u64 bytes_allowed, extra_bytes, tmp; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); + if (bps_limit == U64_MAX) { + if (wait) + *wait = 0; + return true; + } + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; /* Slice has just started. Consider one slice interval */ @@ -951,7 +963,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; + tmp = bps_limit * jiffy_elapsed_rnd; do_div(tmp, HZ); bytes_allowed = tmp; @@ -963,7 +975,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; - jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); + jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); if (!jiffy_wait) jiffy_wait = 1; @@ -987,6 +999,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, { bool rw = bio_data_dir(bio); unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; + u64 bps_limit = tg_bps_limit(tg, rw); + u32 iops_limit = tg_iops_limit(tg, rw); /* * Currently whole state machine of group depends on first bio @@ -998,8 +1012,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ - if (tg_bps_limit(tg, rw) == U64_MAX && - tg_iops_limit(tg, rw) == UINT_MAX) { + if (bps_limit == U64_MAX && iops_limit == UINT_MAX) { if (wait) *wait = 0; return true; @@ -1021,8 +1034,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } - if (tg_with_in_bps_limit(tg, bio, &bps_wait) && - tg_with_in_iops_limit(tg, bio, &iops_wait)) { + if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && + tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) *wait = 0; return true; @@ -1082,7 +1095,7 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, * If @tg doesn't currently have any bios queued in the same * direction, queueing @bio can change when @tg should be * dispatched. Mark that @tg was empty. This is automatically - * cleaered on the next tg_update_disptime(). + * cleared on the next tg_update_disptime(). */ if (!sq->nr_queued[rw]) tg->flags |= THROTL_TG_WAS_EMPTY; @@ -1175,8 +1188,8 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned int nr_reads = 0, nr_writes = 0; - unsigned int max_nr_reads = throtl_grp_quantum*3/4; - unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; + unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; + unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; struct bio *bio; /* Try to dispatch 75% READS and 25% WRITES */ @@ -1226,7 +1239,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) if (sq->nr_queued[0] || sq->nr_queued[1]) tg_update_disptime(tg); - if (nr_disp >= throtl_quantum) + if (nr_disp >= THROTL_QUANTUM) break; } @@ -1303,7 +1316,7 @@ again: } } } else { - /* reached the top-level, queue issueing */ + /* reached the top-level, queue issuing */ queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: @@ -1314,8 +1327,8 @@ out_unlock: * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work * @work: work item being executed * - * This function is queued for execution when bio's reach the bio_lists[] - * of throtl_data->service_queue. Those bio's are ready and issued by this + * This function is queued for execution when bios reach the bio_lists[] + * of throtl_data->service_queue. Those bios are ready and issued by this * function. */ static void blk_throtl_dispatch_work_fn(struct work_struct *work) @@ -1428,8 +1441,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ - throtl_start_new_slice(tg, 0); - throtl_start_new_slice(tg, 1); + throtl_start_new_slice(tg, READ); + throtl_start_new_slice(tg, WRITE); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -2230,7 +2243,7 @@ again: /* * @bio passed through this layer without being throttled. - * Climb up the ladder. If we''re already at the top, it + * Climb up the ladder. If we're already at the top, it * can be executed directly. */ qn = &tg->qnode_on_parent[rw]; diff --git a/block/blk.h b/block/blk.h index 49e2928a1632..c08762e10b04 100644 --- a/block/blk.h +++ b/block/blk.h @@ -29,6 +29,12 @@ struct blk_flush_queue { spinlock_t mq_flush_lock; }; +enum bio_merge_status { + BIO_MERGE_OK, + BIO_MERGE_NONE, + BIO_MERGE_FAILED, +}; + extern struct kmem_cache *blk_requestq_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; @@ -169,14 +175,19 @@ static inline void blk_integrity_del(struct gendisk *disk) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -bool bio_attempt_front_merge(struct request *req, struct bio *bio, - unsigned int nr_segs); -bool bio_attempt_back_merge(struct request *req, struct bio *bio, - unsigned int nr_segs); -bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, - struct bio *bio); +enum bio_merge_status bio_attempt_front_merge(struct request *req, + struct bio *bio, + unsigned int nr_segs); +enum bio_merge_status bio_attempt_back_merge(struct request *req, + struct bio *bio, + unsigned int nr_segs); +enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q, + struct request *req, + struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **same_queue_rq); +bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, + struct bio *bio, unsigned int nr_segs); void blk_account_io_start(struct request *req); void blk_account_io_done(struct request *req, u64 now); @@ -350,7 +361,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -void delete_partition(struct gendisk *disk, struct hd_struct *part); +void delete_partition(struct hd_struct *part); int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index d185396d88bb..330fede77271 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -207,7 +207,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) BUG_ON(!req->nr_phys_segments); - buf->sg_list = kzalloc(sz, GFP_KERNEL); + buf->sg_list = kmalloc(sz, GFP_KERNEL); if (!buf->sg_list) return -ENOMEM; sg_init_table(buf->sg_list, req->nr_phys_segments); diff --git a/block/genhd.c b/block/genhd.c index 99c64641c314..05fb27cbb667 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -50,14 +50,13 @@ static void disk_release_events(struct gendisk *disk); * zero and will not be set to zero */ void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, - bool revalidate) + bool update_bdev) { sector_t capacity = get_capacity(disk); set_capacity(disk, size); - - if (revalidate) - revalidate_disk(disk); + if (update_bdev) + revalidate_disk_size(disk, true); if (capacity != size && capacity != 0 && size != 0) { char *envp[] = { "RESIZE=1", NULL }; @@ -110,8 +109,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } } -static unsigned int part_in_flight(struct request_queue *q, - struct hd_struct *part) +static unsigned int part_in_flight(struct hd_struct *part) { unsigned int inflight = 0; int cpu; @@ -126,8 +124,7 @@ static unsigned int part_in_flight(struct request_queue *q, return inflight; } -static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) { int cpu; @@ -676,11 +673,23 @@ static int exact_lock(dev_t devt, void *data) return 0; } +static void disk_scan_partitions(struct gendisk *disk) +{ + struct block_device *bdev; + + if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) + return; + + set_bit(GD_NEED_PART_SCAN, &disk->state); + bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); + if (!IS_ERR(bdev)) + blkdev_put(bdev, FMODE_READ); +} + static void register_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); - struct block_device *bdev; struct disk_part_iter piter; struct hd_struct *part; int err; @@ -722,25 +731,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, return; } - /* No minors to use for partitions */ - if (!disk_part_scan_enabled(disk)) - goto exit; + disk_scan_partitions(disk); - /* No such device (e.g., media were just removed) */ - if (!get_capacity(disk)) - goto exit; - - bdev = bdget_disk(disk, 0); - if (!bdev) - goto exit; - - bdev->bd_invalidated = 1; - err = blkdev_get(bdev, FMODE_READ, NULL); - if (err < 0) - goto exit; - blkdev_put(bdev, FMODE_READ); - -exit: /* announce disk after possible partitions are created */ dev_set_uevent_suppress(ddev, 0); kobject_uevent(&ddev->kobj, KOBJ_ADD); @@ -913,7 +905,7 @@ void del_gendisk(struct gendisk *disk) DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(disk, part->partno); - delete_partition(disk, part); + delete_partition(part); } disk_part_iter_exit(&piter); @@ -1301,7 +1293,7 @@ ssize_t part_stat_show(struct device *dev, if (queue_is_mq(q)) inflight = blk_mq_in_flight(q, p); else - inflight = part_in_flight(q, p); + inflight = part_in_flight(p); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -1343,7 +1335,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, if (queue_is_mq(q)) blk_mq_in_flight_rw(q, p, inflight); else - part_in_flight_rw(q, p, inflight); + part_in_flight_rw(p, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1623,7 +1615,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); else - inflight = part_in_flight(gp->queue, hd); + inflight = part_in_flight(hd); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " @@ -1729,45 +1721,48 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) } disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); - if (disk) { - disk->part0.dkstats = alloc_percpu(struct disk_stats); - if (!disk->part0.dkstats) { - kfree(disk); - return NULL; - } - init_rwsem(&disk->lookup_sem); - disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) { - free_percpu(disk->part0.dkstats); - kfree(disk); - return NULL; - } - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], &disk->part0); + if (!disk) + return NULL; - /* - * set_capacity() and get_capacity() currently don't use - * seqcounter to read/update the part0->nr_sects. Still init - * the counter as we can read the sectors in IO submission - * patch using seqence counters. - * - * TODO: Ideally set_capacity() and get_capacity() should be - * converted to make use of bd_mutex and sequence counters. - */ - hd_sects_seq_init(&disk->part0); - if (hd_ref_init(&disk->part0)) { - hd_free_part(&disk->part0); - kfree(disk); - return NULL; - } + disk->part0.dkstats = alloc_percpu(struct disk_stats); + if (!disk->part0.dkstats) + goto out_free_disk; - disk->minors = minors; - rand_initialize_disk(disk); - disk_to_dev(disk)->class = &block_class; - disk_to_dev(disk)->type = &disk_type; - device_initialize(disk_to_dev(disk)); + init_rwsem(&disk->lookup_sem); + disk->node_id = node_id; + if (disk_expand_part_tbl(disk, 0)) { + free_percpu(disk->part0.dkstats); + goto out_free_disk; } + + ptbl = rcu_dereference_protected(disk->part_tbl, 1); + rcu_assign_pointer(ptbl->part[0], &disk->part0); + + /* + * set_capacity() and get_capacity() currently don't use + * seqcounter to read/update the part0->nr_sects. Still init + * the counter as we can read the sectors in IO submission + * patch using seqence counters. + * + * TODO: Ideally set_capacity() and get_capacity() should be + * converted to make use of bd_mutex and sequence counters. + */ + hd_sects_seq_init(&disk->part0); + if (hd_ref_init(&disk->part0)) + goto out_free_part0; + + disk->minors = minors; + rand_initialize_disk(disk); + disk_to_dev(disk)->class = &block_class; + disk_to_dev(disk)->type = &disk_type; + device_initialize(disk_to_dev(disk)); return disk; + +out_free_part0: + hd_free_part(&disk->part0); +out_free_disk: + kfree(disk); + return NULL; } EXPORT_SYMBOL(__alloc_disk_node); @@ -2052,7 +2047,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) * CONTEXT: * Might sleep. */ -unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) { struct disk_events *ev = disk->ev; unsigned int pending; @@ -2090,6 +2085,33 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) return pending; } +/** + * bdev_check_media_change - check if a removable media has been changed + * @bdev: block device to check + * + * Check whether a removable media has been changed, and attempt to free all + * dentries and inodes and invalidates all block device page cache entries in + * that case. + * + * Returns %true if the block device changed, or %false if not. + */ +bool bdev_check_media_change(struct block_device *bdev) +{ + unsigned int events; + + events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return false; + + if (__invalidate_device(bdev, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + bdev->bd_disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + return true; +} +EXPORT_SYMBOL(bdev_check_media_change); + /* * Separate this part out so that a different pointer for clearing_ptr can be * passed in for disk_clear_events. diff --git a/block/ioctl.c b/block/ioctl.c index bdb3bbb253d9..06262c28f0c6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -112,8 +112,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, uint64_t range[2]; uint64_t start, len; struct request_queue *q = bdev_get_queue(bdev); - struct address_space *mapping = bdev->bd_inode->i_mapping; - + int err; if (!(mode & FMODE_WRITE)) return -EBADF; @@ -134,7 +133,11 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, if (start + len > i_size_read(bdev->bd_inode)) return -EINVAL; - truncate_inode_pages_range(mapping, start, start + len - 1); + + err = truncate_bdev_range(bdev, mode, start, start + len - 1); + if (err) + return err; + return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, flags); } @@ -143,8 +146,8 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, unsigned long arg) { uint64_t range[2]; - struct address_space *mapping; uint64_t start, end, len; + int err; if (!(mode & FMODE_WRITE)) return -EBADF; @@ -166,8 +169,9 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, return -EINVAL; /* Invalidate the page cache, including dirty pages */ - mapping = bdev->bd_inode->i_mapping; - truncate_inode_pages_range(mapping, start, end); + err = truncate_bdev_range(bdev, mode, start, end); + if (err) + return err; return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); @@ -474,15 +478,14 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, if (get_user(n, argp)) return -EFAULT; - if (!(mode & FMODE_EXCL)) { - bdgrab(bdev); - if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) - return -EBUSY; - } + if (mode & FMODE_EXCL) + return set_blocksize(bdev, n); + if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode | FMODE_EXCL, &bdev))) + return -EBUSY; ret = set_blocksize(bdev, n); - if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, mode | FMODE_EXCL); + return ret; } diff --git a/block/ioprio.c b/block/ioprio.c index 04ebd37966f1..364d2294ba90 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio) switch (class) { case IOPRIO_CLASS_RT: - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN)) return -EPERM; fallthrough; /* rt has prio field too */ diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index a38c5ab103d1..dc89199bc8c6 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -359,7 +359,7 @@ static unsigned int kyber_sched_tags_shift(struct request_queue *q) * All of the hardware queues have the same depth, so we can just grab * the shift of the first one. */ - return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; + return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift; } static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) @@ -502,7 +502,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) khd->batching = 0; hctx->sched_data = khd; - sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, + sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags, kqd->async_depth); return 0; @@ -573,7 +573,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, bool merged; spin_lock(&kcq->lock); - merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); + merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); spin_unlock(&kcq->lock); return merged; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b57470e154c8..800ac902809b 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -386,6 +386,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock(&dd->lock); rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); + if (rq) + atomic_dec(&rq->mq_hctx->elevator_queued); return rq; } @@ -533,6 +535,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, at_head); + atomic_inc(&hctx->elevator_queued); } spin_unlock(&dd->lock); } @@ -579,6 +582,9 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; + if (!atomic_read(&hctx->elevator_queued)) + return false; + return !list_empty_careful(&dd->dispatch) || !list_empty_careful(&dd->fifo_list[0]) || !list_empty_careful(&dd->fifo_list[1]); diff --git a/block/partitions/core.c b/block/partitions/core.c index 722406b841df..5cacbac30107 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -199,14 +199,20 @@ static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); + + return sprintf(buf, "%u\n", + queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, + p->start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%u\n", p->discard_alignment); + + return sprintf(buf, "%u\n", + queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, + p->start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -318,8 +324,9 @@ int hd_ref_init(struct hd_struct *part) * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -void delete_partition(struct gendisk *disk, struct hd_struct *part) +void delete_partition(struct hd_struct *part) { + struct gendisk *disk = part_to_disk(part); struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); @@ -327,7 +334,7 @@ void delete_partition(struct gendisk *disk, struct hd_struct *part) * ->part_tbl is referenced in this part's release handler, so * we have to hold the disk device */ - get_device(disk_to_dev(part_to_disk(part))); + get_device(disk_to_dev(disk)); rcu_assign_pointer(ptbl->part[part->partno], NULL); kobject_put(part->holder_dir); device_del(part_to_dev(part)); @@ -405,10 +412,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); p->start_sect = start; - p->alignment_offset = - queue_limit_alignment_offset(&disk->queue->limits, start); - p->discard_alignment = - queue_limit_discard_alignment(&disk->queue->limits, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); @@ -554,7 +557,7 @@ int bdev_del_partition(struct block_device *bdev, int partno) sync_blockdev(bdevp); invalidate_bdev(bdevp); - delete_partition(bdev->bd_disk, part); + delete_partition(part); ret = 0; out_unlock: mutex_unlock(&bdev->bd_mutex); @@ -592,8 +595,8 @@ int bdev_resize_partition(struct block_device *bdev, int partno, if (partition_overlaps(bdev->bd_disk, start, length, partno)) goto out_unlock; - part_nr_sects_write(part, (sector_t)length); - i_size_write(bdevp->bd_inode, length << SECTOR_SHIFT); + part_nr_sects_write(part, length); + bd_set_nr_sectors(bdevp, length); ret = 0; out_unlock: @@ -634,7 +637,7 @@ int blk_drop_partitions(struct block_device *bdev) disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(bdev->bd_disk, part); + delete_partition(part); disk_part_iter_exit(&piter); return 0; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index ef722f04f88a..4421e61c1af1 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -37,8 +37,6 @@ const unsigned char scsi_command_size_tbl[8] = }; EXPORT_SYMBOL(scsi_command_size_tbl); -#include <scsi/sg.h> - static int sg_get_version(int __user *p) { static const int sg_version_num = 30527; |