diff options
Diffstat (limited to 'block')
36 files changed, 1300 insertions, 962 deletions
diff --git a/block/Kconfig b/block/Kconfig index 41c0917ce622..c23094a14a2b 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -32,6 +32,9 @@ config BLK_RQ_ALLOC_TIME config BLK_SCSI_REQUEST bool +config BLK_CGROUP_RWSTAT + bool + config BLK_DEV_BSG bool "Block layer SG support v4" default y @@ -86,6 +89,7 @@ config BLK_DEV_ZONED config BLK_DEV_THROTTLING bool "Block layer bio throttling support" depends on BLK_CGROUP=y + select BLK_CGROUP_RWSTAT ---help--- Block layer bio throttling support. It can be used to limit the IO rate to a device. IO rate policies are per cgroup and diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index b89310a022ad..7df14133adc8 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -31,6 +31,7 @@ config IOSCHED_BFQ config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" depends on IOSCHED_BFQ && BLK_CGROUP + select BLK_CGROUP_RWSTAT ---help--- Enable hierarchical scheduling in BFQ, using the blkio diff --git a/block/Makefile b/block/Makefile index 9ef57ace90d4..205a5f2fef17 100644 --- a/block/Makefile +++ b/block/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o +obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 86a607cf19a1..cea0ae12f937 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -347,6 +347,14 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg) bfqg_put(bfqg); } +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) +{ + struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); + + blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); + blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); +} + /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { @@ -431,6 +439,8 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) static void bfqg_stats_exit(struct bfqg_stats *stats) { + blkg_rwstat_exit(&stats->bytes); + blkg_rwstat_exit(&stats->ios); #ifdef CONFIG_BFQ_CGROUP_DEBUG blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); @@ -448,6 +458,10 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { + if (blkg_rwstat_init(&stats->bytes, gfp) || + blkg_rwstat_init(&stats->ios, gfp)) + return -ENOMEM; + #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || @@ -1057,18 +1071,35 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, return bfq_io_set_device_weight(of, buf, nbytes, off); } -#ifdef CONFIG_BFQ_CGROUP_DEBUG -static int bfqg_print_stat(struct seq_file *sf, void *v) +static int bfqg_print_rwstat(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - &blkcg_policy_bfq, seq_cft(sf)->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, + &blkcg_policy_bfq, seq_cft(sf)->private, true); return 0; } -static int bfqg_print_rwstat(struct seq_file *sf, void *v) +static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - &blkcg_policy_bfq, seq_cft(sf)->private, true); + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, + seq_cft(sf)->private, true); + return 0; +} + +#ifdef CONFIG_BFQ_CGROUP_DEBUG +static int bfqg_print_stat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, + &blkcg_policy_bfq, seq_cft(sf)->private, false); return 0; } @@ -1097,15 +1128,6 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, sum); } -static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat_sample sum; - - blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); - return __blkg_prfill_rwstat(sf, pd, &sum); -} - static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), @@ -1114,18 +1136,11 @@ static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) return 0; } -static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, - seq_cft(sf)->private, true); - return 0; -} - static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg); + u64 sum = blkg_rwstat_total(&bfqg->stats.bytes); return __blkg_prfill_u64(sf, pd, sum >> 9); } @@ -1142,8 +1157,8 @@ static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, { struct blkg_rwstat_sample tmp; - blkg_rwstat_recursive_sum(pd->blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes), &tmp); + blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq, + offsetof(struct bfq_group, stats.bytes), &tmp); return __blkg_prfill_u64(sf, pd, (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); @@ -1226,13 +1241,13 @@ struct cftype bfq_blkcg_legacy_files[] = { /* statistics, covers only the tasks in the bfqg */ { .name = "bfq.io_service_bytes", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, + .private = offsetof(struct bfq_group, stats.bytes), + .seq_show = bfqg_print_rwstat, }, { .name = "bfq.io_serviced", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, + .private = offsetof(struct bfq_group, stats.ios), + .seq_show = bfqg_print_rwstat, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { @@ -1269,13 +1284,13 @@ struct cftype bfq_blkcg_legacy_files[] = { /* the same statistics which cover the bfqg and its descendants */ { .name = "bfq.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, + .private = offsetof(struct bfq_group, stats.bytes), + .seq_show = bfqg_print_rwstat_recursive, }, { .name = "bfq.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, + .private = offsetof(struct bfq_group, stats.ios), + .seq_show = bfqg_print_rwstat_recursive, }, #ifdef CONFIG_BFQ_CGROUP_DEBUG { diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0319d6339822..ad4af4aaf2ce 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2713,6 +2713,28 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } } + +static +void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* + * To prevent bfqq's service guarantees from being violated, + * bfqq may be left busy, i.e., queued for service, even if + * empty (see comments in __bfq_bfqq_expire() for + * details). But, if no process will send requests to bfqq any + * longer, then there is no point in keeping bfqq queued for + * service. In addition, keeping bfqq queued for service, but + * with no process ref any longer, may have caused bfqq to be + * freed when dequeued from service. But this is assumed to + * never happen. + */ + if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && + bfqq != bfqd->in_service_queue) + bfq_del_bfqq_busy(bfqd, bfqq, false); + + bfq_put_queue(bfqq); +} + static void bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) @@ -2783,8 +2805,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, */ new_bfqq->pid = -1; bfqq->bic = NULL; - /* release process reference to bfqq */ - bfq_put_queue(bfqq); + bfq_release_process_ref(bfqd, bfqq); } static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, @@ -4899,7 +4920,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_cooperator(bfqq); - bfq_put_queue(bfqq); /* release process reference */ + bfq_release_process_ref(bfqd, bfqq); } static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) @@ -5001,8 +5022,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { - /* release process reference on this queue */ - bfq_put_queue(bfqq); + bfq_release_process_ref(bfqd, bfqq); bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); bic_set_bfqq(bic, bfqq, false); } @@ -5464,6 +5484,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool idle_timer_disabled = false; unsigned int cmd_flags; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) + bfqg_stats_update_legacy_io(q, rq); +#endif spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { spin_unlock_irq(&bfqd->lock); @@ -5963,7 +5987,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) bfq_put_cooperator(bfqq); - bfq_put_queue(bfqq); + bfq_release_process_ref(bfqq->bfqd, bfqq); return NULL; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 5d1a519640f6..8526f20c53bc 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -10,6 +10,8 @@ #include <linux/hrtimer.h> #include <linux/blk-cgroup.h> +#include "blk-cgroup-rwstat.h" + #define BFQ_IOPRIO_CLASSES 3 #define BFQ_CL_IDLE_TIMEOUT (HZ/5) @@ -809,6 +811,9 @@ struct bfq_stat { }; struct bfqg_stats { + /* basic stats */ + struct blkg_rwstat bytes; + struct blkg_rwstat ios; #ifdef CONFIG_BFQ_CGROUP_DEBUG /* number of ios merged */ struct blkg_rwstat merged; @@ -956,6 +961,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); /* ---------------- cgroups-support interface ---------------- */ +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, unsigned int op); void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); @@ -1062,6 +1068,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ char pid_str[MAX_PID_STR_LENGTH]; \ + if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ + break; \ bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ @@ -1078,6 +1086,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ char pid_str[MAX_PID_STR_LENGTH]; \ + if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ + break; \ bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ diff --git a/block/bio.c b/block/bio.c index 8f0ed6228fc5..b1170ec18464 100644 --- a/block/bio.c +++ b/block/bio.c @@ -751,7 +751,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return false; - if (bio->bi_vcnt > 0) { + if (bio->bi_vcnt > 0 && !bio_full(bio, len)) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (page_is_mergeable(bv, page, len, off, same_page)) { diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c new file mode 100644 index 000000000000..85d5790ac49b --- /dev/null +++ b/block/blk-cgroup-rwstat.c @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. + * Do not use in new code. + */ +#include "blk-cgroup-rwstat.h" + +int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) +{ + int i, ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); + if (ret) { + while (--i >= 0) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); + return ret; + } + atomic64_set(&rwstat->aux_cnt[i], 0); + } + return 0; +} +EXPORT_SYMBOL_GPL(blkg_rwstat_init); + +void blkg_rwstat_exit(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); +} +EXPORT_SYMBOL_GPL(blkg_rwstat_exit); + +/** + * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @rwstat: rwstat to print + * + * Print @rwstat to @sf for the device assocaited with @pd. + */ +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat_sample *rwstat) +{ + static const char *rwstr[] = { + [BLKG_RWSTAT_READ] = "Read", + [BLKG_RWSTAT_WRITE] = "Write", + [BLKG_RWSTAT_SYNC] = "Sync", + [BLKG_RWSTAT_ASYNC] = "Async", + [BLKG_RWSTAT_DISCARD] = "Discard", + }; + const char *dname = blkg_dev_name(pd->blkg); + u64 v; + int i; + + if (!dname) + return 0; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], + rwstat->cnt[i]); + + v = rwstat->cnt[BLKG_RWSTAT_READ] + + rwstat->cnt[BLKG_RWSTAT_WRITE] + + rwstat->cnt[BLKG_RWSTAT_DISCARD]; + seq_printf(sf, "%s Total %llu\n", dname, v); + return v; +} +EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); + +/** + * blkg_prfill_rwstat - prfill callback for blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the blkg_rwstat in @pd + * + * prfill callback for printing a blkg_rwstat. + */ +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat_sample rwstat = { }; + + blkg_rwstat_read((void *)pd + off, &rwstat); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} +EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); + +/** + * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg + * @sum: blkg_rwstat_sample structure containing the results + * + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. + */ +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat_sample *sum) +{ + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + unsigned int i; + + lockdep_assert_held(&blkg->q->queue_lock); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; + + if (!pos_blkg->online) + continue; + + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h new file mode 100644 index 000000000000..ee746919c41f --- /dev/null +++ b/block/blk-cgroup-rwstat.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. + * Do not use in new code. + */ +#ifndef _BLK_CGROUP_RWSTAT_H +#define _BLK_CGROUP_RWSTAT_H + +#include <linux/blk-cgroup.h> + +enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, + BLKG_RWSTAT_DISCARD, + + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, +}; + +/* + * blkg_[rw]stat->aux_cnt is excluded for local stats but included for + * recursive. Used to carry stats of dead children. + */ +struct blkg_rwstat { + struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; + atomic64_t aux_cnt[BLKG_RWSTAT_NR]; +}; + +struct blkg_rwstat_sample { + u64 cnt[BLKG_RWSTAT_NR]; +}; + +static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, + unsigned int idx) +{ + return atomic64_read(&rwstat->aux_cnt[idx]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); +} + +int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp); +void blkg_rwstat_exit(struct blkg_rwstat *rwstat); +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat_sample *rwstat); +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat_sample *sum); + + +/** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @op: REQ_OP and flags + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ +static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + unsigned int op, uint64_t val) +{ + struct percpu_counter *cnt; + + if (op_is_discard(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; + else if (op_is_write(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; + + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); + + if (op_is_sync(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; + + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); +} + +/** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it in the aux counts. + */ +static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, + struct blkg_rwstat_sample *result) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + result->cnt[i] = + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); +} + +/** + * blkg_rwstat_total - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) +{ + struct blkg_rwstat_sample tmp = { }; + + blkg_rwstat_read(rwstat, &tmp); + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; +} + +/** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ +static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + percpu_counter_set(&rwstat->cpu_cnt[i], 0); + atomic64_set(&rwstat->aux_cnt[i], 0); + } +} + +/** + * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's count including the aux one to @to's aux count. + */ +static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + u64 sum[BLKG_RWSTAT_NR]; + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), + &to->aux_cnt[i]); +} +#endif /* _BLK_CGROUP_RWSTAT_H */ diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b6f20be0fc78..708dea92dac8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -80,8 +80,7 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - blkg_rwstat_exit(&blkg->stat_ios); - blkg_rwstat_exit(&blkg->stat_bytes); + free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } @@ -146,7 +145,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, gfp_t gfp_mask) { struct blkcg_gq *blkg; - int i; + int i, cpu; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); @@ -156,8 +155,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) goto err_free; - if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || - blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); + if (!blkg->iostat_cpu) goto err_free; blkg->q = q; @@ -167,6 +166,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); blkg->blkcg = blkcg; + u64_stats_init(&blkg->iostat.sync); + for_each_possible_cpu(cpu) + u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; @@ -393,7 +396,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; - struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(&blkg->q->queue_lock); @@ -410,11 +412,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) pol->pd_offline_fn(blkg->pd[i]); } - if (parent) { - blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); - blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); - } - blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -464,7 +461,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; - int i; + int i, cpu; mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); @@ -475,8 +472,12 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - blkg_rwstat_reset(&blkg->stat_bytes); - blkg_rwstat_reset(&blkg->stat_ios); + for_each_possible_cpu(cpu) { + struct blkg_iostat_set *bis = + per_cpu_ptr(blkg->iostat_cpu, cpu); + memset(bis, 0, sizeof(*bis)); + } + memset(&blkg->iostat, 0, sizeof(blkg->iostat)); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -560,186 +561,6 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -/** - * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @rwstat: rwstat to print - * - * Print @rwstat to @sf for the device assocaited with @pd. - */ -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat_sample *rwstat) -{ - static const char *rwstr[] = { - [BLKG_RWSTAT_READ] = "Read", - [BLKG_RWSTAT_WRITE] = "Write", - [BLKG_RWSTAT_SYNC] = "Sync", - [BLKG_RWSTAT_ASYNC] = "Async", - [BLKG_RWSTAT_DISCARD] = "Discard", - }; - const char *dname = blkg_dev_name(pd->blkg); - u64 v; - int i; - - if (!dname) - return 0; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - rwstat->cnt[i]); - - v = rwstat->cnt[BLKG_RWSTAT_READ] + - rwstat->cnt[BLKG_RWSTAT_WRITE] + - rwstat->cnt[BLKG_RWSTAT_DISCARD]; - seq_printf(sf, "%s Total %llu\n", dname, v); - return v; -} -EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); - -/** - * blkg_prfill_rwstat - prfill callback for blkg_rwstat - * @sf: seq_file to print to - * @pd: policy private data of interest - * @off: offset to the blkg_rwstat in @pd - * - * prfill callback for printing a blkg_rwstat. - */ -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat_sample rwstat = { }; - - blkg_rwstat_read((void *)pd + off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} -EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); - -static u64 blkg_prfill_rwstat_field(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat_sample rwstat = { }; - - blkg_rwstat_read((void *)pd->blkg + off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -/** - * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes - * @sf: seq_file to print to - * @v: unused - * - * To be used as cftype->seq_show to print blkg->stat_bytes. - * cftype->private must be set to the blkcg_policy. - */ -int blkg_print_stat_bytes(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_bytes), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); - -/** - * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios - * @sf: seq_file to print to - * @v: unused - * - * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private - * must be set to the blkcg_policy. - */ -int blkg_print_stat_ios(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_ios), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_ios); - -static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat_sample rwstat; - - blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat); - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -/** - * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes - * @sf: seq_file to print to - * @v: unused - */ -int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field_recursive, - (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_bytes), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); - -/** - * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios - * @sf: seq_file to print to - * @v: unused - */ -int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - blkg_prfill_rwstat_field_recursive, - (void *)seq_cft(sf)->private, - offsetof(struct blkcg_gq, stat_ios), true); - return 0; -} -EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); - -/** - * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @blkg: blkg of interest - * @pol: blkcg_policy which contains the blkg_rwstat - * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg - * @sum: blkg_rwstat_sample structure containing the results - * - * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its - * online descendants and their aux counts. The caller must be holding the - * queue lock for online tests. - * - * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it - * is at @off bytes into @blkg's blkg_policy_data of the policy. - */ -void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, - int off, struct blkg_rwstat_sample *sum) -{ - struct blkcg_gq *pos_blkg; - struct cgroup_subsys_state *pos_css; - unsigned int i; - - lockdep_assert_held(&blkg->q->queue_lock); - - rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { - struct blkg_rwstat *rwstat; - - if (!pos_blkg->online) - continue; - - if (pol) - rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; - else - rwstat = (void *)pos_blkg + off; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); - /* Performs queue bypass and policy enabled checks then looks up blkg. */ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, const struct blkcg_policy *pol, @@ -923,20 +744,27 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; + cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkg_iostat_set *bis = &blkg->iostat; const char *dname; char *buf; - struct blkg_rwstat_sample rwstat; u64 rbytes, wbytes, rios, wios, dbytes, dios; size_t size = seq_get_buf(sf, &buf), off = 0; int i; bool has_stats = false; + unsigned seq; + + spin_lock_irq(&blkg->q->queue_lock); + + if (!blkg->online) + goto skip; dname = blkg_dev_name(blkg); if (!dname) - continue; + goto skip; /* * Hooray string manipulation, count is the size written NOT @@ -946,21 +774,16 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) */ off += scnprintf(buf+off, size-off, "%s ", dname); - spin_lock_irq(&blkg->q->queue_lock); - - blkg_rwstat_recursive_sum(blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes), &rwstat); - rbytes = rwstat.cnt[BLKG_RWSTAT_READ]; - wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE]; - dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD]; + do { + seq = u64_stats_fetch_begin(&bis->sync); - blkg_rwstat_recursive_sum(blkg, NULL, - offsetof(struct blkcg_gq, stat_ios), &rwstat); - rios = rwstat.cnt[BLKG_RWSTAT_READ]; - wios = rwstat.cnt[BLKG_RWSTAT_WRITE]; - dios = rwstat.cnt[BLKG_RWSTAT_DISCARD]; - - spin_unlock_irq(&blkg->q->queue_lock); + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { has_stats = true; @@ -999,6 +822,8 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) seq_commit(sf, -1); } } + skip: + spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); @@ -1294,6 +1119,77 @@ static int blkcg_can_attach(struct cgroup_taskset *tset) return ret; } +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} + +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); + struct blkg_iostat cur, delta; + unsigned seq; + + /* fetch the current per-cpu values */ + do { + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); + + /* propagate percpu delta to global */ + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&delta, &cur); + blkg_iostat_sub(&delta, &bisc->last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(&bisc->last, &delta); + u64_stats_update_end(&blkg->iostat.sync); + + /* propagate global delta to parent */ + if (parent) { + u64_stats_update_begin(&parent->iostat.sync); + blkg_iostat_set(&delta, &blkg->iostat.cur); + blkg_iostat_sub(&delta, &blkg->iostat.last); + blkg_iostat_add(&parent->iostat.cur, &delta); + blkg_iostat_add(&blkg->iostat.last, &delta); + u64_stats_update_end(&parent->iostat.sync); + } + } + + rcu_read_unlock(); +} + static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; @@ -1326,6 +1222,7 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, + .css_rstat_flush = blkcg_rstat_flush, .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, @@ -1362,7 +1259,7 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { struct blkg_policy_data *pd_prealloc = NULL; - struct blkcg_gq *blkg; + struct blkcg_gq *blkg, *pinned_blkg = NULL; int ret; if (blkcg_policy_enabled(q, pol)) @@ -1370,49 +1267,82 @@ int blkcg_activate_policy(struct request_queue *q, if (queue_is_mq(q)) blk_mq_freeze_queue(q); -pd_prealloc: - if (!pd_prealloc) { - pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, &blkcg_root); - if (!pd_prealloc) { - ret = -ENOMEM; - goto out_bypass_end; - } - } - +retry: spin_lock_irq(&q->queue_lock); - /* blkg_list is pushed at the head, reverse walk to init parents first */ + /* blkg_list is pushed at the head, reverse walk to allocate parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) continue; - pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, &blkcg_root); - if (!pd) - swap(pd, pd_prealloc); + /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ + if (blkg == pinned_blkg) { + pd = pd_prealloc; + pd_prealloc = NULL; + } else { + pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, + blkg->blkcg); + } + if (!pd) { + /* + * GFP_NOWAIT failed. Free the existing one and + * prealloc for @blkg w/ GFP_KERNEL. + */ + if (pinned_blkg) + blkg_put(pinned_blkg); + blkg_get(blkg); + pinned_blkg = blkg; + spin_unlock_irq(&q->queue_lock); - goto pd_prealloc; + + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, + blkg->blkcg); + if (pd_prealloc) + goto retry; + else + goto enomem; } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; - if (pol->pd_init_fn) - pol->pd_init_fn(pd); } + /* all allocated, init in the same order */ + if (pol->pd_init_fn) + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) + pol->pd_init_fn(blkg->pd[pol->plid]); + __set_bit(pol->plid, q->blkcg_pols); ret = 0; spin_unlock_irq(&q->queue_lock); -out_bypass_end: +out: if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); + if (pinned_blkg) + blkg_put(pinned_blkg); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; + +enomem: + /* alloc failed, nothing's initialized yet, free everything */ + spin_lock_irq(&q->queue_lock); + list_for_each_entry(blkg, &q->blkg_list, q_node) { + if (blkg->pd[pol->plid]) { + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } + } + spin_unlock_irq(&q->queue_lock); + ret = -ENOMEM; + goto out; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); diff --git a/block/blk-core.c b/block/blk-core.c index d5e668ec751b..a1e228752083 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -132,6 +132,9 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), REQ_OP_NAME(ZONE_RESET_ALL), + REQ_OP_NAME(ZONE_OPEN), + REQ_OP_NAME(ZONE_CLOSE), + REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), @@ -336,14 +339,14 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying); */ void blk_cleanup_queue(struct request_queue *q) { + WARN_ON_ONCE(blk_queue_registered(q)); + /* mark @q DYING, no new request or merges will be allowed afterwards */ - mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); blk_queue_flag_set(QUEUE_FLAG_DYING, q); - mutex_unlock(&q->sysfs_lock); /* * Drain all requests queued before DYING marking. Set DEAD flag to @@ -848,11 +851,7 @@ static inline int blk_partition_remap(struct bio *bio) if (unlikely(bio_check_ro(bio, p))) goto out; - /* - * Zone reset does not include bi_size so bio_sectors() is always 0. - * Include a test for the reset op code and perform the remap if needed. - */ - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) { + if (bio_sectors(bio)) { if (bio_check_eod(bio, part_nr_sects_read(p))) goto out; bio->bi_iter.bi_sector += p->start_sect; @@ -936,6 +935,9 @@ generic_make_request_checks(struct bio *bio) goto not_supported; break; case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: if (!blk_queue_is_zoned(q)) goto not_supported; break; diff --git a/block/blk-exec.c b/block/blk-exec.c index 1db44ca0f4a6..e20a852ae432 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -55,6 +55,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq->rq_disk = bd_disk; rq->end_io = done; + blk_account_io_start(rq, true); + /* * don't check dying flag for MQ because the request won't * be reused after dying flag is set diff --git a/block/blk-flush.c b/block/blk-flush.c index 1eec9cbe5a0a..1777346baf06 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -136,6 +136,17 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) blk_mq_add_to_requeue_list(rq, add_front, true); } +static void blk_account_io_flush(struct request *rq) +{ + struct hd_struct *part = &rq->rq_disk->part0; + + part_stat_lock(); + part_stat_inc(part, ios[STAT_FLUSH]); + part_stat_add(part, nsecs[STAT_FLUSH], + ktime_get_ns() - rq->start_time_ns); + part_stat_unlock(); +} + /** * blk_flush_complete_seq - complete flush sequence * @rq: PREFLUSH/FUA request being sequenced @@ -185,7 +196,7 @@ static void blk_flush_complete_seq(struct request *rq, case REQ_FSEQ_DONE: /* - * @rq was previously adjusted by blk_flush_issue() for + * @rq was previously adjusted by blk_insert_flush() for * flush sequencing and may already have gone through the * flush data request completion path. Restore @rq for * normal completion and end it. @@ -212,6 +223,8 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); struct blk_mq_hw_ctx *hctx; + blk_account_io_flush(flush_rq); + /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 2a3db80c1dce..e01267f99183 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1057,9 +1057,12 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) atomic64_set(&iocg->active_period, cur_period); /* already activated or breaking leaf-only constraint? */ - for (i = iocg->level; i > 0; i--) - if (!list_empty(&iocg->active_list)) + if (!list_empty(&iocg->active_list)) + goto succeed_unlock; + for (i = iocg->level - 1; i > 0; i--) + if (!list_empty(&iocg->ancestors[i]->active_list)) goto fail_unlock; + if (iocg->child_active_sum) goto fail_unlock; @@ -1101,6 +1104,7 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) ioc_start_period(ioc, now); } +succeed_unlock: spin_unlock_irq(&ioc->lock); return true; @@ -2110,10 +2114,10 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, goto einval; } - spin_lock_irq(&iocg->ioc->lock); + spin_lock(&iocg->ioc->lock); iocg->cfg_weight = v; weight_updated(iocg); - spin_unlock_irq(&iocg->ioc->lock); + spin_unlock(&iocg->ioc->lock); blkg_conf_finish(&ctx); return nbytes; diff --git a/block/blk-merge.c b/block/blk-merge.c index 48e6725b32ee..d783bdc4559b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -293,7 +293,7 @@ split: void __blk_queue_split(struct request_queue *q, struct bio **bio, unsigned int *nr_segs) { - struct bio *split; + struct bio *split = NULL; switch (bio_op(*bio)) { case REQ_OP_DISCARD: @@ -309,6 +309,21 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, nr_segs); break; default: + /* + * All drivers must accept single-segments bios that are <= + * PAGE_SIZE. This is a quick and dirty check that relies on + * the fact that bi_io_vec[0] is always valid if a bio has data. + * The check might lead to occasional false negatives when bios + * are cloned, but compared to the performance impact of cloned + * bios themselves the loop below doesn't matter anyway. + */ + if (!q->limits.chunk_sectors && + (*bio)->bi_vcnt == 1 && + ((*bio)->bi_io_vec[0].bv_len + + (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) { + *nr_segs = 1; + break; + } split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); break; } diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index a0d3ce30fa08..062229395a50 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -74,10 +74,8 @@ static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, if (!entry->show) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->show(ctx, page); + res = entry->show(ctx, page); mutex_unlock(&q->sysfs_lock); return res; } @@ -97,10 +95,8 @@ static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, if (!entry->store) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->store(ctx, page, length); + res = entry->store(ctx, page, length); mutex_unlock(&q->sysfs_lock); return res; } @@ -120,10 +116,8 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, if (!entry->show) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->show(hctx, page); + res = entry->show(hctx, page); mutex_unlock(&q->sysfs_lock); return res; } @@ -144,10 +138,8 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, if (!entry->store) return -EIO; - res = -ENOENT; mutex_lock(&q->sysfs_lock); - if (!blk_queue_dying(q)) - res = entry->store(hctx, page, length); + res = entry->store(hctx, page, length); mutex_unlock(&q->sysfs_lock); return res; } @@ -166,20 +158,25 @@ static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { + const size_t size = PAGE_SIZE - 1; unsigned int i, first = 1; - ssize_t ret = 0; + int ret = 0, pos = 0; for_each_cpu(i, hctx->cpumask) { if (first) - ret += sprintf(ret + page, "%u", i); + ret = snprintf(pos + page, size - pos, "%u", i); else - ret += sprintf(ret + page, ", %u", i); + ret = snprintf(pos + page, size - pos, ", %u", i); + + if (ret >= size - pos) + break; first = 0; + pos += ret; } - ret += sprintf(ret + page, "\n"); - return ret; + ret = snprintf(pos + page, size + 1 - pos, "\n"); + return pos + ret; } static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 008388e82b5c..fbacde454718 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -15,14 +15,6 @@ #include "blk-mq.h" #include "blk-mq-tag.h" -bool blk_mq_has_free_tags(struct blk_mq_tags *tags) -{ - if (!tags) - return true; - - return sbitmap_any_bit_clear(&tags->bitmap_tags.sb); -} - /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 61deab0b5a5a..15bc74acb57e 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -28,7 +28,6 @@ extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); -extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); diff --git a/block/blk-mq.c b/block/blk-mq.c index ec791156e9cc..323c9cb28066 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -93,7 +93,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct mq_inflight { struct hd_struct *part; - unsigned int *inflight; + unsigned int inflight[2]; }; static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, @@ -102,45 +102,29 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - /* - * index[0] counts the specific partition that was asked for. - */ if (rq->part == mi->part) - mi->inflight[0]++; + mi->inflight[rq_data_dir(rq)]++; return true; } unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) { - unsigned inflight[2]; - struct mq_inflight mi = { .part = part, .inflight = inflight, }; + struct mq_inflight mi = { .part = part }; - inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - return inflight[0]; -} - -static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, - bool reserved) -{ - struct mq_inflight *mi = priv; - - if (rq->part == mi->part) - mi->inflight[rq_data_dir(rq)]++; - - return true; + return mi.inflight[0] + mi.inflight[1]; } void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - struct mq_inflight mi = { .part = part, .inflight = inflight, }; + struct mq_inflight mi = { .part = part }; - inflight[0] = inflight[1] = 0; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + inflight[0] = mi.inflight[0]; + inflight[1] = mi.inflight[1]; } void blk_freeze_queue_start(struct request_queue *q) @@ -276,12 +260,6 @@ void blk_mq_wake_waiters(struct request_queue *q) blk_mq_tag_wakeup_all(hctx->tags, true); } -bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) -{ - return blk_mq_has_free_tags(hctx->tags); -} -EXPORT_SYMBOL(blk_mq_can_queue); - /* * Only need start/end time stamping if we have iostat or * blk stats enabled, or using an IO scheduler. @@ -663,18 +641,6 @@ bool blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -int blk_mq_request_started(struct request *rq) -{ - return blk_mq_rq_state(rq) != MQ_RQ_IDLE; -} -EXPORT_SYMBOL_GPL(blk_mq_request_started); - -int blk_mq_request_completed(struct request *rq) -{ - return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; -} -EXPORT_SYMBOL_GPL(blk_mq_request_completed); - void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; @@ -1064,7 +1030,7 @@ bool blk_mq_get_driver_tag(struct request *rq) bool shared; if (rq->tag != -1) - goto done; + return true; if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) data.flags |= BLK_MQ_REQ_RESERVED; @@ -1079,7 +1045,6 @@ bool blk_mq_get_driver_tag(struct request *rq) data.hctx->tags->rqs[rq->tag] = rq; } -done: return rq->tag != -1; } @@ -1486,7 +1451,7 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); -bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { int srcu_idx; bool need_run; @@ -1504,12 +1469,8 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) blk_mq_hctx_has_pending(hctx); hctx_unlock(hctx, srcu_idx); - if (need_run) { + if (need_run) __blk_mq_delay_run_hw_queue(hctx, async, 0); - return true; - } - - return false; } EXPORT_SYMBOL(blk_mq_run_hw_queue); @@ -2789,6 +2750,23 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, int i, j, end; struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + if (q->nr_hw_queues < set->nr_hw_queues) { + struct blk_mq_hw_ctx **new_hctxs; + + new_hctxs = kcalloc_node(set->nr_hw_queues, + sizeof(*new_hctxs), GFP_KERNEL, + set->numa_node); + if (!new_hctxs) + return; + if (hctxs) + memcpy(new_hctxs, hctxs, q->nr_hw_queues * + sizeof(*hctxs)); + q->queue_hw_ctx = new_hctxs; + q->nr_hw_queues = set->nr_hw_queues; + kfree(hctxs); + hctxs = new_hctxs; + } + /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { @@ -2844,19 +2822,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } -/* - * Maximum number of hardware queues we support. For single sets, we'll never - * have more than the CPUs (software queues). For multiple sets, the tag_set - * user may have set ->nr_hw_queues larger. - */ -static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) -{ - if (set->nr_maps == 1) - return nr_cpu_ids; - - return max(set->nr_hw_queues, nr_cpu_ids); -} - struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q, bool elevator_init) @@ -2876,12 +2841,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); - q->nr_queues = nr_hw_queues(set); - q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), - GFP_KERNEL, set->numa_node); - if (!q->queue_hw_ctx) - goto err_sys_init; - INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); @@ -2929,7 +2888,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, err_hctxs: kfree(q->queue_hw_ctx); q->nr_hw_queues = 0; -err_sys_init: blk_mq_sysfs_deinit(q); err_poll: blk_stat_free_callback(q->poll_cb); @@ -3030,6 +2988,29 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) } } +static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, + int cur_nr_hw_queues, int new_nr_hw_queues) +{ + struct blk_mq_tags **new_tags; + + if (cur_nr_hw_queues >= new_nr_hw_queues) + return 0; + + new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!new_tags) + return -ENOMEM; + + if (set->tags) + memcpy(new_tags, set->tags, cur_nr_hw_queues * + sizeof(*set->tags)); + kfree(set->tags); + set->tags = new_tags; + set->nr_hw_queues = new_nr_hw_queues; + + return 0; +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -3083,9 +3064,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), - GFP_KERNEL, set->numa_node); - if (!set->tags) + if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) return -ENOMEM; ret = -ENOMEM; @@ -3126,7 +3105,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; - for (i = 0; i < nr_hw_queues(set); i++) + for (i = 0; i < set->nr_hw_queues; i++) blk_mq_free_map_and_requests(set, i); for (j = 0; j < set->nr_maps; j++) { @@ -3271,10 +3250,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); /* - * Sync with blk_mq_queue_tag_busy_iter. - */ - synchronize_rcu(); - /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done * updating the new sw to hw queue mappings. @@ -3288,6 +3263,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_sysfs_unregister(q); } + if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < + 0) + goto reregister; + prev_nr_hw_queues = set->nr_hw_queues; set->nr_hw_queues = nr_hw_queues; blk_mq_update_queue_map(set); @@ -3304,6 +3283,7 @@ fallback: blk_mq_map_swqueue(q); } +reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register(q); blk_mq_debugfs_register_hctxs(q); diff --git a/block/blk-mq.h b/block/blk-mq.h index 32c62c64e6c2..eaaca8fc1c28 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -128,15 +128,6 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_release(struct request_queue *q); -/** - * blk_mq_rq_state() - read the current MQ_RQ_* state of a request - * @rq: target request. - */ -static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) -{ - return READ_ONCE(rq->state); -} - static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 61b635bc2a31..656460636ad3 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -160,24 +160,27 @@ bool rq_depth_calc_max_depth(struct rq_depth *rqd) return ret; } -void rq_depth_scale_up(struct rq_depth *rqd) +/* Returns true on success and false if scaling up wasn't possible */ +bool rq_depth_scale_up(struct rq_depth *rqd) { /* * Hit max in previous round, stop here */ if (rqd->scaled_max) - return; + return false; rqd->scale_step--; rqd->scaled_max = rq_depth_calc_max_depth(rqd); + return true; } /* * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we - * had a latency violation. + * had a latency violation. Returns true on success and returns false if + * scaling down wasn't possible. */ -void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) +bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) { /* * Stop scaling down when we've hit the limit. This also prevents @@ -185,7 +188,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) * keep up. */ if (rqd->max_depth == 1) - return; + return false; if (rqd->scale_step < 0 && hard_throttle) rqd->scale_step = 0; @@ -194,6 +197,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) rqd->scaled_max = false; rq_depth_calc_max_depth(rqd); + return true; } struct rq_qos_wait_data { diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 08a09dbe0f4b..2bc43e94f4c4 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -108,16 +108,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) { - struct rq_qos *cur, *prev = NULL; - for (cur = q->rq_qos; cur; cur = cur->next) { - if (cur == rqos) { - if (prev) - prev->next = rqos->next; - else - q->rq_qos = cur; + struct rq_qos **cur; + + for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { + if (*cur == rqos) { + *cur = rqos->next; break; } - prev = cur; } blk_mq_debugfs_unregister_rqos(rqos); @@ -130,8 +127,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, acquire_inflight_cb_t *acquire_inflight_cb, cleanup_cb_t *cleanup_cb); bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); -void rq_depth_scale_up(struct rq_depth *rqd); -void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); +bool rq_depth_scale_up(struct rq_depth *rqd); +bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 457d9ba3eb20..6e7ec87d49fa 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -42,17 +42,13 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h) static void trigger_softirq(void *data) { struct request *rq = data; - unsigned long flags; struct list_head *list; - local_irq_save(flags); list = this_cpu_ptr(&blk_cpu_done); list_add_tail(&rq->ipi_list, list); if (list->next == &rq->ipi_list) raise_softirq_irqoff(BLOCK_SOFTIRQ); - - local_irq_restore(flags); } /* diff --git a/block/blk-stat.c b/block/blk-stat.c index 940f15d600f8..7da302ff88d0 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -53,7 +53,7 @@ void blk_stat_add(struct request *rq, u64 now) struct request_queue *q = rq->q; struct blk_stat_callback *cb; struct blk_rq_stat *stat; - int bucket; + int bucket, cpu; u64 value; value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; @@ -61,6 +61,7 @@ void blk_stat_add(struct request *rq, u64 now) blk_throtl_stat_add(rq, value); rcu_read_lock(); + cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { if (!blk_stat_is_active(cb)) continue; @@ -69,10 +70,10 @@ void blk_stat_add(struct request *rq, u64 now) if (bucket < 0) continue; - stat = &get_cpu_ptr(cb->cpu_stat)[bucket]; + stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket]; blk_rq_stat_add(stat, value); - put_cpu_ptr(cb->cpu_stat); } + put_cpu(); rcu_read_unlock(); } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 46f5198be017..fca9b158f4a0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -801,10 +801,6 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; mutex_lock(&q->sysfs_lock); - if (blk_queue_dying(q)) { - mutex_unlock(&q->sysfs_lock); - return -ENOENT; - } res = entry->show(q, page); mutex_unlock(&q->sysfs_lock); return res; @@ -823,10 +819,6 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); - if (blk_queue_dying(q)) { - mutex_unlock(&q->sysfs_lock); - return -ENOENT; - } res = entry->store(q, page, length); mutex_unlock(&q->sysfs_lock); return res; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 18f773e52dfb..98233c9c65a8 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -12,6 +12,7 @@ #include <linux/blktrace_api.h> #include <linux/blk-cgroup.h> #include "blk.h" +#include "blk-cgroup-rwstat.h" /* Max dispatch from a group in 1 round */ static int throtl_grp_quantum = 8; @@ -176,6 +177,9 @@ struct throtl_grp { unsigned int bio_cnt; /* total bios */ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; }; /* We measure latency for request size from <= 4k to >= 1M */ @@ -489,6 +493,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, if (!tg) return NULL; + if (blkg_rwstat_init(&tg->stat_bytes, gfp)) + goto err_free_tg; + + if (blkg_rwstat_init(&tg->stat_ios, gfp)) + goto err_exit_stat_bytes; + throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { @@ -513,6 +523,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; return &tg->pd; + +err_exit_stat_bytes: + blkg_rwstat_exit(&tg->stat_bytes); +err_free_tg: + kfree(tg); + return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) @@ -611,6 +627,8 @@ static void throtl_pd_free(struct blkg_policy_data *pd) struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); + blkg_rwstat_exit(&tg->stat_bytes); + blkg_rwstat_exit(&tg->stat_ios); kfree(tg); } @@ -1464,6 +1482,32 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } +static int tg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + +static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, + &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + tg_prfill_rwstat_recursive, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", @@ -1491,23 +1535,23 @@ static struct cftype throtl_legacy_files[] = { }, { .name = "throttle.io_service_bytes", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_bytes, + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_bytes_recursive, + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat_recursive, }, { .name = "throttle.io_serviced", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_ios, + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_ios_recursive, + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat_recursive, }, { } /* terminate */ }; @@ -2127,7 +2171,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, WARN_ON_ONCE(!rcu_read_lock_held()); /* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) + if (bio_flagged(bio, BIO_THROTTLED)) + goto out; + + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + + if (!tg->has_rules[rw]) goto out; spin_lock_irq(&q->queue_lock); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 8af553a0ba00..8641ba9793c5 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -308,7 +308,8 @@ static void calc_wb_limits(struct rq_wb *rwb) static void scale_up(struct rq_wb *rwb) { - rq_depth_scale_up(&rwb->rq_depth); + if (!rq_depth_scale_up(&rwb->rq_depth)) + return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_wake_all(rwb); @@ -317,7 +318,8 @@ static void scale_up(struct rq_wb *rwb) static void scale_down(struct rq_wb *rwb, bool hard_throttle) { - rq_depth_scale_down(&rwb->rq_depth, hard_throttle); + if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle)) + return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_trace_step(rwb, "scale down"); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 4bc5f260248a..6fad6f3f6980 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -93,172 +93,85 @@ unsigned int blkdev_nr_zones(struct block_device *bdev) if (!blk_queue_is_zoned(q)) return 0; - return __blkdev_nr_zones(q, bdev->bd_part->nr_sects); + return __blkdev_nr_zones(q, get_capacity(bdev->bd_disk)); } EXPORT_SYMBOL_GPL(blkdev_nr_zones); -/* - * Check that a zone report belongs to this partition, and if yes, fix its start - * sector and write pointer and return true. Return false otherwise. - */ -static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep) -{ - sector_t offset = get_start_sect(bdev); - - if (rep->start < offset) - return false; - - rep->start -= offset; - if (rep->start + rep->len > bdev->bd_part->nr_sects) - return false; - - if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL) - rep->wp = rep->start + rep->len; - else - rep->wp -= offset; - return true; -} - -static int blk_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) -{ - struct request_queue *q = disk->queue; - unsigned int z = 0, n, nrz = *nr_zones; - sector_t capacity = get_capacity(disk); - int ret; - - while (z < nrz && sector < capacity) { - n = nrz - z; - ret = disk->fops->report_zones(disk, sector, &zones[z], &n); - if (ret) - return ret; - if (!n) - break; - sector += blk_queue_zone_sectors(q) * n; - z += n; - } - - WARN_ON(z > *nr_zones); - *nr_zones = z; - - return 0; -} - /** * blkdev_report_zones - Get zones information * @bdev: Target block device * @sector: Sector from which to report zones - * @zones: Array of zone structures where to return the zones information - * @nr_zones: Number of zone structures in the zone array + * @nr_zones: Maximum number of zones to report + * @cb: Callback function called for each reported zone + * @data: Private data for the callback * * Description: - * Get zone information starting from the zone containing @sector. - * The number of zone information reported may be less than the number - * requested by @nr_zones. The number of zones actually reported is - * returned in @nr_zones. - * The caller must use memalloc_noXX_save/restore() calls to control - * memory allocations done within this function (zone array and command - * buffer allocation by the device driver). + * Get zone information starting from the zone containing @sector for at most + * @nr_zones, and call @cb for each zone reported by the device. + * To report all zones in a device starting from @sector, the BLK_ALL_ZONES + * constant can be passed to @nr_zones. + * Returns the number of zones reported by the device, or a negative errno + * value in case of failure. + * + * Note: The caller must use memalloc_noXX_save/restore() calls to control + * memory allocations done within this function. */ int blkdev_report_zones(struct block_device *bdev, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) + unsigned int nr_zones, report_zones_cb cb, void *data) { - struct request_queue *q = bdev_get_queue(bdev); - unsigned int i, nrz; - int ret; - - if (!blk_queue_is_zoned(q)) - return -EOPNOTSUPP; + struct gendisk *disk = bdev->bd_disk; + sector_t capacity = get_capacity(disk); - /* - * A block device that advertized itself as zoned must have a - * report_zones method. If it does not have one defined, the device - * driver has a bug. So warn about that. - */ - if (WARN_ON_ONCE(!bdev->bd_disk->fops->report_zones)) + if (!blk_queue_is_zoned(bdev_get_queue(bdev)) || + WARN_ON_ONCE(!disk->fops->report_zones)) return -EOPNOTSUPP; - if (!*nr_zones || sector >= bdev->bd_part->nr_sects) { - *nr_zones = 0; + if (!nr_zones || sector >= capacity) return 0; - } - - nrz = min(*nr_zones, - __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector)); - ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector, - zones, &nrz); - if (ret) - return ret; - - for (i = 0; i < nrz; i++) { - if (!blkdev_report_zone(bdev, zones)) - break; - zones++; - } - *nr_zones = i; - - return 0; + return disk->fops->report_zones(disk, sector, nr_zones, cb, data); } EXPORT_SYMBOL_GPL(blkdev_report_zones); -/* - * Special case of zone reset operation to reset all zones in one command, - * useful for applications like mkfs. - */ -static int __blkdev_reset_all_zones(struct block_device *bdev, gfp_t gfp_mask) -{ - struct bio *bio = bio_alloc(gfp_mask, 0); - int ret; - - /* across the zones operations, don't need any sectors */ - bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_ZONE_RESET_ALL, 0); - - ret = submit_bio_wait(bio); - bio_put(bio); - - return ret; -} - static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, + sector_t sector, sector_t nr_sectors) { if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) return false; - if (nr_sectors != part_nr_sects_read(bdev->bd_part)) - return false; /* - * REQ_OP_ZONE_RESET_ALL can be executed only if the block device is - * the entire disk, that is, if the blocks device start offset is 0 and - * its capacity is the same as the entire disk. + * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors + * of the applicable zone range is the entire disk. */ - return get_start_sect(bdev) == 0 && - part_nr_sects_read(bdev->bd_part) == get_capacity(bdev->bd_disk); + return !sector && nr_sectors == get_capacity(bdev->bd_disk); } /** - * blkdev_reset_zones - Reset zones write pointer + * blkdev_zone_mgmt - Execute a zone management operation on a range of zones * @bdev: Target block device - * @sector: Start sector of the first zone to reset - * @nr_sectors: Number of sectors, at least the length of one zone + * @op: Operation to be performed on the zones + * @sector: Start sector of the first zone to operate on + * @nr_sectors: Number of sectors, should be at least the length of one zone and + * must be zone size aligned. * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: - * Reset the write pointer of the zones contained in the range + * Perform the specified operation on the range of zones specified by * @sector..@sector+@nr_sectors. Specifying the entire disk sector range * is valid, but the specified range should not contain conventional zones. + * The operation to execute on each zone can be a zone reset, open, close + * or finish request. */ -int blkdev_reset_zones(struct block_device *bdev, - sector_t sector, sector_t nr_sectors, - gfp_t gfp_mask) +int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, + sector_t sector, sector_t nr_sectors, + gfp_t gfp_mask) { struct request_queue *q = bdev_get_queue(bdev); - sector_t zone_sectors; + sector_t zone_sectors = blk_queue_zone_sectors(q); + sector_t capacity = get_capacity(bdev->bd_disk); sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; - struct blk_plug plug; int ret; if (!blk_queue_is_zoned(q)) @@ -267,45 +180,62 @@ int blkdev_reset_zones(struct block_device *bdev, if (bdev_read_only(bdev)) return -EPERM; - if (!nr_sectors || end_sector > bdev->bd_part->nr_sects) + if (!op_is_zone_mgmt(op)) + return -EOPNOTSUPP; + + if (!nr_sectors || end_sector > capacity) /* Out of range */ return -EINVAL; - if (blkdev_allow_reset_all_zones(bdev, nr_sectors)) - return __blkdev_reset_all_zones(bdev, gfp_mask); - /* Check alignment (handle eventual smaller last zone) */ - zone_sectors = blk_queue_zone_sectors(q); if (sector & (zone_sectors - 1)) return -EINVAL; - if ((nr_sectors & (zone_sectors - 1)) && - end_sector != bdev->bd_part->nr_sects) + if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity) return -EINVAL; - blk_start_plug(&plug); while (sector < end_sector) { - bio = blk_next_bio(bio, 0, gfp_mask); - bio->bi_iter.bi_sector = sector; bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); + /* + * Special case for the zone reset operation that reset all + * zones, this is useful for applications like mkfs. + */ + if (op == REQ_OP_ZONE_RESET && + blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { + bio->bi_opf = REQ_OP_ZONE_RESET_ALL; + break; + } + + bio->bi_opf = op; + bio->bi_iter.bi_sector = sector; sector += zone_sectors; /* This may take a while, so be nice to others */ cond_resched(); - } ret = submit_bio_wait(bio); bio_put(bio); - blk_finish_plug(&plug); - return ret; } -EXPORT_SYMBOL_GPL(blkdev_reset_zones); +EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); + +struct zone_report_args { + struct blk_zone __user *zones; +}; + +static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct zone_report_args *args = data; + + if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) + return -EFAULT; + return 0; +} /* * BLKREPORTZONE ioctl processing. @@ -315,9 +245,9 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; + struct zone_report_args args; struct request_queue *q; struct blk_zone_report rep; - struct blk_zone *zones; int ret; if (!argp) @@ -339,44 +269,29 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!rep.nr_zones) return -EINVAL; - rep.nr_zones = min(blkdev_nr_zones(bdev), rep.nr_zones); - - zones = kvmalloc_array(rep.nr_zones, sizeof(struct blk_zone), - GFP_KERNEL | __GFP_ZERO); - if (!zones) - return -ENOMEM; - - ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones); - if (ret) - goto out; - - if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) { - ret = -EFAULT; - goto out; - } - - if (rep.nr_zones) { - if (copy_to_user(argp + sizeof(struct blk_zone_report), zones, - sizeof(struct blk_zone) * rep.nr_zones)) - ret = -EFAULT; - } - - out: - kvfree(zones); + args.zones = argp + sizeof(struct blk_zone_report); + ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, + blkdev_copy_zone_to_user, &args); + if (ret < 0) + return ret; - return ret; + rep.nr_zones = ret; + if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) + return -EFAULT; + return 0; } /* - * BLKRESETZONE ioctl processing. + * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. */ -int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct request_queue *q; struct blk_zone_range zrange; + enum req_opf op; if (!argp) return -EINVAL; @@ -397,8 +312,25 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) return -EFAULT; - return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); + switch (cmd) { + case BLKRESETZONE: + op = REQ_OP_ZONE_RESET; + break; + case BLKOPENZONE: + op = REQ_OP_ZONE_OPEN; + break; + case BLKCLOSEZONE: + op = REQ_OP_ZONE_CLOSE; + break; + case BLKFINISHZONE: + op = REQ_OP_ZONE_FINISH; + break; + default: + return -ENOTTY; + } + + return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); } static inline unsigned long *blk_alloc_zone_bitmap(int node, @@ -408,37 +340,99 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node, GFP_NOIO, node); } +void blk_queue_free_zone_bitmaps(struct request_queue *q) +{ + kfree(q->seq_zones_bitmap); + q->seq_zones_bitmap = NULL; + kfree(q->seq_zones_wlock); + q->seq_zones_wlock = NULL; +} + +struct blk_revalidate_zone_args { + struct gendisk *disk; + unsigned long *seq_zones_bitmap; + unsigned long *seq_zones_wlock; + sector_t sector; +}; + /* - * Allocate an array of struct blk_zone to get nr_zones zone information. - * The allocated array may be smaller than nr_zones. + * Helper function to check the validity of zones of a zoned block device. */ -static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones) +static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) { - struct blk_zone *zones; - size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES); + struct blk_revalidate_zone_args *args = data; + struct gendisk *disk = args->disk; + struct request_queue *q = disk->queue; + sector_t zone_sectors = blk_queue_zone_sectors(q); + sector_t capacity = get_capacity(disk); /* - * GFP_KERNEL here is meaningless as the caller task context has - * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones() - * with memalloc_noio_save(). + * All zones must have the same size, with the exception on an eventual + * smaller last zone. */ - zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL); - if (!zones) { - *nr_zones = 0; - return NULL; + if (zone->start + zone_sectors < capacity && + zone->len != zone_sectors) { + pr_warn("%s: Invalid zoned device with non constant zone size\n", + disk->disk_name); + return false; } - *nr_zones = nrz; + if (zone->start + zone->len >= capacity && + zone->len > zone_sectors) { + pr_warn("%s: Invalid zoned device with larger last zone size\n", + disk->disk_name); + return -ENODEV; + } + + /* Check for holes in the zone report */ + if (zone->start != args->sector) { + pr_warn("%s: Zone gap at sectors %llu..%llu\n", + disk->disk_name, args->sector, zone->start); + return -ENODEV; + } - return zones; + /* Check zone type */ + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + break; + default: + pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", + disk->disk_name, (int)zone->type, zone->start); + return -ENODEV; + } + + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) + set_bit(idx, args->seq_zones_bitmap); + + args->sector += zone->len; + return 0; } -void blk_queue_free_zone_bitmaps(struct request_queue *q) +static int blk_update_zone_info(struct gendisk *disk, unsigned int nr_zones, + struct blk_revalidate_zone_args *args) { - kfree(q->seq_zones_bitmap); - q->seq_zones_bitmap = NULL; - kfree(q->seq_zones_wlock); - q->seq_zones_wlock = NULL; + /* + * Ensure that all memory allocations in this context are done as + * if GFP_NOIO was specified. + */ + unsigned int noio_flag = memalloc_noio_save(); + struct request_queue *q = disk->queue; + int ret; + + args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones); + if (!args->seq_zones_wlock) + return -ENOMEM; + args->seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones); + if (!args->seq_zones_bitmap) + return -ENOMEM; + + ret = disk->fops->report_zones(disk, 0, nr_zones, + blk_revalidate_zone_cb, args); + memalloc_noio_restore(noio_flag); + return ret; } /** @@ -454,13 +448,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk) { struct request_queue *q = disk->queue; unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk)); - unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; - unsigned int i, rep_nr_zones = 0, z = 0, nrz; - struct blk_zone *zones = NULL; - unsigned int noio_flag; - sector_t sector = 0; + struct blk_revalidate_zone_args args = { .disk = disk }; int ret = 0; + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + return -EIO; + /* * BIO based queues do not use a scheduler so only q->nr_zones * needs to be updated so that the sysfs exposed value is correct. @@ -470,78 +463,28 @@ int blk_revalidate_disk_zones(struct gendisk *disk) return 0; } - /* - * Ensure that all memory allocations in this context are done as - * if GFP_NOIO was specified. - */ - noio_flag = memalloc_noio_save(); - - if (!blk_queue_is_zoned(q) || !nr_zones) { - nr_zones = 0; - goto update; - } - - /* Allocate bitmaps */ - ret = -ENOMEM; - seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones); - if (!seq_zones_wlock) - goto out; - seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones); - if (!seq_zones_bitmap) - goto out; - - /* Get zone information and initialize seq_zones_bitmap */ - rep_nr_zones = nr_zones; - zones = blk_alloc_zones(&rep_nr_zones); - if (!zones) - goto out; - - while (z < nr_zones) { - nrz = min(nr_zones - z, rep_nr_zones); - ret = blk_report_zones(disk, sector, zones, &nrz); - if (ret) - goto out; - if (!nrz) - break; - for (i = 0; i < nrz; i++) { - if (zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL) - set_bit(z, seq_zones_bitmap); - z++; - } - sector += nrz * blk_queue_zone_sectors(q); - } - - if (WARN_ON(z != nr_zones)) { - ret = -EIO; - goto out; - } + if (nr_zones) + ret = blk_update_zone_info(disk, nr_zones, &args); -update: /* * Install the new bitmaps, making sure the queue is stopped and * all I/Os are completed (i.e. a scheduler is not referencing the * bitmaps). */ blk_mq_freeze_queue(q); - q->nr_zones = nr_zones; - swap(q->seq_zones_wlock, seq_zones_wlock); - swap(q->seq_zones_bitmap, seq_zones_bitmap); - blk_mq_unfreeze_queue(q); - -out: - memalloc_noio_restore(noio_flag); - - kvfree(zones); - kfree(seq_zones_wlock); - kfree(seq_zones_bitmap); - - if (ret) { + if (ret >= 0) { + q->nr_zones = nr_zones; + swap(q->seq_zones_wlock, args.seq_zones_wlock); + swap(q->seq_zones_bitmap, args.seq_zones_bitmap); + ret = 0; + } else { pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - blk_mq_freeze_queue(q); blk_queue_free_zone_bitmaps(q); - blk_mq_unfreeze_queue(q); } + blk_mq_unfreeze_queue(q); + kfree(args.seq_zones_wlock); + kfree(args.seq_zones_bitmap); return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); diff --git a/block/blk.h b/block/blk.h index 47fba9362e60..2bea40180b6f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -242,14 +242,11 @@ int blk_dev_init(void); * Contribute to IO statistics IFF: * * a) it's attached to a gendisk, and - * b) the queue had IO stats enabled when this request was started, and - * c) it's a file system request + * b) the queue had IO stats enabled when this request was started */ static inline bool blk_do_io_stat(struct request *rq) { - return rq->rq_disk && - (rq->rq_flags & RQF_IO_STAT) && - !blk_rq_is_passthrough(rq); + return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT); } static inline void req_set_nomerge(struct request_queue *q, struct request *req) diff --git a/block/elevator.c b/block/elevator.c index 5437059c9261..4eab3d70e880 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -616,7 +616,8 @@ out: static inline bool elv_support_iosched(struct request_queue *q) { - if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) + if (!q->mq_ops || + (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))) return false; return true; } @@ -831,3 +832,12 @@ struct request *elv_rb_latter_request(struct request_queue *q, return NULL; } EXPORT_SYMBOL(elv_rb_latter_request); + +static int __init elevator_setup(char *str) +{ + pr_warn("Kernel parameter elevator= does not have any effect anymore.\n" + "Please use sysfs to set IO scheduler for individual devices.\n"); + return 1; +} + +__setup("elevator=", elevator_setup); diff --git a/block/genhd.c b/block/genhd.c index 26b31fcae217..ff6268970ddc 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1385,7 +1385,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " - "%lu %lu %lu %u\n", + "%lu %lu %lu %u " + "%lu %u" + "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), part_stat_read(hd, ios[STAT_READ]), @@ -1402,7 +1404,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, ios[STAT_DISCARD]), part_stat_read(hd, merges[STAT_DISCARD]), part_stat_read(hd, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD) + (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD), + part_stat_read(hd, ios[STAT_FLUSH]), + (unsigned int)part_stat_read_msecs(hd, STAT_FLUSH) ); } disk_part_iter_exit(&piter); diff --git a/block/ioctl.c b/block/ioctl.c index 15a0eb80ada9..7ac8a66c9787 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -155,48 +155,21 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user } } -/* - * This is an exported API for the block driver, and will not - * acquire bd_mutex. This API should be used in case that - * caller has held bd_mutex already. - */ -int __blkdev_reread_part(struct block_device *bdev) +static int blkdev_reread_part(struct block_device *bdev) { - struct gendisk *disk = bdev->bd_disk; + int ret; - if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) + if (!disk_part_scan_enabled(bdev->bd_disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - lockdep_assert_held(&bdev->bd_mutex); - - return rescan_partitions(disk, bdev); -} -EXPORT_SYMBOL(__blkdev_reread_part); - -/* - * This is an exported API for the block driver, and will - * try to acquire bd_mutex. If bd_mutex has been held already - * in current context, please call __blkdev_reread_part(). - * - * Make sure the held locks in current context aren't required - * in open()/close() handler and I/O path for avoiding ABBA deadlock: - * - bd_mutex is held before calling block driver's open/close - * handler - * - reading partition table may submit I/O to the block device - */ -int blkdev_reread_part(struct block_device *bdev) -{ - int res; - mutex_lock(&bdev->bd_mutex); - res = __blkdev_reread_part(bdev); + ret = bdev_disk_changed(bdev, false); mutex_unlock(&bdev->bd_mutex); - return res; + return ret; } -EXPORT_SYMBOL(blkdev_reread_part); static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, unsigned long arg, unsigned long flags) @@ -532,7 +505,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); case BLKRESETZONE: - return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg); + case BLKOPENZONE: + case BLKCLOSEZONE: + case BLKFINISHZONE: + return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg); case BLKGETZONESZ: return put_uint(arg, bdev_zone_sectors(bdev)); case BLKGETNRZONES: diff --git a/block/opal_proto.h b/block/opal_proto.h index 5532412d567c..325cbba2465f 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -76,7 +76,6 @@ enum opal_response_token { * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Section: 6.3 Assigned UIDs */ -#define OPAL_UID_LENGTH 8 #define OPAL_METHOD_LENGTH 8 #define OPAL_MSID_KEYLEN 15 #define OPAL_UID_LENGTH_HALF 4 @@ -108,6 +107,7 @@ enum opal_uid { OPAL_C_PIN_TABLE, OPAL_LOCKING_INFO_TABLE, OPAL_ENTERPRISE_LOCKING_INFO_TABLE, + OPAL_DATASTORE, /* C_PIN_TABLE object ID's */ OPAL_C_PIN_MSID, OPAL_C_PIN_SID, @@ -205,6 +205,10 @@ enum opal_lockingstate { OPAL_LOCKING_LOCKED = 0x03, }; +enum opal_parameter { + OPAL_SUM_SET_LIST = 0x060000, +}; + /* Packets derived from: * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Secion: 3.2.3 ComPackets, Packets & Subpackets diff --git a/block/partition-generic.c b/block/partition-generic.c index aee643ce13d1..1d20c9cf213f 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -127,7 +127,8 @@ ssize_t part_stat_show(struct device *dev, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " "%8u %8u %8u " - "%8lu %8lu %8llu %8u" + "%8lu %8lu %8llu %8u " + "%8lu %8u" "\n", part_stat_read(p, ios[STAT_READ]), part_stat_read(p, merges[STAT_READ]), @@ -143,7 +144,9 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, ios[STAT_DISCARD]), part_stat_read(p, merges[STAT_DISCARD]), (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(p, STAT_DISCARD)); + (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), + part_stat_read(p, ios[STAT_FLUSH]), + (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, @@ -439,12 +442,14 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -static int drop_partitions(struct gendisk *disk, struct block_device *bdev) +int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev) { struct disk_part_iter piter; struct hd_struct *part; int res; + if (!disk_part_scan_enabled(disk)) + return 0; if (bdev->bd_part_count || bdev->bd_super) return -EBUSY; res = invalidate_partition(disk, 0); @@ -459,204 +464,124 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } -static bool part_zone_aligned(struct gendisk *disk, - struct block_device *bdev, - sector_t from, sector_t size) +static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, + struct parsed_partitions *state, int p) { - unsigned int zone_sectors = bdev_zone_sectors(bdev); + sector_t size = state->parts[p].size; + sector_t from = state->parts[p].from; + struct hd_struct *part; - /* - * If this function is called, then the disk is a zoned block device - * (host-aware or host-managed). This can be detected even if the - * zoned block device support is disabled (CONFIG_BLK_DEV_ZONED not - * set). In this case, however, only host-aware devices will be seen - * as a block device is not created for host-managed devices. Without - * zoned block device support, host-aware drives can still be used as - * regular block devices (no zone operation) and their zone size will - * be reported as 0. Allow this case. - */ - if (!zone_sectors) + if (!size) return true; - /* - * Check partition start and size alignement. If the drive has a - * smaller last runt zone, ignore it and allow the partition to - * use it. Check the zone size too: it should be a power of 2 number - * of sectors. - */ - if (WARN_ON_ONCE(!is_power_of_2(zone_sectors))) { - u32 rem; - - div_u64_rem(from, zone_sectors, &rem); - if (rem) + if (from >= get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d start %llu is beyond EOD, ", + disk->disk_name, p, (unsigned long long) from); + if (disk_unlock_native_capacity(disk)) return false; - if ((from + size) < get_capacity(disk)) { - div_u64_rem(size, zone_sectors, &rem); - if (rem) - return false; - } + return true; + } - } else { + if (from + size > get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d size %llu extends beyond EOD, ", + disk->disk_name, p, (unsigned long long) size); - if (from & (zone_sectors - 1)) - return false; - if ((from + size) < get_capacity(disk) && - (size & (zone_sectors - 1))) + if (disk_unlock_native_capacity(disk)) return false; + /* + * We can not ignore partitions of broken tables created by for + * example camera firmware, but we limit them to the end of the + * disk to avoid creating invalid block devices. + */ + size = get_capacity(disk) - from; + } + + part = add_partition(disk, p, from, size, state->parts[p].flags, + &state->parts[p].info); + if (IS_ERR(part)) { + printk(KERN_ERR " %s: p%d could not be added: %ld\n", + disk->disk_name, p, -PTR_ERR(part)); + return true; } +#ifdef CONFIG_BLK_DEV_MD + if (state->parts[p].flags & ADDPART_FLAG_RAID) + md_autodetect_dev(part_to_dev(part)->devt); +#endif return true; } -int rescan_partitions(struct gendisk *disk, struct block_device *bdev) +int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) { - struct parsed_partitions *state = NULL; - struct hd_struct *part; - int p, highest, res; -rescan: - if (state && !IS_ERR(state)) { - free_partitions(state); - state = NULL; - } + struct parsed_partitions *state; + int ret = -EAGAIN, p, highest; - res = drop_partitions(disk, bdev); - if (res) - return res; + if (!disk_part_scan_enabled(disk)) + return 0; - if (disk->fops->revalidate_disk) - disk->fops->revalidate_disk(disk); - check_disk_size_change(disk, bdev, true); - bdev->bd_invalidated = 0; - if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) + state = check_partition(disk, bdev); + if (!state) return 0; if (IS_ERR(state)) { /* - * I/O error reading the partition table. If any - * partition code tried to read beyond EOD, retry - * after unlocking native capacity. + * I/O error reading the partition table. If we tried to read + * beyond EOD, retry after unlocking the native capacity. */ if (PTR_ERR(state) == -ENOSPC) { printk(KERN_WARNING "%s: partition table beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) - goto rescan; + return -EAGAIN; } return -EIO; } + /* - * If any partition code tried to read beyond EOD, try - * unlocking native capacity even if partition table is - * successfully read as we could be missing some partitions. + * Partitions are not supported on zoned block devices. + */ + if (bdev_is_zoned(bdev)) { + pr_warn("%s: ignoring partition table on zoned block device\n", + disk->disk_name); + ret = 0; + goto out_free_state; + } + + /* + * If we read beyond EOD, try unlocking native capacity even if the + * partition table was successfully read as we could be missing some + * partitions. */ if (state->access_beyond_eod) { printk(KERN_WARNING "%s: partition table partially beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) - goto rescan; + goto out_free_state; } /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - /* Detect the highest partition number and preallocate - * disk->part_tbl. This is an optimization and not strictly - * necessary. + /* + * Detect the highest partition number and preallocate disk->part_tbl. + * This is an optimization and not strictly necessary. */ for (p = 1, highest = 0; p < state->limit; p++) if (state->parts[p].size) highest = p; - disk_expand_part_tbl(disk, highest); - /* add partitions */ - for (p = 1; p < state->limit; p++) { - sector_t size, from; - - size = state->parts[p].size; - if (!size) - continue; - - from = state->parts[p].from; - if (from >= get_capacity(disk)) { - printk(KERN_WARNING - "%s: p%d start %llu is beyond EOD, ", - disk->disk_name, p, (unsigned long long) from); - if (disk_unlock_native_capacity(disk)) - goto rescan; - continue; - } - - if (from + size > get_capacity(disk)) { - printk(KERN_WARNING - "%s: p%d size %llu extends beyond EOD, ", - disk->disk_name, p, (unsigned long long) size); - - if (disk_unlock_native_capacity(disk)) { - /* free state and restart */ - goto rescan; - } else { - /* - * we can not ignore partitions of broken tables - * created by for example camera firmware, but - * we limit them to the end of the disk to avoid - * creating invalid block devices - */ - size = get_capacity(disk) - from; - } - } - - /* - * On a zoned block device, partitions should be aligned on the - * device zone size (i.e. zone boundary crossing not allowed). - * Otherwise, resetting the write pointer of the last zone of - * one partition may impact the following partition. - */ - if (bdev_is_zoned(bdev) && - !part_zone_aligned(disk, bdev, from, size)) { - printk(KERN_WARNING - "%s: p%d start %llu+%llu is not zone aligned\n", - disk->disk_name, p, (unsigned long long) from, - (unsigned long long) size); - continue; - } + for (p = 1; p < state->limit; p++) + if (!blk_add_partition(disk, bdev, state, p)) + goto out_free_state; - part = add_partition(disk, p, from, size, - state->parts[p].flags, - &state->parts[p].info); - if (IS_ERR(part)) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); - continue; - } -#ifdef CONFIG_BLK_DEV_MD - if (state->parts[p].flags & ADDPART_FLAG_RAID) - md_autodetect_dev(part_to_dev(part)->devt); -#endif - } + ret = 0; +out_free_state: free_partitions(state); - return 0; -} - -int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) -{ - int res; - - if (!bdev->bd_invalidated) - return 0; - - res = drop_partitions(disk, bdev); - if (res) - return res; - - set_capacity(disk, 0); - check_disk_size_change(disk, bdev, false); - bdev->bd_invalidated = 0; - /* tell userspace that the media / partition table may have changed */ - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - - return 0; + return ret; } unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) diff --git a/block/sed-opal.c b/block/sed-opal.c index b4c761973ac1..880cc57a5f6b 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -149,6 +149,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_ENTERPRISE_LOCKING_INFO_TABLE] = { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_DATASTORE] = + { 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00 }, /* C_PIN_TABLE object ID's */ [OPAL_C_PIN_MSID] = @@ -1139,11 +1141,11 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, * * the result is provided in dev->resp->tok[4] */ -static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table, +static int generic_get_table_info(struct opal_dev *dev, const u8 *table_uid, u64 column) { u8 uid[OPAL_UID_LENGTH]; - const unsigned int half = OPAL_UID_LENGTH/2; + const unsigned int half = OPAL_UID_LENGTH_HALF; /* sed-opal UIDs can be split in two halves: * first: actual table index @@ -1152,7 +1154,7 @@ static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table, * first part of the target table as relative index into that table */ memcpy(uid, opaluid[OPAL_TABLE_TABLE], half); - memcpy(uid+half, opaluid[table], half); + memcpy(uid + half, table_uid, half); return generic_get_column(dev, uid, column); } @@ -1221,6 +1223,75 @@ static int get_active_key(struct opal_dev *dev, void *data) return get_active_key_cont(dev); } +static int generic_table_write_data(struct opal_dev *dev, const u64 data, + u64 offset, u64 size, const u8 *uid) +{ + const u8 __user *src = (u8 __user *)(uintptr_t)data; + u8 *dst; + u64 len; + size_t off = 0; + int err; + + /* do we fit in the available space? */ + err = generic_get_table_info(dev, uid, OPAL_TABLE_ROWS); + if (err) { + pr_debug("Couldn't get the table size\n"); + return err; + } + + len = response_get_u64(&dev->parsed, 4); + if (size > len || offset > len - size) { + pr_debug("Does not fit in the table (%llu vs. %llu)\n", + offset + size, len); + return -ENOSPC; + } + + /* do the actual transmission(s) */ + while (off < size) { + err = cmd_start(dev, uid, opalmethod[OPAL_SET]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WHERE); + add_token_u64(&err, dev, offset + off); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + /* + * The bytestring header is either 1 or 2 bytes, so assume 2. + * There also needs to be enough space to accommodate the + * trailing OPAL_ENDNAME (1 byte) and tokens added by + * cmd_finalize. + */ + len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), + (size_t)(size - off)); + pr_debug("Write bytes %zu+%llu/%llu\n", off, len, size); + + dst = add_bytestring_header(&err, dev, len); + if (!dst) + break; + + if (copy_from_user(dst, src + off, len)) { + err = -EFAULT; + break; + } + + dev->pos += len; + + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) + break; + + err = finalize_and_send(dev, parse_and_check_status); + if (err) + break; + + off += len; + } + + return err; +} + static int generic_lr_enable_disable(struct opal_dev *dev, u8 *uid, bool rle, bool wle, bool rl, bool wl) @@ -1583,68 +1654,9 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data) static int write_shadow_mbr(struct opal_dev *dev, void *data) { struct opal_shadow_mbr *shadow = data; - const u8 __user *src; - u8 *dst; - size_t off = 0; - u64 len; - int err = 0; - - /* do we fit in the available shadow mbr space? */ - err = generic_get_table_info(dev, OPAL_MBR, OPAL_TABLE_ROWS); - if (err) { - pr_debug("MBR: could not get shadow size\n"); - return err; - } - - len = response_get_u64(&dev->parsed, 4); - if (shadow->size > len || shadow->offset > len - shadow->size) { - pr_debug("MBR: does not fit in shadow (%llu vs. %llu)\n", - shadow->offset + shadow->size, len); - return -ENOSPC; - } - - /* do the actual transmission(s) */ - src = (u8 __user *)(uintptr_t)shadow->data; - while (off < shadow->size) { - err = cmd_start(dev, opaluid[OPAL_MBR], opalmethod[OPAL_SET]); - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_WHERE); - add_token_u64(&err, dev, shadow->offset + off); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_VALUES); - - /* - * The bytestring header is either 1 or 2 bytes, so assume 2. - * There also needs to be enough space to accommodate the - * trailing OPAL_ENDNAME (1 byte) and tokens added by - * cmd_finalize. - */ - len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), - (size_t)(shadow->size - off)); - pr_debug("MBR: write bytes %zu+%llu/%llu\n", - off, len, shadow->size); - - dst = add_bytestring_header(&err, dev, len); - if (!dst) - break; - if (copy_from_user(dst, src + off, len)) - err = -EFAULT; - dev->pos += len; - - add_token_u8(&err, dev, OPAL_ENDNAME); - if (err) - break; - - err = finalize_and_send(dev, parse_and_check_status); - if (err) - break; - - off += len; - } - return err; + return generic_table_write_data(dev, shadow->data, shadow->offset, + shadow->size, opaluid[OPAL_MBR]); } static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, @@ -1874,7 +1886,6 @@ static int activate_lsp(struct opal_dev *dev, void *data) { struct opal_lr_act *opal_act = data; u8 user_lr[OPAL_UID_LENGTH]; - u8 uint_3 = 0x83; int err, i; err = cmd_start(dev, opaluid[OPAL_LOCKINGSP_UID], @@ -1887,10 +1898,7 @@ static int activate_lsp(struct opal_dev *dev, void *data) return err; add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, uint_3); - add_token_u8(&err, dev, 6); - add_token_u8(&err, dev, 0); - add_token_u8(&err, dev, 0); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); add_token_u8(&err, dev, OPAL_STARTLIST); add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); @@ -1957,6 +1965,113 @@ static int get_msid_cpin_pin(struct opal_dev *dev, void *data) return 0; } +static int write_table_data(struct opal_dev *dev, void *data) +{ + struct opal_read_write_table *write_tbl = data; + + return generic_table_write_data(dev, write_tbl->data, write_tbl->offset, + write_tbl->size, write_tbl->table_uid); +} + +static int read_table_data_cont(struct opal_dev *dev) +{ + int err; + const char *data_read; + + err = parse_and_check_status(dev); + if (err) + return err; + + dev->prev_d_len = response_get_string(&dev->parsed, 1, &data_read); + dev->prev_data = (void *)data_read; + if (!dev->prev_data) { + pr_debug("%s: Couldn't read data from the table.\n", __func__); + return OPAL_INVAL_PARAM; + } + + return 0; +} + +/* + * IO_BUFFER_LENGTH = 2048 + * sizeof(header) = 56 + * No. of Token Bytes in the Response = 11 + * MAX size of data that can be carried in response buffer + * at a time is : 2048 - (56 + 11) = 1981 = 0x7BD. + */ +#define OPAL_MAX_READ_TABLE (0x7BD) + +static int read_table_data(struct opal_dev *dev, void *data) +{ + struct opal_read_write_table *read_tbl = data; + int err; + size_t off = 0, max_read_size = OPAL_MAX_READ_TABLE; + u64 table_len, len; + u64 offset = read_tbl->offset, read_size = read_tbl->size - 1; + u8 __user *dst; + + err = generic_get_table_info(dev, read_tbl->table_uid, OPAL_TABLE_ROWS); + if (err) { + pr_debug("Couldn't get the table size\n"); + return err; + } + + table_len = response_get_u64(&dev->parsed, 4); + + /* Check if the user is trying to read from the table limits */ + if (read_size > table_len || offset > table_len - read_size) { + pr_debug("Read size exceeds the Table size limits (%llu vs. %llu)\n", + offset + read_size, table_len); + return -EINVAL; + } + + while (off < read_size) { + err = cmd_start(dev, read_tbl->table_uid, opalmethod[OPAL_GET]); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_STARTROW); + add_token_u64(&err, dev, offset + off); /* start row value */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_ENDROW); + + len = min(max_read_size, (size_t)(read_size - off)); + add_token_u64(&err, dev, offset + off + len); /* end row value + */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_debug("Error building read table data command.\n"); + break; + } + + err = finalize_and_send(dev, read_table_data_cont); + if (err) + break; + + /* len+1: This includes the NULL terminator at the end*/ + if (dev->prev_d_len > len + 1) { + err = -EOVERFLOW; + break; + } + + dst = (u8 __user *)(uintptr_t)read_tbl->data; + if (copy_to_user(dst + off, dev->prev_data, dev->prev_d_len)) { + pr_debug("Error copying data to userspace\n"); + err = -EFAULT; + break; + } + dev->prev_data = NULL; + + off += len; + } + + return err; +} + static int end_opal_session(struct opal_dev *dev, void *data) { int err = 0; @@ -2443,6 +2558,68 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) } EXPORT_SYMBOL(opal_unlock_from_suspend); +static int opal_read_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + const struct opal_step read_table_steps[] = { + { start_admin1LSP_opal_session, &rw_tbl->key }, + { read_table_data, rw_tbl }, + { end_opal_session, } + }; + int ret = 0; + + if (!rw_tbl->size) + return ret; + + return execute_steps(dev, read_table_steps, + ARRAY_SIZE(read_table_steps)); +} + +static int opal_write_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + const struct opal_step write_table_steps[] = { + { start_admin1LSP_opal_session, &rw_tbl->key }, + { write_table_data, rw_tbl }, + { end_opal_session, } + }; + int ret = 0; + + if (!rw_tbl->size) + return ret; + + return execute_steps(dev, write_table_steps, + ARRAY_SIZE(write_table_steps)); +} + +static int opal_generic_read_write_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + int ret, bit_set; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + + bit_set = fls64(rw_tbl->flags) - 1; + switch (bit_set) { + case OPAL_READ_TABLE: + ret = opal_read_table(dev, rw_tbl); + break; + case OPAL_WRITE_TABLE: + ret = opal_write_table(dev, rw_tbl); + break; + default: + pr_debug("Invalid bit set in the flag (%016llx).\n", + rw_tbl->flags); + ret = -EINVAL; + break; + } + + mutex_unlock(&dev->dev_lock); + + return ret; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -2505,6 +2682,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_PSID_REVERT_TPR: ret = opal_reverttper(dev, p, true); break; + case IOC_OPAL_GENERIC_TABLE_RW: + ret = opal_generic_read_write_table(dev, p); + break; default: break; } diff --git a/block/t10-pi.c b/block/t10-pi.c index 9803c7e0376e..f4907d941f03 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -235,16 +235,12 @@ static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); } -/** - * Type 3 does not have a reference tag so no remapping is required. - */ +/* Type 3 does not have a reference tag so no remapping is required. */ static void t10_pi_type3_prepare(struct request *rq) { } -/** - * Type 3 does not have a reference tag so no remapping is required. - */ +/* Type 3 does not have a reference tag so no remapping is required. */ static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) { } |