diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 5 | ||||
-rw-r--r-- | block/Makefile | 1 | ||||
-rw-r--r-- | block/bfq-iosched.c | 182 | ||||
-rw-r--r-- | block/bfq-iosched.h | 25 | ||||
-rw-r--r-- | block/bio-integrity.c | 26 | ||||
-rw-r--r-- | block/bio.c | 32 | ||||
-rw-r--r-- | block/blk-cgroup.c | 8 | ||||
-rw-r--r-- | block/blk-core.c | 153 | ||||
-rw-r--r-- | block/blk-flush.c | 26 | ||||
-rw-r--r-- | block/blk-lib.c | 8 | ||||
-rw-r--r-- | block/blk-merge.c | 6 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 4 | ||||
-rw-r--r-- | block/blk-mq-rdma.c | 52 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 23 | ||||
-rw-r--r-- | block/blk-mq.c | 54 | ||||
-rw-r--r-- | block/blk-mq.h | 3 | ||||
-rw-r--r-- | block/blk-settings.c | 1 | ||||
-rw-r--r-- | block/blk-softirq.c | 2 | ||||
-rw-r--r-- | block/blk-sysfs.c | 2 | ||||
-rw-r--r-- | block/blk-tag.c | 1 | ||||
-rw-r--r-- | block/blk-throttle.c | 13 | ||||
-rw-r--r-- | block/blk-zoned.c | 4 | ||||
-rw-r--r-- | block/blk.h | 3 | ||||
-rw-r--r-- | block/bsg.c | 7 | ||||
-rw-r--r-- | block/cfq-iosched.c | 31 | ||||
-rw-r--r-- | block/compat_ioctl.c | 2 | ||||
-rw-r--r-- | block/deadline-iosched.c | 9 | ||||
-rw-r--r-- | block/elevator.c | 4 | ||||
-rw-r--r-- | block/genhd.c | 109 | ||||
-rw-r--r-- | block/mq-deadline.c | 10 | ||||
-rw-r--r-- | block/partition-generic.c | 23 |
31 files changed, 518 insertions, 311 deletions
diff --git a/block/Kconfig b/block/Kconfig index 89cd28f8d051..3ab42bbb06d5 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -206,4 +206,9 @@ config BLK_MQ_VIRTIO depends on BLOCK && VIRTIO default y +config BLK_MQ_RDMA + bool + depends on BLOCK && INFINIBAND + default y + source block/Kconfig.iosched diff --git a/block/Makefile b/block/Makefile index 2b281cf258a0..9396ebc85d24 100644 --- a/block/Makefile +++ b/block/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o +obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 436b6ca6b175..6a7a26b6cec1 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -128,7 +128,7 @@ BFQ_BFQQ_FNS(busy); BFQ_BFQQ_FNS(wait_request); BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(fifo_expire); -BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(has_short_ttime); BFQ_BFQQ_FNS(sync); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); @@ -731,10 +731,10 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, unsigned int old_wr_coeff = bfqq->wr_coeff; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); - if (bic->saved_idle_window) - bfq_mark_bfqq_idle_window(bfqq); + if (bic->saved_has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); else - bfq_clear_bfqq_idle_window(bfqq); + bfq_clear_bfqq_has_short_ttime(bfqq); if (bic->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); @@ -2012,7 +2012,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) return; bic->saved_ttime = bfqq->ttime; - bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); @@ -3038,8 +3038,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, } bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, - slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + "expire (%d, slow %d, num_disp %d, short_ttime %d)", reason, + slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq)); /* * Increase, decrease or leave budget unchanged according to @@ -3114,7 +3114,10 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; - bool idling_boosts_thr, idling_boosts_thr_without_issues, + bool rot_without_queueing = + !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, + bfqq_sequential_and_IO_bound, + idling_boosts_thr, idling_boosts_thr_without_issues, idling_needed_for_service_guarantees, asymmetric_scenario; @@ -3122,27 +3125,45 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) return true; /* + * Idling is performed only if slice_idle > 0. In addition, we + * do not idle if + * (a) bfqq is async + * (b) bfqq is in the idle io prio class: in this case we do + * not idle because we want to minimize the bandwidth that + * queues in this class can steal to higher-priority queues + */ + if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || + bfq_class_idle(bfqq)) + return false; + + bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && + bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); + + /* * The next variable takes into account the cases where idling * boosts the throughput. * * The value of the variable is computed considering, first, that * idling is virtually always beneficial for the throughput if: - * (a) the device is not NCQ-capable, or - * (b) regardless of the presence of NCQ, the device is rotational - * and the request pattern for bfqq is I/O-bound and sequential. + * (a) the device is not NCQ-capable and rotational, or + * (b) regardless of the presence of NCQ, the device is rotational and + * the request pattern for bfqq is I/O-bound and sequential, or + * (c) regardless of whether it is rotational, the device is + * not NCQ-capable and the request pattern for bfqq is + * I/O-bound and sequential. * * Secondly, and in contrast to the above item (b), idling an * NCQ-capable flash-based device would not boost the * throughput even with sequential I/O; rather it would lower * the throughput in proportion to how fast the device * is. Accordingly, the next variable is true if any of the - * above conditions (a) and (b) is true, and, in particular, - * happens to be false if bfqd is an NCQ-capable flash-based - * device. + * above conditions (a), (b) or (c) is true, and, in + * particular, happens to be false if bfqd is an NCQ-capable + * flash-based device. */ - idling_boosts_thr = !bfqd->hw_tag || - (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && - bfq_bfqq_idle_window(bfqq)); + idling_boosts_thr = rot_without_queueing || + ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && + bfqq_sequential_and_IO_bound); /* * The value of the next variable, @@ -3313,16 +3334,13 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); /* - * We have now all the components we need to compute the return - * value of the function, which is true only if both the following - * conditions hold: - * 1) bfqq is sync, because idling make sense only for sync queues; - * 2) idling either boosts the throughput (without issues), or - * is necessary to preserve service guarantees. + * We have now all the components we need to compute the + * return value of the function, which is true only if idling + * either boosts the throughput (without issues), or is + * necessary to preserve service guarantees. */ - return bfq_bfqq_sync(bfqq) && - (idling_boosts_thr_without_issues || - idling_needed_for_service_guarantees); + return idling_boosts_thr_without_issues || + idling_needed_for_service_guarantees; } /* @@ -3338,10 +3356,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) */ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { - struct bfq_data *bfqd = bfqq->bfqd; - - return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && - bfq_bfqq_may_idle(bfqq); + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); } /* @@ -3783,7 +3798,6 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) case IOPRIO_CLASS_IDLE: bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; bfqq->new_ioprio = 7; - bfq_clear_bfqq_idle_window(bfqq); break; } @@ -3843,8 +3857,14 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_set_next_ioprio_data(bfqq, bic); if (is_sync) { + /* + * No need to mark as has_short_ttime if in + * idle_class, because no device idling is performed + * for queues in idle class + */ if (!bfq_class_idle(bfqq)) - bfq_mark_bfqq_idle_window(bfqq); + /* tentatively mark as has_short_ttime */ + bfq_mark_bfqq_has_short_ttime(bfqq); bfq_mark_bfqq_sync(bfqq); bfq_mark_bfqq_just_created(bfqq); } else @@ -3985,18 +4005,19 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); } -/* - * Disable idle window if the process thinks too long or seeks so much that - * it doesn't matter. - */ -static void bfq_update_idle_window(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_io_cq *bic) +static void bfq_update_has_short_ttime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) { - int enable_idle; + bool has_short_ttime = true; - /* Don't idle for async or idle io prio class. */ - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) + /* + * No need to update has_short_ttime if bfqq is async or in + * idle io prio class, or if bfq_slice_idle is zero, because + * no device idling is performed for bfqq in this case. + */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || + bfqd->bfq_slice_idle == 0) return; /* Idle window just restored, statistics are meaningless. */ @@ -4004,27 +4025,22 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, bfqd->bfq_wr_min_idle_time)) return; - enable_idle = bfq_bfqq_idle_window(bfqq); - + /* Think time is infinite if no process is linked to + * bfqq. Otherwise check average think time to + * decide whether to mark as has_short_ttime + */ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || - bfqd->bfq_slice_idle == 0 || - (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && - bfqq->wr_coeff == 1)) - enable_idle = 0; - else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) { - if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle && - bfqq->wr_coeff == 1) - enable_idle = 0; - else - enable_idle = 1; - } - bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", - enable_idle); + (bfq_sample_valid(bfqq->ttime.ttime_samples) && + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) + has_short_ttime = false; - if (enable_idle) - bfq_mark_bfqq_idle_window(bfqq); + bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", + has_short_ttime); + + if (has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); else - bfq_clear_bfqq_idle_window(bfqq); + bfq_clear_bfqq_has_short_ttime(bfqq); } /* @@ -4040,14 +4056,12 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->meta_pending++; bfq_update_io_thinktime(bfqd, bfqq); + bfq_update_has_short_ttime(bfqd, bfqq, bic); bfq_update_io_seektime(bfqd, bfqq, rq); - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || - !BFQQ_SEEKY(bfqq)) - bfq_update_idle_window(bfqd, bfqq, bic); bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d)", - bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); + "rq_enqueued: has_short_ttime=%d (seeky %d)", + bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); @@ -4787,16 +4801,13 @@ static ssize_t bfq_var_show(unsigned int var, char *page) return sprintf(page, "%u\n", var); } -static ssize_t bfq_var_store(unsigned long *var, const char *page, - size_t count) +static void bfq_var_store(unsigned long *var, const char *page) { unsigned long new_val; int ret = kstrtoul(page, 10, &new_val); if (ret == 0) *var = new_val; - - return count; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ @@ -4838,7 +4849,7 @@ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct bfq_data *bfqd = e->elevator_data; \ unsigned long uninitialized_var(__data); \ - int ret = bfq_var_store(&__data, (page), count); \ + bfq_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ @@ -4849,7 +4860,7 @@ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ else \ *(__PTR) = __data; \ - return ret; \ + return count; \ } STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, INT_MAX, 2); @@ -4866,13 +4877,13 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ { \ struct bfq_data *bfqd = e->elevator_data; \ unsigned long uninitialized_var(__data); \ - int ret = bfq_var_store(&__data, (page), count); \ + bfq_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ - return ret; \ + return count; \ } USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, UINT_MAX); @@ -4883,7 +4894,8 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, { struct bfq_data *bfqd = e->elevator_data; unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); + + bfq_var_store(&__data, (page)); if (__data == 0) bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); @@ -4895,7 +4907,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, bfqd->bfq_user_max_budget = __data; - return ret; + return count; } /* @@ -4907,7 +4919,8 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, { struct bfq_data *bfqd = e->elevator_data; unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); + + bfq_var_store(&__data, (page)); if (__data < 1) __data = 1; @@ -4918,7 +4931,7 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, if (bfqd->bfq_user_max_budget == 0) bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); - return ret; + return count; } static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, @@ -4926,7 +4939,8 @@ static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, { struct bfq_data *bfqd = e->elevator_data; unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); + + bfq_var_store(&__data, (page)); if (__data > 1) __data = 1; @@ -4936,7 +4950,7 @@ static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, bfqd->strict_guarantees = __data; - return ret; + return count; } static ssize_t bfq_low_latency_store(struct elevator_queue *e, @@ -4944,7 +4958,8 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e, { struct bfq_data *bfqd = e->elevator_data; unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); + + bfq_var_store(&__data, (page)); if (__data > 1) __data = 1; @@ -4952,7 +4967,7 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e, bfq_end_wr(bfqd); bfqd->low_latency = __data; - return ret; + return count; } #define BFQ_ATTR(name) \ @@ -4998,6 +5013,7 @@ static struct elevator_type iosched_bfq_mq = { .elevator_name = "bfq", .elevator_owner = THIS_MODULE, }; +MODULE_ALIAS("bfq-iosched"); static int __init bfq_init(void) { @@ -5048,10 +5064,12 @@ static int __init bfq_init(void) ret = elv_register(&iosched_bfq_mq); if (ret) - goto err_pol_unreg; + goto slab_kill; return 0; +slab_kill: + bfq_slab_kill(); err_pol_unreg: #ifdef CONFIG_BFQ_GROUP_IOSCHED blkcg_policy_unregister(&blkcg_policy_bfq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 859f0a8c97c8..cc4ea8574483 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -360,11 +360,11 @@ struct bfq_io_cq { uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif /* - * Snapshot of the idle window before merging; taken to - * remember this value while the queue is merged, so as to be - * able to restore it in case of split. + * Snapshot of the has_short_time flag before merging; taken + * to remember its value while the queue is merged, so as to + * be able to restore it in case of split. */ - bool saved_idle_window; + bool saved_has_short_ttime; /* * Same purpose as the previous two fields for the I/O bound * classification of a queue. @@ -638,7 +638,7 @@ enum bfqq_state_flags { * without idling the device */ BFQQF_fifo_expire, /* FIFO checked in this slice */ - BFQQF_idle_window, /* slice idling enabled */ + BFQQF_has_short_ttime, /* queue has a short think time */ BFQQF_sync, /* synchronous queue */ BFQQF_IO_bound, /* * bfqq has timed-out at least once @@ -667,7 +667,7 @@ BFQ_BFQQ_FNS(busy); BFQ_BFQQ_FNS(wait_request); BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(fifo_expire); -BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(has_short_ttime); BFQ_BFQQ_FNS(sync); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); @@ -929,13 +929,16 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - bfqq_group(bfqq)->blkg_path, ##args); \ + blk_add_cgroup_trace_msg((bfqd)->queue, \ + bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ + "bfq%d%c " fmt, (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \ } while (0) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + blk_add_cgroup_trace_msg((bfqd)->queue, \ + bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \ +} while (0) #else /* CONFIG_BFQ_GROUP_IOSCHED */ diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 9b1ea478577b..5df32907ff3b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -146,7 +146,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; if (bip->bip_vcnt && - bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev), + bvec_gap_to_prev(bio->bi_disk->queue, &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; @@ -190,7 +190,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, static blk_status_t bio_integrity_process(struct bio *bio, struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) { - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); struct blk_integrity_iter iter; struct bvec_iter bviter; struct bio_vec bv; @@ -199,7 +199,7 @@ static blk_status_t bio_integrity_process(struct bio *bio, void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; + iter.disk_name = bio->bi_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; iter.prot_buf = prot_buf; @@ -236,8 +236,8 @@ static blk_status_t bio_integrity_process(struct bio *bio, bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; - struct blk_integrity *bi; - struct request_queue *q; + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct request_queue *q = bio->bi_disk->queue; void *buf; unsigned long start, end; unsigned int len, nr_pages; @@ -245,8 +245,9 @@ bool bio_integrity_prep(struct bio *bio) unsigned int intervals; blk_status_t status; - bi = bdev_get_integrity(bio->bi_bdev); - q = bdev_get_queue(bio->bi_bdev); + if (!bi) + return true; + if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) return true; @@ -257,9 +258,6 @@ bool bio_integrity_prep(struct bio *bio) if (bio_integrity(bio)) return true; - if (bi == NULL) - return true; - if (bio_data_dir(bio) == READ) { if (!bi->profile->verify_fn || !(bi->flags & BLK_INTEGRITY_VERIFY)) @@ -354,7 +352,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); struct bvec_iter iter = bio->bi_iter; /* @@ -387,7 +385,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) */ bool __bio_integrity_endio(struct bio *bio) { - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && @@ -413,7 +411,7 @@ bool __bio_integrity_endio(struct bio *bio) void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); bip->bip_iter.bi_sector += bytes_done >> 9; @@ -430,7 +428,7 @@ EXPORT_SYMBOL(bio_integrity_advance); void bio_integrity_trim(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); } diff --git a/block/bio.c b/block/bio.c index 9a63597aaacc..b38e962fa83e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -593,10 +593,10 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); /* - * most users will be overriding ->bi_bdev with a new target, + * most users will be overriding ->bi_disk with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_bdev = bio_src->bi_bdev; + bio->bi_disk = bio_src->bi_disk; bio_set_flag(bio, BIO_CLONED); bio->bi_opf = bio_src->bi_opf; bio->bi_write_hint = bio_src->bi_write_hint; @@ -681,7 +681,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; - bio->bi_bdev = bio_src->bi_bdev; + bio->bi_disk = bio_src->bi_disk; bio->bi_opf = bio_src->bi_opf; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; @@ -936,6 +936,10 @@ static void submit_bio_wait_endio(struct bio *bio) * * Simple wrapper around submit_bio(). Returns 0 on success, or the error from * bio_endio() on failure. + * + * WARNING: Unlike to how submit_bio() is usually used, this function does not + * result in bio reference to be consumed. The caller must drop the reference + * on his own. */ int submit_bio_wait(struct bio *bio) { @@ -1732,29 +1736,29 @@ void bio_check_pages_dirty(struct bio *bio) } } -void generic_start_io_acct(int rw, unsigned long sectors, - struct hd_struct *part) +void generic_start_io_acct(struct request_queue *q, int rw, + unsigned long sectors, struct hd_struct *part) { int cpu = part_stat_lock(); - part_round_stats(cpu, part); + part_round_stats(q, cpu, part); part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, sectors[rw], sectors); - part_inc_in_flight(part, rw); + part_inc_in_flight(q, part, rw); part_stat_unlock(); } EXPORT_SYMBOL(generic_start_io_acct); -void generic_end_io_acct(int rw, struct hd_struct *part, - unsigned long start_time) +void generic_end_io_acct(struct request_queue *q, int rw, + struct hd_struct *part, unsigned long start_time) { unsigned long duration = jiffies - start_time; int cpu = part_stat_lock(); part_stat_add(cpu, part, ticks[rw], duration); - part_round_stats(cpu, part); - part_dec_in_flight(part, rw); + part_round_stats(q, cpu, part); + part_dec_in_flight(q, part, rw); part_stat_unlock(); } @@ -1826,8 +1830,8 @@ again: goto again; } - if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, + if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_disk->queue, bio, blk_status_to_errno(bio->bi_status)); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } @@ -2085,7 +2089,7 @@ void bio_clone_blkcg_association(struct bio *dst, struct bio *src) if (src->bi_css) WARN_ON(bio_associate_blkcg(dst, src->bi_css)); } - +EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0480892e97e5..d3f56baee936 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1067,7 +1067,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) { ret = ERR_PTR(-ENOMEM); - goto free_blkcg; + goto unlock; } } @@ -1111,8 +1111,10 @@ free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); -free_blkcg: - kfree(blkcg); + + if (blkcg != &blkcg_root) + kfree(blkcg); +unlock: mutex_unlock(&blkcg_pol_mutex); return ret; } diff --git a/block/blk-core.c b/block/blk-core.c index dbecbf4a64e0..d709c0e3a2ac 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -280,7 +280,7 @@ EXPORT_SYMBOL(blk_start_queue_async); void blk_start_queue(struct request_queue *q) { lockdep_assert_held(q->queue_lock); - WARN_ON(!irqs_disabled()); + WARN_ON(!in_interrupt() && !irqs_disabled()); WARN_ON_ONCE(q->mq_ops); queue_flag_clear(QUEUE_FLAG_STOPPED, q); @@ -1469,15 +1469,10 @@ static void add_acct_request(struct request_queue *q, struct request *rq, __elv_add_request(q, rq, where); } -static void part_round_stats_single(int cpu, struct hd_struct *part, - unsigned long now) +static void part_round_stats_single(struct request_queue *q, int cpu, + struct hd_struct *part, unsigned long now, + unsigned int inflight) { - int inflight; - - if (now == part->stamp) - return; - - inflight = part_in_flight(part); if (inflight) { __part_stat_add(cpu, part, time_in_queue, inflight * (now - part->stamp)); @@ -1488,6 +1483,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part, /** * part_round_stats() - Round off the performance stats on a struct disk_stats. + * @q: target block queue * @cpu: cpu number for stats access * @part: target partition * @@ -1502,13 +1498,31 @@ static void part_round_stats_single(int cpu, struct hd_struct *part, * /proc/diskstats. This accounts immediately for all queue usage up to * the current jiffies and restarts the counters again. */ -void part_round_stats(int cpu, struct hd_struct *part) +void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) { + struct hd_struct *part2 = NULL; unsigned long now = jiffies; + unsigned int inflight[2]; + int stats = 0; + + if (part->stamp != now) + stats |= 1; + + if (part->partno) { + part2 = &part_to_disk(part)->part0; + if (part2->stamp != now) + stats |= 2; + } - if (part->partno) - part_round_stats_single(cpu, &part_to_disk(part)->part0, now); - part_round_stats_single(cpu, part, now); + if (!stats) + return; + + part_in_flight(q, part, inflight); + + if (stats & 2) + part_round_stats_single(q, cpu, part2, now, inflight[1]); + if (stats & 1) + part_round_stats_single(q, cpu, part, now, inflight[0]); } EXPORT_SYMBOL_GPL(part_round_stats); @@ -1896,40 +1910,15 @@ out_unlock: return BLK_QC_T_NONE; } -/* - * If bio->bi_dev is a partition, remap the location - */ -static inline void blk_partition_remap(struct bio *bio) -{ - struct block_device *bdev = bio->bi_bdev; - - /* - * Zone reset does not include bi_size so bio_sectors() is always 0. - * Include a test for the reset op code and perform the remap if needed. - */ - if (bdev != bdev->bd_contains && - (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)) { - struct hd_struct *p = bdev->bd_part; - - bio->bi_iter.bi_sector += p->start_sect; - bio->bi_bdev = bdev->bd_contains; - - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, - bio->bi_iter.bi_sector - p->start_sect); - } -} - static void handle_bad_sector(struct bio *bio) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "attempt to access beyond end of device\n"); printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n", - bdevname(bio->bi_bdev, b), - bio->bi_opf, + bio_devname(bio, b), bio->bi_opf, (unsigned long long)bio_end_sector(bio), - (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); + (long long)get_capacity(bio->bi_disk)); } #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -1968,6 +1957,38 @@ static inline bool should_fail_request(struct hd_struct *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ /* + * Remap block n of partition p to block n+start(p) of the disk. + */ +static inline int blk_partition_remap(struct bio *bio) +{ + struct hd_struct *p; + int ret = 0; + + /* + * Zone reset does not include bi_size so bio_sectors() is always 0. + * Include a test for the reset op code and perform the remap if needed. + */ + if (!bio->bi_partno || + (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)) + return 0; + + rcu_read_lock(); + p = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) { + bio->bi_iter.bi_sector += p->start_sect; + bio->bi_partno = 0; + trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector - p->start_sect); + } else { + printk("%s: fail for partition %d\n", __func__, bio->bi_partno); + ret = -EIO; + } + rcu_read_unlock(); + + return ret; +} + +/* * Check whether this bio extends beyond the end of the device. */ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) @@ -1978,7 +1999,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) return 0; /* Test device or partition size, when known. */ - maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; + maxsector = get_capacity(bio->bi_disk); if (maxsector) { sector_t sector = bio->bi_iter.bi_sector; @@ -2003,20 +2024,18 @@ generic_make_request_checks(struct bio *bio) int nr_sectors = bio_sectors(bio); blk_status_t status = BLK_STS_IOERR; char b[BDEVNAME_SIZE]; - struct hd_struct *part; might_sleep(); if (bio_check_eod(bio, nr_sectors)) goto end_io; - q = bdev_get_queue(bio->bi_bdev); + q = bio->bi_disk->queue; if (unlikely(!q)) { printk(KERN_ERR "generic_make_request: Trying to access " "nonexistent block-device %s (%Lu)\n", - bdevname(bio->bi_bdev, b), - (long long) bio->bi_iter.bi_sector); + bio_devname(bio, b), (long long)bio->bi_iter.bi_sector); goto end_io; } @@ -2028,17 +2047,11 @@ generic_make_request_checks(struct bio *bio) if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) goto not_supported; - part = bio->bi_bdev->bd_part; - if (should_fail_request(part, bio->bi_iter.bi_size) || - should_fail_request(&part_to_disk(part)->part0, - bio->bi_iter.bi_size)) + if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) goto end_io; - /* - * If this device has partitions, remap block n - * of partition p to block n+start(p) of the disk. - */ - blk_partition_remap(bio); + if (blk_partition_remap(bio)) + goto end_io; if (bio_check_eod(bio, nr_sectors)) goto end_io; @@ -2067,16 +2080,16 @@ generic_make_request_checks(struct bio *bio) goto not_supported; break; case REQ_OP_WRITE_SAME: - if (!bdev_write_same(bio->bi_bdev)) + if (!q->limits.max_write_same_sectors) goto not_supported; break; case REQ_OP_ZONE_REPORT: case REQ_OP_ZONE_RESET: - if (!bdev_is_zoned(bio->bi_bdev)) + if (!blk_queue_is_zoned(q)) goto not_supported; break; case REQ_OP_WRITE_ZEROES: - if (!bdev_write_zeroes_sectors(bio->bi_bdev)) + if (!q->limits.max_write_zeroes_sectors) goto not_supported; break; default: @@ -2183,7 +2196,7 @@ blk_qc_t generic_make_request(struct bio *bio) bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; do { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct request_queue *q = bio->bi_disk->queue; if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { struct bio_list lower, same; @@ -2201,7 +2214,7 @@ blk_qc_t generic_make_request(struct bio *bio) bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bdev_get_queue(bio->bi_bdev)) + if (q == bio->bi_disk->queue) bio_list_add(&same, bio); else bio_list_add(&lower, bio); @@ -2244,7 +2257,7 @@ blk_qc_t submit_bio(struct bio *bio) unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - count = bdev_logical_block_size(bio->bi_bdev) >> 9; + count = queue_logical_block_size(bio->bi_disk->queue); else count = bio_sectors(bio); @@ -2261,8 +2274,7 @@ blk_qc_t submit_bio(struct bio *bio) current->comm, task_pid_nr(current), op_is_write(bio_op(bio)) ? "WRITE" : "READ", (unsigned long long)bio->bi_iter.bi_sector, - bdevname(bio->bi_bdev, b), - count); + bio_devname(bio, b), count); } } @@ -2431,8 +2443,8 @@ void blk_account_io_done(struct request *req) part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); - part_round_stats(cpu, part); - part_dec_in_flight(part, rw); + part_round_stats(req->q, cpu, part); + part_dec_in_flight(req->q, part, rw); hd_struct_put(part); part_stat_unlock(); @@ -2489,8 +2501,8 @@ void blk_account_io_start(struct request *rq, bool new_io) part = &rq->rq_disk->part0; hd_struct_get(part); } - part_round_stats(cpu, part); - part_inc_in_flight(part, rw); + part_round_stats(rq->q, cpu, part); + part_inc_in_flight(rq->q, part, rw); rq->part = part; } @@ -2603,7 +2615,7 @@ struct request *blk_peek_request(struct request_queue *q) } EXPORT_SYMBOL(blk_peek_request); -void blk_dequeue_request(struct request *rq) +static void blk_dequeue_request(struct request *rq) { struct request_queue *q = rq->q; @@ -2630,9 +2642,6 @@ void blk_dequeue_request(struct request *rq) * Description: * Dequeue @req and start timeout timer on it. This hands off the * request to the driver. - * - * Block internal functions which don't want to start timer should - * call blk_dequeue_request(). */ void blk_start_request(struct request *req) { @@ -3035,8 +3044,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; - if (bio->bi_bdev) - rq->rq_disk = bio->bi_bdev->bd_disk; + if (bio->bi_disk) + rq->rq_disk = bio->bi_disk; } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE diff --git a/block/blk-flush.c b/block/blk-flush.c index ed5fe322abba..4938bec8cfef 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -1,12 +1,12 @@ /* - * Functions to sequence FLUSH and FUA writes. + * Functions to sequence PREFLUSH and FUA writes. * * Copyright (C) 2011 Max Planck Institute for Gravitational Physics * Copyright (C) 2011 Tejun Heo <tj@kernel.org> * * This file is released under the GPLv2. * - * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three + * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request * properties and hardware capability. * @@ -16,9 +16,9 @@ * REQ_FUA means that the data must be on non-volatile media on request * completion. * - * If the device doesn't have writeback cache, FLUSH and FUA don't make any - * difference. The requests are either completed immediately if there's no - * data or executed as normal requests otherwise. + * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any + * difference. The requests are either completed immediately if there's no data + * or executed as normal requests otherwise. * * If the device has writeback cache and supports FUA, REQ_PREFLUSH is * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. @@ -31,7 +31,7 @@ * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a * REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush * completes, all the requests which were pending are proceeded to the next - * step. This allows arbitrary merging of different types of FLUSH/FUA + * step. This allows arbitrary merging of different types of PREFLUSH/FUA * requests. * * Currently, the following conditions are used to determine when to issue @@ -47,19 +47,19 @@ * C3. The second condition is ignored if there is a request which has * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid * starvation in the unlikely case where there are continuous stream of - * FUA (without FLUSH) requests. + * FUA (without PREFLUSH) requests. * * For devices which support FUA, it isn't clear whether C2 (and thus C3) * is beneficial. * - * Note that a sequenced FLUSH/FUA request with DATA is completed twice. + * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice. * Once while executing DATA and again after the whole sequence is * complete. The first completion updates the contained bio but doesn't * finish it so that the bio submitter is notified only after the whole * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in * req_bio_endio(). * - * The above peculiarity requires that each FLUSH/FUA request has only one + * The above peculiarity requires that each PREFLUSH/FUA request has only one * bio attached to it, which is guaranteed as they aren't allowed to be * merged in the usual way. */ @@ -76,7 +76,7 @@ #include "blk-mq-tag.h" #include "blk-mq-sched.h" -/* FLUSH/FUA sequences */ +/* PREFLUSH/FUA sequences */ enum { REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ @@ -148,7 +148,7 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) /** * blk_flush_complete_seq - complete flush sequence - * @rq: FLUSH/FUA request being sequenced + * @rq: PREFLUSH/FUA request being sequenced * @fq: flush queue * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) * @error: whether an error occurred @@ -406,7 +406,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) } /** - * blk_insert_flush - insert a new FLUSH/FUA request + * blk_insert_flush - insert a new PREFLUSH/FUA request * @rq: request to insert * * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. @@ -525,7 +525,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, return -ENXIO; bio = bio_alloc(gfp_mask, 0); - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; ret = submit_bio_wait(bio); diff --git a/block/blk-lib.c b/block/blk-lib.c index 3fe0aec90597..e01adb5145b3 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -77,7 +77,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, bio = next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio_set_op_attrs(bio, op, 0); bio->bi_iter.bi_size = req_sects << 9; @@ -168,7 +168,7 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, while (nr_sects) { bio = next_bio(bio, 1, gfp_mask); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_vcnt = 1; bio->bi_io_vec->bv_page = page; bio->bi_io_vec->bv_offset = 0; @@ -241,7 +241,7 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, while (nr_sects) { bio = next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE_ZEROES; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; @@ -323,7 +323,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), gfp_mask); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); while (nr_sects != 0) { diff --git a/block/blk-merge.c b/block/blk-merge.c index 99038830fb42..aa524cad5bea 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -633,8 +633,8 @@ static void blk_account_io_merge(struct request *req) cpu = part_stat_lock(); part = req->part; - part_round_stats(cpu, part); - part_dec_in_flight(part, rq_data_dir(req)); + part_round_stats(req->q, cpu, part); + part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); part_stat_unlock(); @@ -786,7 +786,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; /* must be same device and not a special request */ - if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq)) + if (rq->rq_disk != bio->bi_disk || req_no_special_merge(rq)) return false; /* only merge integrity protected bio into ditto rq */ diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4f927a58dff8..980e73095643 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -48,8 +48,6 @@ static int blk_flags_show(struct seq_file *m, const unsigned long flags, static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(QUEUED), QUEUE_FLAG_NAME(STOPPED), - QUEUE_FLAG_NAME(SYNCFULL), - QUEUE_FLAG_NAME(ASYNCFULL), QUEUE_FLAG_NAME(DYING), QUEUE_FLAG_NAME(BYPASS), QUEUE_FLAG_NAME(BIDI), @@ -744,7 +742,7 @@ static int blk_mq_debugfs_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -const struct file_operations blk_mq_debugfs_fops = { +static const struct file_operations blk_mq_debugfs_fops = { .open = blk_mq_debugfs_open, .read = seq_read, .write = blk_mq_debugfs_write, diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c new file mode 100644 index 000000000000..996167f1de18 --- /dev/null +++ b/block/blk-mq-rdma.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017 Sagi Grimberg. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/blk-mq.h> +#include <linux/blk-mq-rdma.h> +#include <rdma/ib_verbs.h> + +/** + * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device + * @set: tagset to provide the mapping for + * @dev: rdma device associated with @set. + * @first_vec: first interrupt vectors to use for queues (usually 0) + * + * This function assumes the rdma device @dev has at least as many available + * interrupt vetors as @set has queues. It will then query it's affinity mask + * and built queue mapping that maps a queue to the CPUs that have irq affinity + * for the corresponding vector. + * + * In case either the driver passed a @dev with less vectors than + * @set->nr_hw_queues, or @dev does not provide an affinity mask for a + * vector, we fallback to the naive mapping. + */ +int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, + struct ib_device *dev, int first_vec) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < set->nr_hw_queues; queue++) { + mask = ib_get_vector_affinity(dev, first_vec + queue); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + set->mq_map[cpu] = queue; + } + + return 0; + +fallback: + return blk_mq_map_queues(set); +} +EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d0be72ccb091..6714507aa6c7 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -214,7 +214,11 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) bitnr += tags->nr_reserved_tags; rq = tags->rqs[bitnr]; - if (rq->q == hctx->queue) + /* + * We can hit rq == NULL here, because the tagging functions + * test and set the bit before assining ->rqs[]. + */ + if (rq && rq->q == hctx->queue) iter_data->fn(hctx, rq, iter_data->data, reserved); return true; } @@ -248,9 +252,15 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (!reserved) bitnr += tags->nr_reserved_tags; + + /* + * We can hit rq == NULL here, because the tagging functions + * test and set the bit before assining ->rqs[]. + */ rq = tags->rqs[bitnr]; + if (rq) + iter_data->fn(rq, iter_data->data, reserved); - iter_data->fn(rq, iter_data->data, reserved); return true; } @@ -288,11 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, + int (reinit_request)(void *, struct request *)) { int i, j, ret = 0; - if (!set->ops->reinit_request) + if (WARN_ON_ONCE(!reinit_request)) goto out; for (i = 0; i < set->nr_hw_queues; i++) { @@ -305,8 +316,8 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) if (!tags->static_rqs[j]) continue; - ret = set->ops->reinit_request(set->driver_data, - tags->static_rqs[j]); + ret = reinit_request(set->driver_data, + tags->static_rqs[j]); if (ret) goto out; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 4603b115e234..3f18cff80050 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -83,6 +83,41 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); } +struct mq_inflight { + struct hd_struct *part; + unsigned int *inflight; +}; + +static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, + bool reserved) +{ + struct mq_inflight *mi = priv; + + if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && + !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { + /* + * index[0] counts the specific partition that was asked + * for. index[1] counts the ones that are active on the + * whole device, so increment that if mi->part is indeed + * a partition, and not a whole device. + */ + if (rq->part == mi->part) + mi->inflight[0]++; + if (mi->part->partno) + mi->inflight[1]++; + } +} + +void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) +{ + struct mq_inflight mi = { .part = part, .inflight = inflight, }; + + inflight[0] = inflight[1] = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); +} + void blk_freeze_queue_start(struct request_queue *q) { int freeze_depth; @@ -624,11 +659,10 @@ static void blk_mq_requeue_work(struct work_struct *work) container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); struct request *rq, *next; - unsigned long flags; - spin_lock_irqsave(&q->requeue_lock, flags); + spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); - spin_unlock_irqrestore(&q->requeue_lock, flags); + spin_unlock_irq(&q->requeue_lock); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { if (!(rq->rq_flags & RQF_SOFTBARRIER)) @@ -1102,9 +1136,19 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; + /* + * We should be running this queue from one of the CPUs that + * are mapped to it. + */ WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && cpu_online(hctx->next_cpu)); + /* + * We can't run the queue inline with ints disabled. Ensure that + * we catch bad users of this early. + */ + WARN_ON_ONCE(in_interrupt()); + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); blk_mq_sched_dispatch_requests(hctx); @@ -1218,7 +1262,7 @@ EXPORT_SYMBOL(blk_mq_queue_stopped); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and - * BLK_MQ_RQ_QUEUE_BUSY is usually returned. + * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queue() returns. Please use @@ -1235,7 +1279,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and - * BLK_MQ_RQ_QUEUE_BUSY is usually returned. + * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queues() returns. Please use diff --git a/block/blk-mq.h b/block/blk-mq.h index 60b01c0309bc..98252b79b80b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -133,4 +133,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } +void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]); + #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index be1f115b538b..8559e9563c52 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -68,6 +68,7 @@ EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn) { + WARN_ON_ONCE(q->mq_ops); q->rq_timed_out_fn = fn; } EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out); diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 87b7df4851bf..07125e7941f4 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -60,7 +60,7 @@ static void trigger_softirq(void *data) static int raise_blk_irq(int cpu, struct request *rq) { if (cpu_online(cpu)) { - struct call_single_data *data = &rq->csd; + call_single_data_t *data = &rq->csd; data->func = trigger_softirq; data->info = rq; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 27aceab1cc31..b8362c0df51d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -931,7 +931,9 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; + mutex_lock(&q->sysfs_lock); queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); + mutex_unlock(&q->sysfs_lock); wbt_exit(q); diff --git a/block/blk-tag.c b/block/blk-tag.c index 2290f65b9d73..e1a9c15eb1b8 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -290,7 +290,6 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq) */ clear_bit_unlock(tag, bqt->tag_map); } -EXPORT_SYMBOL(blk_queue_end_tag); /** * blk_queue_start_tag - find a free tag and assign it diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 80f5481fe9f6..0fea76aa0f3f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -373,10 +373,8 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) if (likely(!blk_trace_note_message_enabled(__td->queue))) \ break; \ if ((__tg)) { \ - char __pbuf[128]; \ - \ - blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \ + blk_add_cgroup_trace_msg(__td->queue, \ + tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ @@ -2114,14 +2112,9 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) { #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - int ret; - - ret = bio_associate_current(bio); - if (ret == 0 || ret == -EBUSY) + if (bio->bi_css) bio->bi_cg_private = tg; blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); -#else - bio_associate_current(bio); #endif } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 3bd15d8095b1..ff57fb51b338 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -116,7 +116,7 @@ int blkdev_report_zones(struct block_device *bdev, if (!bio) return -ENOMEM; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blk_zone_start(q, sector); bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0); @@ -234,7 +234,7 @@ int blkdev_reset_zones(struct block_device *bdev, bio = bio_alloc(gfp_mask, 0); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); ret = submit_bio_wait(bio); diff --git a/block/blk.h b/block/blk.h index 3a3d715bd725..fcb9775b997d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -64,7 +64,6 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); void blk_queue_bypass_start(struct request_queue *q); void blk_queue_bypass_end(struct request_queue *q); -void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); void blk_freeze_queue(struct request_queue *q); @@ -204,6 +203,8 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq e->type->ops.sq.elevator_deactivate_req_fn(q, rq); } +struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); + #ifdef CONFIG_FAIL_IO_TIMEOUT int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); diff --git a/block/bsg.c b/block/bsg.c index 37663b664666..ee1335c68de7 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -932,15 +932,8 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return ret; } - /* - * block device ioctls - */ default: -#if 0 - return ioctl_by_bdev(bd->bdev, cmd, arg); -#else return -ENOTTY; -#endif } } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3d5c28945719..9b86e9b352e9 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -656,20 +656,17 @@ static inline void cfqg_put(struct cfq_group *cfqg) } #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \ + blk_add_cgroup_trace_msg((cfqd)->queue, \ + cfqg_to_blkg((cfqq)->cfqg)->blkcg, \ + "cfq%d%c%c " fmt, (cfqq)->pid, \ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ - __pbuf, ##args); \ + ##args); \ } while (0) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \ + blk_add_cgroup_trace_msg((cfqd)->queue, \ + cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \ } while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, @@ -2937,7 +2934,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * for devices that support queuing, otherwise we still have a problem * with sync vs async workloads. */ - if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) + if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag && + !cfqd->cfq_group_idle) return; WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); @@ -4714,13 +4712,12 @@ cfq_var_show(unsigned int var, char *page) return sprintf(page, "%u\n", var); } -static ssize_t -cfq_var_store(unsigned int *var, const char *page, size_t count) +static void +cfq_var_store(unsigned int *var, const char *page) { char *p = (char *) page; *var = simple_strtoul(p, &p, 10); - return count; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ @@ -4766,7 +4763,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) { \ struct cfq_data *cfqd = e->elevator_data; \ unsigned int __data; \ - int ret = cfq_var_store(&__data, (page), count); \ + cfq_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ @@ -4775,7 +4772,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ else \ *(__PTR) = __data; \ - return ret; \ + return count; \ } STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, @@ -4800,13 +4797,13 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) { \ struct cfq_data *cfqd = e->elevator_data; \ unsigned int __data; \ - int ret = cfq_var_store(&__data, (page), count); \ + cfq_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ - return ret; \ + return count; \ } USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX); USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX); diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 38554c2ea38a..abaf9d78a206 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -79,7 +79,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, static int compat_hdio_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - unsigned long *__user p; + unsigned long __user *p; int error; p = compat_alloc_user_space(sizeof(unsigned long)); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index c68f6bbc0dcd..b83f77460d28 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -373,13 +373,12 @@ deadline_var_show(int var, char *page) return sprintf(page, "%d\n", var); } -static ssize_t -deadline_var_store(int *var, const char *page, size_t count) +static void +deadline_var_store(int *var, const char *page) { char *p = (char *) page; *var = simple_strtol(p, &p, 10); - return count; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ @@ -403,7 +402,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) { \ struct deadline_data *dd = e->elevator_data; \ int __data; \ - int ret = deadline_var_store(&__data, (page), count); \ + deadline_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ @@ -412,7 +411,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) *(__PTR) = msecs_to_jiffies(__data); \ else \ *(__PTR) = __data; \ - return ret; \ + return count; \ } STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); diff --git a/block/elevator.c b/block/elevator.c index 4bb2f0c93fa6..153926a90901 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1055,6 +1055,10 @@ static int __elevator_change(struct request_queue *q, const char *name) char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; + /* Make sure queue is not in the middle of being removed */ + if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + return -ENOENT; + /* * Special case for mq, turn off scheduling */ diff --git a/block/genhd.c b/block/genhd.c index 7f520fa25d16..dd305c65ffb0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -45,6 +45,52 @@ static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); +void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) +{ + if (q->mq_ops) + return; + + atomic_inc(&part->in_flight[rw]); + if (part->partno) + atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); +} + +void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) +{ + if (q->mq_ops) + return; + + atomic_dec(&part->in_flight[rw]); + if (part->partno) + atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); +} + +void part_in_flight(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) +{ + if (q->mq_ops) { + blk_mq_in_flight(q, part, inflight); + return; + } + + inflight[0] = atomic_read(&part->in_flight[0]) + + atomic_read(&part->in_flight[1]); + if (part->partno) { + part = &part_to_disk(part)->part0; + inflight[1] = atomic_read(&part->in_flight[0]) + + atomic_read(&part->in_flight[1]); + } +} + +struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); + + if (unlikely(partno < 0 || partno >= ptbl->len)) + return NULL; + return rcu_dereference(ptbl->part[partno]); +} + /** * disk_get_part - get partition * @disk: disk to look partition from @@ -61,21 +107,12 @@ static void disk_release_events(struct gendisk *disk); */ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) { - struct hd_struct *part = NULL; - struct disk_part_tbl *ptbl; - - if (unlikely(partno < 0)) - return NULL; + struct hd_struct *part; rcu_read_lock(); - - ptbl = rcu_dereference(disk->part_tbl); - if (likely(partno < ptbl->len)) { - part = rcu_dereference(ptbl->part[partno]); - if (part) - get_device(part_to_dev(part)); - } - + part = __disk_get_part(disk, partno); + if (part) + get_device(part_to_dev(part)); rcu_read_unlock(); return part; @@ -242,6 +279,7 @@ EXPORT_SYMBOL_GPL(disk_map_sector_rcu); * Can be deleted altogether. Later. * */ +#define BLKDEV_MAJOR_HASH_SIZE 255 static struct blk_major_name { struct blk_major_name *next; int major; @@ -259,12 +297,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; - if (offset < BLKDEV_MAJOR_HASH_SIZE) { - mutex_lock(&block_class_lock); - for (dp = major_names[offset]; dp; dp = dp->next) + mutex_lock(&block_class_lock); + for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) + if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); - mutex_unlock(&block_class_lock); - } + mutex_unlock(&block_class_lock); } #endif /* CONFIG_PROC_FS */ @@ -309,6 +346,14 @@ int register_blkdev(unsigned int major, const char *name) ret = major; } + if (major >= BLKDEV_MAJOR_MAX) { + pr_err("register_blkdev: major requested (%d) is greater than the maximum (%d) for %s\n", + major, BLKDEV_MAJOR_MAX, name); + + ret = -EINVAL; + goto out; + } + p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL); if (p == NULL) { ret = -ENOMEM; @@ -1090,12 +1135,13 @@ static const struct attribute_group *disk_attr_groups[] = { * original ptbl is freed using RCU callback. * * LOCKING: - * Matching bd_mutx locked. + * Matching bd_mutex locked or the caller is the only user of @disk. */ static void disk_replace_part_tbl(struct gendisk *disk, struct disk_part_tbl *new_ptbl) { - struct disk_part_tbl *old_ptbl = disk->part_tbl; + struct disk_part_tbl *old_ptbl = + rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(disk->part_tbl, new_ptbl); @@ -1114,14 +1160,16 @@ static void disk_replace_part_tbl(struct gendisk *disk, * uses RCU to allow unlocked dereferencing for stats and other stuff. * * LOCKING: - * Matching bd_mutex locked, might sleep. + * Matching bd_mutex locked or the caller is the only user of @disk. + * Might sleep. * * RETURNS: * 0 on success, -errno on failure. */ int disk_expand_part_tbl(struct gendisk *disk, int partno) { - struct disk_part_tbl *old_ptbl = disk->part_tbl; + struct disk_part_tbl *old_ptbl = + rcu_dereference_protected(disk->part_tbl, 1); struct disk_part_tbl *new_ptbl; int len = old_ptbl ? old_ptbl->len : 0; int i, target; @@ -1204,6 +1252,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct disk_part_iter piter; struct hd_struct *hd; char buf[BDEVNAME_SIZE]; + unsigned int inflight[2]; int cpu; /* @@ -1217,8 +1266,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { cpu = part_stat_lock(); - part_round_stats(cpu, hd); + part_round_stats(gp->queue, cpu, hd); part_stat_unlock(); + part_in_flight(gp->queue, hd, inflight); seq_printf(seqf, "%4d %7d %s %lu %lu %lu " "%u %lu %lu %lu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), @@ -1231,7 +1281,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, merges[WRITE]), part_stat_read(hd, sectors[WRITE]), jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), - part_in_flight(hd), + inflight[0], jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); @@ -1313,6 +1363,14 @@ EXPORT_SYMBOL(alloc_disk); struct gendisk *alloc_disk_node(int minors, int node_id) { struct gendisk *disk; + struct disk_part_tbl *ptbl; + + if (minors > DISK_MAX_PARTS) { + printk(KERN_ERR + "block: can't allocated more than %d partitions\n", + DISK_MAX_PARTS); + minors = DISK_MAX_PARTS; + } disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (disk) { @@ -1326,7 +1384,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id) kfree(disk); return NULL; } - disk->part_tbl->part[0] = &disk->part0; + ptbl = rcu_dereference_protected(disk->part_tbl, 1); + rcu_assign_pointer(ptbl->part[0], &disk->part0); /* * set_capacity() and get_capacity() currently don't use diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 1b964a387afe..a1cad4331edd 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -457,13 +457,12 @@ deadline_var_show(int var, char *page) return sprintf(page, "%d\n", var); } -static ssize_t -deadline_var_store(int *var, const char *page, size_t count) +static void +deadline_var_store(int *var, const char *page) { char *p = (char *) page; *var = simple_strtol(p, &p, 10); - return count; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ @@ -487,7 +486,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) { \ struct deadline_data *dd = e->elevator_data; \ int __data; \ - int ret = deadline_var_store(&__data, (page), count); \ + deadline_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ @@ -496,7 +495,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) *(__PTR) = msecs_to_jiffies(__data); \ else \ *(__PTR) = __data; \ - return ret; \ + return count; \ } STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); @@ -660,6 +659,7 @@ static struct elevator_type mq_deadline = { .elevator_name = "mq-deadline", .elevator_owner = THIS_MODULE, }; +MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { diff --git a/block/partition-generic.c b/block/partition-generic.c index c5ec8246e25e..86e8fe1adcdb 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -112,11 +112,14 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = dev_to_disk(dev)->queue; + unsigned int inflight[2]; int cpu; cpu = part_stat_lock(); - part_round_stats(cpu, p); + part_round_stats(q, cpu, p); part_stat_unlock(); + part_in_flight(q, p, inflight); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -130,7 +133,7 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - part_in_flight(p), + inflight[0], jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } @@ -249,15 +252,20 @@ void __delete_partition(struct percpu_ref *ref) call_rcu(&part->rcu_head, delete_partition_rcu_cb); } +/* + * Must be called either with bd_mutex held, before a disk can be opened or + * after all disk users are gone. + */ void delete_partition(struct gendisk *disk, int partno) { - struct disk_part_tbl *ptbl = disk->part_tbl; + struct disk_part_tbl *ptbl = + rcu_dereference_protected(disk->part_tbl, 1); struct hd_struct *part; if (partno >= ptbl->len) return; - part = ptbl->part[partno]; + part = rcu_dereference_protected(ptbl->part[partno], 1); if (!part) return; @@ -277,6 +285,10 @@ static ssize_t whole_disk_show(struct device *dev, static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, whole_disk_show, NULL); +/* + * Must be called either with bd_mutex held, before a disk can be opened or + * after all disk users are gone. + */ struct hd_struct *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) @@ -292,7 +304,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, err = disk_expand_part_tbl(disk, partno); if (err) return ERR_PTR(err); - ptbl = disk->part_tbl; + ptbl = rcu_dereference_protected(disk->part_tbl, 1); if (ptbl->part[partno]) return ERR_PTR(-EBUSY); @@ -391,7 +403,6 @@ out_del: device_del(pdev); out_put: put_device(pdev); - blk_free_devt(devt); return ERR_PTR(err); } |