diff options
Diffstat (limited to 'block')
35 files changed, 1156 insertions, 840 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 615516146086..27f11320b8d1 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -30,6 +30,7 @@ config IOSCHED_BFQ config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" depends on IOSCHED_BFQ && BLK_CGROUP + default y select BLK_CGROUP_RWSTAT help diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 0fbde0fc0628..89ffb3aa992c 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -513,12 +513,12 @@ static void bfq_cpd_free(struct blkcg_policy_data *cpd) kfree(cpd_to_bfqgd(cpd)); } -static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *bfq_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { struct bfq_group *bfqg; - bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node); + bfqg = kzalloc_node(sizeof(*bfqg), gfp, disk->node_id); if (!bfqg) return NULL; @@ -551,7 +551,6 @@ static void bfq_pd_init(struct blkg_policy_data *pd) bfqg->bfqd = bfqd; bfqg->active_entities = 0; bfqg->num_queues_with_pending_reqs = 0; - bfqg->online = true; bfqg->rq_pos_tree = RB_ROOT; } @@ -614,7 +613,7 @@ struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) continue; } bfqg = blkg_to_bfqg(blkg); - if (bfqg->online) { + if (bfqg->pd.online) { bio_associate_blkg_from_css(bio, &blkg->blkcg->css); return bfqg; } @@ -706,12 +705,52 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_activate_bfqq(bfqd, bfqq); } - if (!bfqd->in_service_queue && !bfqd->rq_in_driver) + if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } +static void bfq_sync_bfqq_move(struct bfq_data *bfqd, + struct bfq_queue *sync_bfqq, + struct bfq_io_cq *bic, + struct bfq_group *bfqg, + unsigned int act_idx) +{ + struct bfq_queue *bfqq; + + if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { + /* We are the only user of this bfqq, just move it */ + if (sync_bfqq->entity.sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + return; + } + + /* + * The queue was merged to a different queue. Check + * that the merge chain still belongs to the same + * cgroup. + */ + for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) + if (bfqq->entity.sched_data != &bfqg->sched_data) + break; + if (bfqq) { + /* + * Some queue changed cgroup so the merge is not valid + * anymore. We cannot easily just cancel the merge (by + * clearing new_bfqq) as there may be other processes + * using this queue and holding refs to all queues + * below sync_bfqq->new_bfqq. Similarly if the merge + * already happened, we need to detach from bfqq now + * so that we cannot merge bio to a request from the + * old cgroup. + */ + bfq_put_cooperator(sync_bfqq); + bic_set_bfqq(bic, NULL, true, act_idx); + bfq_release_process_ref(bfqd, sync_bfqq); + } +} + /** * __bfq_bic_change_cgroup - move @bic to @bfqg. * @bfqd: the queue descriptor. @@ -726,53 +765,20 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_group *bfqg) { - struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false); - struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true); - struct bfq_entity *entity; + unsigned int act_idx; - if (async_bfqq) { - entity = &async_bfqq->entity; + for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) { + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx); + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx); - if (entity->sched_data != &bfqg->sched_data) { - bic_set_bfqq(bic, NULL, false); + if (async_bfqq && + async_bfqq->entity.sched_data != &bfqg->sched_data) { + bic_set_bfqq(bic, NULL, false, act_idx); bfq_release_process_ref(bfqd, async_bfqq); } - } - if (sync_bfqq) { - if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { - /* We are the only user of this bfqq, just move it */ - if (sync_bfqq->entity.sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, bfqg); - } else { - struct bfq_queue *bfqq; - - /* - * The queue was merged to a different queue. Check - * that the merge chain still belongs to the same - * cgroup. - */ - for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) - if (bfqq->entity.sched_data != - &bfqg->sched_data) - break; - if (bfqq) { - /* - * Some queue changed cgroup so the merge is - * not valid anymore. We cannot easily just - * cancel the merge (by clearing new_bfqq) as - * there may be other processes using this - * queue and holding refs to all queues below - * sync_bfqq->new_bfqq. Similarly if the merge - * already happened, we need to detach from - * bfqq now so that we cannot merge bio to a - * request from the old cgroup. - */ - bfq_put_cooperator(sync_bfqq); - bic_set_bfqq(bic, NULL, true); - bfq_release_process_ref(bfqd, sync_bfqq); - } - } + if (sync_bfqq) + bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx); } } @@ -978,7 +984,6 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) put_async_queues: bfq_put_async_queues(bfqd, bfqg); - bfqg->online = false; spin_unlock_irqrestore(&bfqd->lock, flags); /* @@ -1284,7 +1289,7 @@ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { int ret; - ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); + ret = blkcg_activate_policy(bfqd->queue->disk, &blkcg_policy_bfq); if (ret) return NULL; diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 380e9bda2e57..8a8d4441519c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -377,20 +377,23 @@ static const unsigned long bfq_late_stable_merging = 600; #define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0])) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) +struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx) { - return bic->bfqq[is_sync]; + if (is_sync) + return bic->bfqq[1][actuator_idx]; + + return bic->bfqq[0][actuator_idx]; } static void bfq_put_stable_ref(struct bfq_queue *bfqq); -void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) +void bic_set_bfqq(struct bfq_io_cq *bic, + struct bfq_queue *bfqq, + bool is_sync, + unsigned int actuator_idx) { - struct bfq_queue *old_bfqq = bic->bfqq[is_sync]; - - /* Clear bic pointer if bfqq is detached from this bic */ - if (old_bfqq && old_bfqq->bic == bic) - old_bfqq->bic = NULL; + struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx]; /* * If bfqq != NULL, then a non-stable queue merge between @@ -405,9 +408,18 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) * we cancel the stable merge if * bic->stable_merge_bfqq == bfqq. */ - bic->bfqq[is_sync] = bfqq; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[actuator_idx]; + + /* Clear bic pointer if bfqq is detached from this bic */ + if (old_bfqq && old_bfqq->bic == bic) + old_bfqq->bic = NULL; - if (bfqq && bic->stable_merge_bfqq == bfqq) { + if (is_sync) + bic->bfqq[1][actuator_idx] = bfqq; + else + bic->bfqq[0][actuator_idx] = bfqq; + + if (bfqq && bfqq_data->stable_merge_bfqq == bfqq) { /* * Actually, these same instructions are executed also * in bfq_setup_cooperator, in case of abort or actual @@ -416,9 +428,9 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) * did so, we would nest even more complexity in this * function. */ - bfq_put_stable_ref(bic->stable_merge_bfqq); + bfq_put_stable_ref(bfqq_data->stable_merge_bfqq); - bic->stable_merge_bfqq = NULL; + bfqq_data->stable_merge_bfqq = NULL; } } @@ -678,9 +690,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; struct bfq_io_cq *bic = bfq_bic_lookup(data->q); - struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(opf)) : NULL; int depth; unsigned limit = data->q->nr_requests; + unsigned int act_idx; /* Sync reads have full depth available */ if (op_is_sync(opf) && !op_is_write(opf)) { @@ -690,14 +702,21 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) limit = (limit * depth) >> bfqd->full_depth_shift; } - /* - * Does queue (or any parent entity) exceed number of requests that - * should be available to it? Heavily limit depth so that it cannot - * consume more available requests and thus starve other entities. - */ - if (bfqq && bfqq_request_over_limit(bfqq, limit)) - depth = 1; + for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { + struct bfq_queue *bfqq = + bic_to_bfqq(bic, op_is_sync(opf), act_idx); + /* + * Does queue (or any parent entity) exceed number of + * requests that should be available to it? Heavily + * limit depth so that it cannot consume more + * available requests and thus starve other entities. + */ + if (bfqq && bfqq_request_over_limit(bfqq, limit)) { + depth = 1; + break; + } + } bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); if (depth) @@ -1074,9 +1093,6 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) { u64 dur; - if (bfqd->bfq_wr_max_time > 0) - return bfqd->bfq_wr_max_time; - dur = bfqd->rate_dur_prod; do_div(dur, bfqd->peak_rate); @@ -1118,36 +1134,39 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, { unsigned int old_wr_coeff = 1; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; - if (bic->saved_has_short_ttime) + if (bfqq_data->saved_has_short_ttime) bfq_mark_bfqq_has_short_ttime(bfqq); else bfq_clear_bfqq_has_short_ttime(bfqq); - if (bic->saved_IO_bound) + if (bfqq_data->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); - bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns; - bfqq->inject_limit = bic->saved_inject_limit; - bfqq->decrease_time_jif = bic->saved_decrease_time_jif; + bfqq->last_serv_time_ns = bfqq_data->saved_last_serv_time_ns; + bfqq->inject_limit = bfqq_data->saved_inject_limit; + bfqq->decrease_time_jif = bfqq_data->saved_decrease_time_jif; - bfqq->entity.new_weight = bic->saved_weight; - bfqq->ttime = bic->saved_ttime; - bfqq->io_start_time = bic->saved_io_start_time; - bfqq->tot_idle_time = bic->saved_tot_idle_time; + bfqq->entity.new_weight = bfqq_data->saved_weight; + bfqq->ttime = bfqq_data->saved_ttime; + bfqq->io_start_time = bfqq_data->saved_io_start_time; + bfqq->tot_idle_time = bfqq_data->saved_tot_idle_time; /* * Restore weight coefficient only if low_latency is on */ if (bfqd->low_latency) { old_wr_coeff = bfqq->wr_coeff; - bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_coeff = bfqq_data->saved_wr_coeff; } - bfqq->service_from_wr = bic->saved_service_from_wr; - bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; - bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; + bfqq->service_from_wr = bfqq_data->saved_service_from_wr; + bfqq->wr_start_at_switch_to_srt = + bfqq_data->saved_wr_start_at_switch_to_srt; + bfqq->last_wr_start_finish = bfqq_data->saved_last_wr_start_finish; + bfqq->wr_cur_max_time = bfqq_data->saved_wr_cur_max_time; if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + @@ -1766,6 +1785,33 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, return bfqq_weight > in_serv_weight; } +/* + * Get the index of the actuator that will serve bio. + */ +static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio) +{ + unsigned int i; + sector_t end; + + /* no search needed if one or zero ranges present */ + if (bfqd->num_actuators == 1) + return 0; + + /* bio_end_sector(bio) gives the sector after the last one */ + end = bio_end_sector(bio) - 1; + + for (i = 0; i < bfqd->num_actuators; i++) { + if (end >= bfqd->sector[i] && + end < bfqd->sector[i] + bfqd->nr_sectors[i]) + return i; + } + + WARN_ONCE(true, + "bfq_actuator_index: bio sector out of ranges: end=%llu\n", + end); + return 0; +} + static bool bfq_better_to_idle(struct bfq_queue *bfqq); static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, @@ -1785,7 +1831,9 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, arrived_in_time = ktime_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; - + unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); + bool bfqq_non_merged_or_stably_merged = + bfqq->bic || RQ_BIC(rq)->bfqq_data[act_idx].stably_merged; /* * bfqq deserves to be weight-raised if: @@ -1819,9 +1867,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, */ wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || - (bfq_bfqq_sync(bfqq) && - (bfqq->bic || RQ_BIC(rq)->stably_merged) && - (*interactive || soft_rt))); + (bfq_bfqq_sync(bfqq) && bfqq_non_merged_or_stably_merged && + (*interactive || soft_rt))); /* * Using the last flag, update budget and check whether bfqq @@ -2098,7 +2145,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, * We reset waker detection logic also if too much time has passed * since the first detection. If wakeups are rare, pointless idling * doesn't hurt throughput that much. The condition below makes sure - * we do not uselessly idle blocking waker in more than 1/64 cases. + * we do not uselessly idle blocking waker in more than 1/64 cases. */ if (bfqd->last_completed_rq_bfqq != bfqq->tentative_waker_bfqq || @@ -2209,9 +2256,9 @@ static void bfq_add_request(struct request *rq) * elapsed. */ if (bfqq == bfqd->in_service_queue && - (bfqd->rq_in_driver == 0 || + (bfqd->tot_rq_in_driver == 0 || (bfqq->last_serv_time_ns > 0 && - bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && + bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + msecs_to_jiffies(10))) { bfqd->last_empty_occupied_ns = ktime_get_ns(); @@ -2235,7 +2282,7 @@ static void bfq_add_request(struct request *rq) * will be set in case injection is performed * on bfqq before rq is completed). */ - if (bfqd->rq_in_driver == 0) + if (bfqd->tot_rq_in_driver == 0) bfqd->rqs_injected = false; } } @@ -2418,7 +2465,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, */ bfq_bic_update_cgroup(bic, bio); - bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); + bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf), + bfq_actuator_index(bfqd, bio)); } else { bfqd->bio_bfqq = NULL; } @@ -2584,24 +2632,29 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { - int i, j; + int i, j, k; - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_NR_LEVELS; j++) - if (bfqg->async_bfqq[i][j]) - bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); - if (bfqg->async_idle_bfqq) - bfq_bfqq_end_wr(bfqg->async_idle_bfqq); + for (k = 0; k < bfqd->num_actuators; k++) { + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) + if (bfqg->async_bfqq[i][j][k]) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j][k]); + if (bfqg->async_idle_bfqq[k]) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq[k]); + } } static void bfq_end_wr(struct bfq_data *bfqd) { struct bfq_queue *bfqq; + int i; spin_lock_irq(&bfqd->lock); - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); + for (i = 0; i < bfqd->num_actuators; i++) { + list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) + bfq_bfqq_end_wr(bfqq); + } list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) bfq_bfqq_end_wr(bfqq); bfq_end_wr_async(bfqd); @@ -2794,6 +2847,35 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq); +static struct bfq_queue * +bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_queue *stable_merge_bfqq, + struct bfq_iocq_bfqq_data *bfqq_data) +{ + int proc_ref = min(bfqq_process_refs(bfqq), + bfqq_process_refs(stable_merge_bfqq)); + struct bfq_queue *new_bfqq; + + if (idling_boosts_thr_without_issues(bfqd, bfqq) || + proc_ref == 0) + return NULL; + + /* next function will take at least one ref */ + new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq); + + if (new_bfqq) { + bfqq_data->stably_merged = true; + if (new_bfqq->bic) { + unsigned int new_a_idx = new_bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *new_bfqq_data = + &new_bfqq->bic->bfqq_data[new_a_idx]; + + new_bfqq_data->stably_merged = true; + } + } + return new_bfqq; +} + /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return @@ -2819,6 +2901,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, void *io_struct, bool request, struct bfq_io_cq *bic) { struct bfq_queue *in_service_bfqq, *new_bfqq; + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; /* if a merge has already been setup, then proceed with that first */ if (bfqq->new_bfqq) @@ -2840,37 +2924,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, * stable merging) also if bic is associated with a * sync queue, but this bfqq is async */ - if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq && + if (bfq_bfqq_sync(bfqq) && bfqq_data->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && time_is_before_jiffies(bfqq->split_time + msecs_to_jiffies(bfq_late_stable_merging)) && time_is_before_jiffies(bfqq->creation_time + msecs_to_jiffies(bfq_late_stable_merging))) { struct bfq_queue *stable_merge_bfqq = - bic->stable_merge_bfqq; - int proc_ref = min(bfqq_process_refs(bfqq), - bfqq_process_refs(stable_merge_bfqq)); + bfqq_data->stable_merge_bfqq; /* deschedule stable merge, because done or aborted here */ bfq_put_stable_ref(stable_merge_bfqq); - bic->stable_merge_bfqq = NULL; - - if (!idling_boosts_thr_without_issues(bfqd, bfqq) && - proc_ref > 0) { - /* next function will take at least one ref */ - struct bfq_queue *new_bfqq = - bfq_setup_merge(bfqq, stable_merge_bfqq); - - if (new_bfqq) { - bic->stably_merged = true; - if (new_bfqq->bic) - new_bfqq->bic->stably_merged = - true; - } - return new_bfqq; - } else - return NULL; + bfqq_data->stable_merge_bfqq = NULL; + + return bfq_setup_stable_merge(bfqd, bfqq, + stable_merge_bfqq, + bfqq_data); } } @@ -2965,6 +3035,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { struct bfq_io_cq *bic = bfqq->bic; + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; /* * If !bfqq->bic, the queue is already shared or its requests @@ -2974,18 +3046,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; - bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns; - bic->saved_inject_limit = bfqq->inject_limit; - bic->saved_decrease_time_jif = bfqq->decrease_time_jif; - - bic->saved_weight = bfqq->entity.orig_weight; - bic->saved_ttime = bfqq->ttime; - bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_io_start_time = bfqq->io_start_time; - bic->saved_tot_idle_time = bfqq->tot_idle_time; - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bfqq_data->saved_last_serv_time_ns = bfqq->last_serv_time_ns; + bfqq_data->saved_inject_limit = bfqq->inject_limit; + bfqq_data->saved_decrease_time_jif = bfqq->decrease_time_jif; + + bfqq_data->saved_weight = bfqq->entity.orig_weight; + bfqq_data->saved_ttime = bfqq->ttime; + bfqq_data->saved_has_short_ttime = + bfq_bfqq_has_short_ttime(bfqq); + bfqq_data->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq_data->saved_io_start_time = bfqq->io_start_time; + bfqq_data->saved_tot_idle_time = bfqq->tot_idle_time; + bfqq_data->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bfqq_data->was_in_burst_list = + !hlist_unhashed(&bfqq->burst_list_node); + if (unlikely(bfq_bfqq_just_created(bfqq) && !bfq_bfqq_in_large_burst(bfqq) && bfqq->bfqd->low_latency)) { @@ -2998,17 +3073,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) * to bfqq, so that to avoid that bfqq unjustly fails * to enjoy weight raising if split soon. */ - bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now(); - bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); - bic->saved_last_wr_start_finish = jiffies; + bfqq_data->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bfqq_data->saved_wr_start_at_switch_to_srt = + bfq_smallest_from_now(); + bfqq_data->saved_wr_cur_max_time = + bfq_wr_duration(bfqq->bfqd); + bfqq_data->saved_last_wr_start_finish = jiffies; } else { - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = + bfqq_data->saved_wr_coeff = bfqq->wr_coeff; + bfqq_data->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_service_from_wr = bfqq->service_from_wr; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + bfqq_data->saved_service_from_wr = + bfqq->service_from_wr; + bfqq_data->saved_last_wr_start_finish = + bfqq->last_wr_start_finish; + bfqq_data->saved_wr_cur_max_time = bfqq->wr_cur_max_time; } } @@ -3114,7 +3193,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, /* * Merge queues (that is, let bic redirect its requests to new_bfqq) */ - bic_set_bfqq(bic, new_bfqq, true); + bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx); bfq_mark_bfqq_coop(new_bfqq); /* * new_bfqq now belongs to at least two bics (it is a shared queue): @@ -3532,13 +3611,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) * - start a new observation interval with this dispatch */ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) + bfqd->tot_rq_in_driver == 0) goto update_rate_and_reset; /* Update sampling information */ bfqd->peak_rate_samples++; - if ((bfqd->rq_in_driver > 0 || + if ((bfqd->tot_rq_in_driver > 0 || now_ns - bfqd->last_completion < BFQ_MIN_TT) && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) bfqd->sequential_samples++; @@ -3803,10 +3882,8 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, return false; return (bfqq->wr_coeff > 1 && - (bfqd->wr_busy_queues < - tot_busy_queues || - bfqd->rq_in_driver >= - bfqq->dispatched + 4)) || + (bfqd->wr_busy_queues < tot_busy_queues || + bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) || bfq_asymmetric_scenario(bfqd, bfqq) || tot_busy_queues == 1; } @@ -4072,8 +4149,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, * function to evaluate the I/O speed of a process. */ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason, - unsigned long *delta_ms) + bool compensate, unsigned long *delta_ms) { ktime_t delta_ktime; u32 delta_usecs; @@ -4269,7 +4345,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, /* * Check whether the process is slow (see bfq_bfqq_is_slow). */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, &delta); /* * As above explained, charge slow (typically seeky) and @@ -4577,6 +4653,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) { struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue; unsigned int limit = in_serv_bfqq->inject_limit; + int i; + /* * If * - bfqq is not weight-raised and therefore does not carry @@ -4608,7 +4686,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) ) limit = 1; - if (bfqd->rq_in_driver >= limit) + if (bfqd->tot_rq_in_driver >= limit) return NULL; /* @@ -4623,11 +4701,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) * (and re-added only if it gets new requests, but then it * is assigned again enough budget for its new backlog). */ - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - if (!RB_EMPTY_ROOT(&bfqq->sort_list) && - (in_serv_always_inject || bfqq->wr_coeff > 1) && - bfq_serv_to_charge(bfqq->next_rq, bfqq) <= - bfq_bfqq_budget_left(bfqq)) { + for (i = 0; i < bfqd->num_actuators; i++) { + list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + (in_serv_always_inject || bfqq->wr_coeff > 1) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) { /* * Allow for only one large in-flight request * on non-rotational devices, for the @@ -4647,27 +4726,80 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) */ if (blk_queue_nonrot(bfqd->queue) && blk_rq_sectors(bfqq->next_rq) >= - BFQQ_SECT_THR_NONROT) - limit = min_t(unsigned int, 1, limit); - else - limit = in_serv_bfqq->inject_limit; - - if (bfqd->rq_in_driver < limit) { + BFQQ_SECT_THR_NONROT && + bfqd->tot_rq_in_driver >= 1) + continue; + else { bfqd->rqs_injected = true; return bfqq; } } + } return NULL; } +static struct bfq_queue * +bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx) +{ + struct bfq_queue *bfqq; + + if (bfqd->in_service_queue && + bfqd->in_service_queue->actuator_idx == idx) + return bfqd->in_service_queue; + + list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) { + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) { + return bfqq; + } + } + + return NULL; +} + +/* + * Perform a linear scan of each actuator, until an actuator is found + * for which the following three conditions hold: the load of the + * actuator is below the threshold (see comments on + * actuator_load_threshold for details) and lower than that of the + * next actuator (comments on this extra condition below), and there + * is a queue that contains I/O for that actuator. On success, return + * that queue. + * + * Performing a plain linear scan entails a prioritization among + * actuators. The extra condition above breaks this prioritization and + * tends to distribute injection uniformly across actuators. + */ +static struct bfq_queue * +bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd) +{ + int i; + + for (i = 0 ; i < bfqd->num_actuators; i++) { + if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold && + (i == bfqd->num_actuators - 1 || + bfqd->rq_in_driver[i] < bfqd->rq_in_driver[i+1])) { + struct bfq_queue *bfqq = + bfq_find_active_bfqq_for_actuator(bfqd, i); + + if (bfqq) + return bfqq; + } + } + + return NULL; +} + + /* * Select a queue for service. If we have a current queue in service, * check whether to continue servicing it, or retrieve and set a new one. */ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) { - struct bfq_queue *bfqq; + struct bfq_queue *bfqq, *inject_bfqq; struct request *next_rq; enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT; @@ -4690,6 +4822,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) check_queue: /* + * If some actuator is underutilized, but the in-service + * queue does not contain I/O for that actuator, then try to + * inject I/O for that actuator. + */ + inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd); + if (inject_bfqq && inject_bfqq != bfqq) + return inject_bfqq; + + /* * This loop is rarely executed more than once. Even when it * happens, it is much more convenient to re-execute this loop * than to return NULL and trigger a new dispatch to get a @@ -4748,11 +4889,8 @@ check_queue: */ if (bfq_bfqq_wait_request(bfqq) || (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { - struct bfq_queue *async_bfqq = - bfqq->bic && bfqq->bic->bfqq[0] && - bfq_bfqq_busy(bfqq->bic->bfqq[0]) && - bfqq->bic->bfqq[0]->next_rq ? - bfqq->bic->bfqq[0] : NULL; + unsigned int act_idx = bfqq->actuator_idx; + struct bfq_queue *async_bfqq = NULL; struct bfq_queue *blocked_bfqq = !hlist_empty(&bfqq->woken_list) ? container_of(bfqq->woken_list.first, @@ -4760,6 +4898,10 @@ check_queue: woken_list_node) : NULL; + if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] && + bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) && + bfqq->bic->bfqq[0][act_idx]->next_rq) + async_bfqq = bfqq->bic->bfqq[0][act_idx]; /* * The next four mutually-exclusive ifs decide * whether to try injection, and choose the queue to @@ -4844,7 +4986,7 @@ check_queue: icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= bfq_bfqq_budget_left(async_bfqq)) - bfqq = bfqq->bic->bfqq[0]; + bfqq = async_bfqq; else if (bfqq->waker_bfqq && bfq_bfqq_busy(bfqq->waker_bfqq) && bfqq->waker_bfqq->next_rq && @@ -4975,7 +5117,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, bfq_dispatch_remove(bfqd->queue, rq); if (bfqq != bfqd->in_service_queue) - goto return_rq; + return rq; /* * If weight raising has to terminate for bfqq, then next @@ -4995,12 +5137,9 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, * belongs to CLASS_IDLE and other queues are waiting for * service. */ - if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) - goto return_rq; - - bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); + if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); -return_rq: return rq; } @@ -5043,11 +5182,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) /* * We exploit the bfq_finish_requeue_request hook to - * decrement rq_in_driver, but + * decrement tot_rq_in_driver, but * bfq_finish_requeue_request will not be invoked on * this request. So, to avoid unbalance, just start - * this request, without incrementing rq_in_driver. As - * a negative consequence, rq_in_driver is deceptively + * this request, without incrementing tot_rq_in_driver. As + * a negative consequence, tot_rq_in_driver is deceptively * lower than it should be while this request is in * service. This may cause bfq_schedule_dispatch to be * invoked uselessly. @@ -5056,7 +5195,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * bfq_finish_requeue_request hook, if defined, is * probably invoked also on this request. So, by * exploiting this hook, we could 1) increment - * rq_in_driver here, and 2) decrement it in + * tot_rq_in_driver here, and 2) decrement it in * bfq_finish_requeue_request. Such a solution would * let the value of the counter be always accurate, * but it would entail using an extra interface @@ -5085,7 +5224,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * Of course, serving one request at a time may cause loss of * throughput. */ - if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0) goto exit; bfqq = bfq_select_queue(bfqd); @@ -5096,7 +5235,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) if (rq) { inc_in_driver_start_rq: - bfqd->rq_in_driver++; + bfqd->rq_in_driver[bfqq->actuator_idx]++; + bfqd->tot_rq_in_driver++; start_rq: rq->rq_flags |= RQF_STARTED; } @@ -5283,8 +5423,6 @@ void bfq_put_cooperator(struct bfq_queue *bfqq) */ __bfqq = bfqq->new_bfqq; while (__bfqq) { - if (__bfqq == bfqq) - break; next = __bfqq->new_bfqq; bfq_put_queue(__bfqq); __bfqq = next; @@ -5305,48 +5443,55 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_release_process_ref(bfqd, bfqq); } -static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) +static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx) { - struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx); struct bfq_data *bfqd; if (bfqq) bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ if (bfqq && bfqd) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - bic_set_bfqq(bic, NULL, is_sync); + bic_set_bfqq(bic, NULL, is_sync, actuator_idx); bfq_exit_bfqq(bfqd, bfqq); - spin_unlock_irqrestore(&bfqd->lock, flags); } } static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + unsigned long flags; + unsigned int act_idx; + /* + * If bfqd and thus bfqd->num_actuators is not available any + * longer, then cycle over all possible per-actuator bfqqs in + * next loop. We rely on bic being zeroed on creation, and + * therefore on its unused per-actuator fields being NULL. + */ + unsigned int num_actuators = BFQ_MAX_ACTUATORS; + struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; - if (bic->stable_merge_bfqq) { - struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd; + /* + * bfqd is NULL if scheduler already exited, and in that case + * this is the last time these queues are accessed. + */ + if (bfqd) { + spin_lock_irqsave(&bfqd->lock, flags); + num_actuators = bfqd->num_actuators; + } - /* - * bfqd is NULL if scheduler already exited, and in - * that case this is the last time bfqq is accessed. - */ - if (bfqd) { - unsigned long flags; + for (act_idx = 0; act_idx < num_actuators; act_idx++) { + if (bfqq_data[act_idx].stable_merge_bfqq) + bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); - spin_lock_irqsave(&bfqd->lock, flags); - bfq_put_stable_ref(bic->stable_merge_bfqq); - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - bfq_put_stable_ref(bic->stable_merge_bfqq); - } + bfq_exit_icq_bfqq(bic, true, act_idx); + bfq_exit_icq_bfqq(bic, false, act_idx); } - bfq_exit_icq_bfqq(bic, true); - bfq_exit_icq_bfqq(bic, false); + if (bfqd) + spin_unlock_irqrestore(&bfqd->lock, flags); } /* @@ -5423,25 +5568,27 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bic->ioprio = ioprio; - bfqq = bic_to_bfqq(bic, false); + bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio)); if (bfqq) { struct bfq_queue *old_bfqq = bfqq; bfqq = bfq_get_queue(bfqd, bio, false, bic, true); - bic_set_bfqq(bic, bfqq, false); + bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio)); bfq_release_process_ref(bfqd, old_bfqq); } - bfqq = bic_to_bfqq(bic, true); + bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio)); if (bfqq) bfq_set_next_ioprio_data(bfqq, bic); } static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_io_cq *bic, pid_t pid, int is_sync) + struct bfq_io_cq *bic, pid_t pid, int is_sync, + unsigned int act_idx) { u64 now_ns = ktime_get_ns(); + bfqq->actuator_idx = act_idx; RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); @@ -5501,22 +5648,24 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* first request is almost certainly seeky */ bfqq->seek_history = 1; + + bfqq->decrease_time_jif = jiffies; } static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, struct bfq_group *bfqg, - int ioprio_class, int ioprio) + int ioprio_class, int ioprio, int act_idx) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &bfqg->async_bfqq[0][ioprio]; + return &bfqg->async_bfqq[0][ioprio][act_idx]; case IOPRIO_CLASS_NONE: ioprio = IOPRIO_BE_NORM; fallthrough; case IOPRIO_CLASS_BE: - return &bfqg->async_bfqq[1][ioprio]; + return &bfqg->async_bfqq[1][ioprio][act_idx]; case IOPRIO_CLASS_IDLE: - return &bfqg->async_idle_bfqq; + return &bfqg->async_idle_bfqq[act_idx]; default: return NULL; } @@ -5527,6 +5676,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, struct bfq_queue *last_bfqq_created) { + unsigned int a_idx = last_bfqq_created->actuator_idx; struct bfq_queue *new_bfqq = bfq_setup_merge(bfqq, last_bfqq_created); @@ -5534,8 +5684,8 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, return bfqq; if (new_bfqq->bic) - new_bfqq->bic->stably_merged = true; - bic->stably_merged = true; + new_bfqq->bic->bfqq_data[a_idx].stably_merged = true; + bic->bfqq_data[a_idx].stably_merged = true; /* * Reusing merge functions. This implies that @@ -5610,9 +5760,13 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, * it has been set already, but too long ago, then move it * forward to bfqq. Finally, move also if bfqq belongs to a * different group than last_bfqq_created, or if bfqq has a - * different ioprio or ioprio_class. If none of these - * conditions holds true, then try an early stable merge or - * schedule a delayed stable merge. + * different ioprio, ioprio_class or actuator_idx. If none of + * these conditions holds true, then try an early stable merge + * or schedule a delayed stable merge. As for the condition on + * actuator_idx, the reason is that, if queues associated with + * different actuators are merged, then control is lost on + * each actuator. Therefore some actuator may be + * underutilized, and throughput may decrease. * * A delayed merge is scheduled (instead of performing an * early merge), in case bfqq might soon prove to be more @@ -5630,7 +5784,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, bfqq->creation_time) || bfqq->entity.parent != last_bfqq_created->entity.parent || bfqq->ioprio != last_bfqq_created->ioprio || - bfqq->ioprio_class != last_bfqq_created->ioprio_class) + bfqq->ioprio_class != last_bfqq_created->ioprio_class || + bfqq->actuator_idx != last_bfqq_created->actuator_idx) *source_bfqq = bfqq; else if (time_after_eq(last_bfqq_created->creation_time + bfqd->bfq_burst_interval, @@ -5660,7 +5815,8 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, /* * Record the bfqq to merge to. */ - bic->stable_merge_bfqq = last_bfqq_created; + bic->bfqq_data[last_bfqq_created->actuator_idx].stable_merge_bfqq = + last_bfqq_created; } } @@ -5682,7 +5838,8 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, bfqg = bfq_bio_bfqg(bfqd, bio); if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, - ioprio); + ioprio, + bfq_actuator_index(bfqd, bio)); bfqq = *async_bfqq; if (bfqq) goto out; @@ -5694,7 +5851,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, if (bfqq) { bfq_init_bfqq(bfqd, bfqq, bic, current->pid, - is_sync); + is_sync, bfq_actuator_index(bfqd, bio)); bfq_init_entity(&bfqq->entity, bfqg); bfq_log_bfqq(bfqd, bfqq, "allocated"); } else { @@ -6009,7 +6166,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * then complete the merge and redirect it to * new_bfqq. */ - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + if (bic_to_bfqq(RQ_BIC(rq), true, + bfq_actuator_index(bfqd, rq->bio)) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); @@ -6147,7 +6305,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) struct bfq_queue *bfqq = bfqd->in_service_queue; bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, - bfqd->rq_in_driver); + bfqd->tot_rq_in_driver); if (bfqd->hw_tag == 1) return; @@ -6158,7 +6316,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) * sum is not exact, as it's not taking into account deactivated * requests. */ - if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) + if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) return; /* @@ -6169,7 +6327,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < BFQ_HW_QUEUE_THRESHOLD && - bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) + bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) return; if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) @@ -6190,7 +6348,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_update_hw_tag(bfqd); - bfqd->rq_in_driver--; + bfqd->rq_in_driver[bfqq->actuator_idx]--; + bfqd->tot_rq_in_driver--; bfqq->dispatched--; if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { @@ -6310,7 +6469,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) BFQQE_NO_MORE_REQUESTS); } - if (!bfqd->rq_in_driver) + if (!bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); } @@ -6441,13 +6600,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, * conditions to do it, or we can lower the last base value * computed. * - * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O + * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O * request in flight, because this function is in the code * path that handles the completion of a request of bfqq, and, * in particular, this function is executed before - * bfqd->rq_in_driver is decremented in such a code path. + * bfqd->tot_rq_in_driver is decremented in such a code path. */ - if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || + if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) || tot_time_ns < bfqq->last_serv_time_ns) { if (bfqq->last_serv_time_ns == 0) { /* @@ -6457,7 +6616,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, bfqq->inject_limit = max_t(unsigned int, 1, old_limit); } bfqq->last_serv_time_ns = tot_time_ns; - } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) + } else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1) /* * No I/O injected and no request still in service in * the drive: these are the exact conditions for @@ -6564,7 +6723,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) return bfqq; } - bic_set_bfqq(bic, NULL, true); + bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); bfq_put_cooperator(bfqq); @@ -6578,7 +6737,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, bool split, bool is_sync, bool *new_queue) { - struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + unsigned int act_idx = bfq_actuator_index(bfqd, bio); + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx); + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[act_idx]; if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) return bfqq; @@ -6590,14 +6751,14 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, bfq_put_queue(bfqq); bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); - bic_set_bfqq(bic, bfqq, is_sync); + bic_set_bfqq(bic, bfqq, is_sync, act_idx); if (split && is_sync) { - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) + if ((bfqq_data->was_in_burst_list && bfqd->large_burst) || + bfqq_data->saved_in_large_burst) bfq_mark_bfqq_in_large_burst(bfqq); else { bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) + if (bfqq_data->was_in_burst_list) /* * If bfqq was in the current * burst list before being @@ -6686,19 +6847,20 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) struct bfq_queue *bfqq; bool new_queue = false; bool bfqq_already_existing = false, split = false; + unsigned int a_idx = bfq_actuator_index(bfqd, bio); if (unlikely(!rq->elv.icq)) return NULL; /* - * Assuming that elv.priv[1] is set only if everything is set + * Assuming that RQ_BFQQ(rq) is set only if everything is set * for this rq. This holds true, because this function is * invoked only for insertion or merging, and, after such * events, a request cannot be manipulated any longer before * being removed from bfq. */ - if (rq->elv.priv[1]) - return rq->elv.priv[1]; + if (RQ_BFQQ(rq)) + return RQ_BFQQ(rq); bic = icq_to_bic(rq->elv.icq); @@ -6712,12 +6874,13 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (likely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && - !bic->stably_merged) { + !bic->bfqq_data[a_idx].stably_merged) { struct bfq_queue *old_bfqq = bfqq; /* Update bic before losing reference to bfqq */ if (bfq_bfqq_in_large_burst(bfqq)) - bic->saved_in_large_burst = true; + bic->bfqq_data[a_idx].saved_in_large_burst = + true; bfqq = bfq_split_bfqq(bic, bfqq); split = true; @@ -6900,13 +7063,15 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, */ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { - int i, j; + int i, j, k; - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_NR_LEVELS; j++) - __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + for (k = 0; k < bfqd->num_actuators; k++) { + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j][k]); - __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq[k]); + } } /* @@ -6983,7 +7148,7 @@ static void bfq_exit_queue(struct elevator_queue *e) bfqg_and_blkg_put(bfqd->root_group); #ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); + blkcg_deactivate_policy(bfqd->queue->disk, &blkcg_policy_bfq); #else spin_lock_irq(&bfqd->lock); bfq_put_async_queues(bfqd, bfqd->root_group); @@ -6993,7 +7158,7 @@ static void bfq_exit_queue(struct elevator_queue *e) blk_stat_disable_accounting(bfqd->queue); clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags); - wbt_enable_default(bfqd->queue); + wbt_enable_default(bfqd->queue->disk); kfree(bfqd); } @@ -7018,6 +7183,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) { struct bfq_data *bfqd; struct elevator_queue *eq; + unsigned int i; + struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; eq = elevator_alloc(q, e); if (!eq) @@ -7038,8 +7205,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow * will not attempt to free it. + * Set zero as actuator index: we will pretend that + * all I/O requests are for the same actuator. */ - bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0); bfqd->oom_bfqq.ref++; bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; @@ -7058,6 +7227,39 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->queue = q; + bfqd->num_actuators = 1; + /* + * If the disk supports multiple actuators, copy independent + * access ranges from the request queue structure. + */ + spin_lock_irq(&q->queue_lock); + if (ia_ranges) { + /* + * Check if the disk ia_ranges size exceeds the current bfq + * actuator limit. + */ + if (ia_ranges->nr_ia_ranges > BFQ_MAX_ACTUATORS) { + pr_crit("nr_ia_ranges higher than act limit: iars=%d, max=%d.\n", + ia_ranges->nr_ia_ranges, BFQ_MAX_ACTUATORS); + pr_crit("Falling back to single actuator mode.\n"); + } else { + bfqd->num_actuators = ia_ranges->nr_ia_ranges; + + for (i = 0; i < bfqd->num_actuators; i++) { + bfqd->sector[i] = ia_ranges->ia_range[i].sector; + bfqd->nr_sectors[i] = + ia_ranges->ia_range[i].nr_sectors; + } + } + } + + /* Otherwise use single-actuator dev info */ + if (bfqd->num_actuators == 1) { + bfqd->sector[0] = 0; + bfqd->nr_sectors[0] = get_capacity(q->disk); + } + spin_unlock_irq(&q->queue_lock); + INIT_LIST_HEAD(&bfqd->dispatch); hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, @@ -7069,7 +7271,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->num_groups_with_pending_reqs = 0; #endif - INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->active_list[0]); + INIT_LIST_HEAD(&bfqd->active_list[1]); INIT_LIST_HEAD(&bfqd->idle_list); INIT_HLIST_HEAD(&bfqd->burst_list); @@ -7095,7 +7298,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) */ bfqd->bfq_wr_coeff = 30; bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); - bfqd->bfq_wr_max_time = 0; bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); bfqd->bfq_wr_max_softrt_rate = 7000; /* @@ -7114,6 +7316,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + /* see comments on the definition of next field inside bfq_data */ + bfqd->actuator_load_threshold = 4; + spin_lock_init(&bfqd->lock); /* @@ -7141,7 +7346,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); - wbt_disable_default(q); + wbt_disable_default(q->disk); blk_stat_enable_accounting(q); return 0; diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 466e4865ace6..69aaee52285a 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -33,6 +33,14 @@ */ #define BFQ_SOFTRT_WEIGHT_FACTOR 100 +/* + * Maximum number of actuators supported. This constant is used simply + * to define the size of the static array that will contain + * per-actuator data. The current value is hopefully a good upper + * bound to the possible number of actuators of any actual drive. + */ +#define BFQ_MAX_ACTUATORS 8 + struct bfq_entity; /** @@ -227,12 +235,14 @@ struct bfq_ttime { * struct bfq_queue - leaf schedulable entity. * * A bfq_queue is a leaf request queue; it can be associated with an - * io_context or more, if it is async or shared between cooperating - * processes. @cgroup holds a reference to the cgroup, to be sure that it - * does not disappear while a bfqq still references it (mostly to avoid - * races between request issuing and task migration followed by cgroup - * destruction). - * All the fields are protected by the queue lock of the containing bfqd. + * io_context or more, if it is async or shared between cooperating + * processes. Besides, it contains I/O requests for only one actuator + * (an io_context is associated with a different bfq_queue for each + * actuator it generates I/O for). @cgroup holds a reference to the + * cgroup, to be sure that it does not disappear while a bfqq still + * references it (mostly to avoid races between request issuing and + * task migration followed by cgroup destruction). All the fields are + * protected by the queue lock of the containing bfqd. */ struct bfq_queue { /* reference counter */ @@ -397,24 +407,18 @@ struct bfq_queue { * the woken queues when this queue exits. */ struct hlist_head woken_list; + + /* index of the actuator this queue is associated with */ + unsigned int actuator_idx; }; /** - * struct bfq_io_cq - per (request_queue, io_context) structure. - */ -struct bfq_io_cq { - /* associated io_cq structure */ - struct io_cq icq; /* must be the first member */ - /* array of two process queues, the sync and the async */ - struct bfq_queue *bfqq[2]; - /* per (request_queue, blkcg) ioprio */ - int ioprio; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ -#endif +* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq +*/ +struct bfq_iocq_bfqq_data { /* * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to + * to remember its values while the queue is merged, so as to * be able to restore it in case of split. */ bool saved_has_short_ttime; @@ -428,7 +432,7 @@ struct bfq_io_cq { u64 saved_tot_idle_time; /* - * Same purpose as the previous fields for the value of the + * Same purpose as the previous fields for the values of the * field keeping the queue's belonging to a large burst */ bool saved_in_large_burst; @@ -466,6 +470,38 @@ struct bfq_io_cq { struct bfq_queue *stable_merge_bfqq; bool stably_merged; /* non splittable if true */ +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + */ +struct bfq_io_cq { + /* associated io_cq structure */ + struct io_cq icq; /* must be the first member */ + /* + * Matrix of associated process queues: first row for async + * queues, second row sync queues. Each row contains one + * column for each actuator. An I/O request generated by the + * process is inserted into the queue pointed by bfqq[i][j] if + * the request is to be served by the j-th actuator of the + * drive, where i==0 or i==1, depending on whether the request + * is async or sync. So there is a distinct queue for each + * actuator. + */ + struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS]; + /* per (request_queue, blkcg) ioprio */ + int ioprio; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + uint64_t blkcg_serial_nr; /* the current blkcg serial */ +#endif + + /* + * Persistent data for associated synchronous process queues + * (one queue per actuator, see field bfqq above). In + * particular, each of these queues may undergo a merge. + */ + struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS]; + unsigned int requests; /* Number of requests this process has in flight */ }; @@ -554,7 +590,12 @@ struct bfq_data { /* number of queued requests */ int queued; /* number of requests dispatched and waiting for completion */ - int rq_in_driver; + int tot_rq_in_driver; + /* + * number of requests dispatched and waiting for completion + * for each actuator + */ + int rq_in_driver[BFQ_MAX_ACTUATORS]; /* true if the device is non rotational and performs queueing */ bool nonrot_with_queueing; @@ -648,8 +689,13 @@ struct bfq_data { /* maximum budget allotted to a bfq_queue before rescheduling */ int bfq_max_budget; - /* list of all the bfq_queues active on the device */ - struct list_head active_list; + /* + * List of all the bfq_queues active for a specific actuator + * on the device. Keeping active queues separate on a + * per-actuator basis helps implementing per-actuator + * injection more efficiently. + */ + struct list_head active_list[BFQ_MAX_ACTUATORS]; /* list of all the bfq_queues idle on the device */ struct list_head idle_list; @@ -723,8 +769,6 @@ struct bfq_data { * is multiplied. */ unsigned int bfq_wr_coeff; - /* maximum duration of a weight-raising period (jiffies) */ - unsigned int bfq_wr_max_time; /* Maximum weight-raising duration for soft real-time processes */ unsigned int bfq_wr_rt_max_time; @@ -772,6 +816,42 @@ struct bfq_data { */ unsigned int word_depths[2][2]; unsigned int full_depth_shift; + + /* + * Number of independent actuators. This is equal to 1 in + * case of single-actuator drives. + */ + unsigned int num_actuators; + /* + * Disk independent access ranges for each actuator + * in this device. + */ + sector_t sector[BFQ_MAX_ACTUATORS]; + sector_t nr_sectors[BFQ_MAX_ACTUATORS]; + struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS]; + + /* + * If the number of I/O requests queued in the device for a + * given actuator is below next threshold, then the actuator + * is deemed as underutilized. If this condition is found to + * hold for some actuator upon a dispatch, but (i) the + * in-service queue does not contain I/O for that actuator, + * while (ii) some other queue does contain I/O for that + * actuator, then the head I/O request of the latter queue is + * returned (injected), instead of the head request of the + * currently in-service queue. + * + * We set the threshold, empirically, to the minimum possible + * value for which an actuator is fully utilized, or close to + * be fully utilized. By doing so, injected I/O 'steals' as + * few drive-queue slots as possibile to the in-service + * queue. This reduces as much as possible the probability + * that the service of I/O from the in-service bfq_queue gets + * delayed because of slot exhaustion, i.e., because all the + * slots of the drive queue are filled with I/O injected from + * other queues (NCQ provides for 32 slots). + */ + unsigned int actuator_load_threshold; }; enum bfqq_state_flags { @@ -929,16 +1009,14 @@ struct bfq_group { /* reference counter (see comments in bfq_bic_update_cgroup) */ refcount_t ref; - /* Is bfq_group still online? */ - bool online; struct bfq_entity entity; struct bfq_sched_data sched_data; struct bfq_data *bfqd; - struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; - struct bfq_queue *async_idle_bfqq; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; + struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; struct bfq_entity *my_entity; @@ -955,8 +1033,8 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; - struct bfq_queue *async_idle_bfqq; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; + struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; struct rb_root rq_pos_tree; }; @@ -969,8 +1047,10 @@ struct bfq_group { extern const int bfq_timeout; -struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); -void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); +struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx); +void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync, + unsigned int actuator_idx); struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_weights_tree_add(struct bfq_queue *bfqq); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index ea4c3d757fdd..7941b6f07391 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -493,7 +493,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, bfq_update_active_tree(node); if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]); bfq_inc_active_entities(entity); } diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 3f5685c00e36..4533eb491661 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -124,23 +124,18 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_vec *iv; if (bip->bip_vcnt >= bip->bip_max_vcnt) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } - iv = bip->bip_vec + bip->bip_vcnt; - if (bip->bip_vcnt && bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits, &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; - iv->bv_page = page; - iv->bv_len = len; - iv->bv_offset = offset; + bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; return len; @@ -418,6 +413,7 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, bip->bip_vcnt = bip_src->bip_vcnt; bip->bip_iter = bip_src->bip_iter; + bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; return 0; } diff --git a/block/bio.c b/block/bio.c index b97f3991c904..2e421c0dad13 100644 --- a/block/bio.c +++ b/block/bio.c @@ -26,7 +26,6 @@ #include "blk-cgroup.h" #define ALLOC_CACHE_THRESHOLD 16 -#define ALLOC_CACHE_SLACK 64 #define ALLOC_CACHE_MAX 256 struct bio_alloc_cache { @@ -1029,10 +1028,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, if (bio->bi_vcnt >= queue_max_segments(q)) return 0; - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; + bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); bio->bi_vcnt++; bio->bi_iter.bi_size += len; return len; @@ -1108,15 +1104,10 @@ EXPORT_SYMBOL_GPL(bio_add_zone_append_page); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; - WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); - bv->bv_page = page; - bv->bv_offset = off; - bv->bv_len = len; - + bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; } @@ -1792,6 +1783,8 @@ static int __init init_bio(void) { int i; + BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); + bio_integrity_init(); for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9ac1efb053e0..bd50b55bdb61 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -118,14 +118,26 @@ static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, free_work); + struct request_queue *q = blkg->q; int i; + /* + * pd_free_fn() can also be called from blkcg_deactivate_policy(), + * in order to make sure pd_free_fn() is called in order, the deletion + * of the list blkg->q_node is delayed to here from blkg_destroy(), and + * blkcg_mutex is used to synchronize blkg_free_workfn() and + * blkcg_deactivate_policy(). + */ + mutex_lock(&q->blkcg_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + if (blkg->parent) + blkg_put(blkg->parent); + list_del_init(&blkg->q_node); + mutex_unlock(&q->blkcg_mutex); - if (blkg->q) - blk_put_queue(blkg->q); + blk_put_queue(q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); @@ -158,8 +170,6 @@ static void __blkg_release(struct rcu_head *rcu) /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); - if (blkg->parent) - blkg_put(blkg->parent); blkg_free(blkg); } @@ -249,16 +259,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; - if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) - goto err_free; - + goto out_free_blkg; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) - goto err_free; - + goto out_exit_refcnt; if (!blk_get_queue(disk->queue)) - goto err_free; + goto out_free_iostat; blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); @@ -281,19 +288,28 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, continue; /* alloc per-policy data and attach it to blkg */ - pd = pol->pd_alloc_fn(gfp_mask, disk->queue, blkcg); + pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask); if (!pd) - goto err_free; - + goto out_free_pds; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; + pd->online = false; } return blkg; -err_free: - blkg_free(blkg); +out_free_pds: + while (--i >= 0) + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + blk_put_queue(disk->queue); +out_free_iostat: + free_percpu(blkg->iostat_cpu); +out_exit_refcnt: + percpu_ref_exit(&blkg->refcnt); +out_free_blkg: + kfree(blkg); return NULL; } @@ -359,8 +375,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg->pd[i]); + if (blkg->pd[i]) { + if (pol->pd_online_fn) + pol->pd_online_fn(blkg->pd[i]); + blkg->pd[i]->online = true; + } } } blkg->online = true; @@ -376,7 +395,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, err_put_css: css_put(&blkcg->css); err_free_blkg: - blkg_free(new_blkg); + if (new_blkg) + blkg_free(new_blkg); return ERR_PTR(ret); } @@ -458,21 +478,28 @@ static void blkg_destroy(struct blkcg_gq *blkg) lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); - /* Something wrong if we are trying to remove same group twice */ - WARN_ON_ONCE(list_empty(&blkg->q_node)); - WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + /* + * blkg stays on the queue list until blkg_free_workfn(), see details in + * blkg_free_workfn(), hence this function can be called from + * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before + * blkg_free_workfn(). + */ + if (hlist_unhashed(&blkg->blkcg_node)) + return; for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg->pd[i]); + if (blkg->pd[i] && blkg->pd[i]->online) { + blkg->pd[i]->online = false; + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[i]); + } } blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); - list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node); /* @@ -559,7 +586,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - if (!blkg->q->disk || !blkg->q->disk->bdi->dev) + if (!blkg->q->disk) return NULL; return bdi_dev_name(blkg->q->disk->bdi); } @@ -1273,6 +1300,7 @@ int blkcg_init_disk(struct gendisk *disk) int ret; INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) @@ -1349,9 +1377,9 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css) static void blkcg_exit(struct task_struct *tsk) { - if (tsk->throttle_queue) - blk_put_queue(tsk->throttle_queue); - tsk->throttle_queue = NULL; + if (tsk->throttle_disk) + put_disk(tsk->throttle_disk); + tsk->throttle_disk = NULL; } struct cgroup_subsys io_cgrp_subsys = { @@ -1377,14 +1405,14 @@ struct cgroup_subsys io_cgrp_subsys = { EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** - * blkcg_activate_policy - activate a blkcg policy on a request_queue - * @q: request_queue of interest + * blkcg_activate_policy - activate a blkcg policy on a gendisk + * @disk: gendisk of interest * @pol: blkcg policy to activate * - * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through + * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * - * Activation happens with @q bypassed, so nobody would be accessing blkgs + * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. @@ -1392,9 +1420,9 @@ EXPORT_SYMBOL_GPL(io_cgrp_subsys); * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) +int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { + struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; int ret; @@ -1419,8 +1447,8 @@ retry: pd = pd_prealloc; pd_prealloc = NULL; } else { - pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, - blkg->blkcg); + pd = pol->pd_alloc_fn(disk, blkg->blkcg, + GFP_NOWAIT | __GFP_NOWARN); } if (!pd) { @@ -1437,8 +1465,8 @@ retry: if (pd_prealloc) pol->pd_free_fn(pd_prealloc); - pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, - blkg->blkcg); + pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, + GFP_KERNEL); if (pd_prealloc) goto retry; else @@ -1448,6 +1476,7 @@ retry: blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; + pd->online = false; } /* all allocated, init in the same order */ @@ -1455,9 +1484,11 @@ retry: list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) pol->pd_init_fn(blkg->pd[pol->plid]); - if (pol->pd_online_fn) - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { + if (pol->pd_online_fn) pol->pd_online_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid]->online = true; + } __set_bit(pol->plid, q->blkcg_pols); ret = 0; @@ -1492,16 +1523,17 @@ enomem: EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** - * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue - * @q: request_queue of interest + * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk + * @disk: gendisk of interest * @pol: blkcg policy to deactivate * - * Deactivate @pol on @q. Follows the same synchronization rules as + * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ -void blkcg_deactivate_policy(struct request_queue *q, +void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg; if (!blkcg_policy_enabled(q, pol)) @@ -1510,6 +1542,7 @@ void blkcg_deactivate_policy(struct request_queue *q, if (queue_is_mq(q)) blk_mq_freeze_queue(q); + mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); @@ -1519,7 +1552,7 @@ void blkcg_deactivate_policy(struct request_queue *q, spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { - if (pol->pd_offline_fn) + if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; @@ -1528,6 +1561,7 @@ void blkcg_deactivate_policy(struct request_queue *q, } spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); @@ -1797,29 +1831,29 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we - * check to see if current->throttle_queue is set and if not this doesn't do + * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { - struct request_queue *q = current->throttle_queue; + struct gendisk *disk = current->throttle_disk; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; - if (!q) + if (!disk) return; - current->throttle_queue = NULL; + current->throttle_disk = NULL; current->use_memdelay = false; rcu_read_lock(); blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; - blkg = blkg_lookup(blkcg, q); + blkg = blkg_lookup(blkcg, disk->queue); if (!blkg) goto out; if (!blkg_tryget(blkg)) @@ -1828,11 +1862,10 @@ void blkcg_maybe_throttle_current(void) blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); - blk_put_queue(q); + put_disk(disk); return; out: rcu_read_unlock(); - blk_put_queue(q); } /** @@ -1854,18 +1887,17 @@ out: */ void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { - struct request_queue *q = disk->queue; - if (unlikely(current->flags & PF_KTHREAD)) return; - if (current->throttle_queue != q) { - if (!blk_get_queue(q)) + if (current->throttle_disk != disk) { + if (test_bit(GD_DEAD, &disk->state)) return; + get_device(disk_to_dev(disk)); - if (current->throttle_queue) - blk_put_queue(current->throttle_queue); - current->throttle_queue = q; + if (current->throttle_disk) + put_disk(current->throttle_disk); + current->throttle_disk = disk; } if (use_memdelay) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 1e94e404eaa8..9c5078755e5e 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -135,6 +135,7 @@ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; + bool online; }; /* @@ -154,8 +155,8 @@ typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); -typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, - struct request_queue *q, struct blkcg *blkcg); +typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); @@ -194,9 +195,8 @@ void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol); -void blkcg_deactivate_policy(struct request_queue *q, +int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); +void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); @@ -495,9 +495,9 @@ static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } -static inline int blkcg_activate_policy(struct request_queue *q, +static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } -static inline void blkcg_deactivate_policy(struct request_queue *q, +static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, diff --git a/block/blk-core.c b/block/blk-core.c index b5098355d8b2..82b5b2c53f1e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -570,7 +570,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ - if (bio->bi_iter.bi_sector & (bdev_zone_sectors(bio->bi_bdev) - 1) || + if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) || !bio_zone_is_seq(bio)) return BLK_STS_IOERR; @@ -684,6 +684,18 @@ static void __submit_bio_noacct_mq(struct bio *bio) void submit_bio_noacct_nocheck(struct bio *bio) { + blk_cgroup_bio_start(bio); + blkcg_bio_issue_init(bio); + + if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_queue(bio); + /* + * Now that enqueuing has been traced, we need to trace + * completion as well. + */ + bio_set_flag(bio, BIO_TRACE_COMPLETION); + } + /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list @@ -741,12 +753,16 @@ void submit_bio_noacct(struct bio *bio) * Filter flush bio's early so that bio based drivers without flush * support don't have to worry about them. */ - if (op_is_flush(bio->bi_opf) && - !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { - bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); - if (!bio_sectors(bio)) { - status = BLK_STS_OK; + if (op_is_flush(bio->bi_opf)) { + if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && + bio_op(bio) != REQ_OP_ZONE_APPEND)) goto end_io; + if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { + bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); + if (!bio_sectors(bio)) { + status = BLK_STS_OK; + goto end_io; + } } } @@ -788,17 +804,6 @@ void submit_bio_noacct(struct bio *bio) if (blk_throtl_bio(bio)) return; - - blk_cgroup_bio_start(bio); - blkcg_bio_issue_init(bio); - - if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_queue(bio); - /* Now that enqueuing has been traced, we need to trace - * completion as well. - */ - bio_set_flag(bio, BIO_TRACE_COMPLETION); - } submit_bio_noacct_nocheck(bio); return; @@ -869,7 +874,16 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) */ blk_flush_plug(current->plug, false); - if (bio_queue_enter(bio)) + /* + * We need to be able to enter a frozen queue, similar to how + * timeouts also need to do that. If that is blocked, then we can + * have pending IO when a queue freeze is started, and then the + * wait for the freeze to finish will wait for polled requests to + * timeout as the poller is preventer from entering the queue and + * completing them. As long as we prevent new IO from being queued, + * that should be all that matters. + */ + if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; if (queue_is_mq(q)) { ret = blk_mq_poll(q, cookie, iob, flags); diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index 55268edc0625..a304434489ba 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -116,7 +116,7 @@ static void blk_crypto_release(struct kobject *kobj) kfree(container_of(kobj, struct blk_crypto_kobj, kobj)); } -static struct kobj_type blk_crypto_ktype = { +static const struct kobj_type blk_crypto_ktype = { .default_groups = blk_crypto_attr_groups, .sysfs_ops = &blk_crypto_attr_ops, .release = blk_crypto_release, diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index 2141931ddd37..c9eb4241e048 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -75,7 +75,7 @@ static void blk_ia_range_sysfs_nop_release(struct kobject *kobj) { } -static struct kobj_type blk_ia_range_ktype = { +static const struct kobj_type blk_ia_range_ktype = { .sysfs_ops = &blk_ia_range_sysfs_ops, .default_groups = blk_ia_range_groups, .release = blk_ia_range_sysfs_nop_release, @@ -94,7 +94,7 @@ static void blk_ia_ranges_sysfs_release(struct kobject *kobj) kfree(iars); } -static struct kobj_type blk_ia_ranges_ktype = { +static const struct kobj_type blk_ia_ranges_ktype = { .release = blk_ia_ranges_sysfs_release, }; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 69eed260a823..8f01d786f5cb 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -356,7 +356,7 @@ static const struct sysfs_ops integrity_ops = { .store = &integrity_attr_store, }; -static struct kobj_type integrity_ktype = { +static const struct kobj_type integrity_ktype = { .default_groups = integrity_groups, .sysfs_ops = &integrity_ops, }; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6955605629e4..ff534e9d92dc 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -258,6 +258,11 @@ enum { VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION, VRATE_CLAMP_ADJ_PCT = 4, + /* switch iff the conditions are met for longer than this */ + AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, +}; + +enum { /* if IOs end up waiting for requests, issue less */ RQ_WAIT_BUSY_PCT = 5, @@ -296,9 +301,6 @@ enum { /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, - /* switch iff the conditions are met for longer than this */ - AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, - /* * Count IO size in 4k pages. The 12bit shift helps keeping * size-proportional components of cost calculation in closer @@ -667,7 +669,7 @@ static struct ioc *q_to_ioc(struct request_queue *q) static const char __maybe_unused *ioc_name(struct ioc *ioc) { - struct gendisk *disk = ioc->rqos.q->disk; + struct gendisk *disk = ioc->rqos.disk; if (!disk) return "<unknown>"; @@ -806,11 +808,11 @@ static int ioc_autop_idx(struct ioc *ioc) u64 now_ns; /* rotational? */ - if (!blk_queue_nonrot(ioc->rqos.q)) + if (!blk_queue_nonrot(ioc->rqos.disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ - if (blk_queue_depth(ioc->rqos.q) == 1) + if (blk_queue_depth(ioc->rqos.disk->queue) == 1) return AUTOP_SSD_QD1; /* use one of the normal ssd sets */ @@ -866,9 +868,14 @@ static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops, *page = *seqio = *randio = 0; - if (bps) - *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, - DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE)); + if (bps) { + u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE); + + if (bps_pages) + *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages); + else + *page = 1; + } if (seqiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops); @@ -926,8 +933,8 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force) ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * VTIME_PER_USEC, MILLION); - ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] * - VTIME_PER_USEC, MILLION); + ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] * + VTIME_PER_USEC, MILLION); return true; } @@ -2642,7 +2649,7 @@ retry_lock: if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q->disk, + blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; @@ -2743,7 +2750,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q->disk, + blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); @@ -2814,7 +2821,7 @@ static void ioc_rqos_exit(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); - blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost); + blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost); spin_lock_irq(&ioc->lock); ioc->running = IOC_STOP; @@ -2825,7 +2832,7 @@ static void ioc_rqos_exit(struct rq_qos *rqos) kfree(ioc); } -static struct rq_qos_ops ioc_rqos_ops = { +static const struct rq_qos_ops ioc_rqos_ops = { .throttle = ioc_rqos_throttle, .merge = ioc_rqos_merge, .done_bio = ioc_rqos_done_bio, @@ -2836,9 +2843,7 @@ static struct rq_qos_ops ioc_rqos_ops = { static int blk_iocost_init(struct gendisk *disk) { - struct request_queue *q = disk->queue; struct ioc *ioc; - struct rq_qos *rqos; int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); @@ -2861,11 +2866,6 @@ static int blk_iocost_init(struct gendisk *disk) local64_set(&ccs->rq_wait_ns, 0); } - rqos = &ioc->rqos; - rqos->id = RQ_QOS_COST; - rqos->ops = &ioc_rqos_ops; - rqos->q = q; - spin_lock_init(&ioc->lock); timer_setup(&ioc->timer, ioc_timer_fn, 0); INIT_LIST_HEAD(&ioc->active_iocgs); @@ -2889,17 +2889,17 @@ static int blk_iocost_init(struct gendisk *disk) * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ - ret = rq_qos_add(q, rqos); + ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); if (ret) goto err_free_ioc; - ret = blkcg_activate_policy(q, &blkcg_policy_iocost); + ret = blkcg_activate_policy(disk, &blkcg_policy_iocost); if (ret) goto err_del_qos; return 0; err_del_qos: - rq_qos_del(q, rqos); + rq_qos_del(&ioc->rqos); err_free_ioc: free_percpu(ioc->pcpu_stat); kfree(ioc); @@ -2923,13 +2923,14 @@ static void ioc_cpd_free(struct blkcg_policy_data *cpd) kfree(container_of(cpd, struct ioc_cgrp, cpd)); } -static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; - iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); + iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, + disk->node_id); if (!iocg) return NULL; @@ -3129,6 +3130,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, if (!dname) return 0; + spin_lock_irq(&ioc->lock); seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", ioc->params.qos[QOS_RPPM] / 10000, @@ -3141,6 +3143,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, ioc->params.qos[QOS_MIN] % 10000 / 100, ioc->params.qos[QOS_MAX] / 10000, ioc->params.qos[QOS_MAX] % 10000 / 100); + spin_unlock_irq(&ioc->lock); return 0; } @@ -3185,6 +3188,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, return PTR_ERR(bdev); disk = bdev->bd_disk; + if (!queue_is_mq(disk->queue)) { + ret = -EOPNOTSUPP; + goto err; + } + ioc = q_to_ioc(disk->queue); if (!ioc) { ret = blk_iocost_init(disk); @@ -3212,7 +3220,8 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, switch (match_token(p, qos_ctrl_tokens, args)) { case QOS_ENABLE: - match_u64(&args[0], &v); + if (match_u64(&args[0], &v)) + goto einval; enable = v; continue; case QOS_CTRL: @@ -3270,11 +3279,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; - wbt_disable_default(disk->queue); + wbt_disable_default(disk); } else { blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; - wbt_enable_default(disk->queue); + wbt_enable_default(disk); } if (user) { @@ -3314,12 +3323,14 @@ static u64 ioc_cost_model_prfill(struct seq_file *sf, if (!dname) return 0; + spin_lock_irq(&ioc->lock); seq_printf(sf, "%s ctrl=%s model=linear " "rbps=%llu rseqiops=%llu rrandiops=%llu " "wbps=%llu wseqiops=%llu wrandiops=%llu\n", dname, ioc->user_cost_model ? "user" : "auto", u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); + spin_unlock_irq(&ioc->lock); return 0; } @@ -3364,6 +3375,11 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, return PTR_ERR(bdev); q = bdev_get_queue(bdev); + if (!queue_is_mq(q)) { + ret = -EOPNOTSUPP; + goto err; + } + ioc = q_to_ioc(q); if (!ioc) { ret = blk_iocost_init(bdev->bd_disk); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index ecdc10741836..0dc910568b31 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -292,7 +292,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); if (use_delay) - blkcg_schedule_throttle(rqos->q->disk, use_memdelay); + blkcg_schedule_throttle(rqos->disk, use_memdelay); /* * To avoid priority inversions we want to just take a slot if we are @@ -330,7 +330,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat, struct child_latency_info *lat_info, bool up) { - unsigned long qd = blkiolat->rqos.q->nr_requests; + unsigned long qd = blkiolat->rqos.disk->queue->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = atomic_read(&lat_info->scale_cookie); unsigned long max_scale = qd << 1; @@ -372,7 +372,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat, */ static void scale_change(struct iolatency_grp *iolat, bool up) { - unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; + unsigned long qd = iolat->blkiolat->rqos.disk->queue->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = iolat->max_depth; @@ -646,11 +646,11 @@ static void blkcg_iolatency_exit(struct rq_qos *rqos) timer_shutdown_sync(&blkiolat->timer); flush_work(&blkiolat->enable_work); - blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); + blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iolatency); kfree(blkiolat); } -static struct rq_qos_ops blkcg_iolatency_ops = { +static const struct rq_qos_ops blkcg_iolatency_ops = { .throttle = blkcg_iolatency_throttle, .done_bio = blkcg_iolatency_done_bio, .exit = blkcg_iolatency_exit, @@ -665,7 +665,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) rcu_read_lock(); blkg_for_each_descendant_pre(blkg, pos_css, - blkiolat->rqos.q->root_blkg) { + blkiolat->rqos.disk->queue->root_blkg) { struct iolatency_grp *iolat; struct child_latency_info *lat_info; unsigned long flags; @@ -749,32 +749,26 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) */ enabled = atomic_read(&blkiolat->enable_cnt); if (enabled != blkiolat->enabled) { - blk_mq_freeze_queue(blkiolat->rqos.q); + blk_mq_freeze_queue(blkiolat->rqos.disk->queue); blkiolat->enabled = enabled; - blk_mq_unfreeze_queue(blkiolat->rqos.q); + blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue); } } int blk_iolatency_init(struct gendisk *disk) { - struct request_queue *q = disk->queue; struct blk_iolatency *blkiolat; - struct rq_qos *rqos; int ret; blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); if (!blkiolat) return -ENOMEM; - rqos = &blkiolat->rqos; - rqos->id = RQ_QOS_LATENCY; - rqos->ops = &blkcg_iolatency_ops; - rqos->q = q; - - ret = rq_qos_add(q, rqos); + ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY, + &blkcg_iolatency_ops); if (ret) goto err_free; - ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); + ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency); if (ret) goto err_qos_del; @@ -784,7 +778,7 @@ int blk_iolatency_init(struct gendisk *disk) return 0; err_qos_del: - rq_qos_del(q, rqos); + rq_qos_del(&blkiolat->rqos); err_free: kfree(blkiolat); return ret; @@ -952,13 +946,12 @@ static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) iolat->max_depth, avg_lat, cur_win); } -static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, - struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *iolatency_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { struct iolatency_grp *iolat; - iolat = kzalloc_node(sizeof(*iolat), gfp, q->node); + iolat = kzalloc_node(sizeof(*iolat), gfp, disk->node_id); if (!iolat) return NULL; iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 8bb6b8eba4ce..055529b9b92b 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -116,7 +116,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, } static struct blkg_policy_data * -ioprio_alloc_pd(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg) +ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct ioprio_blkg *ioprio_blkg; @@ -204,12 +204,12 @@ void blkcg_set_ioprio(struct bio *bio) void blk_ioprio_exit(struct gendisk *disk) { - blkcg_deactivate_policy(disk->queue, &ioprio_policy); + blkcg_deactivate_policy(disk, &ioprio_policy); } int blk_ioprio_init(struct gendisk *disk) { - return blkcg_activate_policy(disk->queue, &ioprio_policy); + return blkcg_activate_policy(disk, &ioprio_policy); } static int __init ioprio_init(void) diff --git a/block/blk-map.c b/block/blk-map.c index 080dd60485be..9137d16cecdc 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -31,7 +31,8 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, return NULL; memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); bmd->iter = *data; - bmd->iter.iov = bmd->iov; + if (iter_is_iovec(data)) + bmd->iter.iov = bmd->iov; return bmd; } @@ -246,10 +247,8 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, { struct bio *bio; - if (rq->cmd_flags & REQ_POLLED) { - blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE; - - bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask, + if (rq->cmd_flags & REQ_ALLOC_CACHE) { + bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask, &fs_bio_set); if (!bio) return NULL; @@ -641,7 +640,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, copy = true; else if (iov_iter_is_bvec(iter)) map_bvec = true; - else if (!iter_is_iovec(iter)) + else if (!user_backed_iter(iter)) copy = true; else if (queue_virt_boundary(q)) copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); @@ -682,9 +681,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, void __user *ubuf, unsigned long len, gfp_t gfp_mask) { - struct iovec iov; struct iov_iter i; - int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i); + int ret = import_ubuf(rq_data_dir(rq), ubuf, len, &i); if (unlikely(ret < 0)) return ret; diff --git a/block/blk-merge.c b/block/blk-merge.c index b7c193d67185..1ac782fdc55c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, * responsible for ensuring that @bs is only destroyed after processing of the * split bio has finished. */ -static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *segs, struct bio_set *bs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; @@ -336,6 +336,7 @@ split: bio_clear_polled(bio); return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); } +EXPORT_SYMBOL_GPL(bio_split_rw); /** * __bio_split_to_limits - split a bio to fit the queue limits @@ -757,6 +758,33 @@ void blk_rq_set_mixed_merge(struct request *rq) rq->rq_flags |= RQF_MIXED_MERGE; } +static inline blk_opf_t bio_failfast(const struct bio *bio) +{ + if (bio->bi_opf & REQ_RAHEAD) + return REQ_FAILFAST_MASK; + + return bio->bi_opf & REQ_FAILFAST_MASK; +} + +/* + * After we are marked as MIXED_MERGE, any new RA bio has to be updated + * as failfast, and request's failfast has to be updated in case of + * front merge. + */ +static inline void blk_update_mixed_merge(struct request *req, + struct bio *bio, bool front_merge) +{ + if (req->rq_flags & RQF_MIXED_MERGE) { + if (bio->bi_opf & REQ_RAHEAD) + bio->bi_opf |= REQ_FAILFAST_MASK; + + if (front_merge) { + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= bio->bi_opf & REQ_FAILFAST_MASK; + } + } +} + static void blk_account_io_merge_request(struct request *req) { if (blk_do_io_stat(req)) { @@ -954,7 +982,7 @@ enum bio_merge_status { static enum bio_merge_status bio_attempt_back_merge(struct request *req, struct bio *bio, unsigned int nr_segs) { - const blk_opf_t ff = bio->bi_opf & REQ_FAILFAST_MASK; + const blk_opf_t ff = bio_failfast(bio); if (!ll_back_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; @@ -965,6 +993,8 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); + blk_update_mixed_merge(req, bio, false); + req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; @@ -978,7 +1008,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, static enum bio_merge_status bio_attempt_front_merge(struct request *req, struct bio *bio, unsigned int nr_segs) { - const blk_opf_t ff = bio->bi_opf & REQ_FAILFAST_MASK; + const blk_opf_t ff = bio_failfast(bio); if (!ll_front_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; @@ -989,6 +1019,8 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); + blk_update_mixed_merge(req, bio, true); + bio->bi_next = req->bio; req->bio = bio; diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 9c2fce1a7b50..0c612c19feb8 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -10,66 +10,29 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <linux/group_cpus.h> #include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" -static int queue_index(struct blk_mq_queue_map *qmap, - unsigned int nr_queues, const int q) -{ - return qmap->queue_offset + (q % nr_queues); -} - -static int get_first_sibling(unsigned int cpu) -{ - unsigned int ret; - - ret = cpumask_first(topology_sibling_cpumask(cpu)); - if (ret < nr_cpu_ids) - return ret; - - return cpu; -} - void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { - unsigned int *map = qmap->mq_map; - unsigned int nr_queues = qmap->nr_queues; - unsigned int cpu, first_sibling, q = 0; - - for_each_possible_cpu(cpu) - map[cpu] = -1; - - /* - * Spread queues among present CPUs first for minimizing - * count of dead queues which are mapped by all un-present CPUs - */ - for_each_present_cpu(cpu) { - if (q >= nr_queues) - break; - map[cpu] = queue_index(qmap, nr_queues, q++); + const struct cpumask *masks; + unsigned int queue, cpu; + + masks = group_cpus_evenly(qmap->nr_queues); + if (!masks) { + for_each_possible_cpu(cpu) + qmap->mq_map[cpu] = qmap->queue_offset; + return; } - for_each_possible_cpu(cpu) { - if (map[cpu] != -1) - continue; - /* - * First do sequential mapping between CPUs and queues. - * In case we still have CPUs to map, and we have some number of - * threads per cores then map sibling threads to the same queue - * for performance optimizations. - */ - if (q < nr_queues) { - map[cpu] = queue_index(qmap, nr_queues, q++); - } else { - first_sibling = get_first_sibling(cpu); - if (first_sibling == cpu) - map[cpu] = queue_index(qmap, nr_queues, q++); - else - map[cpu] = map[first_sibling]; - } + for (queue = 0; queue < qmap->nr_queues; queue++) { + for_each_cpu(cpu, &masks[queue]) + qmap->mq_map[cpu] = qmap->queue_offset + queue; } + kfree(masks); } EXPORT_SYMBOL_GPL(blk_mq_map_queues); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index bd942341b638..b01818f8e216 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -813,9 +813,9 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { - lockdep_assert_held(&rqos->q->debugfs_mutex); + lockdep_assert_held(&rqos->disk->queue->debugfs_mutex); - if (!rqos->q->debugfs_dir) + if (!rqos->disk->queue->debugfs_dir) return; debugfs_remove_recursive(rqos->debugfs_dir); rqos->debugfs_dir = NULL; @@ -823,7 +823,7 @@ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { - struct request_queue *q = rqos->q; + struct request_queue *q = rqos->disk->queue; const char *dir_name = rq_qos_id_to_name(rqos->id); lockdep_assert_held(&q->debugfs_mutex); @@ -835,9 +835,7 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) q->rqos_debugfs_dir = debugfs_create_dir("rqos", q->debugfs_dir); - rqos->debugfs_dir = debugfs_create_dir(dir_name, - rqos->q->rqos_debugfs_dir); - + rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir); debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); } diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 23d1a90fec42..06b312c69114 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -19,8 +19,7 @@ #include "blk-wbt.h" /* - * Mark a hardware queue as needing a restart. For shared queues, maintain - * a count of how many hardware queues are marked for restart. + * Mark a hardware queue as needing a restart. */ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) { @@ -82,7 +81,7 @@ dispatch: /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. @@ -210,7 +209,7 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 4515288fbe35..1b2b0d258e46 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -46,7 +46,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) struct blk_mq_hw_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_hw_ctx *, char *); - ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); }; static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, @@ -70,28 +69,6 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, return res; } -static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, - struct attribute *attr, const char *page, - size_t length) -{ - struct blk_mq_hw_ctx_sysfs_entry *entry; - struct blk_mq_hw_ctx *hctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); - hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); - q = hctx->queue; - - if (!entry->store) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->store(hctx, page, length); - mutex_unlock(&q->sysfs_lock); - return res; -} - static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -150,18 +127,17 @@ ATTRIBUTE_GROUPS(default_hw_ctx); static const struct sysfs_ops blk_mq_hw_sysfs_ops = { .show = blk_mq_hw_sysfs_show, - .store = blk_mq_hw_sysfs_store, }; -static struct kobj_type blk_mq_ktype = { +static const struct kobj_type blk_mq_ktype = { .release = blk_mq_sysfs_release, }; -static struct kobj_type blk_mq_ctx_ktype = { +static const struct kobj_type blk_mq_ctx_ktype = { .release = blk_mq_ctx_sysfs_release, }; -static struct kobj_type blk_mq_hw_ktype = { +static const struct kobj_type blk_mq_hw_ktype = { .sysfs_ops = &blk_mq_hw_sysfs_ops, .default_groups = default_hw_ctx_groups, .release = blk_mq_hw_sysfs_release, diff --git a/block/blk-mq.c b/block/blk-mq.c index 9c8dc70020bc..d3494a796ba8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -658,7 +658,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * allocator for this for the rare use case of a command tied to * a specific queue. */ - if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED)))) + if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || + WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL); if (hctx_idx >= q->nr_hw_queues) @@ -1825,12 +1826,13 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; + struct sbitmap_queue *sbq; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; - if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && + !(blk_mq_is_shared_tags(hctx->flags))) { blk_mq_sched_mark_restart_hctx(hctx); /* @@ -1848,6 +1850,10 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, if (!list_empty_careful(&wait->entry)) return false; + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) + sbq = &hctx->tags->breserved_tags; + else + sbq = &hctx->tags->bitmap_tags; wq = &bt_wait_ptr(sbq, hctx)->wait; spin_lock_irq(&wq->lock); @@ -1917,16 +1923,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) static void blk_mq_handle_dev_resource(struct request *rq, struct list_head *list) { - struct request *next = - list_first_entry_or_null(list, struct request, queuelist); - - /* - * If an I/O scheduler has been configured and we got a driver tag for - * the next request already, free it. - */ - if (next) - blk_mq_put_driver_tag(next); - list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); } @@ -2002,6 +1998,23 @@ static void blk_mq_release_budgets(struct request_queue *q, } /* + * blk_mq_commit_rqs will notify driver using bd->last that there is no + * more requests. (See comment in struct blk_mq_ops for commit_rqs for + * details) + * Attention, we should explicitly call this in unusual cases: + * 1) did not queue everything initially scheduled to queue + * 2) the last attempt to queue a request failed + */ +static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, + bool from_schedule) +{ + if (hctx->queue->mq_ops->commit_rqs && queued) { + trace_block_unplug(hctx->queue, queued, !from_schedule); + hctx->queue->mq_ops->commit_rqs(hctx); + } +} + +/* * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, @@ -2009,8 +2022,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, { enum prep_dispatch prep; struct request_queue *q = hctx->queue; - struct request *rq, *nxt; - int errors, queued; + struct request *rq; + int queued; blk_status_t ret = BLK_STS_OK; LIST_HEAD(zone_list); bool needs_resource = false; @@ -2021,7 +2034,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, /* * Now process all the entries, sending them to the driver. */ - errors = queued = 0; + queued = 0; do { struct blk_mq_queue_data bd; @@ -2035,17 +2048,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, list_del_init(&rq->queuelist); bd.rq = rq; - - /* - * Flag last if we have no more requests, or if we have more - * but can't assign a driver tag to it. - */ - if (list_empty(list)) - bd.last = true; - else { - nxt = list_first_entry(list, struct request, queuelist); - bd.last = !blk_mq_get_driver_tag(nxt); - } + bd.last = list_empty(list); /* * once the request is queued to lld, no need to cover the @@ -2074,7 +2077,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, needs_resource = true; break; default: - errors++; blk_mq_end_request(rq, ret); } } while (!list_empty(list)); @@ -2085,9 +2087,9 @@ out: /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ - if ((!list_empty(list) || errors || needs_resource || - ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued) - q->mq_ops->commit_rqs(hctx); + if (!list_empty(list) || ret != BLK_STS_OK) + blk_mq_commit_rqs(hctx, queued, false); + /* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. @@ -2096,7 +2098,8 @@ out: bool needs_restart; /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && - (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); + ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || + blk_mq_is_shared_tags(hctx->flags)); if (nr_budgets) blk_mq_release_budgets(q, list); @@ -2151,10 +2154,10 @@ out: blk_mq_update_dispatch_busy(hctx, true); return false; - } else - blk_mq_update_dispatch_busy(hctx, false); + } - return (queued + errors) != 0; + blk_mq_update_dispatch_busy(hctx, false); + return true; } /** @@ -2548,16 +2551,6 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, spin_unlock(&ctx->lock); } -static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued, - bool from_schedule) -{ - if (hctx->queue->mq_ops->commit_rqs) { - trace_block_unplug(hctx->queue, *queued, !from_schedule); - hctx->queue->mq_ops->commit_rqs(hctx); - } - *queued = 0; -} - static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, unsigned int nr_segs) { @@ -2681,20 +2674,21 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last); } -static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) +static void blk_mq_plug_issue_direct(struct blk_plug *plug) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; - int errors = 0; + blk_status_t ret = BLK_STS_OK; while ((rq = rq_list_pop(&plug->mq_list))) { bool last = rq_list_empty(plug->mq_list); - blk_status_t ret; if (hctx != rq->mq_hctx) { - if (hctx) - blk_mq_commit_rqs(hctx, &queued, from_schedule); + if (hctx) { + blk_mq_commit_rqs(hctx, queued, false); + queued = 0; + } hctx = rq->mq_hctx; } @@ -2706,21 +2700,16 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, false, true); - blk_mq_commit_rqs(hctx, &queued, from_schedule); - return; + goto out; default: blk_mq_end_request(rq, ret); - errors++; break; } } - /* - * If we didn't flush the entire list, we could have told the driver - * there was more coming, but that turned out to be a lie. - */ - if (errors) - blk_mq_commit_rqs(hctx, &queued, from_schedule); +out: + if (ret != BLK_STS_OK) + blk_mq_commit_rqs(hctx, queued, false); } static void __blk_mq_flush_plug_list(struct request_queue *q, @@ -2791,7 +2780,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) } blk_mq_run_dispatch_ops(q, - blk_mq_plug_issue_direct(plug, false)); + blk_mq_plug_issue_direct(plug)); if (rq_list_empty(plug->mq_list)) return; } @@ -2805,36 +2794,32 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { int queued = 0; - int errors = 0; + blk_status_t ret = BLK_STS_OK; while (!list_empty(list)) { - blk_status_t ret; struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); - if (ret != BLK_STS_OK) { - errors++; - if (ret == BLK_STS_RESOURCE || - ret == BLK_STS_DEV_RESOURCE) { - blk_mq_request_bypass_insert(rq, false, - list_empty(list)); - break; - } - blk_mq_end_request(rq, ret); - } else + switch (ret) { + case BLK_STS_OK: queued++; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, false, + list_empty(list)); + goto out; + default: + blk_mq_end_request(rq, ret); + break; + } } - /* - * If we didn't flush the entire list, we could have told - * the driver there was more coming, but that turned out to - * be a lie. - */ - if ((!list_empty(list) || errors) && - hctx->queue->mq_ops->commit_rqs && queued) - hctx->queue->mq_ops->commit_rqs(hctx); +out: + if (ret != BLK_STS_OK) + blk_mq_commit_rqs(hctx, queued, false); } static bool blk_mq_attempt_bio_merge(struct request_queue *q, @@ -2894,15 +2879,16 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, if (!plug) return NULL; - rq = rq_list_peek(&plug->cached_rq); - if (!rq || rq->q != q) - return NULL; if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { *bio = NULL; return NULL; } + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + type = blk_mq_get_hctx_type((*bio)->bi_opf); hctx_type = rq->mq_hctx->type; if (type != hctx_type && diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 88f0fe7dcf54..d8cc820a365e 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -294,3 +294,70 @@ void rq_qos_exit(struct request_queue *q) rqos->ops->exit(rqos); } } + +int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, + const struct rq_qos_ops *ops) +{ + struct request_queue *q = disk->queue; + + rqos->disk = disk; + rqos->id = id; + rqos->ops = ops; + + /* + * No IO can be in-flight when adding rqos, so freeze queue, which + * is fine since we only support rq_qos for blk-mq queue. + * + * Reuse ->queue_lock for protecting against other concurrent + * rq_qos adding/deleting + */ + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); + if (rq_qos_id(q, rqos->id)) + goto ebusy; + rqos->next = q->rq_qos; + q->rq_qos = rqos; + spin_unlock_irq(&q->queue_lock); + + blk_mq_unfreeze_queue(q); + + if (rqos->ops->debugfs_attrs) { + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); + } + + return 0; +ebusy: + spin_unlock_irq(&q->queue_lock); + blk_mq_unfreeze_queue(q); + return -EBUSY; +} + +void rq_qos_del(struct rq_qos *rqos) +{ + struct request_queue *q = rqos->disk->queue; + struct rq_qos **cur; + + /* + * See comment in rq_qos_add() about freezing queue & using + * ->queue_lock. + */ + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); + for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { + if (*cur == rqos) { + *cur = rqos->next; + break; + } + } + spin_unlock_irq(&q->queue_lock); + + blk_mq_unfreeze_queue(q); + + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_unregister_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); +} diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 1ef1f7d4bc3c..b02a1a3d33a8 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -25,8 +25,8 @@ struct rq_wait { }; struct rq_qos { - struct rq_qos_ops *ops; - struct request_queue *q; + const struct rq_qos_ops *ops; + struct gendisk *disk; enum rq_qos_id id; struct rq_qos *next; #ifdef CONFIG_BLK_DEBUG_FS @@ -85,65 +85,9 @@ static inline void rq_wait_init(struct rq_wait *rq_wait) init_waitqueue_head(&rq_wait->wait); } -static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos) -{ - /* - * No IO can be in-flight when adding rqos, so freeze queue, which - * is fine since we only support rq_qos for blk-mq queue. - * - * Reuse ->queue_lock for protecting against other concurrent - * rq_qos adding/deleting - */ - blk_mq_freeze_queue(q); - - spin_lock_irq(&q->queue_lock); - if (rq_qos_id(q, rqos->id)) - goto ebusy; - rqos->next = q->rq_qos; - q->rq_qos = rqos; - spin_unlock_irq(&q->queue_lock); - - blk_mq_unfreeze_queue(q); - - if (rqos->ops->debugfs_attrs) { - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_rqos(rqos); - mutex_unlock(&q->debugfs_mutex); - } - - return 0; -ebusy: - spin_unlock_irq(&q->queue_lock); - blk_mq_unfreeze_queue(q); - return -EBUSY; - -} - -static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) -{ - struct rq_qos **cur; - - /* - * See comment in rq_qos_add() about freezing queue & using - * ->queue_lock. - */ - blk_mq_freeze_queue(q); - - spin_lock_irq(&q->queue_lock); - for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { - if (*cur == rqos) { - *cur = rqos->next; - break; - } - } - spin_unlock_irq(&q->queue_lock); - - blk_mq_unfreeze_queue(q); - - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_rqos(rqos); - mutex_unlock(&q->debugfs_mutex); -} +int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, + const struct rq_qos_ops *ops); +void rq_qos_del(struct rq_qos *rqos); typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); diff --git a/block/blk-settings.c b/block/blk-settings.c index 0477c4d527fe..896b4654ab00 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -16,6 +16,7 @@ #include <linux/dma-mapping.h> #include "blk.h" +#include "blk-rq-qos.h" #include "blk-wbt.h" void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) @@ -40,7 +41,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; - lim->max_dev_sectors = 0; + lim->max_user_sectors = lim->max_dev_sectors = 0; lim->chunk_sectors = 0; lim->max_write_zeroes_sectors = 0; lim->max_zone_append_sectors = 0; @@ -135,7 +136,12 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto limits->max_hw_sectors = max_hw_sectors; max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); - max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + + if (limits->max_user_sectors) + max_sectors = min(max_sectors, limits->max_user_sectors); + else + max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS); + max_sectors = round_down(max_sectors, limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; diff --git a/block/blk-stat.c b/block/blk-stat.c index 2ea01b5c1aca..c6ca16abf911 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -58,7 +58,8 @@ void blk_stat_add(struct request *rq, u64 now) value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; - blk_throtl_stat_add(rq, value); + if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE) + blk_throtl_stat_add(rq, value); rcu_read_lock(); cpu = get_cpu(); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 93d9e9c9a6ea..f1fce1c7fa44 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -16,6 +16,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" +#include "blk-rq-qos.h" #include "blk-wbt.h" #include "blk-cgroup.h" #include "blk-throttle.h" @@ -239,19 +240,28 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { - unsigned long max_sectors_kb, + unsigned long var; + unsigned int max_sectors_kb, max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1, page_kb = 1 << (PAGE_SHIFT - 10); - ssize_t ret = queue_var_store(&max_sectors_kb, page, count); + ssize_t ret = queue_var_store(&var, page, count); if (ret < 0) return ret; - max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long) + max_sectors_kb = (unsigned int)var; + max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, q->limits.max_dev_sectors >> 1); - - if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) - return -EINVAL; + if (max_sectors_kb == 0) { + q->limits.max_user_sectors = 0; + max_sectors_kb = min(max_hw_sectors_kb, + BLK_DEF_MAX_SECTORS >> 1); + } else { + if (max_sectors_kb > max_hw_sectors_kb || + max_sectors_kb < page_kb) + return -EINVAL; + q->limits.max_user_sectors = max_sectors_kb << 1; + } spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; @@ -491,7 +501,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, rqos = wbt_rq_qos(q); if (!rqos) { - ret = wbt_init(q); + ret = wbt_init(q->disk); if (ret) return ret; } @@ -755,7 +765,7 @@ static void blk_queue_release(struct kobject *kobj) /* nothing to do here, all data is associated with the parent gendisk */ } -static struct kobj_type blk_queue_ktype = { +static const struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, .release = blk_queue_release, @@ -817,7 +827,7 @@ int blk_register_queue(struct gendisk *disk) goto out_elv_unregister; blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); - wbt_enable_default(q); + wbt_enable_default(disk); blk_throtl_register(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 6fb5a2f9e1ee..47e9d8be68f3 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -335,14 +335,13 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq) timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } -static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, - struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { struct throtl_grp *tg; int rw; - tg = kzalloc_node(sizeof(*tg), gfp, q->node); + tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); if (!tg) return NULL; @@ -2395,7 +2394,7 @@ int blk_throtl_init(struct gendisk *disk) td->low_downgrade_time = jiffies; /* activate policy */ - ret = blkcg_activate_policy(q, &blkcg_policy_throtl); + ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { free_percpu(td->latency_buckets[READ]); free_percpu(td->latency_buckets[WRITE]); @@ -2411,7 +2410,7 @@ void blk_throtl_exit(struct gendisk *disk) BUG_ON(!q->td); del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); - blkcg_deactivate_policy(q, &blkcg_policy_throtl); + blkcg_deactivate_policy(disk, &blkcg_policy_throtl); free_percpu(q->td->latency_buckets[READ]); free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 68a774d7a7c9..e49a48684532 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -25,6 +25,7 @@ #include <linux/backing-dev.h> #include <linux/swap.h> +#include "blk-stat.h" #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" @@ -32,6 +33,72 @@ #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> +enum wbt_flags { + WBT_TRACKED = 1, /* write, tracked for throttling */ + WBT_READ = 2, /* read */ + WBT_KSWAPD = 4, /* write, from kswapd */ + WBT_DISCARD = 8, /* discard */ + + WBT_NR_BITS = 4, /* number of bits */ +}; + +enum { + WBT_RWQ_BG = 0, + WBT_RWQ_KSWAPD, + WBT_RWQ_DISCARD, + WBT_NUM_RWQ, +}; + +/* + * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other + * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered + * to WBT_STATE_OFF/ON_MANUAL. + */ +enum { + WBT_STATE_ON_DEFAULT = 1, /* on by default */ + WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ + WBT_STATE_OFF_DEFAULT = 3, /* off by default */ + WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ +}; + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + + short enable_state; /* WBT_STATE_* */ + + /* + * Number of consecutive periods where we don't have enough + * information to make a firm scale up/down decision. + */ + unsigned int unknown_cnt; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + struct blk_stat_callback *cb; + + u64 sync_issue; + void *sync_cookie; + + unsigned int wc; + + unsigned long last_issue; /* last non-throttled issue */ + unsigned long last_comp; /* last non-throttled comp */ + unsigned long min_lat_nsec; + struct rq_qos rqos; + struct rq_wait rq_wait[WBT_NUM_RWQ]; + struct rq_depth rq_depth; +}; + +static inline struct rq_wb *RQWB(struct rq_qos *rqos) +{ + return container_of(rqos, struct rq_wb, rqos); +} + static inline void wbt_clear_state(struct request *rq) { rq->wbt_flags = 0; @@ -98,7 +165,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb; + struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -226,6 +293,16 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb) return now - issue; } +static inline unsigned int wbt_inflight(struct rq_wb *rwb) +{ + unsigned int i, ret = 0; + + for (i = 0; i < WBT_NUM_RWQ; i++) + ret += atomic_read(&rwb->rq_wait[i].inflight); + + return ret; +} + enum { LAT_OK = 1, LAT_UNKNOWN, @@ -235,7 +312,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -288,7 +365,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -358,13 +435,12 @@ static void wb_timer_fn(struct blk_stat_callback *cb) unsigned int inflight = wbt_inflight(rwb); int status; - if (!rwb->rqos.q->disk) + if (!rwb->rqos.disk) return; status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step, - inflight); + trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, @@ -650,8 +726,9 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) /* * Enable wbt if defaults are configured that way */ -void wbt_enable_default(struct request_queue *q) +void wbt_enable_default(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct rq_qos *rqos; bool disable_flag = q->elevator && test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags); @@ -670,7 +747,7 @@ void wbt_enable_default(struct request_queue *q) return; if (queue_is_mq(q) && !disable_flag) - wbt_init(q); + wbt_init(disk); } EXPORT_SYMBOL_GPL(wbt_enable_default); @@ -701,16 +778,15 @@ static int wbt_data_dir(const struct request *rq) static void wbt_queue_depth_changed(struct rq_qos *rqos) { - RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q); + RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue); wbt_update_limits(RQWB(rqos)); } static void wbt_exit(struct rq_qos *rqos) { struct rq_wb *rwb = RQWB(rqos); - struct request_queue *q = rqos->q; - blk_stat_remove_callback(q, rwb->cb); + blk_stat_remove_callback(rqos->disk->queue, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); } @@ -718,9 +794,9 @@ static void wbt_exit(struct rq_qos *rqos) /* * Disable wbt, if enabled by default. */ -void wbt_disable_default(struct request_queue *q) +void wbt_disable_default(struct gendisk *disk) { - struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_qos *rqos = wbt_rq_qos(disk->queue); struct rq_wb *rwb; if (!rqos) return; @@ -820,7 +896,7 @@ static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { }; #endif -static struct rq_qos_ops wbt_rqos_ops = { +static const struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, .issue = wbt_issue, .track = wbt_track, @@ -834,8 +910,9 @@ static struct rq_qos_ops wbt_rqos_ops = { #endif }; -int wbt_init(struct request_queue *q) +int wbt_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct rq_wb *rwb; int i; int ret; @@ -853,22 +930,19 @@ int wbt_init(struct request_queue *q) for (i = 0; i < WBT_NUM_RWQ; i++) rq_wait_init(&rwb->rq_wait[i]); - rwb->rqos.id = RQ_QOS_WBT; - rwb->rqos.ops = &wbt_rqos_ops; - rwb->rqos.q = q; rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); - - wbt_queue_depth_changed(&rwb->rqos); + rwb->rq_depth.queue_depth = blk_queue_depth(q); + wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ - ret = rq_qos_add(q, &rwb->rqos); + ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); if (ret) goto err_free; diff --git a/block/blk-wbt.h b/block/blk-wbt.h index e3ea6e7e2900..ba6cca5849a6 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -2,97 +2,11 @@ #ifndef WB_THROTTLE_H #define WB_THROTTLE_H -#include <linux/kernel.h> -#include <linux/atomic.h> -#include <linux/wait.h> -#include <linux/timer.h> -#include <linux/ktime.h> - -#include "blk-stat.h" -#include "blk-rq-qos.h" - -enum wbt_flags { - WBT_TRACKED = 1, /* write, tracked for throttling */ - WBT_READ = 2, /* read */ - WBT_KSWAPD = 4, /* write, from kswapd */ - WBT_DISCARD = 8, /* discard */ - - WBT_NR_BITS = 4, /* number of bits */ -}; - -enum { - WBT_RWQ_BG = 0, - WBT_RWQ_KSWAPD, - WBT_RWQ_DISCARD, - WBT_NUM_RWQ, -}; - -/* - * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other - * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered - * to WBT_STATE_OFF/ON_MANUAL. - */ -enum { - WBT_STATE_ON_DEFAULT = 1, /* on by default */ - WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ - WBT_STATE_OFF_DEFAULT = 3, /* off by default */ - WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ -}; - -struct rq_wb { - /* - * Settings that govern how we throttle - */ - unsigned int wb_background; /* background writeback */ - unsigned int wb_normal; /* normal writeback */ - - short enable_state; /* WBT_STATE_* */ - - /* - * Number of consecutive periods where we don't have enough - * information to make a firm scale up/down decision. - */ - unsigned int unknown_cnt; - - u64 win_nsec; /* default window size */ - u64 cur_win_nsec; /* current window size */ - - struct blk_stat_callback *cb; - - u64 sync_issue; - void *sync_cookie; - - unsigned int wc; - - unsigned long last_issue; /* last non-throttled issue */ - unsigned long last_comp; /* last non-throttled comp */ - unsigned long min_lat_nsec; - struct rq_qos rqos; - struct rq_wait rq_wait[WBT_NUM_RWQ]; - struct rq_depth rq_depth; -}; - -static inline struct rq_wb *RQWB(struct rq_qos *rqos) -{ - return container_of(rqos, struct rq_wb, rqos); -} - -static inline unsigned int wbt_inflight(struct rq_wb *rwb) -{ - unsigned int i, ret = 0; - - for (i = 0; i < WBT_NUM_RWQ; i++) - ret += atomic_read(&rwb->rq_wait[i].inflight); - - return ret; -} - - #ifdef CONFIG_BLK_WBT -int wbt_init(struct request_queue *); -void wbt_disable_default(struct request_queue *); -void wbt_enable_default(struct request_queue *); +int wbt_init(struct gendisk *disk); +void wbt_disable_default(struct gendisk *disk); +void wbt_enable_default(struct gendisk *disk); u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); @@ -104,14 +18,14 @@ u64 wbt_default_latency_nsec(struct request_queue *); #else -static inline int wbt_init(struct request_queue *q) +static inline int wbt_init(struct gendisk *disk) { return -EINVAL; } -static inline void wbt_disable_default(struct request_queue *q) +static inline void wbt_disable_default(struct gendisk *disk) { } -static inline void wbt_enable_default(struct request_queue *q) +static inline void wbt_enable_default(struct gendisk *disk) { } static inline void wbt_set_write_cache(struct request_queue *q, bool wc) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index db829401d8d0..614b575be899 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -277,10 +277,10 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, return -EINVAL; /* Check alignment (handle eventual smaller last zone) */ - if (sector & (zone_sectors - 1)) + if (!bdev_is_zone_start(bdev, sector)) return -EINVAL; - if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity) + if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) return -EINVAL; /* diff --git a/block/elevator.c b/block/elevator.c index adee58e48e2d..24909069f872 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -126,7 +126,7 @@ static struct elevator_type *elevator_find_get(struct request_queue *q, return e; } -static struct kobj_type elv_ktype; +static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, struct elevator_type *e) @@ -455,7 +455,7 @@ static const struct sysfs_ops elv_sysfs_ops = { .store = elv_attr_store, }; -static struct kobj_type elv_ktype = { +static const struct kobj_type elv_ktype = { .sysfs_ops = &elv_sysfs_ops, .release = elevator_release, }; diff --git a/block/fops.c b/block/fops.c index 50d245e8c913..d2e6be4e3d1c 100644 --- a/block/fops.c +++ b/block/fops.c @@ -221,6 +221,24 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio_endio(bio); break; } + if (iocb->ki_flags & IOCB_NOWAIT) { + /* + * This is nonblocking IO, and we need to allocate + * another bio if we have data left to map. As we + * cannot guarantee that one of the sub bios will not + * fail getting issued FOR NOWAIT and as error results + * are coalesced across all of them, be safe and ask for + * a retry of this from blocking context. + */ + if (unlikely(iov_iter_count(iter))) { + bio_release_pages(bio, false); + bio_clear_flag(bio, BIO_REFFED); + bio_put(bio); + blk_finish_plug(&plug); + return -EAGAIN; + } + bio->bi_opf |= REQ_NOWAIT; + } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) @@ -228,9 +246,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } else { task_io_account_write(bio->bi_iter.bi_size); } - if (iocb->ki_flags & IOCB_NOWAIT) - bio->bi_opf |= REQ_NOWAIT; - dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; diff --git a/block/genhd.c b/block/genhd.c index 23cf83b3331c..093ef292e98f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1016,9 +1016,8 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct gendisk *disk = dev_to_disk(dev); - - return sprintf(buf, "%x\n", disk->flags); + dev_warn_once(dev, "the capability attribute has been deprecated.\n"); + return sprintf(buf, "0\n"); } static ssize_t disk_alignment_offset_show(struct device *dev, |