diff options
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r-- | block/bfq-iosched.c | 291 |
1 files changed, 189 insertions, 102 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 653100fb719e..6075100f03a5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -624,12 +624,13 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) } /* - * Tell whether there are active queues or groups with differentiated weights. + * Tell whether there are active queues with different weights or + * active groups. */ -static bool bfq_differentiated_weights(struct bfq_data *bfqd) +static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd) { /* - * For weights to differ, at least one of the trees must contain + * For queue weights to differ, queue_weights_tree must contain * at least two nodes. */ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && @@ -637,9 +638,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) bfqd->queue_weights_tree.rb_node->rb_right) #ifdef CONFIG_BFQ_GROUP_IOSCHED ) || - (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && - (bfqd->group_weights_tree.rb_node->rb_left || - bfqd->group_weights_tree.rb_node->rb_right) + (bfqd->num_active_groups > 0 #endif ); } @@ -657,26 +656,25 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) * 3) all active groups at the same level in the groups tree have the same * number of children. * - * Unfortunately, keeping the necessary state for evaluating exactly the - * above symmetry conditions would be quite complex and time-consuming. - * Therefore this function evaluates, instead, the following stronger - * sub-conditions, for which it is much easier to maintain the needed - * state: + * Unfortunately, keeping the necessary state for evaluating exactly + * the last two symmetry sub-conditions above would be quite complex + * and time consuming. Therefore this function evaluates, instead, + * only the following stronger two sub-conditions, for which it is + * much easier to maintain the needed state: * 1) all active queues have the same weight, - * 2) all active groups have the same weight, - * 3) all active groups have at most one active child each. - * In particular, the last two conditions are always true if hierarchical - * support and the cgroups interface are not enabled, thus no state needs - * to be maintained in this case. + * 2) there are no active groups. + * In particular, the last condition is always true if hierarchical + * support or the cgroups interface are not enabled, thus no state + * needs to be maintained in this case. */ static bool bfq_symmetric_scenario(struct bfq_data *bfqd) { - return !bfq_differentiated_weights(bfqd); + return !bfq_varied_queue_weights_or_active_groups(bfqd); } /* * If the weight-counter tree passed as input contains no counter for - * the weight of the input entity, then add that counter; otherwise just + * the weight of the input queue, then add that counter; otherwise just * increment the existing counter. * * Note that weight-counter trees contain few nodes in mostly symmetric @@ -687,25 +685,25 @@ static bool bfq_symmetric_scenario(struct bfq_data *bfqd) * In most scenarios, the rate at which nodes are created/destroyed * should be low too. */ -void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, +void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct rb_root *root) { + struct bfq_entity *entity = &bfqq->entity; struct rb_node **new = &(root->rb_node), *parent = NULL; /* - * Do not insert if the entity is already associated with a + * Do not insert if the queue is already associated with a * counter, which happens if: - * 1) the entity is associated with a queue, - * 2) a request arrival has caused the queue to become both + * 1) a request arrival has caused the queue to become both * non-weight-raised, and hence change its weight, and * backlogged; in this respect, each of the two events * causes an invocation of this function, - * 3) this is the invocation of this function caused by the + * 2) this is the invocation of this function caused by the * second event. This second invocation is actually useless, * and we handle this fact by exiting immediately. More * efficient or clearer solutions might possibly be adopted. */ - if (entity->weight_counter) + if (bfqq->weight_counter) return; while (*new) { @@ -715,7 +713,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, parent = *new; if (entity->weight == __counter->weight) { - entity->weight_counter = __counter; + bfqq->weight_counter = __counter; goto inc_counter; } if (entity->weight < __counter->weight) @@ -724,66 +722,67 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, new = &((*new)->rb_right); } - entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), - GFP_ATOMIC); + bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), + GFP_ATOMIC); /* * In the unlucky event of an allocation failure, we just - * exit. This will cause the weight of entity to not be - * considered in bfq_differentiated_weights, which, in its - * turn, causes the scenario to be deemed wrongly symmetric in - * case entity's weight would have been the only weight making - * the scenario asymmetric. On the bright side, no unbalance - * will however occur when entity becomes inactive again (the - * invocation of this function is triggered by an activation - * of entity). In fact, bfq_weights_tree_remove does nothing - * if !entity->weight_counter. + * exit. This will cause the weight of queue to not be + * considered in bfq_varied_queue_weights_or_active_groups, + * which, in its turn, causes the scenario to be deemed + * wrongly symmetric in case bfqq's weight would have been + * the only weight making the scenario asymmetric. On the + * bright side, no unbalance will however occur when bfqq + * becomes inactive again (the invocation of this function + * is triggered by an activation of queue). In fact, + * bfq_weights_tree_remove does nothing if + * !bfqq->weight_counter. */ - if (unlikely(!entity->weight_counter)) + if (unlikely(!bfqq->weight_counter)) return; - entity->weight_counter->weight = entity->weight; - rb_link_node(&entity->weight_counter->weights_node, parent, new); - rb_insert_color(&entity->weight_counter->weights_node, root); + bfqq->weight_counter->weight = entity->weight; + rb_link_node(&bfqq->weight_counter->weights_node, parent, new); + rb_insert_color(&bfqq->weight_counter->weights_node, root); inc_counter: - entity->weight_counter->num_active++; + bfqq->weight_counter->num_active++; } /* - * Decrement the weight counter associated with the entity, and, if the + * Decrement the weight counter associated with the queue, and, if the * counter reaches 0, remove the counter from the tree. * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */ void __bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_entity *entity, + struct bfq_queue *bfqq, struct rb_root *root) { - if (!entity->weight_counter) + if (!bfqq->weight_counter) return; - entity->weight_counter->num_active--; - if (entity->weight_counter->num_active > 0) + bfqq->weight_counter->num_active--; + if (bfqq->weight_counter->num_active > 0) goto reset_entity_pointer; - rb_erase(&entity->weight_counter->weights_node, root); - kfree(entity->weight_counter); + rb_erase(&bfqq->weight_counter->weights_node, root); + kfree(bfqq->weight_counter); reset_entity_pointer: - entity->weight_counter = NULL; + bfqq->weight_counter = NULL; } /* - * Invoke __bfq_weights_tree_remove on bfqq and all its inactive - * parent entities. + * Invoke __bfq_weights_tree_remove on bfqq and decrement the number + * of active groups for each queue's inactive parent entity. */ void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_entity *entity = bfqq->entity.parent; - __bfq_weights_tree_remove(bfqd, &bfqq->entity, + __bfq_weights_tree_remove(bfqd, bfqq, &bfqd->queue_weights_tree); for_each_entity(entity) { @@ -797,17 +796,13 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, * next_in_service for details on why * in_service_entity must be checked too). * - * As a consequence, the weight of entity is - * not to be removed. In addition, if entity - * is active, then its parent entities are - * active as well, and thus their weights are - * not to be removed either. In the end, this - * loop must stop here. + * As a consequence, its parent entities are + * active as well, and thus this loop must + * stop here. */ break; } - __bfq_weights_tree_remove(bfqd, entity, - &bfqd->group_weights_tree); + bfqd->num_active_groups--; } } @@ -3182,6 +3177,13 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } +static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) +{ + return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && + blk_queue_nonrot(bfqq->bfqd->queue) && + bfqq->bfqd->hw_tag; +} + /** * bfq_bfqq_expire - expire a queue. * @bfqd: device owning the queue. @@ -3291,6 +3293,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, if (ref == 1) /* bfqq is gone, no more actions on it */ return; + bfqq->injected_service = 0; + /* mark bfqq as waiting a request only if a bic still points to it */ if (!bfq_bfqq_busy(bfqq) && reason != BFQQE_BUDGET_TIMEOUT && @@ -3497,9 +3501,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * symmetric scenario where: * (i) each of these processes must get the same throughput as * the others; - * (ii) all these processes have the same I/O pattern - (either sequential or random). - * In fact, in such a scenario, the drive will tend to treat + * (ii) the I/O of each process has the same properties, in + * terms of locality (sequential or random), direction + * (reads or writes), request sizes, greediness + * (from I/O-bound to sporadic), and so on. + * In fact, in such a scenario, the drive tends to treat * the requests of each of these processes in about the same * way as the requests of the others, and thus to provide * each of these processes with about the same throughput @@ -3508,18 +3514,50 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * certainly needed to guarantee that bfqq receives its * assigned fraction of the device throughput (see [1] for * details). + * The problem is that idling may significantly reduce + * throughput with certain combinations of types of I/O and + * devices. An important example is sync random I/O, on flash + * storage with command queueing. So, unless bfqq falls in the + * above cases where idling also boosts throughput, it would + * be important to check conditions (i) and (ii) accurately, + * so as to avoid idling when not strictly needed for service + * guarantees. + * + * Unfortunately, it is extremely difficult to thoroughly + * check condition (ii). And, in case there are active groups, + * it becomes very difficult to check condition (i) too. In + * fact, if there are active groups, then, for condition (i) + * to become false, it is enough that an active group contains + * more active processes or sub-groups than some other active + * group. We address this issue with the following bi-modal + * behavior, implemented in the function + * bfq_symmetric_scenario(). * - * We address this issue by controlling, actually, only the - * symmetry sub-condition (i), i.e., provided that - * sub-condition (i) holds, idling is not performed, - * regardless of whether sub-condition (ii) holds. In other - * words, only if sub-condition (i) holds, then idling is + * If there are active groups, then the scenario is tagged as + * asymmetric, conservatively, without checking any of the + * conditions (i) and (ii). So the device is idled for bfqq. + * This behavior matches also the fact that groups are created + * exactly if controlling I/O (to preserve bandwidth and + * latency guarantees) is a primary concern. + * + * On the opposite end, if there are no active groups, then + * only condition (i) is actually controlled, i.e., provided + * that condition (i) holds, idling is not performed, + * regardless of whether condition (ii) holds. In other words, + * only if condition (i) does not hold, then idling is * allowed, and the device tends to be prevented from queueing - * many requests, possibly of several processes. The reason - * for not controlling also sub-condition (ii) is that we - * exploit preemption to preserve guarantees in case of - * symmetric scenarios, even if (ii) does not hold, as - * explained in the next two paragraphs. + * many requests, possibly of several processes. Since there + * are no active groups, then, to control condition (i) it is + * enough to check whether all active queues have the same + * weight. + * + * Not checking condition (ii) evidently exposes bfqq to the + * risk of getting less throughput than its fair share. + * However, for queues with the same weight, a further + * mechanism, preemption, mitigates or even eliminates this + * problem. And it does so without consequences on overall + * throughput. This mechanism and its benefits are explained + * in the next three paragraphs. * * Even if a queue, say Q, is expired when it remains idle, Q * can still preempt the new in-service queue if the next @@ -3533,11 +3571,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * idling allows the internal queues of the device to contain * many requests, and thus to reorder requests, we can rather * safely assume that the internal scheduler still preserves a - * minimum of mid-term fairness. The motivation for using - * preemption instead of idling is that, by not idling, - * service guarantees are preserved without minimally - * sacrificing throughput. In other words, both a high - * throughput and its desired distribution are obtained. + * minimum of mid-term fairness. * * More precisely, this preemption-based, idleless approach * provides fairness in terms of IOPS, and not sectors per @@ -3556,22 +3590,27 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * 1024/8 times as high as the service received by the other * queue. * - * On the other hand, device idling is performed, and thus - * pure sector-domain guarantees are provided, for the - * following queues, which are likely to need stronger - * throughput guarantees: weight-raised queues, and queues - * with a higher weight than other queues. When such queues - * are active, sub-condition (i) is false, which triggers - * device idling. + * The motivation for using preemption instead of idling (for + * queues with the same weight) is that, by not idling, + * service guarantees are preserved (completely or at least in + * part) without minimally sacrificing throughput. And, if + * there is no active group, then the primary expectation for + * this device is probably a high throughput. * - * According to the above considerations, the next variable is - * true (only) if sub-condition (i) holds. To compute the - * value of this variable, we not only use the return value of - * the function bfq_symmetric_scenario(), but also check - * whether bfqq is being weight-raised, because - * bfq_symmetric_scenario() does not take into account also - * weight-raised queues (see comments on - * bfq_weights_tree_add()). + * We are now left only with explaining the additional + * compound condition that is checked below for deciding + * whether the scenario is asymmetric. To explain this + * compound condition, we need to add that the function + * bfq_symmetric_scenario checks the weights of only + * non-weight-raised queues, for efficiency reasons (see + * comments on bfq_weights_tree_add()). Then the fact that + * bfqq is weight-raised is checked explicitly here. More + * precisely, the compound condition below takes into account + * also the fact that, even if bfqq is being weight-raised, + * the scenario is still symmetric if all active queues happen + * to be weight-raised. Actually, we should be even more + * precise here, and differentiate between interactive weight + * raising and soft real-time weight raising. * * As a side note, it is worth considering that the above * device-idling countermeasures may however fail in the @@ -3583,7 +3622,8 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * to let requests be served in the desired order until all * the requests already queued in the device have been served. */ - asymmetric_scenario = bfqq->wr_coeff > 1 || + asymmetric_scenario = (bfqq->wr_coeff > 1 && + bfqd->wr_busy_queues < bfqd->busy_queues) || !bfq_symmetric_scenario(bfqd); /* @@ -3629,6 +3669,30 @@ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); } +static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + + /* + * A linear search; but, with a high probability, very few + * steps are needed to find a candidate queue, i.e., a queue + * with enough budget left for its next request. In fact: + * - BFQ dynamically updates the budget of every queue so as + * to accommodate the expected backlog of the queue; + * - if a queue gets all its requests dispatched as injected + * service, then the queue is removed from the active list + * (and re-added only if it gets new requests, but with + * enough budget for its new backlog). + */ + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) + return bfqq; + + return NULL; +} + /* * Select a queue for service. If we have a current queue in service, * check whether to continue servicing it, or retrieve and set a new one. @@ -3710,10 +3774,19 @@ check_queue: * No requests pending. However, if the in-service queue is idling * for a new request, or has requests waiting for a completion and * may idle after their completion, then keep it anyway. + * + * Yet, to boost throughput, inject service from other queues if + * possible. */ if (bfq_bfqq_wait_request(bfqq) || (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { - bfqq = NULL; + if (bfq_bfqq_injectable(bfqq) && + bfqq->injected_service * bfqq->inject_coeff < + bfqq->entity.service * 10) + bfqq = bfq_choose_bfqq_for_injection(bfqd); + else + bfqq = NULL; + goto keep_queue; } @@ -3803,6 +3876,14 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, bfq_dispatch_remove(bfqd->queue, rq); + if (bfqq != bfqd->in_service_queue) { + if (likely(bfqd->in_service_queue)) + bfqd->in_service_queue->injected_service += + bfq_serv_to_charge(rq, bfqq); + + goto return_rq; + } + /* * If weight raising has to terminate for bfqq, then next * function causes an immediate update of bfqq's weight, @@ -3821,13 +3902,12 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, * belongs to CLASS_IDLE and other queues are waiting for * service. */ - if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - goto expire; - - return rq; + if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq))) + goto return_rq; -expire: bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); + +return_rq: return rq; } @@ -4232,6 +4312,13 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_mark_bfqq_has_short_ttime(bfqq); bfq_mark_bfqq_sync(bfqq); bfq_mark_bfqq_just_created(bfqq); + /* + * Aggressively inject a lot of service: up to 90%. + * This coefficient remains constant during bfqq life, + * but this behavior might be changed, after enough + * testing and tuning. + */ + bfqq->inject_coeff = 1; } else bfq_clear_bfqq_sync(bfqq); @@ -4297,7 +4384,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, rcu_read_lock(); - bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); + bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); if (!bfqg) { bfqq = &bfqd->oom_bfqq; goto out; @@ -5330,7 +5417,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->idle_slice_timer.function = bfq_idle_slice_timer; bfqd->queue_weights_tree = RB_ROOT; - bfqd->group_weights_tree = RB_ROOT; + bfqd->num_active_groups = 0; INIT_LIST_HEAD(&bfqd->active_list); INIT_LIST_HEAD(&bfqd->idle_list); |