summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/bfq-iosched.c76
-rw-r--r--block/bfq-iosched.h51
-rw-r--r--block/bfq-wf2q.c5
-rw-r--r--block/blk-mq.c33
-rw-r--r--drivers/nvme/host/core.c10
-rw-r--r--drivers/nvme/target/rdma.c3
6 files changed, 123 insertions, 55 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3a27d31fcda6..97337214bec4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
bfqd->queue_weights_tree.rb_node->rb_right)
#ifdef CONFIG_BFQ_GROUP_IOSCHED
) ||
- (bfqd->num_active_groups > 0
+ (bfqd->num_groups_with_pending_reqs > 0
#endif
);
}
@@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
*/
break;
}
- bfqd->num_active_groups--;
+
+ /*
+ * The decrement of num_groups_with_pending_reqs is
+ * not performed immediately upon the deactivation of
+ * entity, but it is delayed to when it also happens
+ * that the first leaf descendant bfqq of entity gets
+ * all its pending requests completed. The following
+ * instructions perform this delayed decrement, if
+ * needed. See the comments on
+ * num_groups_with_pending_reqs for details.
+ */
+ if (entity->in_groups_with_pending_reqs) {
+ entity->in_groups_with_pending_reqs = false;
+ bfqd->num_groups_with_pending_reqs--;
+ }
}
}
@@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
* fact, if there are active groups, then, for condition (i)
* to become false, it is enough that an active group contains
* more active processes or sub-groups than some other active
- * group. We address this issue with the following bi-modal
- * behavior, implemented in the function
+ * group. More precisely, for condition (i) to hold because of
+ * such a group, it is not even necessary that the group is
+ * (still) active: it is sufficient that, even if the group
+ * has become inactive, some of its descendant processes still
+ * have some request already dispatched but still waiting for
+ * completion. In fact, requests have still to be guaranteed
+ * their share of the throughput even after being
+ * dispatched. In this respect, it is easy to show that, if a
+ * group frequently becomes inactive while still having
+ * in-flight requests, and if, when this happens, the group is
+ * not considered in the calculation of whether the scenario
+ * is asymmetric, then the group may fail to be guaranteed its
+ * fair share of the throughput (basically because idling may
+ * not be performed for the descendant processes of the group,
+ * but it had to be). We address this issue with the
+ * following bi-modal behavior, implemented in the function
* bfq_symmetric_scenario().
*
- * If there are active groups, then the scenario is tagged as
+ * If there are groups with requests waiting for completion
+ * (as commented above, some of these groups may even be
+ * already inactive), then the scenario is tagged as
* asymmetric, conservatively, without checking any of the
* conditions (i) and (ii). So the device is idled for bfqq.
* This behavior matches also the fact that groups are created
- * exactly if controlling I/O (to preserve bandwidth and
- * latency guarantees) is a primary concern.
+ * exactly if controlling I/O is a primary concern (to
+ * preserve bandwidth and latency guarantees).
*
- * On the opposite end, if there are no active groups, then
- * only condition (i) is actually controlled, i.e., provided
- * that condition (i) holds, idling is not performed,
- * regardless of whether condition (ii) holds. In other words,
- * only if condition (i) does not hold, then idling is
- * allowed, and the device tends to be prevented from queueing
- * many requests, possibly of several processes. Since there
- * are no active groups, then, to control condition (i) it is
- * enough to check whether all active queues have the same
- * weight.
+ * On the opposite end, if there are no groups with requests
+ * waiting for completion, then only condition (i) is actually
+ * controlled, i.e., provided that condition (i) holds, idling
+ * is not performed, regardless of whether condition (ii)
+ * holds. In other words, only if condition (i) does not hold,
+ * then idling is allowed, and the device tends to be
+ * prevented from queueing many requests, possibly of several
+ * processes. Since there are no groups with requests waiting
+ * for completion, then, to control condition (i) it is enough
+ * to check just whether all the queues with requests waiting
+ * for completion also have the same weight.
*
* Not checking condition (ii) evidently exposes bfqq to the
* risk of getting less throughput than its fair share.
@@ -3607,10 +3638,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
* bfqq is weight-raised is checked explicitly here. More
* precisely, the compound condition below takes into account
* also the fact that, even if bfqq is being weight-raised,
- * the scenario is still symmetric if all active queues happen
- * to be weight-raised. Actually, we should be even more
- * precise here, and differentiate between interactive weight
- * raising and soft real-time weight raising.
+ * the scenario is still symmetric if all queues with requests
+ * waiting for completion happen to be
+ * weight-raised. Actually, we should be even more precise
+ * here, and differentiate between interactive weight raising
+ * and soft real-time weight raising.
*
* As a side note, it is worth considering that the above
* device-idling countermeasures may however fail in the
@@ -5417,7 +5449,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
bfqd->queue_weights_tree = RB_ROOT;
- bfqd->num_active_groups = 0;
+ bfqd->num_groups_with_pending_reqs = 0;
INIT_LIST_HEAD(&bfqd->active_list);
INIT_LIST_HEAD(&bfqd->idle_list);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 77651d817ecd..0b02bf302de0 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -196,6 +196,9 @@ struct bfq_entity {
/* flag, set to request a weight, ioprio or ioprio_class change */
int prio_changed;
+
+ /* flag, set if the entity is counted in groups_with_pending_reqs */
+ bool in_groups_with_pending_reqs;
};
struct bfq_group;
@@ -448,10 +451,54 @@ struct bfq_data {
* bfq_weights_tree_[add|remove] for further details).
*/
struct rb_root queue_weights_tree;
+
/*
- * number of groups with requests still waiting for completion
+ * Number of groups with at least one descendant process that
+ * has at least one request waiting for completion. Note that
+ * this accounts for also requests already dispatched, but not
+ * yet completed. Therefore this number of groups may differ
+ * (be larger) than the number of active groups, as a group is
+ * considered active only if its corresponding entity has
+ * descendant queues with at least one request queued. This
+ * number is used to decide whether a scenario is symmetric.
+ * For a detailed explanation see comments on the computation
+ * of the variable asymmetric_scenario in the function
+ * bfq_better_to_idle().
+ *
+ * However, it is hard to compute this number exactly, for
+ * groups with multiple descendant processes. Consider a group
+ * that is inactive, i.e., that has no descendant process with
+ * pending I/O inside BFQ queues. Then suppose that
+ * num_groups_with_pending_reqs is still accounting for this
+ * group, because the group has descendant processes with some
+ * I/O request still in flight. num_groups_with_pending_reqs
+ * should be decremented when the in-flight request of the
+ * last descendant process is finally completed (assuming that
+ * nothing else has changed for the group in the meantime, in
+ * terms of composition of the group and active/inactive state of child
+ * groups and processes). To accomplish this, an additional
+ * pending-request counter must be added to entities, and must
+ * be updated correctly. To avoid this additional field and operations,
+ * we resort to the following tradeoff between simplicity and
+ * accuracy: for an inactive group that is still counted in
+ * num_groups_with_pending_reqs, we decrement
+ * num_groups_with_pending_reqs when the first descendant
+ * process of the group remains with no request waiting for
+ * completion.
+ *
+ * Even this simpler decrement strategy requires a little
+ * carefulness: to avoid multiple decrements, we flag a group,
+ * more precisely an entity representing a group, as still
+ * counted in num_groups_with_pending_reqs when it becomes
+ * inactive. Then, when the first descendant queue of the
+ * entity remains with no request waiting for completion,
+ * num_groups_with_pending_reqs is decremented, and this flag
+ * is reset. After this flag is reset for the entity,
+ * num_groups_with_pending_reqs won't be decremented any
+ * longer in case a new descendant queue of the entity remains
+ * with no request waiting for completion.
*/
- unsigned int num_active_groups;
+ unsigned int num_groups_with_pending_reqs;
/*
* Number of bfq_queues containing requests (including the
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 4b0d5fb69160..63e0f12be7c9 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
container_of(entity, struct bfq_group, entity);
struct bfq_data *bfqd = bfqg->bfqd;
- bfqd->num_active_groups++;
+ if (!entity->in_groups_with_pending_reqs) {
+ entity->in_groups_with_pending_reqs = true;
+ bfqd->num_groups_with_pending_reqs++;
+ }
}
#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3262d83b9e07..6a7566244de3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1715,15 +1715,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
- /*
- * If direct dispatch fails, we cannot allow any merging on
- * this IO. Drivers (like SCSI) may have set up permanent state
- * for this request, like SG tables and mappings, and if we
- * merge to it later on then we'll still only do IO to the
- * original part.
- */
- rq->cmd_flags |= REQ_NOMERGE;
-
blk_mq_update_dispatch_busy(hctx, true);
__blk_mq_requeue_request(rq);
break;
@@ -1736,18 +1727,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
return ret;
}
-/*
- * Don't allow direct dispatch of anything but regular reads/writes,
- * as some of the other commands can potentially share request space
- * with data we need for the IO scheduler. If we attempt a direct dispatch
- * on those and fail, we can't safely add it to the scheduler afterwards
- * without potentially overwriting data that the driver has already written.
- */
-static bool blk_rq_can_direct_dispatch(struct request *rq)
-{
- return req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE;
-}
-
static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq,
blk_qc_t *cookie,
@@ -1769,7 +1748,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
goto insert;
}
- if (!blk_rq_can_direct_dispatch(rq) || (q->elevator && !bypass_insert))
+ if (q->elevator && !bypass_insert)
goto insert;
if (!blk_mq_get_dispatch_budget(hctx))
@@ -1785,7 +1764,7 @@ insert:
if (bypass_insert)
return BLK_STS_RESOURCE;
- blk_mq_sched_insert_request(rq, false, run_queue, false);
+ blk_mq_request_bypass_insert(rq, run_queue);
return BLK_STS_OK;
}
@@ -1801,7 +1780,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
- blk_mq_sched_insert_request(rq, false, true, false);
+ blk_mq_request_bypass_insert(rq, true);
else if (ret != BLK_STS_OK)
blk_mq_end_request(rq, ret);
@@ -1831,15 +1810,13 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq = list_first_entry(list, struct request,
queuelist);
- if (!blk_rq_can_direct_dispatch(rq))
- break;
-
list_del_init(&rq->queuelist);
ret = blk_mq_request_issue_directly(rq);
if (ret != BLK_STS_OK) {
if (ret == BLK_STS_RESOURCE ||
ret == BLK_STS_DEV_RESOURCE) {
- list_add(&rq->queuelist, list);
+ blk_mq_request_bypass_insert(rq,
+ list_empty(list));
break;
}
blk_mq_end_request(rq, ret);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3cf1b773158e..962012135b62 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -831,6 +831,8 @@ static int nvme_submit_user_cmd(struct request_queue *q,
static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
{
struct nvme_ctrl *ctrl = rq->end_io_data;
+ unsigned long flags;
+ bool startka = false;
blk_mq_free_request(rq);
@@ -841,7 +843,13 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
return;
}
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+ spin_lock_irqsave(&ctrl->lock, flags);
+ if (ctrl->state == NVME_CTRL_LIVE ||
+ ctrl->state == NVME_CTRL_CONNECTING)
+ startka = true;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ if (startka)
+ schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
}
static int nvme_keep_alive(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 3f7971d3706d..583086dd9cb9 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -529,6 +529,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvmet_rdma_rsp *rsp =
container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
+ struct nvmet_rdma_queue *queue = cq->cq_context;
nvmet_rdma_release_rsp(rsp);
@@ -536,7 +537,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
wc->status != IB_WC_WR_FLUSH_ERR)) {
pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
- nvmet_rdma_error_comp(rsp->queue);
+ nvmet_rdma_error_comp(queue);
}
}