summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig16
-rw-r--r--block/Makefile4
-rw-r--r--block/bfq-iosched.c131
-rw-r--r--block/bfq-iosched.h7
-rw-r--r--block/bfq-wf2q.c30
-rw-r--r--block/bio-integrity.c22
-rw-r--r--block/bio.c263
-rw-r--r--block/blk-cgroup.c284
-rw-r--r--block/blk-core.c123
-rw-r--r--block/blk-ioc.c2
-rw-r--r--block/blk-iolatency.c955
-rw-r--r--block/blk-lib.c10
-rw-r--r--block/blk-mq-debugfs-zoned.c24
-rw-r--r--block/blk-mq-debugfs.c26
-rw-r--r--block/blk-mq-debugfs.h9
-rw-r--r--block/blk-mq-pci.c5
-rw-r--r--block/blk-mq-sched.c112
-rw-r--r--block/blk-mq-tag.c13
-rw-r--r--block/blk-mq.c190
-rw-r--r--block/blk-mq.h13
-rw-r--r--block/blk-rq-qos.c194
-rw-r--r--block/blk-rq-qos.h109
-rw-r--r--block/blk-settings.c6
-rw-r--r--block/blk-softirq.c1
-rw-r--r--block/blk-stat.c16
-rw-r--r--block/blk-stat.h4
-rw-r--r--block/blk-sysfs.c37
-rw-r--r--block/blk-throttle.c32
-rw-r--r--block/blk-timeout.c1
-rw-r--r--block/blk-wbt.c425
-rw-r--r--block/blk-wbt.h68
-rw-r--r--block/blk-zoned.c2
-rw-r--r--block/blk.h7
-rw-r--r--block/bounce.c69
-rw-r--r--block/bsg-lib.c5
-rw-r--r--block/bsg.c462
-rw-r--r--block/cfq-iosched.c23
-rw-r--r--block/genhd.c29
-rw-r--r--block/partition-generic.c25
-rw-r--r--block/partitions/aix.c13
-rw-r--r--block/partitions/ldm.c3
-rw-r--r--block/sed-opal.c4
-rw-r--r--block/t10-pi.c110
43 files changed, 2630 insertions, 1254 deletions
diff --git a/block/Kconfig b/block/Kconfig
index eb50fd4977c2..1f2469a0123c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -149,6 +149,18 @@ config BLK_WBT
dynamically on an algorithm loosely based on CoDel, factoring in
the realtime performance of the disk.
+config BLK_CGROUP_IOLATENCY
+ bool "Enable support for latency based cgroup IO protection"
+ depends on BLK_CGROUP=y
+ default n
+ ---help---
+ Enabling this option enables the .latency interface for IO throttling.
+ The IO controller will attempt to maintain average IO latencies below
+ the configured latency target, throttling anybody with a higher latency
+ target than the victimized group.
+
+ Note, this is an experimental interface and could be changed someday.
+
config BLK_WBT_SQ
bool "Single queue writeback throttling"
default n
@@ -177,6 +189,10 @@ config BLK_DEBUG_FS
Unless you are building a kernel for a tiny system, you should
say Y here.
+config BLK_DEBUG_FS_ZONED
+ bool
+ default BLK_DEBUG_FS && BLK_DEV_ZONED
+
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
---help---
diff --git a/block/Makefile b/block/Makefile
index 6a56303b9925..572b33f32c07 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o partition-generic.o ioprio.o \
- badblocks.o partitions/
+ badblocks.o partitions/ blk-rq-qos.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
@@ -17,6 +17,7 @@ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
+obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
@@ -34,4 +35,5 @@ obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
+obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 495b9ddb3355..41d9036b1822 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -634,7 +634,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
* The following function returns true if every queue must receive the
* same share of the throughput (this condition is used when deciding
* whether idling may be disabled, see the comments in the function
- * bfq_bfqq_may_idle()).
+ * bfq_better_to_idle()).
*
* Such a scenario occurs when:
* 1) all active queues have the same weight,
@@ -742,8 +742,9 @@ inc_counter:
* See the comments to the function bfq_weights_tree_add() for considerations
* about overhead.
*/
-void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
- struct rb_root *root)
+void __bfq_weights_tree_remove(struct bfq_data *bfqd,
+ struct bfq_entity *entity,
+ struct rb_root *root)
{
if (!entity->weight_counter)
return;
@@ -760,6 +761,43 @@ reset_entity_pointer:
}
/*
+ * Invoke __bfq_weights_tree_remove on bfqq and all its inactive
+ * parent entities.
+ */
+void bfq_weights_tree_remove(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = bfqq->entity.parent;
+
+ __bfq_weights_tree_remove(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+
+ for_each_entity(entity) {
+ struct bfq_sched_data *sd = entity->my_sched_data;
+
+ if (sd->next_in_service || sd->in_service_entity) {
+ /*
+ * entity is still active, because either
+ * next_in_service or in_service_entity is not
+ * NULL (see the comments on the definition of
+ * next_in_service for details on why
+ * in_service_entity must be checked too).
+ *
+ * As a consequence, the weight of entity is
+ * not to be removed. In addition, if entity
+ * is active, then its parent entities are
+ * active as well, and thus their weights are
+ * not to be removed either. In the end, this
+ * loop must stop here.
+ */
+ break;
+ }
+ __bfq_weights_tree_remove(bfqd, entity,
+ &bfqd->group_weights_tree);
+ }
+}
+
+/*
* Return expired entry, or NULL to just start from scratch in rbtree.
*/
static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
@@ -1344,18 +1382,30 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
* remain unchanged after such an expiration, and the
* following statement therefore assigns to
* entity->budget the remaining budget on such an
- * expiration. For clarity, entity->service is not
- * updated on expiration in any case, and, in normal
- * operation, is reset only when bfqq is selected for
- * service (see bfq_get_next_queue).
+ * expiration.
*/
entity->budget = min_t(unsigned long,
bfq_bfqq_budget_left(bfqq),
bfqq->max_budget);
+ /*
+ * At this point, we have used entity->service to get
+ * the budget left (needed for updating
+ * entity->budget). Thus we finally can, and have to,
+ * reset entity->service. The latter must be reset
+ * because bfqq would otherwise be charged again for
+ * the service it has received during its previous
+ * service slot(s).
+ */
+ entity->service = 0;
+
return true;
}
+ /*
+ * We can finally complete expiration, by setting service to 0.
+ */
+ entity->service = 0;
entity->budget = max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(bfqq->next_rq, bfqq));
bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
@@ -3233,11 +3283,21 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
ref = bfqq->ref;
__bfq_bfqq_expire(bfqd, bfqq);
+ if (ref == 1) /* bfqq is gone, no more actions on it */
+ return;
+
/* mark bfqq as waiting a request only if a bic still points to it */
- if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
+ if (!bfq_bfqq_busy(bfqq) &&
reason != BFQQE_BUDGET_TIMEOUT &&
- reason != BFQQE_BUDGET_EXHAUSTED)
+ reason != BFQQE_BUDGET_EXHAUSTED) {
bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
+ /*
+ * Not setting service to 0, because, if the next rq
+ * arrives in time, the queue will go on receiving
+ * service with this same budget (as if it never expired)
+ */
+ } else
+ entity->service = 0;
}
/*
@@ -3295,7 +3355,7 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
* issues taken into account are not trivial. We discuss these issues
* individually while introducing the variables.
*/
-static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
{
struct bfq_data *bfqd = bfqq->bfqd;
bool rot_without_queueing =
@@ -3528,19 +3588,19 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
}
/*
- * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * If the in-service queue is empty but the function bfq_better_to_idle
* returns true, then:
* 1) the queue must remain in service and cannot be expired, and
* 2) the device must be idled to wait for the possible arrival of a new
* request for the queue.
- * See the comments on the function bfq_bfqq_may_idle for the reasons
+ * See the comments on the function bfq_better_to_idle for the reasons
* why performing device idling is the best choice to boost the throughput
- * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * and preserve service guarantees when bfq_better_to_idle itself
* returns true.
*/
static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
{
- return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq);
+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
}
/*
@@ -3559,8 +3619,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
+ /*
+ * Do not expire bfqq for budget timeout if bfqq may be about
+ * to enjoy device idling. The reason why, in this case, we
+ * prevent bfqq from expiring is the same as in the comments
+ * on the case where bfq_bfqq_must_idle() returns true, in
+ * bfq_completed_request().
+ */
if (bfq_may_expire_for_budg_timeout(bfqq) &&
- !bfq_bfqq_wait_request(bfqq) &&
!bfq_bfqq_must_idle(bfqq))
goto expire;
@@ -3620,7 +3686,7 @@ check_queue:
* may idle after their completion, then keep it anyway.
*/
if (bfq_bfqq_wait_request(bfqq) ||
- (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
bfqq = NULL;
goto keep_queue;
}
@@ -4582,8 +4648,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
*/
bfqq->budget_timeout = jiffies;
- bfq_weights_tree_remove(bfqd, &bfqq->entity,
- &bfqd->queue_weights_tree);
+ bfq_weights_tree_remove(bfqd, bfqq);
}
now_ns = ktime_get_ns();
@@ -4637,15 +4702,39 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
* or if we want to idle in case it has no pending requests.
*/
if (bfqd->in_service_queue == bfqq) {
- if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
- bfq_arm_slice_timer(bfqd);
+ if (bfq_bfqq_must_idle(bfqq)) {
+ if (bfqq->dispatched == 0)
+ bfq_arm_slice_timer(bfqd);
+ /*
+ * If we get here, we do not expire bfqq, even
+ * if bfqq was in budget timeout or had no
+ * more requests (as controlled in the next
+ * conditional instructions). The reason for
+ * not expiring bfqq is as follows.
+ *
+ * Here bfqq->dispatched > 0 holds, but
+ * bfq_bfqq_must_idle() returned true. This
+ * implies that, even if no request arrives
+ * for bfqq before bfqq->dispatched reaches 0,
+ * bfqq will, however, not be expired on the
+ * completion event that causes bfqq->dispatch
+ * to reach zero. In contrast, on this event,
+ * bfqq will start enjoying device idling
+ * (I/O-dispatch plugging).
+ *
+ * But, if we expired bfqq here, bfqq would
+ * not have the chance to enjoy device idling
+ * when bfqq->dispatched finally reaches
+ * zero. This would expose bfqq to violation
+ * of its reserved service guarantees.
+ */
return;
} else if (bfq_may_expire_for_budg_timeout(bfqq))
bfq_bfqq_expire(bfqd, bfqq, false,
BFQQE_BUDGET_TIMEOUT);
else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
(bfqq->dispatched == 0 ||
- !bfq_bfqq_may_idle(bfqq)))
+ !bfq_better_to_idle(bfqq)))
bfq_bfqq_expire(bfqd, bfqq, false,
BFQQE_NO_MORE_REQUESTS);
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 0f712e03b035..a8a2e5aca4d4 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -827,8 +827,11 @@ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
struct rb_root *root);
-void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
- struct rb_root *root);
+void __bfq_weights_tree_remove(struct bfq_data *bfqd,
+ struct bfq_entity *entity,
+ struct rb_root *root);
+void bfq_weights_tree_remove(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq);
void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool compensate, enum bfqq_expiration reason);
void bfq_put_queue(struct bfq_queue *bfqq);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 4498c43245e2..dbc07b456059 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -499,9 +499,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
if (bfqq)
list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else /* bfq_group */
- bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
-
if (bfqg != bfqd->root_group)
bfqg->active_entities++;
#endif
@@ -601,10 +598,6 @@ static void bfq_active_extract(struct bfq_service_tree *st,
if (bfqq)
list_del(&bfqq->bfqq_list);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else /* bfq_group */
- bfq_weights_tree_remove(bfqd, entity,
- &bfqd->group_weights_tree);
-
if (bfqg != bfqd->root_group)
bfqg->active_entities--;
#endif
@@ -799,7 +792,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
if (prev_weight != new_weight) {
root = bfqq ? &bfqd->queue_weights_tree :
&bfqd->group_weights_tree;
- bfq_weights_tree_remove(bfqd, entity, root);
+ __bfq_weights_tree_remove(bfqd, entity, root);
}
entity->weight = new_weight;
/*
@@ -971,7 +964,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
* one of its children receives a new request.
*
* Basically, this function updates the timestamps of entity and
- * inserts entity into its active tree, ater possibly extracting it
+ * inserts entity into its active tree, after possibly extracting it
* from its idle tree.
*/
static void __bfq_activate_entity(struct bfq_entity *entity,
@@ -1015,6 +1008,16 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
entity->on_st = true;
}
+#ifdef BFQ_GROUP_IOSCHED_ENABLED
+ if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
+ struct bfq_group *bfqg =
+ container_of(entity, struct bfq_group, entity);
+
+ bfq_weights_tree_add(bfqg->bfqd, entity,
+ &bfqd->group_weights_tree);
+ }
+#endif
+
bfq_update_fin_time_enqueue(entity, st, backshifted);
}
@@ -1542,12 +1545,6 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
sd->in_service_entity = entity;
/*
- * Reset the accumulator of the amount of service that
- * the entity is about to receive.
- */
- entity->service = 0;
-
- /*
* If entity is no longer a candidate for next
* service, then it must be extracted from its active
* tree, so as to make sure that it won't be
@@ -1664,8 +1661,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqd->busy_queues--;
if (!bfqq->dispatched)
- bfq_weights_tree_remove(bfqd, &bfqq->entity,
- &bfqd->queue_weights_tree);
+ bfq_weights_tree_remove(bfqd, bfqq);
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues--;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index add7c7c85335..67b5fb861a51 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -160,28 +160,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
EXPORT_SYMBOL(bio_integrity_add_page);
/**
- * bio_integrity_intervals - Return number of integrity intervals for a bio
- * @bi: blk_integrity profile for device
- * @sectors: Size of the bio in 512-byte sectors
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the data integrity
- * interval size of the storage device. Convert the block layer sectors
- * to the appropriate number of integrity intervals.
- */
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
- unsigned int sectors)
-{
- return sectors >> (bi->interval_exp - 9);
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
- unsigned int sectors)
-{
- return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
-}
-
-/**
* bio_integrity_process - Process integrity metadata for a bio
* @bio: bio to generate/verify integrity metadata for
* @proc_iter: iterator to process
diff --git a/block/bio.c b/block/bio.c
index 9710e275f230..b12966e415d3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -28,9 +28,11 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
+#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-rq-qos.h"
/*
* Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -156,7 +158,7 @@ out:
unsigned int bvec_nr_vecs(unsigned short idx)
{
- return bvec_slabs[idx].nr_vecs;
+ return bvec_slabs[--idx].nr_vecs;
}
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
@@ -645,83 +647,6 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
EXPORT_SYMBOL(bio_clone_fast);
/**
- * bio_clone_bioset - clone a bio
- * @bio_src: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- *
- * Clone bio. Caller will own the returned bio, but not the actual data it
- * points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- struct bio *bio;
-
- /*
- * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
- * bio_src->bi_io_vec to bio->bi_io_vec.
- *
- * We can't do that anymore, because:
- *
- * - The point of cloning the biovec is to produce a bio with a biovec
- * the caller can modify: bi_idx and bi_bvec_done should be 0.
- *
- * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
- * we tried to clone the whole thing bio_alloc_bioset() would fail.
- * But the clone should succeed as long as the number of biovecs we
- * actually need to allocate is fewer than BIO_MAX_PAGES.
- *
- * - Lastly, bi_vcnt should not be looked at or relied upon by code
- * that does not own the bio - reason being drivers don't use it for
- * iterating over the biovec anymore, so expecting it to be kept up
- * to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work on
- * __bio_clone_fast() anyways.
- */
-
- bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
- if (!bio)
- return NULL;
- bio->bi_disk = bio_src->bi_disk;
- bio->bi_opf = bio_src->bi_opf;
- bio->bi_write_hint = bio_src->bi_write_hint;
- bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
- bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
-
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- case REQ_OP_WRITE_ZEROES:
- break;
- case REQ_OP_WRITE_SAME:
- bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
- break;
- default:
- bio_for_each_segment(bv, bio_src, iter)
- bio->bi_io_vec[bio->bi_vcnt++] = bv;
- break;
- }
-
- if (bio_integrity(bio_src)) {
- int ret;
-
- ret = bio_integrity_clone(bio, bio_src, gfp_mask);
- if (ret < 0) {
- bio_put(bio);
- return NULL;
- }
- }
-
- bio_clone_blkcg_association(bio, bio_src);
-
- return bio;
-}
-EXPORT_SYMBOL(bio_clone_bioset);
-
-/**
* bio_add_pc_page - attempt to add page to bio
* @q: the target queue
* @bio: destination bio
@@ -903,25 +828,27 @@ int bio_add_page(struct bio *bio, struct page *page,
EXPORT_SYMBOL(bio_add_page);
/**
- * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+ * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be mapped
*
- * Pins as many pages from *iter and appends them to @bio's bvec array. The
+ * Pins pages from *iter and appends them to @bio's bvec array. The
* pages will have to be released using put_page() when done.
+ * For multi-segment *iter, this function only adds pages from the
+ * the next non-empty segment of the iov iterator.
*/
-int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
- unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+ unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
- size_t offset, diff;
+ size_t offset;
ssize_t size;
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
- nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
+ idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
/*
* Deep magic below: We need to walk the pinned pages backwards
@@ -934,21 +861,46 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
bio->bi_iter.bi_size += size;
bio->bi_vcnt += nr_pages;
- diff = (nr_pages * PAGE_SIZE - offset) - size;
- while (nr_pages--) {
- bv[nr_pages].bv_page = pages[nr_pages];
- bv[nr_pages].bv_len = PAGE_SIZE;
- bv[nr_pages].bv_offset = 0;
+ while (idx--) {
+ bv[idx].bv_page = pages[idx];
+ bv[idx].bv_len = PAGE_SIZE;
+ bv[idx].bv_offset = 0;
}
bv[0].bv_offset += offset;
bv[0].bv_len -= offset;
- if (diff)
- bv[bio->bi_vcnt - 1].bv_len -= diff;
+ bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size;
iov_iter_advance(iter, size);
return 0;
}
+
+/**
+ * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+ * @bio: bio to add pages to
+ * @iter: iov iterator describing the region to be mapped
+ *
+ * Pins pages from *iter and appends them to @bio's bvec array. The
+ * pages will have to be released using put_page() when done.
+ * The function tries, but does not guarantee, to pin as many pages as
+ * fit into the bio, or are requested in *iter, whatever is smaller.
+ * If MM encounters an error pinning the requested pages, it stops.
+ * Error is returned only if 0 pages could be pinned.
+ */
+int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+{
+ unsigned short orig_vcnt = bio->bi_vcnt;
+
+ do {
+ int ret = __bio_iov_iter_get_pages(bio, iter);
+
+ if (unlikely(ret))
+ return bio->bi_vcnt > orig_vcnt ? 0 : ret;
+
+ } while (iov_iter_count(iter) && !bio_full(bio));
+
+ return 0;
+}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
static void submit_bio_wait_endio(struct bio *bio)
@@ -1634,10 +1586,8 @@ void bio_set_pages_dirty(struct bio *bio)
int i;
bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (page && !PageCompound(page))
- set_page_dirty_lock(page);
+ if (!PageCompound(bvec->bv_page))
+ set_page_dirty_lock(bvec->bv_page);
}
}
EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
@@ -1647,19 +1597,15 @@ static void bio_release_pages(struct bio *bio)
struct bio_vec *bvec;
int i;
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (page)
- put_page(page);
- }
+ bio_for_each_segment_all(bvec, bio, i)
+ put_page(bvec->bv_page);
}
/*
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
* If they are, then fine. If, however, some pages are clean then they must
* have been written out during the direct-IO read. So we take another ref on
- * the BIO and the offending pages and re-dirty the pages in process context.
+ * the BIO and re-dirty the pages in process context.
*
* It is expected that bio_check_pages_dirty() will wholly own the BIO from
* here on. It will run one put_page() against each page and will run one
@@ -1677,78 +1623,70 @@ static struct bio *bio_dirty_list;
*/
static void bio_dirty_fn(struct work_struct *work)
{
- unsigned long flags;
- struct bio *bio;
+ struct bio *bio, *next;
- spin_lock_irqsave(&bio_dirty_lock, flags);
- bio = bio_dirty_list;
+ spin_lock_irq(&bio_dirty_lock);
+ next = bio_dirty_list;
bio_dirty_list = NULL;
- spin_unlock_irqrestore(&bio_dirty_lock, flags);
+ spin_unlock_irq(&bio_dirty_lock);
- while (bio) {
- struct bio *next = bio->bi_private;
+ while ((bio = next) != NULL) {
+ next = bio->bi_private;
bio_set_pages_dirty(bio);
bio_release_pages(bio);
bio_put(bio);
- bio = next;
}
}
void bio_check_pages_dirty(struct bio *bio)
{
struct bio_vec *bvec;
- int nr_clean_pages = 0;
+ unsigned long flags;
int i;
bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (PageDirty(page) || PageCompound(page)) {
- put_page(page);
- bvec->bv_page = NULL;
- } else {
- nr_clean_pages++;
- }
+ if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
+ goto defer;
}
- if (nr_clean_pages) {
- unsigned long flags;
-
- spin_lock_irqsave(&bio_dirty_lock, flags);
- bio->bi_private = bio_dirty_list;
- bio_dirty_list = bio;
- spin_unlock_irqrestore(&bio_dirty_lock, flags);
- schedule_work(&bio_dirty_work);
- } else {
- bio_put(bio);
- }
+ bio_release_pages(bio);
+ bio_put(bio);
+ return;
+defer:
+ spin_lock_irqsave(&bio_dirty_lock, flags);
+ bio->bi_private = bio_dirty_list;
+ bio_dirty_list = bio;
+ spin_unlock_irqrestore(&bio_dirty_lock, flags);
+ schedule_work(&bio_dirty_work);
}
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
-void generic_start_io_acct(struct request_queue *q, int rw,
+void generic_start_io_acct(struct request_queue *q, int op,
unsigned long sectors, struct hd_struct *part)
{
+ const int sgrp = op_stat_group(op);
int cpu = part_stat_lock();
part_round_stats(q, cpu, part);
- part_stat_inc(cpu, part, ios[rw]);
- part_stat_add(cpu, part, sectors[rw], sectors);
- part_inc_in_flight(q, part, rw);
+ part_stat_inc(cpu, part, ios[sgrp]);
+ part_stat_add(cpu, part, sectors[sgrp], sectors);
+ part_inc_in_flight(q, part, op_is_write(op));
part_stat_unlock();
}
EXPORT_SYMBOL(generic_start_io_acct);
-void generic_end_io_acct(struct request_queue *q, int rw,
+void generic_end_io_acct(struct request_queue *q, int req_op,
struct hd_struct *part, unsigned long start_time)
{
unsigned long duration = jiffies - start_time;
+ const int sgrp = op_stat_group(req_op);
int cpu = part_stat_lock();
- part_stat_add(cpu, part, ticks[rw], duration);
+ part_stat_add(cpu, part, ticks[sgrp], duration);
part_round_stats(q, cpu, part);
- part_dec_in_flight(q, part, rw);
+ part_dec_in_flight(q, part, op_is_write(req_op));
part_stat_unlock();
}
@@ -1807,8 +1745,8 @@ again:
if (!bio_integrity_endio(bio))
return;
- if (WARN_ONCE(bio->bi_next, "driver left bi_next not NULL"))
- bio->bi_next = NULL;
+ if (bio->bi_disk)
+ rq_qos_done_bio(bio->bi_disk->queue, bio);
/*
* Need to have a real endio function for chained bios, otherwise
@@ -1869,6 +1807,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
bio_integrity_trim(split);
bio_advance(bio, split->bi_iter.bi_size);
+ bio->bi_iter.bi_done = 0;
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
bio_set_flag(split, BIO_TRACE_COMPLETION);
@@ -2017,6 +1956,30 @@ EXPORT_SYMBOL(bioset_init_from_src);
#ifdef CONFIG_BLK_CGROUP
+#ifdef CONFIG_MEMCG
+/**
+ * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
+ * @bio: target bio
+ * @page: the page to lookup the blkcg from
+ *
+ * Associate @bio with the blkcg from @page's owning memcg. This works like
+ * every other associate function wrt references.
+ */
+int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
+{
+ struct cgroup_subsys_state *blkcg_css;
+
+ if (unlikely(bio->bi_css))
+ return -EBUSY;
+ if (!page->mem_cgroup)
+ return 0;
+ blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
+ &io_cgrp_subsys);
+ bio->bi_css = blkcg_css;
+ return 0;
+}
+#endif /* CONFIG_MEMCG */
+
/**
* bio_associate_blkcg - associate a bio with the specified blkcg
* @bio: target bio
@@ -2040,6 +2003,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
EXPORT_SYMBOL_GPL(bio_associate_blkcg);
/**
+ * bio_associate_blkg - associate a bio with the specified blkg
+ * @bio: target bio
+ * @blkg: the blkg to associate
+ *
+ * Associate @bio with the blkg specified by @blkg. This is the queue specific
+ * blkcg information associated with the @bio, a reference will be taken on the
+ * @blkg and will be freed when the bio is freed.
+ */
+int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+{
+ if (unlikely(bio->bi_blkg))
+ return -EBUSY;
+ blkg_get(blkg);
+ bio->bi_blkg = blkg;
+ return 0;
+}
+
+/**
* bio_disassociate_task - undo bio_associate_current()
* @bio: target bio
*/
@@ -2053,6 +2034,10 @@ void bio_disassociate_task(struct bio *bio)
css_put(bio->bi_css);
bio->bi_css = NULL;
}
+ if (bio->bi_blkg) {
+ blkg_put(bio->bi_blkg);
+ bio->bi_blkg = NULL;
+ }
}
/**
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index eb85cb87c40f..694595b29b8f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,6 +27,7 @@
#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/blk-cgroup.h>
+#include <linux/tracehook.h>
#include "blk.h"
#define MAX_KEY_LEN 100
@@ -50,6 +51,8 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
+static bool blkcg_debug_stats = false;
+
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
{
@@ -564,6 +567,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
[BLKG_RWSTAT_WRITE] = "Write",
[BLKG_RWSTAT_SYNC] = "Sync",
[BLKG_RWSTAT_ASYNC] = "Async",
+ [BLKG_RWSTAT_DISCARD] = "Discard",
};
const char *dname = blkg_dev_name(pd->blkg);
u64 v;
@@ -577,7 +581,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
(unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
+ atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
+ atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
return v;
}
@@ -954,30 +959,77 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
const char *dname;
+ char *buf;
struct blkg_rwstat rwstat;
- u64 rbytes, wbytes, rios, wios;
+ u64 rbytes, wbytes, rios, wios, dbytes, dios;
+ size_t size = seq_get_buf(sf, &buf), off = 0;
+ int i;
+ bool has_stats = false;
dname = blkg_dev_name(blkg);
if (!dname)
continue;
+ /*
+ * Hooray string manipulation, count is the size written NOT
+ * INCLUDING THE \0, so size is now count+1 less than what we
+ * had before, but we want to start writing the next bit from
+ * the \0 so we only add count to buf.
+ */
+ off += scnprintf(buf+off, size-off, "%s ", dname);
+
spin_lock_irq(blkg->q->queue_lock);
rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
offsetof(struct blkcg_gq, stat_bytes));
rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+ dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
offsetof(struct blkcg_gq, stat_ios));
rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+ dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
spin_unlock_irq(blkg->q->queue_lock);
- if (rbytes || wbytes || rios || wios)
- seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
- dname, rbytes, wbytes, rios, wios);
+ if (rbytes || wbytes || rios || wios) {
+ has_stats = true;
+ off += scnprintf(buf+off, size-off,
+ "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
+ rbytes, wbytes, rios, wios,
+ dbytes, dios);
+ }
+
+ if (!blkcg_debug_stats)
+ goto next;
+
+ if (atomic_read(&blkg->use_delay)) {
+ has_stats = true;
+ off += scnprintf(buf+off, size-off,
+ " use_delay=%d delay_nsec=%llu",
+ atomic_read(&blkg->use_delay),
+ (unsigned long long)atomic64_read(&blkg->delay_nsec));
+ }
+
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+ size_t written;
+
+ if (!blkg->pd[i] || !pol->pd_stat_fn)
+ continue;
+
+ written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
+ if (written)
+ has_stats = true;
+ off += written;
+ }
+next:
+ if (has_stats) {
+ off += scnprintf(buf+off, size-off, "\n");
+ seq_commit(sf, off);
+ }
}
rcu_read_unlock();
@@ -1191,6 +1243,14 @@ int blkcg_init_queue(struct request_queue *q)
if (preloaded)
radix_tree_preload_end();
+ ret = blk_iolatency_init(q);
+ if (ret) {
+ spin_lock_irq(q->queue_lock);
+ blkg_destroy_all(q);
+ spin_unlock_irq(q->queue_lock);
+ return ret;
+ }
+
ret = blk_throtl_init(q);
if (ret) {
spin_lock_irq(q->queue_lock);
@@ -1288,6 +1348,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
mutex_unlock(&blkcg_pol_mutex);
}
+static void blkcg_exit(struct task_struct *tsk)
+{
+ if (tsk->throttle_queue)
+ blk_put_queue(tsk->throttle_queue);
+ tsk->throttle_queue = NULL;
+}
+
struct cgroup_subsys io_cgrp_subsys = {
.css_alloc = blkcg_css_alloc,
.css_offline = blkcg_css_offline,
@@ -1297,6 +1364,7 @@ struct cgroup_subsys io_cgrp_subsys = {
.dfl_cftypes = blkcg_files,
.legacy_cftypes = blkcg_legacy_files,
.legacy_name = "blkio",
+ .exit = blkcg_exit,
#ifdef CONFIG_MEMCG
/*
* This ensures that, if available, memcg is automatically enabled
@@ -1547,3 +1615,209 @@ out_unlock:
mutex_unlock(&blkcg_pol_register_mutex);
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+
+/*
+ * Scale the accumulated delay based on how long it has been since we updated
+ * the delay. We only call this when we are adding delay, in case it's been a
+ * while since we added delay, and when we are checking to see if we need to
+ * delay a task, to account for any delays that may have occurred.
+ */
+static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
+{
+ u64 old = atomic64_read(&blkg->delay_start);
+
+ /*
+ * We only want to scale down every second. The idea here is that we
+ * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
+ * time window. We only want to throttle tasks for recent delay that
+ * has occurred, in 1 second time windows since that's the maximum
+ * things can be throttled. We save the current delay window in
+ * blkg->last_delay so we know what amount is still left to be charged
+ * to the blkg from this point onward. blkg->last_use keeps track of
+ * the use_delay counter. The idea is if we're unthrottling the blkg we
+ * are ok with whatever is happening now, and we can take away more of
+ * the accumulated delay as we've already throttled enough that
+ * everybody is happy with their IO latencies.
+ */
+ if (time_before64(old + NSEC_PER_SEC, now) &&
+ atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
+ u64 cur = atomic64_read(&blkg->delay_nsec);
+ u64 sub = min_t(u64, blkg->last_delay, now - old);
+ int cur_use = atomic_read(&blkg->use_delay);
+
+ /*
+ * We've been unthrottled, subtract a larger chunk of our
+ * accumulated delay.
+ */
+ if (cur_use < blkg->last_use)
+ sub = max_t(u64, sub, blkg->last_delay >> 1);
+
+ /*
+ * This shouldn't happen, but handle it anyway. Our delay_nsec
+ * should only ever be growing except here where we subtract out
+ * min(last_delay, 1 second), but lord knows bugs happen and I'd
+ * rather not end up with negative numbers.
+ */
+ if (unlikely(cur < sub)) {
+ atomic64_set(&blkg->delay_nsec, 0);
+ blkg->last_delay = 0;
+ } else {
+ atomic64_sub(sub, &blkg->delay_nsec);
+ blkg->last_delay = cur - sub;
+ }
+ blkg->last_use = cur_use;
+ }
+}
+
+/*
+ * This is called when we want to actually walk up the hierarchy and check to
+ * see if we need to throttle, and then actually throttle if there is some
+ * accumulated delay. This should only be called upon return to user space so
+ * we're not holding some lock that would induce a priority inversion.
+ */
+static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ u64 exp;
+ u64 delay_nsec = 0;
+ int tok;
+
+ while (blkg->parent) {
+ if (atomic_read(&blkg->use_delay)) {
+ blkcg_scale_delay(blkg, now);
+ delay_nsec = max_t(u64, delay_nsec,
+ atomic64_read(&blkg->delay_nsec));
+ }
+ blkg = blkg->parent;
+ }
+
+ if (!delay_nsec)
+ return;
+
+ /*
+ * Let's not sleep for all eternity if we've amassed a huge delay.
+ * Swapping or metadata IO can accumulate 10's of seconds worth of
+ * delay, and we want userspace to be able to do _something_ so cap the
+ * delays at 1 second. If there's 10's of seconds worth of delay then
+ * the tasks will be delayed for 1 second for every syscall.
+ */
+ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+
+ /*
+ * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
+ * that hasn't landed upstream yet. Once that stuff is in place we need
+ * to do a psi_memstall_enter/leave if memdelay is set.
+ */
+
+ exp = ktime_add_ns(now, delay_nsec);
+ tok = io_schedule_prepare();
+ do {
+ __set_current_state(TASK_KILLABLE);
+ if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
+ break;
+ } while (!fatal_signal_pending(current));
+ io_schedule_finish(tok);
+}
+
+/**
+ * blkcg_maybe_throttle_current - throttle the current task if it has been marked
+ *
+ * This is only called if we've been marked with set_notify_resume(). Obviously
+ * we can be set_notify_resume() for reasons other than blkcg throttling, so we
+ * check to see if current->throttle_queue is set and if not this doesn't do
+ * anything. This should only ever be called by the resume code, it's not meant
+ * to be called by people willy-nilly as it will actually do the work to
+ * throttle the task if it is setup for throttling.
+ */
+void blkcg_maybe_throttle_current(void)
+{
+ struct request_queue *q = current->throttle_queue;
+ struct cgroup_subsys_state *css;
+ struct blkcg *blkcg;
+ struct blkcg_gq *blkg;
+ bool use_memdelay = current->use_memdelay;
+
+ if (!q)
+ return;
+
+ current->throttle_queue = NULL;
+ current->use_memdelay = false;
+
+ rcu_read_lock();
+ css = kthread_blkcg();
+ if (css)
+ blkcg = css_to_blkcg(css);
+ else
+ blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
+
+ if (!blkcg)
+ goto out;
+ blkg = blkg_lookup(blkcg, q);
+ if (!blkg)
+ goto out;
+ blkg = blkg_try_get(blkg);
+ if (!blkg)
+ goto out;
+ rcu_read_unlock();
+
+ blkcg_maybe_throttle_blkg(blkg, use_memdelay);
+ blkg_put(blkg);
+ blk_put_queue(q);
+ return;
+out:
+ rcu_read_unlock();
+ blk_put_queue(q);
+}
+EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
+
+/**
+ * blkcg_schedule_throttle - this task needs to check for throttling
+ * @q - the request queue IO was submitted on
+ * @use_memdelay - do we charge this to memory delay for PSI
+ *
+ * This is called by the IO controller when we know there's delay accumulated
+ * for the blkg for this task. We do not pass the blkg because there are places
+ * we call this that may not have that information, the swapping code for
+ * instance will only have a request_queue at that point. This set's the
+ * notify_resume for the task to check and see if it requires throttling before
+ * returning to user space.
+ *
+ * We will only schedule once per syscall. You can call this over and over
+ * again and it will only do the check once upon return to user space, and only
+ * throttle once. If the task needs to be throttled again it'll need to be
+ * re-set at the next time we see the task.
+ */
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
+{
+ if (unlikely(current->flags & PF_KTHREAD))
+ return;
+
+ if (!blk_get_queue(q))
+ return;
+
+ if (current->throttle_queue)
+ blk_put_queue(current->throttle_queue);
+ current->throttle_queue = q;
+ if (use_memdelay)
+ current->use_memdelay = use_memdelay;
+ set_notify_resume(current);
+}
+EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
+
+/**
+ * blkcg_add_delay - add delay to this blkg
+ * @now - the current time in nanoseconds
+ * @delta - how many nanoseconds of delay to add
+ *
+ * Charge @delta to the blkg's current delay accumulation. This is used to
+ * throttle tasks if an IO controller thinks we need more throttling.
+ */
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
+{
+ blkcg_scale_delay(blkg, now);
+ atomic64_add(delta, &blkg->delay_nsec);
+}
+EXPORT_SYMBOL_GPL(blkcg_add_delay);
+
+module_param(blkcg_debug_stats, bool, 0644);
+MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index cf0ee764b908..12550340418d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -42,7 +42,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
-#include "blk-wbt.h"
+#include "blk-rq-qos.h"
#ifdef CONFIG_DEBUG_FS
struct dentry *blk_debugfs_root;
@@ -273,10 +273,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
bio_advance(bio, nbytes);
/* don't actually finish bio if it's part of flush sequence */
- /*
- * XXX this code looks suspicious - it's not consistent with advancing
- * req->bio in caller
- */
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
}
@@ -719,6 +715,35 @@ void blk_set_queue_dying(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_set_queue_dying);
+/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
+void blk_exit_queue(struct request_queue *q)
+{
+ /*
+ * Since the I/O scheduler exit code may access cgroup information,
+ * perform I/O scheduler exit before disassociating from the block
+ * cgroup controller.
+ */
+ if (q->elevator) {
+ ioc_clear_queue(q);
+ elevator_exit(q, q->elevator);
+ q->elevator = NULL;
+ }
+
+ /*
+ * Remove all references to @q from the block cgroup controller before
+ * restoring @q->queue_lock to avoid that restoring this pointer causes
+ * e.g. blkcg_print_blkgs() to crash.
+ */
+ blkcg_exit_queue(q);
+
+ /*
+ * Since the cgroup code may dereference the @q->backing_dev_info
+ * pointer, only decrease its reference count after having removed the
+ * association with the block cgroup controller.
+ */
+ bdi_put(q->backing_dev_info);
+}
+
/**
* blk_cleanup_queue - shutdown a request queue
* @q: request queue to shutdown
@@ -766,9 +791,13 @@ void blk_cleanup_queue(struct request_queue *q)
* make sure all in-progress dispatch are completed because
* blk_freeze_queue() can only complete all requests, and
* dispatch may still be in-progress since we dispatch requests
- * from more than one contexts
+ * from more than one contexts.
+ *
+ * No need to quiesce queue if it isn't initialized yet since
+ * blk_freeze_queue() should be enough for cases of passthrough
+ * request.
*/
- if (q->mq_ops)
+ if (q->mq_ops && blk_queue_init_done(q))
blk_mq_quiesce_queue(q);
/* for synchronous bio-based driver finish in-flight integrity i/o */
@@ -784,30 +813,7 @@ void blk_cleanup_queue(struct request_queue *q)
*/
WARN_ON_ONCE(q->kobj.state_in_sysfs);
- /*
- * Since the I/O scheduler exit code may access cgroup information,
- * perform I/O scheduler exit before disassociating from the block
- * cgroup controller.
- */
- if (q->elevator) {
- ioc_clear_queue(q);
- elevator_exit(q, q->elevator);
- q->elevator = NULL;
- }
-
- /*
- * Remove all references to @q from the block cgroup controller before
- * restoring @q->queue_lock to avoid that restoring this pointer causes
- * e.g. blkcg_print_blkgs() to crash.
- */
- blkcg_exit_queue(q);
-
- /*
- * Since the cgroup code may dereference the @q->backing_dev_info
- * pointer, only decrease its reference count after having removed the
- * association with the block cgroup controller.
- */
- bdi_put(q->backing_dev_info);
+ blk_exit_queue(q);
if (q->mq_ops)
blk_mq_free_queue(q);
@@ -1184,6 +1190,7 @@ out_exit_flush_rq:
q->exit_rq_fn(q, q->fq->flush_rq);
out_free_flush_queue:
blk_free_flush_queue(q->fq);
+ q->fq = NULL;
return -ENOMEM;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1645,7 +1652,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
blk_delete_timer(rq);
blk_clear_rq_complete(rq);
trace_block_rq_requeue(q, rq);
- wbt_requeue(q->rq_wb, rq);
+ rq_qos_requeue(q, rq);
if (rq->rq_flags & RQF_QUEUED)
blk_queue_end_tag(q, rq);
@@ -1752,7 +1759,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);
- wbt_done(q->rq_wb, req);
+ rq_qos_done(q, req);
/*
* Request may not have originated from ll_rw_blk. if not,
@@ -1986,7 +1993,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
int where = ELEVATOR_INSERT_SORT;
struct request *req, *free;
unsigned int request_count = 0;
- unsigned int wb_acct;
/*
* low level driver can indicate that it wants pages above a
@@ -2044,7 +2050,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
}
get_rq:
- wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+ rq_qos_throttle(q, bio, q->queue_lock);
/*
* Grab a free request. This is might sleep but can not fail.
@@ -2054,7 +2060,7 @@ get_rq:
req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
if (IS_ERR(req)) {
blk_queue_exit(q);
- __wbt_done(q->rq_wb, wb_acct);
+ rq_qos_cleanup(q, bio);
if (PTR_ERR(req) == -ENOMEM)
bio->bi_status = BLK_STS_RESOURCE;
else
@@ -2063,7 +2069,7 @@ get_rq:
goto out_unlock;
}
- wbt_track(req, wb_acct);
+ rq_qos_track(q, req, bio);
/*
* After dropping the lock and possibly sleeping here, our request
@@ -2159,11 +2165,12 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
if (part->policy && op_is_write(bio_op(bio))) {
char b[BDEVNAME_SIZE];
- printk(KERN_ERR
+ WARN_ONCE(1,
"generic_make_request: Trying to write "
"to read-only block-device %s (partno %d)\n",
bio_devname(bio, b), part->partno);
- return true;
+ /* Older lvm-tools actually trigger this */
+ return false;
}
return false;
@@ -2703,13 +2710,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (blk_do_io_stat(req)) {
- const int rw = rq_data_dir(req);
+ const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
int cpu;
cpu = part_stat_lock();
part = req->part;
- part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+ part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
part_stat_unlock();
}
}
@@ -2723,7 +2730,7 @@ void blk_account_io_done(struct request *req, u64 now)
*/
if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
unsigned long duration;
- const int rw = rq_data_dir(req);
+ const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
int cpu;
@@ -2731,10 +2738,10 @@ void blk_account_io_done(struct request *req, u64 now)
cpu = part_stat_lock();
part = req->part;
- part_stat_inc(cpu, part, ios[rw]);
- part_stat_add(cpu, part, ticks[rw], duration);
+ part_stat_inc(cpu, part, ios[sgrp]);
+ part_stat_add(cpu, part, ticks[sgrp], duration);
part_round_stats(req->q, cpu, part);
- part_dec_in_flight(req->q, part, rw);
+ part_dec_in_flight(req->q, part, rq_data_dir(req));
hd_struct_put(part);
part_stat_unlock();
@@ -2754,9 +2761,9 @@ static bool blk_pm_allow_request(struct request *rq)
return rq->rq_flags & RQF_PM;
case RPM_SUSPENDED:
return false;
+ default:
+ return true;
}
-
- return true;
}
#else
static bool blk_pm_allow_request(struct request *rq)
@@ -2983,7 +2990,7 @@ void blk_start_request(struct request *req)
req->throtl_size = blk_rq_sectors(req);
#endif
req->rq_flags |= RQF_STATS;
- wbt_issue(req->q->rq_wb, req);
+ rq_qos_issue(req->q, req);
}
BUG_ON(blk_rq_is_complete(req));
@@ -3056,6 +3063,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios);
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
* %false return from this function.
*
+ * Note:
+ * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
+ * blk_rq_bytes() and in blk_update_request().
+ *
* Return:
* %false - this request doesn't have any more data
* %true - this request has more data
@@ -3081,10 +3092,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
- if (bio_bytes == bio->bi_iter.bi_size) {
+ if (bio_bytes == bio->bi_iter.bi_size)
req->bio = bio->bi_next;
- bio->bi_next = NULL;
- }
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
@@ -3205,7 +3214,7 @@ void blk_finish_request(struct request *req, blk_status_t error)
blk_account_io_done(req, now);
if (req->end_io) {
- wbt_done(req->q->rq_wb, req);
+ rq_qos_done(q, req);
req->end_io(req, error);
} else {
if (blk_bidi_rq(req))
@@ -3479,6 +3488,10 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)
dst->cpu = src->cpu;
dst->__sector = blk_rq_pos(src);
dst->__data_len = blk_rq_bytes(src);
+ if (src->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ dst->rq_flags |= RQF_SPECIAL_PAYLOAD;
+ dst->special_vec = src->special_vec;
+ }
dst->nr_phys_segments = src->nr_phys_segments;
dst->ioprio = src->ioprio;
dst->extra_len = src->extra_len;
@@ -3764,9 +3777,11 @@ EXPORT_SYMBOL(blk_finish_plug);
*/
void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
{
- /* not support for RQF_PM and ->rpm_status in blk-mq yet */
- if (q->mq_ops)
+ /* Don't enable runtime PM for blk-mq until it is ready */
+ if (q->mq_ops) {
+ pm_runtime_disable(dev);
return;
+ }
q->dev = dev;
q->rpm_status = RPM_ACTIVE;
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index f23311e4b201..01580f88fcb3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -278,7 +278,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
atomic_set(&ioc->nr_tasks, 1);
atomic_set(&ioc->active_ref, 1);
spin_lock_init(&ioc->lock);
- INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
+ INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
new file mode 100644
index 000000000000..19923f8a029d
--- /dev/null
+++ b/block/blk-iolatency.c
@@ -0,0 +1,955 @@
+/*
+ * Block rq-qos base io controller
+ *
+ * This works similar to wbt with a few exceptions
+ *
+ * - It's bio based, so the latency covers the whole block layer in addition to
+ * the actual io.
+ * - We will throttle all IO that comes in here if we need to.
+ * - We use the mean latency over the 100ms window. This is because writes can
+ * be particularly fast, which could give us a false sense of the impact of
+ * other workloads on our protected workload.
+ * - By default there's no throttling, we set the queue_depth to UINT_MAX so
+ * that we can have as many outstanding bio's as we're allowed to. Only at
+ * throttle time do we pay attention to the actual queue depth.
+ *
+ * The hierarchy works like the cpu controller does, we track the latency at
+ * every configured node, and each configured node has it's own independent
+ * queue depth. This means that we only care about our latency targets at the
+ * peer level. Some group at the bottom of the hierarchy isn't going to affect
+ * a group at the end of some other path if we're only configred at leaf level.
+ *
+ * Consider the following
+ *
+ * root blkg
+ * / \
+ * fast (target=5ms) slow (target=10ms)
+ * / \ / \
+ * a b normal(15ms) unloved
+ *
+ * "a" and "b" have no target, but their combined io under "fast" cannot exceed
+ * an average latency of 5ms. If it does then we will throttle the "slow"
+ * group. In the case of "normal", if it exceeds its 15ms target, we will
+ * throttle "unloved", but nobody else.
+ *
+ * In this example "fast", "slow", and "normal" will be the only groups actually
+ * accounting their io latencies. We have to walk up the heirarchy to the root
+ * on every submit and complete so we can do the appropriate stat recording and
+ * adjust the queue depth of ourselves if needed.
+ *
+ * There are 2 ways we throttle IO.
+ *
+ * 1) Queue depth throttling. As we throttle down we will adjust the maximum
+ * number of IO's we're allowed to have in flight. This starts at (u64)-1 down
+ * to 1. If the group is only ever submitting IO for itself then this is the
+ * only way we throttle.
+ *
+ * 2) Induced delay throttling. This is for the case that a group is generating
+ * IO that has to be issued by the root cg to avoid priority inversion. So think
+ * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot
+ * of work done for us on behalf of the root cg and are being asked to scale
+ * down more then we induce a latency at userspace return. We accumulate the
+ * total amount of time we need to be punished by doing
+ *
+ * total_time += min_lat_nsec - actual_io_completion
+ *
+ * and then at throttle time will do
+ *
+ * throttle_time = min(total_time, NSEC_PER_SEC)
+ *
+ * This induced delay will throttle back the activity that is generating the
+ * root cg issued io's, wethere that's some metadata intensive operation or the
+ * group is using so much memory that it is pushing us into swap.
+ *
+ * Copyright (C) 2018 Josef Bacik
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/memcontrol.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/signal.h>
+#include <trace/events/block.h>
+#include "blk-rq-qos.h"
+#include "blk-stat.h"
+
+#define DEFAULT_SCALE_COOKIE 1000000U
+
+static struct blkcg_policy blkcg_policy_iolatency;
+struct iolatency_grp;
+
+struct blk_iolatency {
+ struct rq_qos rqos;
+ struct timer_list timer;
+ atomic_t enabled;
+};
+
+static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct blk_iolatency, rqos);
+}
+
+static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat)
+{
+ return atomic_read(&blkiolat->enabled) > 0;
+}
+
+struct child_latency_info {
+ spinlock_t lock;
+
+ /* Last time we adjusted the scale of everybody. */
+ u64 last_scale_event;
+
+ /* The latency that we missed. */
+ u64 scale_lat;
+
+ /* Total io's from all of our children for the last summation. */
+ u64 nr_samples;
+
+ /* The guy who actually changed the latency numbers. */
+ struct iolatency_grp *scale_grp;
+
+ /* Cookie to tell if we need to scale up or down. */
+ atomic_t scale_cookie;
+};
+
+struct iolatency_grp {
+ struct blkg_policy_data pd;
+ struct blk_rq_stat __percpu *stats;
+ struct blk_iolatency *blkiolat;
+ struct rq_depth rq_depth;
+ struct rq_wait rq_wait;
+ atomic64_t window_start;
+ atomic_t scale_cookie;
+ u64 min_lat_nsec;
+ u64 cur_win_nsec;
+
+ /* total running average of our io latency. */
+ u64 lat_avg;
+
+ /* Our current number of IO's for the last summation. */
+ u64 nr_samples;
+
+ struct child_latency_info child_lat;
+};
+
+#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
+#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
+/*
+ * These are the constants used to fake the fixed-point moving average
+ * calculation just like load average. The call to CALC_LOAD folds
+ * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling
+ * window size is bucketed to try to approximately calculate average
+ * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
+ * elapse immediately. Note, windows only elapse with IO activity. Idle
+ * periods extend the most recent window.
+ */
+#define BLKIOLATENCY_NR_EXP_FACTORS 5
+#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
+ (BLKIOLATENCY_NR_EXP_FACTORS - 1))
+static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
+ 2045, // exp(1/600) - 600 samples
+ 2039, // exp(1/240) - 240 samples
+ 2031, // exp(1/120) - 120 samples
+ 2023, // exp(1/80) - 80 samples
+ 2014, // exp(1/60) - 60 samples
+};
+
+static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
+}
+
+static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg)
+{
+ return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency));
+}
+
+static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
+{
+ return pd_to_blkg(&iolat->pd);
+}
+
+static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
+ wait_queue_entry_t *wait,
+ bool first_block)
+{
+ struct rq_wait *rqw = &iolat->rq_wait;
+
+ if (first_block && waitqueue_active(&rqw->wait) &&
+ rqw->wait.head.next != &wait->entry)
+ return false;
+ return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
+}
+
+static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
+ struct iolatency_grp *iolat,
+ spinlock_t *lock, bool issue_as_root,
+ bool use_memdelay)
+ __releases(lock)
+ __acquires(lock)
+{
+ struct rq_wait *rqw = &iolat->rq_wait;
+ unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
+ DEFINE_WAIT(wait);
+ bool first_block = true;
+
+ if (use_delay)
+ blkcg_schedule_throttle(rqos->q, use_memdelay);
+
+ /*
+ * To avoid priority inversions we want to just take a slot if we are
+ * issuing as root. If we're being killed off there's no point in
+ * delaying things, we may have been killed by OOM so throttling may
+ * make recovery take even longer, so just let the IO's through so the
+ * task can go away.
+ */
+ if (issue_as_root || fatal_signal_pending(current)) {
+ atomic_inc(&rqw->inflight);
+ return;
+ }
+
+ if (iolatency_may_queue(iolat, &wait, first_block))
+ return;
+
+ do {
+ prepare_to_wait_exclusive(&rqw->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (iolatency_may_queue(iolat, &wait, first_block))
+ break;
+ first_block = false;
+
+ if (lock) {
+ spin_unlock_irq(lock);
+ io_schedule();
+ spin_lock_irq(lock);
+ } else {
+ io_schedule();
+ }
+ } while (1);
+
+ finish_wait(&rqw->wait, &wait);
+}
+
+#define SCALE_DOWN_FACTOR 2
+#define SCALE_UP_FACTOR 4
+
+static inline unsigned long scale_amount(unsigned long qd, bool up)
+{
+ return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL);
+}
+
+/*
+ * We scale the qd down faster than we scale up, so we need to use this helper
+ * to adjust the scale_cookie accordingly so we don't prematurely get
+ * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
+ *
+ * Each group has their own local copy of the last scale cookie they saw, so if
+ * the global scale cookie goes up or down they know which way they need to go
+ * based on their last knowledge of it.
+ */
+static void scale_cookie_change(struct blk_iolatency *blkiolat,
+ struct child_latency_info *lat_info,
+ bool up)
+{
+ unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
+ unsigned long scale = scale_amount(qd, up);
+ unsigned long old = atomic_read(&lat_info->scale_cookie);
+ unsigned long max_scale = qd << 1;
+ unsigned long diff = 0;
+
+ if (old < DEFAULT_SCALE_COOKIE)
+ diff = DEFAULT_SCALE_COOKIE - old;
+
+ if (up) {
+ if (scale + old > DEFAULT_SCALE_COOKIE)
+ atomic_set(&lat_info->scale_cookie,
+ DEFAULT_SCALE_COOKIE);
+ else if (diff > qd)
+ atomic_inc(&lat_info->scale_cookie);
+ else
+ atomic_add(scale, &lat_info->scale_cookie);
+ } else {
+ /*
+ * We don't want to dig a hole so deep that it takes us hours to
+ * dig out of it. Just enough that we don't throttle/unthrottle
+ * with jagged workloads but can still unthrottle once pressure
+ * has sufficiently dissipated.
+ */
+ if (diff > qd) {
+ if (diff < max_scale)
+ atomic_dec(&lat_info->scale_cookie);
+ } else {
+ atomic_sub(scale, &lat_info->scale_cookie);
+ }
+ }
+}
+
+/*
+ * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the
+ * queue depth at a time so we don't get wild swings and hopefully dial in to
+ * fairer distribution of the overall queue depth.
+ */
+static void scale_change(struct iolatency_grp *iolat, bool up)
+{
+ unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
+ unsigned long scale = scale_amount(qd, up);
+ unsigned long old = iolat->rq_depth.max_depth;
+ bool changed = false;
+
+ if (old > qd)
+ old = qd;
+
+ if (up) {
+ if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat)))
+ return;
+
+ if (old < qd) {
+ changed = true;
+ old += scale;
+ old = min(old, qd);
+ iolat->rq_depth.max_depth = old;
+ wake_up_all(&iolat->rq_wait.wait);
+ }
+ } else if (old > 1) {
+ old >>= 1;
+ changed = true;
+ iolat->rq_depth.max_depth = max(old, 1UL);
+ }
+}
+
+/* Check our parent and see if the scale cookie has changed. */
+static void check_scale_change(struct iolatency_grp *iolat)
+{
+ struct iolatency_grp *parent;
+ struct child_latency_info *lat_info;
+ unsigned int cur_cookie;
+ unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
+ u64 scale_lat;
+ unsigned int old;
+ int direction = 0;
+
+ if (lat_to_blkg(iolat)->parent == NULL)
+ return;
+
+ parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
+ if (!parent)
+ return;
+
+ lat_info = &parent->child_lat;
+ cur_cookie = atomic_read(&lat_info->scale_cookie);
+ scale_lat = READ_ONCE(lat_info->scale_lat);
+
+ if (cur_cookie < our_cookie)
+ direction = -1;
+ else if (cur_cookie > our_cookie)
+ direction = 1;
+ else
+ return;
+
+ old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
+
+ /* Somebody beat us to the punch, just bail. */
+ if (old != our_cookie)
+ return;
+
+ if (direction < 0 && iolat->min_lat_nsec) {
+ u64 samples_thresh;
+
+ if (!scale_lat || iolat->min_lat_nsec <= scale_lat)
+ return;
+
+ /*
+ * Sometimes high priority groups are their own worst enemy, so
+ * instead of taking it out on some poor other group that did 5%
+ * or less of the IO's for the last summation just skip this
+ * scale down event.
+ */
+ samples_thresh = lat_info->nr_samples * 5;
+ samples_thresh = div64_u64(samples_thresh, 100);
+ if (iolat->nr_samples <= samples_thresh)
+ return;
+ }
+
+ /* We're as low as we can go. */
+ if (iolat->rq_depth.max_depth == 1 && direction < 0) {
+ blkcg_use_delay(lat_to_blkg(iolat));
+ return;
+ }
+
+ /* We're back to the default cookie, unthrottle all the things. */
+ if (cur_cookie == DEFAULT_SCALE_COOKIE) {
+ blkcg_clear_delay(lat_to_blkg(iolat));
+ iolat->rq_depth.max_depth = UINT_MAX;
+ wake_up_all(&iolat->rq_wait.wait);
+ return;
+ }
+
+ scale_change(iolat, direction > 0);
+}
+
+static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
+ spinlock_t *lock)
+{
+ struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+ struct blkcg *blkcg;
+ struct blkcg_gq *blkg;
+ struct request_queue *q = rqos->q;
+ bool issue_as_root = bio_issue_as_root_blkg(bio);
+
+ if (!blk_iolatency_enabled(blkiolat))
+ return;
+
+ rcu_read_lock();
+ blkcg = bio_blkcg(bio);
+ bio_associate_blkcg(bio, &blkcg->css);
+ blkg = blkg_lookup(blkcg, q);
+ if (unlikely(!blkg)) {
+ if (!lock)
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_lookup_create(blkcg, q);
+ if (IS_ERR(blkg))
+ blkg = NULL;
+ if (!lock)
+ spin_unlock_irq(q->queue_lock);
+ }
+ if (!blkg)
+ goto out;
+
+ bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+ bio_associate_blkg(bio, blkg);
+out:
+ rcu_read_unlock();
+ while (blkg && blkg->parent) {
+ struct iolatency_grp *iolat = blkg_to_lat(blkg);
+ if (!iolat) {
+ blkg = blkg->parent;
+ continue;
+ }
+
+ check_scale_change(iolat);
+ __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root,
+ (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
+ blkg = blkg->parent;
+ }
+ if (!timer_pending(&blkiolat->timer))
+ mod_timer(&blkiolat->timer, jiffies + HZ);
+}
+
+static void iolatency_record_time(struct iolatency_grp *iolat,
+ struct bio_issue *issue, u64 now,
+ bool issue_as_root)
+{
+ struct blk_rq_stat *rq_stat;
+ u64 start = bio_issue_time(issue);
+ u64 req_time;
+
+ /*
+ * Have to do this so we are truncated to the correct time that our
+ * issue is truncated to.
+ */
+ now = __bio_issue_time(now);
+
+ if (now <= start)
+ return;
+
+ req_time = now - start;
+
+ /*
+ * We don't want to count issue_as_root bio's in the cgroups latency
+ * statistics as it could skew the numbers downwards.
+ */
+ if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
+ u64 sub = iolat->min_lat_nsec;
+ if (req_time < sub)
+ blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
+ return;
+ }
+
+ rq_stat = get_cpu_ptr(iolat->stats);
+ blk_rq_stat_add(rq_stat, req_time);
+ put_cpu_ptr(rq_stat);
+}
+
+#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
+#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5
+
+static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
+{
+ struct blkcg_gq *blkg = lat_to_blkg(iolat);
+ struct iolatency_grp *parent;
+ struct child_latency_info *lat_info;
+ struct blk_rq_stat stat;
+ unsigned long flags;
+ int cpu, exp_idx;
+
+ blk_rq_stat_init(&stat);
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ struct blk_rq_stat *s;
+ s = per_cpu_ptr(iolat->stats, cpu);
+ blk_rq_stat_sum(&stat, s);
+ blk_rq_stat_init(s);
+ }
+ preempt_enable();
+
+ parent = blkg_to_lat(blkg->parent);
+ if (!parent)
+ return;
+
+ lat_info = &parent->child_lat;
+
+ /*
+ * CALC_LOAD takes in a number stored in fixed point representation.
+ * Because we are using this for IO time in ns, the values stored
+ * are significantly larger than the FIXED_1 denominator (2048).
+ * Therefore, rounding errors in the calculation are negligible and
+ * can be ignored.
+ */
+ exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
+ div64_u64(iolat->cur_win_nsec,
+ BLKIOLATENCY_EXP_BUCKET_SIZE));
+ CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
+
+ /* Everything is ok and we don't need to adjust the scale. */
+ if (stat.mean <= iolat->min_lat_nsec &&
+ atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
+ return;
+
+ /* Somebody beat us to the punch, just bail. */
+ spin_lock_irqsave(&lat_info->lock, flags);
+ lat_info->nr_samples -= iolat->nr_samples;
+ lat_info->nr_samples += stat.nr_samples;
+ iolat->nr_samples = stat.nr_samples;
+
+ if ((lat_info->last_scale_event >= now ||
+ now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
+ lat_info->scale_lat <= iolat->min_lat_nsec)
+ goto out;
+
+ if (stat.mean <= iolat->min_lat_nsec &&
+ stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
+ if (lat_info->scale_grp == iolat) {
+ lat_info->last_scale_event = now;
+ scale_cookie_change(iolat->blkiolat, lat_info, true);
+ }
+ } else if (stat.mean > iolat->min_lat_nsec) {
+ lat_info->last_scale_event = now;
+ if (!lat_info->scale_grp ||
+ lat_info->scale_lat > iolat->min_lat_nsec) {
+ WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec);
+ lat_info->scale_grp = iolat;
+ }
+ scale_cookie_change(iolat->blkiolat, lat_info, false);
+ }
+out:
+ spin_unlock_irqrestore(&lat_info->lock, flags);
+}
+
+static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
+{
+ struct blkcg_gq *blkg;
+ struct rq_wait *rqw;
+ struct iolatency_grp *iolat;
+ u64 window_start;
+ u64 now = ktime_to_ns(ktime_get());
+ bool issue_as_root = bio_issue_as_root_blkg(bio);
+ bool enabled = false;
+
+ blkg = bio->bi_blkg;
+ if (!blkg)
+ return;
+
+ iolat = blkg_to_lat(bio->bi_blkg);
+ if (!iolat)
+ return;
+
+ enabled = blk_iolatency_enabled(iolat->blkiolat);
+ while (blkg && blkg->parent) {
+ iolat = blkg_to_lat(blkg);
+ if (!iolat) {
+ blkg = blkg->parent;
+ continue;
+ }
+ rqw = &iolat->rq_wait;
+
+ atomic_dec(&rqw->inflight);
+ if (!enabled || iolat->min_lat_nsec == 0)
+ goto next;
+ iolatency_record_time(iolat, &bio->bi_issue, now,
+ issue_as_root);
+ window_start = atomic64_read(&iolat->window_start);
+ if (now > window_start &&
+ (now - window_start) >= iolat->cur_win_nsec) {
+ if (atomic64_cmpxchg(&iolat->window_start,
+ window_start, now) == window_start)
+ iolatency_check_latencies(iolat, now);
+ }
+next:
+ wake_up(&rqw->wait);
+ blkg = blkg->parent;
+ }
+}
+
+static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio)
+{
+ struct blkcg_gq *blkg;
+
+ blkg = bio->bi_blkg;
+ while (blkg && blkg->parent) {
+ struct rq_wait *rqw;
+ struct iolatency_grp *iolat;
+
+ iolat = blkg_to_lat(blkg);
+ if (!iolat)
+ goto next;
+
+ rqw = &iolat->rq_wait;
+ atomic_dec(&rqw->inflight);
+ wake_up(&rqw->wait);
+next:
+ blkg = blkg->parent;
+ }
+}
+
+static void blkcg_iolatency_exit(struct rq_qos *rqos)
+{
+ struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+
+ del_timer_sync(&blkiolat->timer);
+ blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency);
+ kfree(blkiolat);
+}
+
+static struct rq_qos_ops blkcg_iolatency_ops = {
+ .throttle = blkcg_iolatency_throttle,
+ .cleanup = blkcg_iolatency_cleanup,
+ .done_bio = blkcg_iolatency_done_bio,
+ .exit = blkcg_iolatency_exit,
+};
+
+static void blkiolatency_timer_fn(struct timer_list *t)
+{
+ struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
+ struct blkcg_gq *blkg;
+ struct cgroup_subsys_state *pos_css;
+ u64 now = ktime_to_ns(ktime_get());
+
+ rcu_read_lock();
+ blkg_for_each_descendant_pre(blkg, pos_css,
+ blkiolat->rqos.q->root_blkg) {
+ struct iolatency_grp *iolat;
+ struct child_latency_info *lat_info;
+ unsigned long flags;
+ u64 cookie;
+
+ /*
+ * We could be exiting, don't access the pd unless we have a
+ * ref on the blkg.
+ */
+ if (!blkg_try_get(blkg))
+ continue;
+
+ iolat = blkg_to_lat(blkg);
+ if (!iolat)
+ goto next;
+
+ lat_info = &iolat->child_lat;
+ cookie = atomic_read(&lat_info->scale_cookie);
+
+ if (cookie >= DEFAULT_SCALE_COOKIE)
+ goto next;
+
+ spin_lock_irqsave(&lat_info->lock, flags);
+ if (lat_info->last_scale_event >= now)
+ goto next_lock;
+
+ /*
+ * We scaled down but don't have a scale_grp, scale up and carry
+ * on.
+ */
+ if (lat_info->scale_grp == NULL) {
+ scale_cookie_change(iolat->blkiolat, lat_info, true);
+ goto next_lock;
+ }
+
+ /*
+ * It's been 5 seconds since our last scale event, clear the
+ * scale grp in case the group that needed the scale down isn't
+ * doing any IO currently.
+ */
+ if (now - lat_info->last_scale_event >=
+ ((u64)NSEC_PER_SEC * 5))
+ lat_info->scale_grp = NULL;
+next_lock:
+ spin_unlock_irqrestore(&lat_info->lock, flags);
+next:
+ blkg_put(blkg);
+ }
+ rcu_read_unlock();
+}
+
+int blk_iolatency_init(struct request_queue *q)
+{
+ struct blk_iolatency *blkiolat;
+ struct rq_qos *rqos;
+ int ret;
+
+ blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
+ if (!blkiolat)
+ return -ENOMEM;
+
+ rqos = &blkiolat->rqos;
+ rqos->id = RQ_QOS_CGROUP;
+ rqos->ops = &blkcg_iolatency_ops;
+ rqos->q = q;
+
+ rq_qos_add(q, rqos);
+
+ ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
+ if (ret) {
+ rq_qos_del(q, rqos);
+ kfree(blkiolat);
+ return ret;
+ }
+
+ timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
+
+ return 0;
+}
+
+static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
+{
+ struct iolatency_grp *iolat = blkg_to_lat(blkg);
+ struct blk_iolatency *blkiolat = iolat->blkiolat;
+ u64 oldval = iolat->min_lat_nsec;
+
+ iolat->min_lat_nsec = val;
+ iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE);
+ iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
+ BLKIOLATENCY_MAX_WIN_SIZE);
+
+ if (!oldval && val)
+ atomic_inc(&blkiolat->enabled);
+ if (oldval && !val)
+ atomic_dec(&blkiolat->enabled);
+}
+
+static void iolatency_clear_scaling(struct blkcg_gq *blkg)
+{
+ if (blkg->parent) {
+ struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
+ struct child_latency_info *lat_info;
+ if (!iolat)
+ return;
+
+ lat_info = &iolat->child_lat;
+ spin_lock(&lat_info->lock);
+ atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
+ lat_info->last_scale_event = 0;
+ lat_info->scale_grp = NULL;
+ lat_info->scale_lat = 0;
+ spin_unlock(&lat_info->lock);
+ }
+}
+
+static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct blkcg_gq *blkg;
+ struct blk_iolatency *blkiolat;
+ struct blkg_conf_ctx ctx;
+ struct iolatency_grp *iolat;
+ char *p, *tok;
+ u64 lat_val = 0;
+ u64 oldval;
+ int ret;
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
+ if (ret)
+ return ret;
+
+ iolat = blkg_to_lat(ctx.blkg);
+ blkiolat = iolat->blkiolat;
+ p = ctx.body;
+
+ ret = -EINVAL;
+ while ((tok = strsep(&p, " "))) {
+ char key[16];
+ char val[21]; /* 18446744073709551616 */
+
+ if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
+ goto out;
+
+ if (!strcmp(key, "target")) {
+ u64 v;
+
+ if (!strcmp(val, "max"))
+ lat_val = 0;
+ else if (sscanf(val, "%llu", &v) == 1)
+ lat_val = v * NSEC_PER_USEC;
+ else
+ goto out;
+ } else {
+ goto out;
+ }
+ }
+
+ /* Walk up the tree to see if our new val is lower than it should be. */
+ blkg = ctx.blkg;
+ oldval = iolat->min_lat_nsec;
+
+ iolatency_set_min_lat_nsec(blkg, lat_val);
+ if (oldval != iolat->min_lat_nsec) {
+ iolatency_clear_scaling(blkg);
+ }
+
+ ret = 0;
+out:
+ blkg_conf_finish(&ctx);
+ return ret ?: nbytes;
+}
+
+static u64 iolatency_prfill_limit(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct iolatency_grp *iolat = pd_to_lat(pd);
+ const char *dname = blkg_dev_name(pd->blkg);
+
+ if (!dname || !iolat->min_lat_nsec)
+ return 0;
+ seq_printf(sf, "%s target=%llu\n",
+ dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC));
+ return 0;
+}
+
+static int iolatency_print_limit(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ iolatency_prfill_limit,
+ &blkcg_policy_iolatency, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
+ size_t size)
+{
+ struct iolatency_grp *iolat = pd_to_lat(pd);
+ unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
+ unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
+
+ if (iolat->rq_depth.max_depth == UINT_MAX)
+ return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
+ avg_lat, cur_win);
+
+ return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
+ iolat->rq_depth.max_depth, avg_lat, cur_win);
+}
+
+
+static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
+{
+ struct iolatency_grp *iolat;
+
+ iolat = kzalloc_node(sizeof(*iolat), gfp, node);
+ if (!iolat)
+ return NULL;
+ iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
+ __alignof__(struct blk_rq_stat), gfp);
+ if (!iolat->stats) {
+ kfree(iolat);
+ return NULL;
+ }
+ return &iolat->pd;
+}
+
+static void iolatency_pd_init(struct blkg_policy_data *pd)
+{
+ struct iolatency_grp *iolat = pd_to_lat(pd);
+ struct blkcg_gq *blkg = lat_to_blkg(iolat);
+ struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
+ struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct blk_rq_stat *stat;
+ stat = per_cpu_ptr(iolat->stats, cpu);
+ blk_rq_stat_init(stat);
+ }
+
+ rq_wait_init(&iolat->rq_wait);
+ spin_lock_init(&iolat->child_lat.lock);
+ iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
+ iolat->rq_depth.max_depth = UINT_MAX;
+ iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
+ iolat->blkiolat = blkiolat;
+ iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
+ atomic64_set(&iolat->window_start, now);
+
+ /*
+ * We init things in list order, so the pd for the parent may not be
+ * init'ed yet for whatever reason.
+ */
+ if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) {
+ struct iolatency_grp *parent = blkg_to_lat(blkg->parent);
+ atomic_set(&iolat->scale_cookie,
+ atomic_read(&parent->child_lat.scale_cookie));
+ } else {
+ atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE);
+ }
+
+ atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE);
+}
+
+static void iolatency_pd_offline(struct blkg_policy_data *pd)
+{
+ struct iolatency_grp *iolat = pd_to_lat(pd);
+ struct blkcg_gq *blkg = lat_to_blkg(iolat);
+
+ iolatency_set_min_lat_nsec(blkg, 0);
+ iolatency_clear_scaling(blkg);
+}
+
+static void iolatency_pd_free(struct blkg_policy_data *pd)
+{
+ struct iolatency_grp *iolat = pd_to_lat(pd);
+ free_percpu(iolat->stats);
+ kfree(iolat);
+}
+
+static struct cftype iolatency_files[] = {
+ {
+ .name = "latency",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = iolatency_print_limit,
+ .write = iolatency_set_limit,
+ },
+ {}
+};
+
+static struct blkcg_policy blkcg_policy_iolatency = {
+ .dfl_cftypes = iolatency_files,
+ .pd_alloc_fn = iolatency_pd_alloc,
+ .pd_init_fn = iolatency_pd_init,
+ .pd_offline_fn = iolatency_pd_offline,
+ .pd_free_fn = iolatency_pd_free,
+ .pd_stat_fn = iolatency_pd_stat,
+};
+
+static int __init iolatency_init(void)
+{
+ return blkcg_policy_register(&blkcg_policy_iolatency);
+}
+
+static void __exit iolatency_exit(void)
+{
+ return blkcg_policy_unregister(&blkcg_policy_iolatency);
+}
+
+module_init(iolatency_init);
+module_exit(iolatency_exit);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 8faa70f26fcd..d1b9dd03da25 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -68,6 +68,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
*/
req_sects = min_t(sector_t, nr_sects,
q->limits.max_discard_sectors);
+ if (!req_sects)
+ goto fail;
if (req_sects > UINT_MAX >> 9)
req_sects = UINT_MAX >> 9;
@@ -105,6 +107,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
*biop = bio;
return 0;
+
+fail:
+ if (bio) {
+ submit_bio_wait(bio);
+ bio_put(bio);
+ }
+ *biop = NULL;
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL(__blkdev_issue_discard);
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
new file mode 100644
index 000000000000..fb2c82c351e4
--- /dev/null
+++ b/block/blk-mq-debugfs-zoned.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/blkdev.h>
+#include "blk-mq-debugfs.h"
+
+int queue_zone_wlock_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ unsigned int i;
+
+ if (!q->seq_zones_wlock)
+ return 0;
+
+ for (i = 0; i < q->nr_zones; i++)
+ if (test_bit(i, q->seq_zones_wlock))
+ seq_printf(m, "%u\n", i);
+
+ return 0;
+}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index ffa622366922..cb1e6cf7ac48 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -206,21 +206,6 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
return count;
}
-static int queue_zone_wlock_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- unsigned int i;
-
- if (!q->seq_zones_wlock)
- return 0;
-
- for (i = 0; i < blk_queue_nr_zones(q); i++)
- if (test_bit(i, q->seq_zones_wlock))
- seq_printf(m, "%u\n", i);
-
- return 0;
-}
-
static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "poll_stat", 0400, queue_poll_stat_show },
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
@@ -356,7 +341,7 @@ static const char *const blk_mq_rq_state_name_array[] = {
static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
{
- if (WARN_ON_ONCE((unsigned int)rq_state >
+ if (WARN_ON_ONCE((unsigned int)rq_state >=
ARRAY_SIZE(blk_mq_rq_state_name_array)))
return "(?)";
return blk_mq_rq_state_name_array[rq_state];
@@ -637,6 +622,14 @@ static int hctx_active_show(void *data, struct seq_file *m)
return 0;
}
+static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
+{
+ struct blk_mq_hw_ctx *hctx = data;
+
+ seq_printf(m, "%u\n", hctx->dispatch_busy);
+ return 0;
+}
+
static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
__acquires(&ctx->lock)
{
@@ -798,6 +791,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"queued", 0600, hctx_queued_show, hctx_queued_write},
{"run", 0600, hctx_run_show, hctx_run_write},
{"active", 0400, hctx_active_show},
+ {"dispatch_busy", 0400, hctx_dispatch_busy_show},
{},
};
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index b9d366e57097..a9160be12be0 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -80,4 +80,13 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc
}
#endif
+#ifdef CONFIG_BLK_DEBUG_FS_ZONED
+int queue_zone_wlock_show(void *data, struct seq_file *m);
+#else
+static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index e233996bb76f..db644ec624f5 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -17,6 +17,8 @@
#include <linux/pci.h>
#include <linux/module.h>
+#include "blk-mq.h"
+
/**
* blk_mq_pci_map_queues - provide a default queue mapping for PCI device
* @set: tagset to provide the mapping for
@@ -48,8 +50,7 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
fallback:
WARN_ON_ONCE(set->nr_hw_queues > 1);
- for_each_possible_cpu(cpu)
- set->mq_map[cpu] = 0;
+ blk_mq_clear_mq_map(set);
return 0;
}
EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 56c493c6cd90..cf9c66c6d35a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -59,29 +59,16 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
return;
- if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
- struct request_queue *q = hctx->queue;
-
- if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_inc(&q->shared_hctx_restart);
- } else
- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+ set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
-static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- return false;
-
- if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
- struct request_queue *q = hctx->queue;
-
- if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_dec(&q->shared_hctx_restart);
- } else
- clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+ return;
+ clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
- return blk_mq_run_hw_queue(hctx, true);
+ blk_mq_run_hw_queue(hctx, true);
}
/*
@@ -219,15 +206,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
}
} else if (has_sched_dispatch) {
blk_mq_do_dispatch_sched(hctx);
- } else if (q->mq_ops->get_budget) {
- /*
- * If we need to get budget before queuing request, we
- * dequeue request one by one from sw queue for avoiding
- * to mess up I/O merge when dispatch runs out of resource.
- *
- * TODO: get more budgets, and dequeue more requests in
- * one time.
- */
+ } else if (hctx->dispatch_busy) {
+ /* dequeue request one by one from sw queue if queue is busy */
blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
@@ -339,7 +319,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
return e->type->ops.mq.bio_merge(hctx, bio);
}
- if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) {
+ if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
+ !list_empty_careful(&ctx->rq_list)) {
/* default per sw-queue merge */
spin_lock(&ctx->lock);
ret = blk_mq_attempt_merge(q, ctx, bio);
@@ -380,68 +361,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
return false;
}
-/**
- * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
- * @pos: loop cursor.
- * @skip: the list element that will not be examined. Iteration starts at
- * @skip->next.
- * @head: head of the list to examine. This list must have at least one
- * element, namely @skip.
- * @member: name of the list_head structure within typeof(*pos).
- */
-#define list_for_each_entry_rcu_rr(pos, skip, head, member) \
- for ((pos) = (skip); \
- (pos = (pos)->member.next != (head) ? list_entry_rcu( \
- (pos)->member.next, typeof(*pos), member) : \
- list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
- (pos) != (skip); )
-
-/*
- * Called after a driver tag has been freed to check whether a hctx needs to
- * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
- * queues in a round-robin fashion if the tag set of @hctx is shared with other
- * hardware queues.
- */
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
-{
- struct blk_mq_tags *const tags = hctx->tags;
- struct blk_mq_tag_set *const set = hctx->queue->tag_set;
- struct request_queue *const queue = hctx->queue, *q;
- struct blk_mq_hw_ctx *hctx2;
- unsigned int i, j;
-
- if (set->flags & BLK_MQ_F_TAG_SHARED) {
- /*
- * If this is 0, then we know that no hardware queues
- * have RESTART marked. We're done.
- */
- if (!atomic_read(&queue->shared_hctx_restart))
- return;
-
- rcu_read_lock();
- list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
- tag_set_list) {
- queue_for_each_hw_ctx(q, hctx2, i)
- if (hctx2->tags == tags &&
- blk_mq_sched_restart_hctx(hctx2))
- goto done;
- }
- j = hctx->queue_num + 1;
- for (i = 0; i < queue->nr_hw_queues; i++, j++) {
- if (j == queue->nr_hw_queues)
- j = 0;
- hctx2 = queue->queue_hw_ctx[j];
- if (hctx2->tags == tags &&
- blk_mq_sched_restart_hctx(hctx2))
- break;
- }
-done:
- rcu_read_unlock();
- } else {
- blk_mq_sched_restart_hctx(hctx);
- }
-}
-
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async)
{
@@ -486,8 +405,19 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
if (e && e->type->ops.mq.insert_requests)
e->type->ops.mq.insert_requests(hctx, list, false);
- else
+ else {
+ /*
+ * try to issue requests directly if the hw queue isn't
+ * busy in case of 'none' scheduler, and this way may save
+ * us one extra enqueue & dequeue to sw queue.
+ */
+ if (!hctx->dispatch_busy && !e && !run_queue_async) {
+ blk_mq_try_issue_list_directly(hctx, list);
+ if (list_empty(list))
+ return;
+ }
blk_mq_insert_requests(hctx, ctx, list);
+ }
blk_mq_run_hw_queue(hctx, run_queue_async);
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 09b2ee6694fb..816923bf874d 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -23,6 +23,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
/*
* If a previously inactive queue goes active, bump the active user count.
+ * We need to do this before try to allocate driver tag, then even if fail
+ * to get tag when first time, the other shared-tag users could reserve
+ * budget for it.
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
@@ -271,7 +274,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* test and set the bit before assining ->rqs[].
*/
rq = tags->rqs[bitnr];
- if (rq && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
+ if (rq && blk_mq_request_started(rq))
iter_data->fn(rq, iter_data->data, reserved);
return true;
@@ -399,8 +402,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth <= tags->nr_reserved_tags)
return -EINVAL;
- tdepth -= tags->nr_reserved_tags;
-
/*
* If we are allowed to grow beyond the original size, allocate
* a new set of tags before freeing the old one.
@@ -420,7 +421,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > 16 * BLKDEV_MAX_RQ)
return -EINVAL;
- new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
+ new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
+ tags->nr_reserved_tags);
if (!new)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
@@ -437,7 +439,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
* Don't need (or can't) update reserved tags here, they
* remain static and should never need resizing.
*/
- sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+ sbitmap_queue_resize(&tags->bitmap_tags,
+ tdepth - tags->nr_reserved_tags);
}
return 0;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 70c65bb6c013..72a0033ccee9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -34,8 +34,8 @@
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
#include "blk-stat.h"
-#include "blk-wbt.h"
#include "blk-mq-sched.h"
+#include "blk-rq-qos.h"
static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
static void blk_mq_poll_stats_start(struct request_queue *q);
@@ -285,7 +285,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->tag = -1;
rq->internal_tag = tag;
} else {
- if (blk_mq_tag_busy(data->hctx)) {
+ if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
@@ -367,6 +367,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
!(data->flags & BLK_MQ_REQ_RESERVED))
e->type->ops.mq.limit_depth(op, data);
+ } else {
+ blk_mq_tag_busy(data->hctx);
}
tag = blk_mq_get_tag(data);
@@ -504,7 +506,7 @@ void blk_mq_free_request(struct request *rq)
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->backing_dev_info);
- wbt_done(q->rq_wb, rq);
+ rq_qos_done(q, rq);
if (blk_rq_rl(rq))
blk_put_rl(blk_rq_rl(rq));
@@ -527,7 +529,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_account_io_done(rq, now);
if (rq->end_io) {
- wbt_done(rq->q->rq_wb, rq);
+ rq_qos_done(rq->q, rq);
rq->end_io(rq, error);
} else {
if (unlikely(blk_bidi_rq(rq)))
@@ -558,10 +560,8 @@ static void __blk_mq_complete_request(struct request *rq)
bool shared = false;
int cpu;
- if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) !=
- MQ_RQ_IN_FLIGHT)
+ if (!blk_mq_mark_complete(rq))
return;
-
if (rq->internal_tag != -1)
blk_mq_sched_completed_request(rq);
@@ -641,7 +641,7 @@ void blk_mq_start_request(struct request *rq)
rq->throtl_size = blk_rq_sectors(rq);
#endif
rq->rq_flags |= RQF_STATS;
- wbt_issue(q->rq_wb, rq);
+ rq_qos_issue(q, rq);
}
WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
@@ -667,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq)
blk_mq_put_driver_tag(rq);
trace_block_rq_requeue(q, rq);
- wbt_requeue(q->rq_wb, rq);
+ rq_qos_requeue(q, rq);
if (blk_mq_request_started(rq)) {
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
@@ -781,7 +781,6 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
}
- req->rq_flags &= ~RQF_TIMED_OUT;
blk_add_timer(req);
}
@@ -965,16 +964,14 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
-bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
- bool wait)
+bool blk_mq_get_driver_tag(struct request *rq)
{
struct blk_mq_alloc_data data = {
.q = rq->q,
.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
- .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+ .flags = BLK_MQ_REQ_NOWAIT,
};
-
- might_sleep_if(wait);
+ bool shared;
if (rq->tag != -1)
goto done;
@@ -982,9 +979,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
data.flags |= BLK_MQ_REQ_RESERVED;
+ shared = blk_mq_tag_busy(data.hctx);
rq->tag = blk_mq_get_tag(&data);
if (rq->tag >= 0) {
- if (blk_mq_tag_busy(data.hctx)) {
+ if (shared) {
rq->rq_flags |= RQF_MQ_INFLIGHT;
atomic_inc(&data.hctx->nr_active);
}
@@ -992,8 +990,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
}
done:
- if (hctx)
- *hctx = data.hctx;
return rq->tag != -1;
}
@@ -1004,7 +1000,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+ spin_lock(&hctx->dispatch_wait_lock);
list_del_init(&wait->entry);
+ spin_unlock(&hctx->dispatch_wait_lock);
+
blk_mq_run_hw_queue(hctx, true);
return 1;
}
@@ -1015,17 +1014,16 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
* restart. For both cases, take care to check the condition again after
* marking us as waiting.
*/
-static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
+static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- struct blk_mq_hw_ctx *this_hctx = *hctx;
- struct sbq_wait_state *ws;
+ struct wait_queue_head *wq;
wait_queue_entry_t *wait;
bool ret;
- if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
- set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
+ if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
+ if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+ set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
/*
* It's possible that a tag was freed in the window between the
@@ -1035,30 +1033,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
* Don't clear RESTART here, someone else could have set it.
* At most this will cost an extra queue run.
*/
- return blk_mq_get_driver_tag(rq, hctx, false);
+ return blk_mq_get_driver_tag(rq);
}
- wait = &this_hctx->dispatch_wait;
+ wait = &hctx->dispatch_wait;
if (!list_empty_careful(&wait->entry))
return false;
- spin_lock(&this_hctx->lock);
+ wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
+
+ spin_lock_irq(&wq->lock);
+ spin_lock(&hctx->dispatch_wait_lock);
if (!list_empty(&wait->entry)) {
- spin_unlock(&this_hctx->lock);
+ spin_unlock(&hctx->dispatch_wait_lock);
+ spin_unlock_irq(&wq->lock);
return false;
}
- ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
- add_wait_queue(&ws->wait, wait);
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue(wq, wait);
/*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
*/
- ret = blk_mq_get_driver_tag(rq, hctx, false);
+ ret = blk_mq_get_driver_tag(rq);
if (!ret) {
- spin_unlock(&this_hctx->lock);
+ spin_unlock(&hctx->dispatch_wait_lock);
+ spin_unlock_irq(&wq->lock);
return false;
}
@@ -1066,16 +1069,47 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
* We got a tag, remove ourselves from the wait queue to ensure
* someone else gets the wakeup.
*/
- spin_lock_irq(&ws->wait.lock);
list_del_init(&wait->entry);
- spin_unlock_irq(&ws->wait.lock);
- spin_unlock(&this_hctx->lock);
+ spin_unlock(&hctx->dispatch_wait_lock);
+ spin_unlock_irq(&wq->lock);
return true;
}
+#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
+#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
+/*
+ * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
+ * - EWMA is one simple way to compute running average value
+ * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
+ * - take 4 as factor for avoiding to get too small(0) result, and this
+ * factor doesn't matter because EWMA decreases exponentially
+ */
+static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
+{
+ unsigned int ewma;
+
+ if (hctx->queue->elevator)
+ return;
+
+ ewma = hctx->dispatch_busy;
+
+ if (!ewma && !busy)
+ return;
+
+ ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
+ if (busy)
+ ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
+ ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
+
+ hctx->dispatch_busy = ewma;
+}
+
#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
+/*
+ * Returns true if we did some work AND can potentially do more.
+ */
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bool got_budget)
{
@@ -1103,7 +1137,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
break;
- if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+ if (!blk_mq_get_driver_tag(rq)) {
/*
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed. The
@@ -1111,7 +1145,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
* before we add this entry back on the dispatch list,
* we'll re-run it below.
*/
- if (!blk_mq_mark_tag_wait(&hctx, rq)) {
+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
blk_mq_put_dispatch_budget(hctx);
/*
* For non-shared tags, the RESTART check
@@ -1135,7 +1169,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bd.last = true;
else {
nxt = list_first_entry(list, struct request, queuelist);
- bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
+ bd.last = !blk_mq_get_driver_tag(nxt);
}
ret = q->mq_ops->queue_rq(hctx, &bd);
@@ -1206,7 +1240,18 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
blk_mq_run_hw_queue(hctx, true);
else if (needs_restart && (ret == BLK_STS_RESOURCE))
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
- }
+
+ blk_mq_update_dispatch_busy(hctx, true);
+ return false;
+ } else
+ blk_mq_update_dispatch_busy(hctx, false);
+
+ /*
+ * If the host/device is unable to accept more work, inform the
+ * caller of that.
+ */
+ if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
+ return false;
return (queued + errors) != 0;
}
@@ -1533,19 +1578,19 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list)
{
+ struct request *rq;
+
/*
* preemption doesn't flush plug list, so it's possible ctx->cpu is
* offline now
*/
- spin_lock(&ctx->lock);
- while (!list_empty(list)) {
- struct request *rq;
-
- rq = list_first_entry(list, struct request, queuelist);
+ list_for_each_entry(rq, list, queuelist) {
BUG_ON(rq->mq_ctx != ctx);
- list_del_init(&rq->queuelist);
- __blk_mq_insert_req_list(hctx, rq, false);
+ trace_block_rq_insert(hctx->queue, rq);
}
+
+ spin_lock(&ctx->lock);
+ list_splice_tail_init(list, &ctx->rq_list);
blk_mq_hctx_mark_pending(hctx, ctx);
spin_unlock(&ctx->lock);
}
@@ -1648,13 +1693,16 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
+ blk_mq_update_dispatch_busy(hctx, false);
*cookie = new_cookie;
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
+ blk_mq_update_dispatch_busy(hctx, true);
__blk_mq_requeue_request(rq);
break;
default:
+ blk_mq_update_dispatch_busy(hctx, false);
*cookie = BLK_QC_T_NONE;
break;
}
@@ -1689,7 +1737,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (!blk_mq_get_dispatch_budget(hctx))
goto insert;
- if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+ if (!blk_mq_get_driver_tag(rq)) {
blk_mq_put_dispatch_budget(hctx);
goto insert;
}
@@ -1737,6 +1785,27 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
return ret;
}
+void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list)
+{
+ while (!list_empty(list)) {
+ blk_status_t ret;
+ struct request *rq = list_first_entry(list, struct request,
+ queuelist);
+
+ list_del_init(&rq->queuelist);
+ ret = blk_mq_request_issue_directly(rq);
+ if (ret != BLK_STS_OK) {
+ if (ret == BLK_STS_RESOURCE ||
+ ret == BLK_STS_DEV_RESOURCE) {
+ list_add(&rq->queuelist, list);
+ break;
+ }
+ blk_mq_end_request(rq, ret);
+ }
+ }
+}
+
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
@@ -1747,7 +1816,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
- unsigned int wb_acct;
blk_queue_bounce(q, &bio);
@@ -1763,19 +1831,19 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
if (blk_mq_sched_bio_merge(q, bio))
return BLK_QC_T_NONE;
- wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+ rq_qos_throttle(q, bio, NULL);
trace_block_getrq(q, bio, bio->bi_opf);
rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
if (unlikely(!rq)) {
- __wbt_done(q->rq_wb, wb_acct);
+ rq_qos_cleanup(q, bio);
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
return BLK_QC_T_NONE;
}
- wbt_track(rq, wb_acct);
+ rq_qos_track(q, rq, bio);
cookie = request_to_qc_t(data.hctx, rq);
@@ -1838,7 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
}
- } else if (q->nr_hw_queues > 1 && is_sync) {
+ } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
+ !data.hctx->dispatch_busy)) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
@@ -2137,6 +2206,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
hctx->nr_ctx = 0;
+ spin_lock_init(&hctx->dispatch_wait_lock);
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
@@ -2322,15 +2392,10 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (shared) {
- if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_inc(&q->shared_hctx_restart);
+ if (shared)
hctx->flags |= BLK_MQ_F_TAG_SHARED;
- } else {
- if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- atomic_dec(&q->shared_hctx_restart);
+ else
hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
- }
}
}
@@ -2361,7 +2426,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
blk_mq_update_tag_set_depth(set, false);
}
mutex_unlock(&set->tag_list_lock);
- synchronize_rcu();
INIT_LIST_HEAD(&q->tag_set_list);
}
@@ -2676,7 +2740,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
if (set->ops->map_queues) {
- int cpu;
/*
* transport .map_queues is usually done in the following
* way:
@@ -2691,8 +2754,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
* killing stale mapping since one CPU may not be mapped
* to any hw queue.
*/
- for_each_possible_cpu(cpu)
- set->mq_map[cpu] = 0;
+ blk_mq_clear_mq_map(set);
return set->ops->map_queues(set);
} else
@@ -2702,7 +2764,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
- * requested depth down, if if it too large. In that case, the set
+ * requested depth down, if it's too large. In that case, the set
* value will be stored in set->queue_depth.
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 89231e439b2f..9497b47e2526 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -36,8 +36,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
- bool wait);
+bool blk_mq_get_driver_tag(struct request *rq);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
@@ -65,6 +64,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
/* Used by blk_insert_cloned_request() to issue request directly */
blk_status_t blk_mq_request_issue_directly(struct request *rq);
+void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list);
/*
* CPU -> queue mappings
@@ -203,4 +204,12 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
__blk_mq_put_driver_tag(hctx, rq);
}
+static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ set->mq_map[cpu] = 0;
+}
+
#endif
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
new file mode 100644
index 000000000000..0005dfd568dd
--- /dev/null
+++ b/block/blk-rq-qos.c
@@ -0,0 +1,194 @@
+#include "blk-rq-qos.h"
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, unsigned int below)
+{
+ unsigned int cur = atomic_read(v);
+
+ for (;;) {
+ unsigned int old;
+
+ if (cur >= below)
+ return false;
+ old = atomic_cmpxchg(v, cur, cur + 1);
+ if (old == cur)
+ break;
+ cur = old;
+ }
+
+ return true;
+}
+
+bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
+{
+ return atomic_inc_below(&rq_wait->inflight, limit);
+}
+
+void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
+{
+ struct rq_qos *rqos;
+
+ for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->cleanup)
+ rqos->ops->cleanup(rqos, bio);
+ }
+}
+
+void rq_qos_done(struct request_queue *q, struct request *rq)
+{
+ struct rq_qos *rqos;
+
+ for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->done)
+ rqos->ops->done(rqos, rq);
+ }
+}
+
+void rq_qos_issue(struct request_queue *q, struct request *rq)
+{
+ struct rq_qos *rqos;
+
+ for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->issue)
+ rqos->ops->issue(rqos, rq);
+ }
+}
+
+void rq_qos_requeue(struct request_queue *q, struct request *rq)
+{
+ struct rq_qos *rqos;
+
+ for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->requeue)
+ rqos->ops->requeue(rqos, rq);
+ }
+}
+
+void rq_qos_throttle(struct request_queue *q, struct bio *bio,
+ spinlock_t *lock)
+{
+ struct rq_qos *rqos;
+
+ for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->throttle)
+ rqos->ops->throttle(rqos, bio, lock);
+ }
+}
+
+void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio)
+{
+ struct rq_qos *rqos;
+
+ for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->track)
+ rqos->ops->track(rqos, rq, bio);
+ }
+}
+
+void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+{
+ struct rq_qos *rqos;
+
+ for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->ops->done_bio)
+ rqos->ops->done_bio(rqos, bio);
+ }
+}
+
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+bool rq_depth_calc_max_depth(struct rq_depth *rqd)
+{
+ unsigned int depth;
+ bool ret = false;
+
+ /*
+ * For QD=1 devices, this is a special case. It's important for those
+ * to have one request ready when one completes, so force a depth of
+ * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+ * since the device can't have more than that in flight. If we're
+ * scaling down, then keep a setting of 1/1/1.
+ */
+ if (rqd->queue_depth == 1) {
+ if (rqd->scale_step > 0)
+ rqd->max_depth = 1;
+ else {
+ rqd->max_depth = 2;
+ ret = true;
+ }
+ } else {
+ /*
+ * scale_step == 0 is our default state. If we have suffered
+ * latency spikes, step will be > 0, and we shrink the
+ * allowed write depths. If step is < 0, we're only doing
+ * writes, and we allow a temporarily higher depth to
+ * increase performance.
+ */
+ depth = min_t(unsigned int, rqd->default_depth,
+ rqd->queue_depth);
+ if (rqd->scale_step > 0)
+ depth = 1 + ((depth - 1) >> min(31, rqd->scale_step));
+ else if (rqd->scale_step < 0) {
+ unsigned int maxd = 3 * rqd->queue_depth / 4;
+
+ depth = 1 + ((depth - 1) << -rqd->scale_step);
+ if (depth > maxd) {
+ depth = maxd;
+ ret = true;
+ }
+ }
+
+ rqd->max_depth = depth;
+ }
+
+ return ret;
+}
+
+void rq_depth_scale_up(struct rq_depth *rqd)
+{
+ /*
+ * Hit max in previous round, stop here
+ */
+ if (rqd->scaled_max)
+ return;
+
+ rqd->scale_step--;
+
+ rqd->scaled_max = rq_depth_calc_max_depth(rqd);
+}
+
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
+{
+ /*
+ * Stop scaling down when we've hit the limit. This also prevents
+ * ->scale_step from going to crazy values, if the device can't
+ * keep up.
+ */
+ if (rqd->max_depth == 1)
+ return;
+
+ if (rqd->scale_step < 0 && hard_throttle)
+ rqd->scale_step = 0;
+ else
+ rqd->scale_step++;
+
+ rqd->scaled_max = false;
+ rq_depth_calc_max_depth(rqd);
+}
+
+void rq_qos_exit(struct request_queue *q)
+{
+ while (q->rq_qos) {
+ struct rq_qos *rqos = q->rq_qos;
+ q->rq_qos = rqos->next;
+ rqos->ops->exit(rqos);
+ }
+}
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
new file mode 100644
index 000000000000..32b02efbfa66
--- /dev/null
+++ b/block/blk-rq-qos.h
@@ -0,0 +1,109 @@
+#ifndef RQ_QOS_H
+#define RQ_QOS_H
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blk_types.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
+
+enum rq_qos_id {
+ RQ_QOS_WBT,
+ RQ_QOS_CGROUP,
+};
+
+struct rq_wait {
+ wait_queue_head_t wait;
+ atomic_t inflight;
+};
+
+struct rq_qos {
+ struct rq_qos_ops *ops;
+ struct request_queue *q;
+ enum rq_qos_id id;
+ struct rq_qos *next;
+};
+
+struct rq_qos_ops {
+ void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *);
+ void (*track)(struct rq_qos *, struct request *, struct bio *);
+ void (*issue)(struct rq_qos *, struct request *);
+ void (*requeue)(struct rq_qos *, struct request *);
+ void (*done)(struct rq_qos *, struct request *);
+ void (*done_bio)(struct rq_qos *, struct bio *);
+ void (*cleanup)(struct rq_qos *, struct bio *);
+ void (*exit)(struct rq_qos *);
+};
+
+struct rq_depth {
+ unsigned int max_depth;
+
+ int scale_step;
+ bool scaled_max;
+
+ unsigned int queue_depth;
+ unsigned int default_depth;
+};
+
+static inline struct rq_qos *rq_qos_id(struct request_queue *q,
+ enum rq_qos_id id)
+{
+ struct rq_qos *rqos;
+ for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (rqos->id == id)
+ break;
+ }
+ return rqos;
+}
+
+static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
+{
+ return rq_qos_id(q, RQ_QOS_WBT);
+}
+
+static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
+{
+ return rq_qos_id(q, RQ_QOS_CGROUP);
+}
+
+static inline void rq_wait_init(struct rq_wait *rq_wait)
+{
+ atomic_set(&rq_wait->inflight, 0);
+ init_waitqueue_head(&rq_wait->wait);
+}
+
+static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+{
+ rqos->next = q->rq_qos;
+ q->rq_qos = rqos;
+}
+
+static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
+{
+ struct rq_qos *cur, *prev = NULL;
+ for (cur = q->rq_qos; cur; cur = cur->next) {
+ if (cur == rqos) {
+ if (prev)
+ prev->next = rqos->next;
+ else
+ q->rq_qos = cur;
+ break;
+ }
+ prev = cur;
+ }
+}
+
+bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
+void rq_depth_scale_up(struct rq_depth *rqd);
+void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
+bool rq_depth_calc_max_depth(struct rq_depth *rqd);
+
+void rq_qos_cleanup(struct request_queue *, struct bio *);
+void rq_qos_done(struct request_queue *, struct request *);
+void rq_qos_issue(struct request_queue *, struct request *);
+void rq_qos_requeue(struct request_queue *, struct request *);
+void rq_qos_done_bio(struct request_queue *q, struct bio *bio);
+void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *);
+void rq_qos_track(struct request_queue *q, struct request *, struct bio *);
+void rq_qos_exit(struct request_queue *);
+#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d1de71124656..ffd459969689 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -128,7 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
/* Inherit limits from component devices */
lim->max_segments = USHRT_MAX;
- lim->max_discard_segments = 1;
+ lim->max_discard_segments = USHRT_MAX;
lim->max_hw_sectors = UINT_MAX;
lim->max_segment_size = UINT_MAX;
lim->max_sectors = UINT_MAX;
@@ -875,7 +875,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
{
q->queue_depth = depth;
- wbt_set_queue_depth(q->rq_wb, depth);
+ wbt_set_queue_depth(q, depth);
}
EXPORT_SYMBOL(blk_set_queue_depth);
@@ -900,7 +900,7 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
queue_flag_clear(QUEUE_FLAG_FUA, q);
spin_unlock_irq(q->queue_lock);
- wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+ wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 01e2b353a2b9..15c1f5e12eb8 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -144,6 +144,7 @@ do_local:
local_irq_restore(flags);
}
+EXPORT_SYMBOL(__blk_complete_request);
/**
* blk_complete_request - end I/O on a request
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 175c143ac5b9..7587b1c3caaf 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -17,7 +17,7 @@ struct blk_queue_stats {
bool enable_accounting;
};
-static void blk_stat_init(struct blk_rq_stat *stat)
+void blk_rq_stat_init(struct blk_rq_stat *stat)
{
stat->min = -1ULL;
stat->max = stat->nr_samples = stat->mean = 0;
@@ -25,7 +25,7 @@ static void blk_stat_init(struct blk_rq_stat *stat)
}
/* src is a per-cpu stat, mean isn't initialized */
-static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
if (!src->nr_samples)
return;
@@ -39,7 +39,7 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
dst->nr_samples += src->nr_samples;
}
-static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
+void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
{
stat->min = min(stat->min, value);
stat->max = max(stat->max, value);
@@ -69,7 +69,7 @@ void blk_stat_add(struct request *rq, u64 now)
continue;
stat = &get_cpu_ptr(cb->cpu_stat)[bucket];
- __blk_stat_add(stat, value);
+ blk_rq_stat_add(stat, value);
put_cpu_ptr(cb->cpu_stat);
}
rcu_read_unlock();
@@ -82,15 +82,15 @@ static void blk_stat_timer_fn(struct timer_list *t)
int cpu;
for (bucket = 0; bucket < cb->buckets; bucket++)
- blk_stat_init(&cb->stat[bucket]);
+ blk_rq_stat_init(&cb->stat[bucket]);
for_each_online_cpu(cpu) {
struct blk_rq_stat *cpu_stat;
cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
for (bucket = 0; bucket < cb->buckets; bucket++) {
- blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
- blk_stat_init(&cpu_stat[bucket]);
+ blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
+ blk_rq_stat_init(&cpu_stat[bucket]);
}
}
@@ -143,7 +143,7 @@ void blk_stat_add_callback(struct request_queue *q,
cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
for (bucket = 0; bucket < cb->buckets; bucket++)
- blk_stat_init(&cpu_stat[bucket]);
+ blk_rq_stat_init(&cpu_stat[bucket]);
}
spin_lock(&q->stats->lock);
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 78399cdde9c9..f4a1568e81a4 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -159,4 +159,8 @@ static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
}
+void blk_rq_stat_add(struct blk_rq_stat *, u64);
+void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
+void blk_rq_stat_init(struct blk_rq_stat *);
+
#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 94987b1f69e1..bb109bb0a055 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -422,16 +422,16 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
{
- if (!q->rq_wb)
+ if (!wbt_rq_qos(q))
return -EINVAL;
- return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+ return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
}
static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
size_t count)
{
- struct rq_wb *rwb;
+ struct rq_qos *rqos;
ssize_t ret;
s64 val;
@@ -441,23 +441,21 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
if (val < -1)
return -EINVAL;
- rwb = q->rq_wb;
- if (!rwb) {
+ rqos = wbt_rq_qos(q);
+ if (!rqos) {
ret = wbt_init(q);
if (ret)
return ret;
}
- rwb = q->rq_wb;
if (val == -1)
- rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+ val = wbt_default_latency_nsec(q);
else if (val >= 0)
- rwb->min_lat_nsec = val * 1000ULL;
+ val *= 1000ULL;
- if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
- rwb->enable_state = WBT_STATE_ON_MANUAL;
+ wbt_set_min_lat(q, val);
- wbt_update_limits(rwb);
+ wbt_update_limits(q);
return count;
}
@@ -804,6 +802,21 @@ static void __blk_release_queue(struct work_struct *work)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
+ if (!blk_queue_dead(q)) {
+ /*
+ * Last reference was dropped without having called
+ * blk_cleanup_queue().
+ */
+ WARN_ONCE(blk_queue_init_done(q),
+ "request queue %p has been registered but blk_cleanup_queue() has not been called for that queue\n",
+ q);
+ blk_exit_queue(q);
+ }
+
+ WARN(blk_queue_root_blkg(q),
+ "request queue %p is being released but it has not yet been removed from the blkcg controller\n",
+ q);
+
blk_free_queue_stats(q->stats);
blk_exit_rl(q, &q->root_rl);
@@ -964,7 +977,7 @@ void blk_unregister_queue(struct gendisk *disk)
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
- wbt_exit(q);
+ rq_qos_exit(q);
mutex_lock(&q->sysfs_lock);
if (q->request_fn || (q->mq_ops && q->elevator))
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 82282e6fdcf8..a3eede00d302 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -579,8 +579,10 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
struct throtl_grp *tg = blkg_to_tg(blkg);
if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
- tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+ tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
low_valid = true;
+ break;
+ }
}
rcu_read_unlock();
@@ -920,12 +922,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
}
/* Calc approx time to dispatch */
- jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
-
- if (jiffy_wait > jiffy_elapsed)
- jiffy_wait = jiffy_wait - jiffy_elapsed;
- else
- jiffy_wait = 1;
+ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
if (wait)
*wait = jiffy_wait;
@@ -2132,12 +2129,8 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
{
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- if (bio->bi_css) {
- if (bio->bi_cg_private)
- blkg_put(tg_to_blkg(bio->bi_cg_private));
- bio->bi_cg_private = tg;
- blkg_get(tg_to_blkg(tg));
- }
+ if (bio->bi_css)
+ bio_associate_blkg(bio, tg_to_blkg(tg));
bio_issue_init(&bio->bi_issue, bio_sectors(bio));
#endif
}
@@ -2285,6 +2278,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
void blk_throtl_bio_endio(struct bio *bio)
{
+ struct blkcg_gq *blkg;
struct throtl_grp *tg;
u64 finish_time_ns;
unsigned long finish_time;
@@ -2292,20 +2286,18 @@ void blk_throtl_bio_endio(struct bio *bio)
unsigned long lat;
int rw = bio_data_dir(bio);
- tg = bio->bi_cg_private;
- if (!tg)
+ blkg = bio->bi_blkg;
+ if (!blkg)
return;
- bio->bi_cg_private = NULL;
+ tg = blkg_to_tg(blkg);
finish_time_ns = ktime_get_ns();
tg->last_finish_time = finish_time_ns >> 10;
start_time = bio_issue_time(&bio->bi_issue) >> 10;
finish_time = __bio_issue_time(finish_time_ns) >> 10;
- if (!start_time || finish_time <= start_time) {
- blkg_put(tg_to_blkg(tg));
+ if (!start_time || finish_time <= start_time)
return;
- }
lat = finish_time - start_time;
/* this is only for bio based driver */
@@ -2334,8 +2326,6 @@ void blk_throtl_bio_endio(struct bio *bio)
tg->bio_cnt /= 2;
tg->bad_bio_cnt /= 2;
}
-
- blkg_put(tg_to_blkg(tg));
}
#endif
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 4b8a48d48ba1..f2cfd56e1606 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -210,6 +210,7 @@ void blk_add_timer(struct request *req)
if (!req->timeout)
req->timeout = q->rq_timeout;
+ req->rq_flags &= ~RQF_TIMED_OUT;
blk_rq_set_deadline(req, jiffies + req->timeout);
/*
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 4f89b28fa652..1d94a20374fc 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -25,6 +25,7 @@
#include <linux/swap.h>
#include "blk-wbt.h"
+#include "blk-rq-qos.h"
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
@@ -78,28 +79,6 @@ static inline bool rwb_enabled(struct rq_wb *rwb)
return rwb && rwb->wb_normal != 0;
}
-/*
- * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
- * false if 'v' + 1 would be bigger than 'below'.
- */
-static bool atomic_inc_below(atomic_t *v, int below)
-{
- int cur = atomic_read(v);
-
- for (;;) {
- int old;
-
- if (cur >= below)
- return false;
- old = atomic_cmpxchg(v, cur, cur + 1);
- if (old == cur)
- break;
- cur = old;
- }
-
- return true;
-}
-
static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
{
if (rwb_enabled(rwb)) {
@@ -116,7 +95,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
+ struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb;
return time_before(jiffies, wb->dirty_sleep + HZ);
}
@@ -144,8 +123,9 @@ static void rwb_wake_all(struct rq_wb *rwb)
}
}
-void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
+static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
{
+ struct rq_wb *rwb = RQWB(rqos);
struct rq_wait *rqw;
int inflight, limit;
@@ -186,7 +166,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
int diff = limit - inflight;
if (!inflight || diff >= rwb->wb_background / 2)
- wake_up_all(&rqw->wait);
+ wake_up(&rqw->wait);
}
}
@@ -194,10 +174,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
* Called on completion of a request. Note that it's also called when
* a request is merged, when the request gets freed.
*/
-void wbt_done(struct rq_wb *rwb, struct request *rq)
+static void wbt_done(struct rq_qos *rqos, struct request *rq)
{
- if (!rwb)
- return;
+ struct rq_wb *rwb = RQWB(rqos);
if (!wbt_is_tracked(rq)) {
if (rwb->sync_cookie == rq) {
@@ -209,72 +188,11 @@ void wbt_done(struct rq_wb *rwb, struct request *rq)
wb_timestamp(rwb, &rwb->last_comp);
} else {
WARN_ON_ONCE(rq == rwb->sync_cookie);
- __wbt_done(rwb, wbt_flags(rq));
+ __wbt_done(rqos, wbt_flags(rq));
}
wbt_clear_state(rq);
}
-/*
- * Return true, if we can't increase the depth further by scaling
- */
-static bool calc_wb_limits(struct rq_wb *rwb)
-{
- unsigned int depth;
- bool ret = false;
-
- if (!rwb->min_lat_nsec) {
- rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
- return false;
- }
-
- /*
- * For QD=1 devices, this is a special case. It's important for those
- * to have one request ready when one completes, so force a depth of
- * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
- * since the device can't have more than that in flight. If we're
- * scaling down, then keep a setting of 1/1/1.
- */
- if (rwb->queue_depth == 1) {
- if (rwb->scale_step > 0)
- rwb->wb_max = rwb->wb_normal = 1;
- else {
- rwb->wb_max = rwb->wb_normal = 2;
- ret = true;
- }
- rwb->wb_background = 1;
- } else {
- /*
- * scale_step == 0 is our default state. If we have suffered
- * latency spikes, step will be > 0, and we shrink the
- * allowed write depths. If step is < 0, we're only doing
- * writes, and we allow a temporarily higher depth to
- * increase performance.
- */
- depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
- if (rwb->scale_step > 0)
- depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
- else if (rwb->scale_step < 0) {
- unsigned int maxd = 3 * rwb->queue_depth / 4;
-
- depth = 1 + ((depth - 1) << -rwb->scale_step);
- if (depth > maxd) {
- depth = maxd;
- ret = true;
- }
- }
-
- /*
- * Set our max/normal/bg queue depths based on how far
- * we have scaled down (->scale_step).
- */
- rwb->wb_max = depth;
- rwb->wb_normal = (rwb->wb_max + 1) / 2;
- rwb->wb_background = (rwb->wb_max + 3) / 4;
- }
-
- return ret;
-}
-
static inline bool stat_sample_valid(struct blk_rq_stat *stat)
{
/*
@@ -307,7 +225,8 @@ enum {
static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
{
- struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
+ struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+ struct rq_depth *rqd = &rwb->rq_depth;
u64 thislat;
/*
@@ -351,7 +270,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
return LAT_EXCEEDED;
}
- if (rwb->scale_step)
+ if (rqd->scale_step)
trace_wbt_stat(bdi, stat);
return LAT_OK;
@@ -359,58 +278,48 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
{
- struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
+ struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+ struct rq_depth *rqd = &rwb->rq_depth;
- trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
- rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+ trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
+ rwb->wb_background, rwb->wb_normal, rqd->max_depth);
}
-static void scale_up(struct rq_wb *rwb)
+static void calc_wb_limits(struct rq_wb *rwb)
{
- /*
- * Hit max in previous round, stop here
- */
- if (rwb->scaled_max)
- return;
+ if (rwb->min_lat_nsec == 0) {
+ rwb->wb_normal = rwb->wb_background = 0;
+ } else if (rwb->rq_depth.max_depth <= 2) {
+ rwb->wb_normal = rwb->rq_depth.max_depth;
+ rwb->wb_background = 1;
+ } else {
+ rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
+ rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
+ }
+}
- rwb->scale_step--;
+static void scale_up(struct rq_wb *rwb)
+{
+ rq_depth_scale_up(&rwb->rq_depth);
+ calc_wb_limits(rwb);
rwb->unknown_cnt = 0;
-
- rwb->scaled_max = calc_wb_limits(rwb);
-
- rwb_wake_all(rwb);
-
- rwb_trace_step(rwb, "step up");
+ rwb_trace_step(rwb, "scale up");
}
-/*
- * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
- * had a latency violation.
- */
static void scale_down(struct rq_wb *rwb, bool hard_throttle)
{
- /*
- * Stop scaling down when we've hit the limit. This also prevents
- * ->scale_step from going to crazy values, if the device can't
- * keep up.
- */
- if (rwb->wb_max == 1)
- return;
-
- if (rwb->scale_step < 0 && hard_throttle)
- rwb->scale_step = 0;
- else
- rwb->scale_step++;
-
- rwb->scaled_max = false;
- rwb->unknown_cnt = 0;
+ rq_depth_scale_down(&rwb->rq_depth, hard_throttle);
calc_wb_limits(rwb);
- rwb_trace_step(rwb, "step down");
+ rwb->unknown_cnt = 0;
+ rwb_wake_all(rwb);
+ rwb_trace_step(rwb, "scale down");
}
static void rwb_arm_timer(struct rq_wb *rwb)
{
- if (rwb->scale_step > 0) {
+ struct rq_depth *rqd = &rwb->rq_depth;
+
+ if (rqd->scale_step > 0) {
/*
* We should speed this up, using some variant of a fast
* integer inverse square root calculation. Since we only do
@@ -418,7 +327,7 @@ static void rwb_arm_timer(struct rq_wb *rwb)
* though.
*/
rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
- int_sqrt((rwb->scale_step + 1) << 8));
+ int_sqrt((rqd->scale_step + 1) << 8));
} else {
/*
* For step < 0, we don't want to increase/decrease the
@@ -433,12 +342,13 @@ static void rwb_arm_timer(struct rq_wb *rwb)
static void wb_timer_fn(struct blk_stat_callback *cb)
{
struct rq_wb *rwb = cb->data;
+ struct rq_depth *rqd = &rwb->rq_depth;
unsigned int inflight = wbt_inflight(rwb);
int status;
status = latency_exceeded(rwb, cb->stat);
- trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
+ trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step,
inflight);
/*
@@ -469,9 +379,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
* currently don't have a valid read/write sample. For that
* case, slowly return to center state (step == 0).
*/
- if (rwb->scale_step > 0)
+ if (rqd->scale_step > 0)
scale_up(rwb);
- else if (rwb->scale_step < 0)
+ else if (rqd->scale_step < 0)
scale_down(rwb, false);
break;
default:
@@ -481,19 +391,50 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
/*
* Re-arm timer, if we have IO in flight
*/
- if (rwb->scale_step || inflight)
+ if (rqd->scale_step || inflight)
rwb_arm_timer(rwb);
}
-void wbt_update_limits(struct rq_wb *rwb)
+static void __wbt_update_limits(struct rq_wb *rwb)
{
- rwb->scale_step = 0;
- rwb->scaled_max = false;
+ struct rq_depth *rqd = &rwb->rq_depth;
+
+ rqd->scale_step = 0;
+ rqd->scaled_max = false;
+
+ rq_depth_calc_max_depth(rqd);
calc_wb_limits(rwb);
rwb_wake_all(rwb);
}
+void wbt_update_limits(struct request_queue *q)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ if (!rqos)
+ return;
+ __wbt_update_limits(RQWB(rqos));
+}
+
+u64 wbt_get_min_lat(struct request_queue *q)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ if (!rqos)
+ return 0;
+ return RQWB(rqos)->min_lat_nsec;
+}
+
+void wbt_set_min_lat(struct request_queue *q, u64 val)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ if (!rqos)
+ return;
+ RQWB(rqos)->min_lat_nsec = val;
+ RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+ __wbt_update_limits(RQWB(rqos));
+}
+
+
static bool close_io(struct rq_wb *rwb)
{
const unsigned long now = jiffies;
@@ -520,7 +461,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
* IO for a bit.
*/
if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
- limit = rwb->wb_max;
+ limit = rwb->rq_depth.max_depth;
else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
/*
* If less than 100ms since we completed unrelated IO,
@@ -533,30 +474,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
return limit;
}
-static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
- wait_queue_entry_t *wait, unsigned long rw)
-{
- /*
- * inc it here even if disabled, since we'll dec it at completion.
- * this only happens if the task was sleeping in __wbt_wait(),
- * and someone turned it off at the same time.
- */
- if (!rwb_enabled(rwb)) {
- atomic_inc(&rqw->inflight);
- return true;
- }
-
- /*
- * If the waitqueue is already active and we are not the next
- * in line to be woken up, wait for our turn.
- */
- if (waitqueue_active(&rqw->wait) &&
- rqw->wait.head.next != &wait->entry)
- return false;
-
- return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
-}
-
/*
* Block if we will exceed our limit, or if we are currently waiting for
* the timer to kick off queuing again.
@@ -567,16 +484,32 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
__acquires(lock)
{
struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
- DEFINE_WAIT(wait);
+ DECLARE_WAITQUEUE(wait, current);
- if (may_queue(rwb, rqw, &wait, rw))
+ /*
+ * inc it here even if disabled, since we'll dec it at completion.
+ * this only happens if the task was sleeping in __wbt_wait(),
+ * and someone turned it off at the same time.
+ */
+ if (!rwb_enabled(rwb)) {
+ atomic_inc(&rqw->inflight);
return;
+ }
+ if (!waitqueue_active(&rqw->wait)
+ && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
+ return;
+
+ add_wait_queue_exclusive(&rqw->wait, &wait);
do {
- prepare_to_wait_exclusive(&rqw->wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
- if (may_queue(rwb, rqw, &wait, rw))
+ if (!rwb_enabled(rwb)) {
+ atomic_inc(&rqw->inflight);
+ break;
+ }
+
+ if (rq_wait_inc_below(rqw, get_limit(rwb, rw)))
break;
if (lock) {
@@ -587,7 +520,8 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
io_schedule();
} while (1);
- finish_wait(&rqw->wait, &wait);
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&rqw->wait, &wait);
}
static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
@@ -608,43 +542,72 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
}
}
+static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
+{
+ enum wbt_flags flags = 0;
+
+ if (bio_op(bio) == REQ_OP_READ) {
+ flags = WBT_READ;
+ } else if (wbt_should_throttle(rwb, bio)) {
+ if (current_is_kswapd())
+ flags |= WBT_KSWAPD;
+ if (bio_op(bio) == REQ_OP_DISCARD)
+ flags |= WBT_DISCARD;
+ flags |= WBT_TRACKED;
+ }
+ return flags;
+}
+
+static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
+{
+ struct rq_wb *rwb = RQWB(rqos);
+ enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
+ __wbt_done(rqos, flags);
+}
+
/*
* Returns true if the IO request should be accounted, false if not.
* May sleep, if we have exceeded the writeback limits. Caller can pass
* in an irq held spinlock, if it holds one when calling this function.
* If we do sleep, we'll release and re-grab it.
*/
-enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
+static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
{
- enum wbt_flags ret = 0;
+ struct rq_wb *rwb = RQWB(rqos);
+ enum wbt_flags flags;
if (!rwb_enabled(rwb))
- return 0;
+ return;
- if (bio_op(bio) == REQ_OP_READ)
- ret = WBT_READ;
+ flags = bio_to_wbt_flags(rwb, bio);
if (!wbt_should_throttle(rwb, bio)) {
- if (ret & WBT_READ)
+ if (flags & WBT_READ)
wb_timestamp(rwb, &rwb->last_issue);
- return ret;
+ return;
}
if (current_is_kswapd())
- ret |= WBT_KSWAPD;
+ flags |= WBT_KSWAPD;
if (bio_op(bio) == REQ_OP_DISCARD)
- ret |= WBT_DISCARD;
+ flags |= WBT_DISCARD;
- __wbt_wait(rwb, ret, bio->bi_opf, lock);
+ __wbt_wait(rwb, flags, bio->bi_opf, lock);
if (!blk_stat_is_active(rwb->cb))
rwb_arm_timer(rwb);
+}
- return ret | WBT_TRACKED;
+static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
+{
+ struct rq_wb *rwb = RQWB(rqos);
+ rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
}
-void wbt_issue(struct rq_wb *rwb, struct request *rq)
+void wbt_issue(struct rq_qos *rqos, struct request *rq)
{
+ struct rq_wb *rwb = RQWB(rqos);
+
if (!rwb_enabled(rwb))
return;
@@ -661,8 +624,9 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq)
}
}
-void wbt_requeue(struct rq_wb *rwb, struct request *rq)
+void wbt_requeue(struct rq_qos *rqos, struct request *rq)
{
+ struct rq_wb *rwb = RQWB(rqos);
if (!rwb_enabled(rwb))
return;
if (rq == rwb->sync_cookie) {
@@ -671,39 +635,30 @@ void wbt_requeue(struct rq_wb *rwb, struct request *rq)
}
}
-void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
{
- if (rwb) {
- rwb->queue_depth = depth;
- wbt_update_limits(rwb);
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ if (rqos) {
+ RQWB(rqos)->rq_depth.queue_depth = depth;
+ __wbt_update_limits(RQWB(rqos));
}
}
-void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
+void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
{
- if (rwb)
- rwb->wc = write_cache_on;
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ if (rqos)
+ RQWB(rqos)->wc = write_cache_on;
}
/*
- * Disable wbt, if enabled by default.
- */
-void wbt_disable_default(struct request_queue *q)
-{
- struct rq_wb *rwb = q->rq_wb;
-
- if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
- wbt_exit(q);
-}
-EXPORT_SYMBOL_GPL(wbt_disable_default);
-
-/*
* Enable wbt if defaults are configured that way
*/
void wbt_enable_default(struct request_queue *q)
{
+ struct rq_qos *rqos = wbt_rq_qos(q);
/* Throttling already enabled? */
- if (q->rq_wb)
+ if (rqos)
return;
/* Queue not registered? Maybe shutting down... */
@@ -741,6 +696,42 @@ static int wbt_data_dir(const struct request *rq)
return -1;
}
+static void wbt_exit(struct rq_qos *rqos)
+{
+ struct rq_wb *rwb = RQWB(rqos);
+ struct request_queue *q = rqos->q;
+
+ blk_stat_remove_callback(q, rwb->cb);
+ blk_stat_free_callback(rwb->cb);
+ kfree(rwb);
+}
+
+/*
+ * Disable wbt, if enabled by default.
+ */
+void wbt_disable_default(struct request_queue *q)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+ struct rq_wb *rwb;
+ if (!rqos)
+ return;
+ rwb = RQWB(rqos);
+ if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
+ rwb->wb_normal = 0;
+}
+EXPORT_SYMBOL_GPL(wbt_disable_default);
+
+
+static struct rq_qos_ops wbt_rqos_ops = {
+ .throttle = wbt_wait,
+ .issue = wbt_issue,
+ .track = wbt_track,
+ .requeue = wbt_requeue,
+ .done = wbt_done,
+ .cleanup = wbt_cleanup,
+ .exit = wbt_exit,
+};
+
int wbt_init(struct request_queue *q)
{
struct rq_wb *rwb;
@@ -756,39 +747,29 @@ int wbt_init(struct request_queue *q)
return -ENOMEM;
}
- for (i = 0; i < WBT_NUM_RWQ; i++) {
- atomic_set(&rwb->rq_wait[i].inflight, 0);
- init_waitqueue_head(&rwb->rq_wait[i].wait);
- }
+ for (i = 0; i < WBT_NUM_RWQ; i++)
+ rq_wait_init(&rwb->rq_wait[i]);
+ rwb->rqos.id = RQ_QOS_WBT;
+ rwb->rqos.ops = &wbt_rqos_ops;
+ rwb->rqos.q = q;
rwb->last_comp = rwb->last_issue = jiffies;
- rwb->queue = q;
rwb->win_nsec = RWB_WINDOW_NSEC;
rwb->enable_state = WBT_STATE_ON_DEFAULT;
- wbt_update_limits(rwb);
+ rwb->wc = 1;
+ rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
+ __wbt_update_limits(rwb);
/*
* Assign rwb and add the stats callback.
*/
- q->rq_wb = rwb;
+ rq_qos_add(q, &rwb->rqos);
blk_stat_add_callback(q, rwb->cb);
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
- wbt_set_queue_depth(rwb, blk_queue_depth(q));
- wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+ wbt_set_queue_depth(q, blk_queue_depth(q));
+ wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
return 0;
}
-
-void wbt_exit(struct request_queue *q)
-{
- struct rq_wb *rwb = q->rq_wb;
-
- if (rwb) {
- blk_stat_remove_callback(q, rwb->cb);
- blk_stat_free_callback(rwb->cb);
- q->rq_wb = NULL;
- kfree(rwb);
- }
-}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 300df531d0a6..f47218d5b3b2 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -9,6 +9,7 @@
#include <linux/ktime.h>
#include "blk-stat.h"
+#include "blk-rq-qos.h"
enum wbt_flags {
WBT_TRACKED = 1, /* write, tracked for throttling */
@@ -35,20 +36,12 @@ enum {
WBT_STATE_ON_MANUAL = 2,
};
-struct rq_wait {
- wait_queue_head_t wait;
- atomic_t inflight;
-};
-
struct rq_wb {
/*
* Settings that govern how we throttle
*/
unsigned int wb_background; /* background writeback */
unsigned int wb_normal; /* normal writeback */
- unsigned int wb_max; /* max throughput writeback */
- int scale_step;
- bool scaled_max;
short enable_state; /* WBT_STATE_* */
@@ -67,15 +60,20 @@ struct rq_wb {
void *sync_cookie;
unsigned int wc;
- unsigned int queue_depth;
unsigned long last_issue; /* last non-throttled issue */
unsigned long last_comp; /* last non-throttled comp */
unsigned long min_lat_nsec;
- struct request_queue *queue;
+ struct rq_qos rqos;
struct rq_wait rq_wait[WBT_NUM_RWQ];
+ struct rq_depth rq_depth;
};
+static inline struct rq_wb *RQWB(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct rq_wb, rqos);
+}
+
static inline unsigned int wbt_inflight(struct rq_wb *rwb)
{
unsigned int i, ret = 0;
@@ -86,26 +84,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
return ret;
}
-#ifdef CONFIG_BLK_WBT
-static inline void wbt_track(struct request *rq, enum wbt_flags flags)
-{
- rq->wbt_flags |= flags;
-}
+#ifdef CONFIG_BLK_WBT
-void __wbt_done(struct rq_wb *, enum wbt_flags);
-void wbt_done(struct rq_wb *, struct request *);
-enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
int wbt_init(struct request_queue *);
-void wbt_exit(struct request_queue *);
-void wbt_update_limits(struct rq_wb *);
-void wbt_requeue(struct rq_wb *, struct request *);
-void wbt_issue(struct rq_wb *, struct request *);
+void wbt_update_limits(struct request_queue *);
void wbt_disable_default(struct request_queue *);
void wbt_enable_default(struct request_queue *);
-void wbt_set_queue_depth(struct rq_wb *, unsigned int);
-void wbt_set_write_cache(struct rq_wb *, bool);
+u64 wbt_get_min_lat(struct request_queue *q);
+void wbt_set_min_lat(struct request_queue *q, u64 val);
+
+void wbt_set_queue_depth(struct request_queue *, unsigned int);
+void wbt_set_write_cache(struct request_queue *, bool);
u64 wbt_default_latency_nsec(struct request_queue *);
@@ -114,43 +105,30 @@ u64 wbt_default_latency_nsec(struct request_queue *);
static inline void wbt_track(struct request *rq, enum wbt_flags flags)
{
}
-static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
-{
-}
-static inline void wbt_done(struct rq_wb *rwb, struct request *rq)
-{
-}
-static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
- spinlock_t *lock)
-{
- return 0;
-}
static inline int wbt_init(struct request_queue *q)
{
return -EINVAL;
}
-static inline void wbt_exit(struct request_queue *q)
-{
-}
-static inline void wbt_update_limits(struct rq_wb *rwb)
+static inline void wbt_update_limits(struct request_queue *q)
{
}
-static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq)
+static inline void wbt_disable_default(struct request_queue *q)
{
}
-static inline void wbt_issue(struct rq_wb *rwb, struct request *rq)
+static inline void wbt_enable_default(struct request_queue *q)
{
}
-static inline void wbt_disable_default(struct request_queue *q)
+static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
{
}
-static inline void wbt_enable_default(struct request_queue *q)
+static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
{
}
-static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+static inline u64 wbt_get_min_lat(struct request_queue *q)
{
+ return 0;
}
-static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
+static inline void wbt_set_min_lat(struct request_queue *q, u64 val)
{
}
static inline u64 wbt_default_latency_nsec(struct request_queue *q)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 51000914e23f..c461cf63f1f4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -200,7 +200,7 @@ int blkdev_report_zones(struct block_device *bdev,
/* Get header in the first page */
ofst = 0;
if (!nr_rep) {
- hdr = (struct blk_zone_report_hdr *) addr;
+ hdr = addr;
nr_rep = hdr->nr_zones;
ofst = sizeof(struct blk_zone_report_hdr);
}
diff --git a/block/blk.h b/block/blk.h
index 8d23aea96ce9..d4d67e948920 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -130,6 +130,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask);
void blk_exit_rl(struct request_queue *q, struct request_list *rl);
+void blk_exit_queue(struct request_queue *q);
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio);
void blk_queue_bypass_start(struct request_queue *q);
@@ -412,4 +413,10 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
extern void blk_drain_queue(struct request_queue *q);
+#ifdef CONFIG_BLK_CGROUP_IOLATENCY
+extern int blk_iolatency_init(struct request_queue *q);
+#else
+static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
+#endif
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index fd31347b7836..bc63b3a2d18c 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -195,6 +195,73 @@ static void bounce_end_io_read_isa(struct bio *bio)
__bounce_end_io_read(bio, &isa_page_pool);
}
+static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
+ struct bio_set *bs)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ struct bio *bio;
+
+ /*
+ * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
+ * bio_src->bi_io_vec to bio->bi_io_vec.
+ *
+ * We can't do that anymore, because:
+ *
+ * - The point of cloning the biovec is to produce a bio with a biovec
+ * the caller can modify: bi_idx and bi_bvec_done should be 0.
+ *
+ * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
+ * we tried to clone the whole thing bio_alloc_bioset() would fail.
+ * But the clone should succeed as long as the number of biovecs we
+ * actually need to allocate is fewer than BIO_MAX_PAGES.
+ *
+ * - Lastly, bi_vcnt should not be looked at or relied upon by code
+ * that does not own the bio - reason being drivers don't use it for
+ * iterating over the biovec anymore, so expecting it to be kept up
+ * to date (i.e. for clones that share the parent biovec) is just
+ * asking for trouble and would force extra work on
+ * __bio_clone_fast() anyways.
+ */
+
+ bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+ if (!bio)
+ return NULL;
+ bio->bi_disk = bio_src->bi_disk;
+ bio->bi_opf = bio_src->bi_opf;
+ bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
+ bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
+ break;
+ case REQ_OP_WRITE_SAME:
+ bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
+ break;
+ default:
+ bio_for_each_segment(bv, bio_src, iter)
+ bio->bi_io_vec[bio->bi_vcnt++] = bv;
+ break;
+ }
+
+ if (bio_integrity(bio_src)) {
+ int ret;
+
+ ret = bio_integrity_clone(bio, bio_src, gfp_mask);
+ if (ret < 0) {
+ bio_put(bio);
+ return NULL;
+ }
+ }
+
+ bio_clone_blkcg_association(bio, bio_src);
+
+ return bio;
+}
+
static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
mempool_t *pool)
{
@@ -222,7 +289,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
generic_make_request(*bio_orig);
*bio_orig = bio;
}
- bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+ bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
&bounce_bio_set);
bio_for_each_segment_all(to, bio, i) {
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 9419def8c017..f3501cdaf1a6 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -48,9 +48,8 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
job->request_len = hdr->request_len;
job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
- if (IS_ERR(job->request))
- return PTR_ERR(job->request);
- return 0;
+
+ return PTR_ERR_OR_ZERO(job->request);
}
static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
diff --git a/block/bsg.c b/block/bsg.c
index 66602c489956..db588add6ba6 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -13,11 +13,9 @@
#include <linux/init.h>
#include <linux/file.h>
#include <linux/blkdev.h>
-#include <linux/poll.h>
#include <linux/cdev.h>
#include <linux/jiffies.h>
#include <linux/percpu.h>
-#include <linux/uio.h>
#include <linux/idr.h>
#include <linux/bsg.h>
#include <linux/slab.h>
@@ -38,21 +36,10 @@
struct bsg_device {
struct request_queue *queue;
spinlock_t lock;
- struct list_head busy_list;
- struct list_head done_list;
struct hlist_node dev_list;
atomic_t ref_count;
- int queued_cmds;
- int done_cmds;
- wait_queue_head_t wq_done;
- wait_queue_head_t wq_free;
char name[20];
int max_queue;
- unsigned long flags;
-};
-
-enum {
- BSG_F_BLOCK = 1,
};
#define BSG_DEFAULT_CMDS 64
@@ -67,64 +54,6 @@ static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE];
static struct class *bsg_class;
static int bsg_major;
-static struct kmem_cache *bsg_cmd_cachep;
-
-/*
- * our internal command type
- */
-struct bsg_command {
- struct bsg_device *bd;
- struct list_head list;
- struct request *rq;
- struct bio *bio;
- struct bio *bidi_bio;
- int err;
- struct sg_io_v4 hdr;
-};
-
-static void bsg_free_command(struct bsg_command *bc)
-{
- struct bsg_device *bd = bc->bd;
- unsigned long flags;
-
- kmem_cache_free(bsg_cmd_cachep, bc);
-
- spin_lock_irqsave(&bd->lock, flags);
- bd->queued_cmds--;
- spin_unlock_irqrestore(&bd->lock, flags);
-
- wake_up(&bd->wq_free);
-}
-
-static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
-{
- struct bsg_command *bc = ERR_PTR(-EINVAL);
-
- spin_lock_irq(&bd->lock);
-
- if (bd->queued_cmds >= bd->max_queue)
- goto out;
-
- bd->queued_cmds++;
- spin_unlock_irq(&bd->lock);
-
- bc = kmem_cache_zalloc(bsg_cmd_cachep, GFP_KERNEL);
- if (unlikely(!bc)) {
- spin_lock_irq(&bd->lock);
- bd->queued_cmds--;
- bc = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- bc->bd = bd;
- INIT_LIST_HEAD(&bc->list);
- bsg_dbg(bd, "returning free cmd %p\n", bc);
- return bc;
-out:
- spin_unlock_irq(&bd->lock);
- return bc;
-}
-
static inline struct hlist_head *bsg_dev_idx_hash(int index)
{
return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
@@ -267,8 +196,6 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
} else if (hdr->din_xfer_len) {
ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr->din_xferp),
hdr->din_xfer_len, GFP_KERNEL);
- } else {
- ret = blk_rq_map_user(q, rq, NULL, NULL, 0, GFP_KERNEL);
}
if (ret)
@@ -287,101 +214,6 @@ out:
return ERR_PTR(ret);
}
-/*
- * async completion call-back from the block layer, when scsi/ide/whatever
- * calls end_that_request_last() on a request
- */
-static void bsg_rq_end_io(struct request *rq, blk_status_t status)
-{
- struct bsg_command *bc = rq->end_io_data;
- struct bsg_device *bd = bc->bd;
- unsigned long flags;
-
- bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
- rq, bc, bc->bio);
-
- bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
-
- spin_lock_irqsave(&bd->lock, flags);
- list_move_tail(&bc->list, &bd->done_list);
- bd->done_cmds++;
- spin_unlock_irqrestore(&bd->lock, flags);
-
- wake_up(&bd->wq_done);
-}
-
-/*
- * do final setup of a 'bc' and submit the matching 'rq' to the block
- * layer for io
- */
-static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
- struct bsg_command *bc, struct request *rq)
-{
- int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL));
-
- /*
- * add bc command to busy queue and submit rq for io
- */
- bc->rq = rq;
- bc->bio = rq->bio;
- if (rq->next_rq)
- bc->bidi_bio = rq->next_rq->bio;
- bc->hdr.duration = jiffies;
- spin_lock_irq(&bd->lock);
- list_add_tail(&bc->list, &bd->busy_list);
- spin_unlock_irq(&bd->lock);
-
- bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
-
- rq->end_io_data = bc;
- blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
-}
-
-static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd)
-{
- struct bsg_command *bc = NULL;
-
- spin_lock_irq(&bd->lock);
- if (bd->done_cmds) {
- bc = list_first_entry(&bd->done_list, struct bsg_command, list);
- list_del(&bc->list);
- bd->done_cmds--;
- }
- spin_unlock_irq(&bd->lock);
-
- return bc;
-}
-
-/*
- * Get a finished command from the done list
- */
-static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
-{
- struct bsg_command *bc;
- int ret;
-
- do {
- bc = bsg_next_done_cmd(bd);
- if (bc)
- break;
-
- if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
- bc = ERR_PTR(-EAGAIN);
- break;
- }
-
- ret = wait_event_interruptible(bd->wq_done, bd->done_cmds);
- if (ret) {
- bc = ERR_PTR(-ERESTARTSYS);
- break;
- }
- } while (1);
-
- bsg_dbg(bd, "returning done %p\n", bc);
-
- return bc;
-}
-
static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
struct bio *bio, struct bio *bidi_bio)
{
@@ -400,234 +232,6 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
return ret;
}
-static bool bsg_complete(struct bsg_device *bd)
-{
- bool ret = false;
- bool spin;
-
- do {
- spin_lock_irq(&bd->lock);
-
- BUG_ON(bd->done_cmds > bd->queued_cmds);
-
- /*
- * All commands consumed.
- */
- if (bd->done_cmds == bd->queued_cmds)
- ret = true;
-
- spin = !test_bit(BSG_F_BLOCK, &bd->flags);
-
- spin_unlock_irq(&bd->lock);
- } while (!ret && spin);
-
- return ret;
-}
-
-static int bsg_complete_all_commands(struct bsg_device *bd)
-{
- struct bsg_command *bc;
- int ret, tret;
-
- bsg_dbg(bd, "entered\n");
-
- /*
- * wait for all commands to complete
- */
- io_wait_event(bd->wq_done, bsg_complete(bd));
-
- /*
- * discard done commands
- */
- ret = 0;
- do {
- spin_lock_irq(&bd->lock);
- if (!bd->queued_cmds) {
- spin_unlock_irq(&bd->lock);
- break;
- }
- spin_unlock_irq(&bd->lock);
-
- bc = bsg_get_done_cmd(bd);
- if (IS_ERR(bc))
- break;
-
- tret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
- bc->bidi_bio);
- if (!ret)
- ret = tret;
-
- bsg_free_command(bc);
- } while (1);
-
- return ret;
-}
-
-static int
-__bsg_read(char __user *buf, size_t count, struct bsg_device *bd,
- const struct iovec *iov, ssize_t *bytes_read)
-{
- struct bsg_command *bc;
- int nr_commands, ret;
-
- if (count % sizeof(struct sg_io_v4))
- return -EINVAL;
-
- ret = 0;
- nr_commands = count / sizeof(struct sg_io_v4);
- while (nr_commands) {
- bc = bsg_get_done_cmd(bd);
- if (IS_ERR(bc)) {
- ret = PTR_ERR(bc);
- break;
- }
-
- /*
- * this is the only case where we need to copy data back
- * after completing the request. so do that here,
- * bsg_complete_work() cannot do that for us
- */
- ret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
- bc->bidi_bio);
-
- if (copy_to_user(buf, &bc->hdr, sizeof(bc->hdr)))
- ret = -EFAULT;
-
- bsg_free_command(bc);
-
- if (ret)
- break;
-
- buf += sizeof(struct sg_io_v4);
- *bytes_read += sizeof(struct sg_io_v4);
- nr_commands--;
- }
-
- return ret;
-}
-
-static inline void bsg_set_block(struct bsg_device *bd, struct file *file)
-{
- if (file->f_flags & O_NONBLOCK)
- clear_bit(BSG_F_BLOCK, &bd->flags);
- else
- set_bit(BSG_F_BLOCK, &bd->flags);
-}
-
-/*
- * Check if the error is a "real" error that we should return.
- */
-static inline int err_block_err(int ret)
-{
- if (ret && ret != -ENOSPC && ret != -ENODATA && ret != -EAGAIN)
- return 1;
-
- return 0;
-}
-
-static ssize_t
-bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
-{
- struct bsg_device *bd = file->private_data;
- int ret;
- ssize_t bytes_read;
-
- bsg_dbg(bd, "read %zd bytes\n", count);
-
- bsg_set_block(bd, file);
-
- bytes_read = 0;
- ret = __bsg_read(buf, count, bd, NULL, &bytes_read);
- *ppos = bytes_read;
-
- if (!bytes_read || err_block_err(ret))
- bytes_read = ret;
-
- return bytes_read;
-}
-
-static int __bsg_write(struct bsg_device *bd, const char __user *buf,
- size_t count, ssize_t *bytes_written, fmode_t mode)
-{
- struct bsg_command *bc;
- struct request *rq;
- int ret, nr_commands;
-
- if (count % sizeof(struct sg_io_v4))
- return -EINVAL;
-
- nr_commands = count / sizeof(struct sg_io_v4);
- rq = NULL;
- bc = NULL;
- ret = 0;
- while (nr_commands) {
- struct request_queue *q = bd->queue;
-
- bc = bsg_alloc_command(bd);
- if (IS_ERR(bc)) {
- ret = PTR_ERR(bc);
- bc = NULL;
- break;
- }
-
- if (copy_from_user(&bc->hdr, buf, sizeof(bc->hdr))) {
- ret = -EFAULT;
- break;
- }
-
- /*
- * get a request, fill in the blanks, and add to request queue
- */
- rq = bsg_map_hdr(bd->queue, &bc->hdr, mode);
- if (IS_ERR(rq)) {
- ret = PTR_ERR(rq);
- rq = NULL;
- break;
- }
-
- bsg_add_command(bd, q, bc, rq);
- bc = NULL;
- rq = NULL;
- nr_commands--;
- buf += sizeof(struct sg_io_v4);
- *bytes_written += sizeof(struct sg_io_v4);
- }
-
- if (bc)
- bsg_free_command(bc);
-
- return ret;
-}
-
-static ssize_t
-bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
-{
- struct bsg_device *bd = file->private_data;
- ssize_t bytes_written;
- int ret;
-
- bsg_dbg(bd, "write %zd bytes\n", count);
-
- if (unlikely(uaccess_kernel()))
- return -EINVAL;
-
- bsg_set_block(bd, file);
-
- bytes_written = 0;
- ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
-
- *ppos = bytes_written;
-
- /*
- * return bytes written on non-fatal errors
- */
- if (!bytes_written || err_block_err(ret))
- bytes_written = ret;
-
- bsg_dbg(bd, "returning %zd\n", bytes_written);
- return bytes_written;
-}
-
static struct bsg_device *bsg_alloc_device(void)
{
struct bsg_device *bd;
@@ -637,29 +241,20 @@ static struct bsg_device *bsg_alloc_device(void)
return NULL;
spin_lock_init(&bd->lock);
-
bd->max_queue = BSG_DEFAULT_CMDS;
-
- INIT_LIST_HEAD(&bd->busy_list);
- INIT_LIST_HEAD(&bd->done_list);
INIT_HLIST_NODE(&bd->dev_list);
-
- init_waitqueue_head(&bd->wq_free);
- init_waitqueue_head(&bd->wq_done);
return bd;
}
static int bsg_put_device(struct bsg_device *bd)
{
- int ret = 0, do_free;
struct request_queue *q = bd->queue;
mutex_lock(&bsg_mutex);
- do_free = atomic_dec_and_test(&bd->ref_count);
- if (!do_free) {
+ if (!atomic_dec_and_test(&bd->ref_count)) {
mutex_unlock(&bsg_mutex);
- goto out;
+ return 0;
}
hlist_del(&bd->dev_list);
@@ -670,20 +265,9 @@ static int bsg_put_device(struct bsg_device *bd)
/*
* close can always block
*/
- set_bit(BSG_F_BLOCK, &bd->flags);
-
- /*
- * correct error detection baddies here again. it's the responsibility
- * of the app to properly reap commands before close() if it wants
- * fool-proof error detection
- */
- ret = bsg_complete_all_commands(bd);
-
kfree(bd);
-out:
- if (do_free)
- blk_put_queue(q);
- return ret;
+ blk_put_queue(q);
+ return 0;
}
static struct bsg_device *bsg_add_device(struct inode *inode,
@@ -706,8 +290,6 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
bd->queue = rq;
- bsg_set_block(bd, file);
-
atomic_set(&bd->ref_count, 1);
hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
@@ -781,24 +363,6 @@ static int bsg_release(struct inode *inode, struct file *file)
return bsg_put_device(bd);
}
-static __poll_t bsg_poll(struct file *file, poll_table *wait)
-{
- struct bsg_device *bd = file->private_data;
- __poll_t mask = 0;
-
- poll_wait(file, &bd->wq_done, wait);
- poll_wait(file, &bd->wq_free, wait);
-
- spin_lock_irq(&bd->lock);
- if (!list_empty(&bd->done_list))
- mask |= EPOLLIN | EPOLLRDNORM;
- if (bd->queued_cmds < bd->max_queue)
- mask |= EPOLLOUT;
- spin_unlock_irq(&bd->lock);
-
- return mask;
-}
-
static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct bsg_device *bd = file->private_data;
@@ -872,9 +436,6 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
}
static const struct file_operations bsg_fops = {
- .read = bsg_read,
- .write = bsg_write,
- .poll = bsg_poll,
.open = bsg_open,
.release = bsg_release,
.unlocked_ioctl = bsg_ioctl,
@@ -979,21 +540,12 @@ static int __init bsg_init(void)
int ret, i;
dev_t devid;
- bsg_cmd_cachep = kmem_cache_create("bsg_cmd",
- sizeof(struct bsg_command), 0, 0, NULL);
- if (!bsg_cmd_cachep) {
- printk(KERN_ERR "bsg: failed creating slab cache\n");
- return -ENOMEM;
- }
-
for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++)
INIT_HLIST_HEAD(&bsg_device_list[i]);
bsg_class = class_create(THIS_MODULE, "bsg");
- if (IS_ERR(bsg_class)) {
- ret = PTR_ERR(bsg_class);
- goto destroy_kmemcache;
- }
+ if (IS_ERR(bsg_class))
+ return PTR_ERR(bsg_class);
bsg_class->devnode = bsg_devnode;
ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg");
@@ -1014,8 +566,6 @@ unregister_chrdev:
unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS);
destroy_bsg_class:
class_destroy(bsg_class);
-destroy_kmemcache:
- kmem_cache_destroy(bsg_cmd_cachep);
return ret;
}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 82b6c27b3245..2eb87444b157 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3666,6 +3666,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
switch (ioprio_class) {
default:
printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
+ /* fall through */
case IOPRIO_CLASS_NONE:
/*
* no prio set, inherit CPU scheduling settings
@@ -4735,12 +4736,13 @@ USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
{ \
struct cfq_data *cfqd = e->elevator_data; \
- unsigned int __data; \
+ unsigned int __data, __min = (MIN), __max = (MAX); \
+ \
cfq_var_store(&__data, (page)); \
- if (__data < (MIN)) \
- __data = (MIN); \
- else if (__data > (MAX)) \
- __data = (MAX); \
+ if (__data < __min) \
+ __data = __min; \
+ else if (__data > __max) \
+ __data = __max; \
if (__CONV) \
*(__PTR) = (u64)__data * NSEC_PER_MSEC; \
else \
@@ -4769,12 +4771,13 @@ STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX,
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
{ \
struct cfq_data *cfqd = e->elevator_data; \
- unsigned int __data; \
+ unsigned int __data, __min = (MIN), __max = (MAX); \
+ \
cfq_var_store(&__data, (page)); \
- if (__data < (MIN)) \
- __data = (MIN); \
- else if (__data > (MAX)) \
- __data = (MAX); \
+ if (__data < __min) \
+ __data = __min; \
+ else if (__data > __max) \
+ __data = __max; \
*(__PTR) = (u64)__data * NSEC_PER_USEC; \
return count; \
}
diff --git a/block/genhd.c b/block/genhd.c
index f1543a45e73b..8cc719a37b32 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1333,21 +1333,28 @@ static int diskstats_show(struct seq_file *seqf, void *v)
part_round_stats(gp->queue, cpu, hd);
part_stat_unlock();
part_in_flight(gp->queue, hd, inflight);
- seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
- "%u %lu %lu %lu %u %u %u %u\n",
+ seq_printf(seqf, "%4d %7d %s "
+ "%lu %lu %lu %u "
+ "%lu %lu %lu %u "
+ "%u %u %u "
+ "%lu %lu %lu %u\n",
MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
disk_name(gp, hd->partno, buf),
- part_stat_read(hd, ios[READ]),
- part_stat_read(hd, merges[READ]),
- part_stat_read(hd, sectors[READ]),
- jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
- part_stat_read(hd, ios[WRITE]),
- part_stat_read(hd, merges[WRITE]),
- part_stat_read(hd, sectors[WRITE]),
- jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
+ part_stat_read(hd, ios[STAT_READ]),
+ part_stat_read(hd, merges[STAT_READ]),
+ part_stat_read(hd, sectors[STAT_READ]),
+ jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])),
+ part_stat_read(hd, ios[STAT_WRITE]),
+ part_stat_read(hd, merges[STAT_WRITE]),
+ part_stat_read(hd, sectors[STAT_WRITE]),
+ jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])),
inflight[0],
jiffies_to_msecs(part_stat_read(hd, io_ticks)),
- jiffies_to_msecs(part_stat_read(hd, time_in_queue))
+ jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
+ part_stat_read(hd, ios[STAT_DISCARD]),
+ part_stat_read(hd, merges[STAT_DISCARD]),
+ part_stat_read(hd, sectors[STAT_DISCARD]),
+ jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD]))
);
}
disk_part_iter_exit(&piter);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 3dcfd4ec0e11..5a8975a1201c 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -130,19 +130,24 @@ ssize_t part_stat_show(struct device *dev,
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
- "%8u %8u %8u"
+ "%8u %8u %8u "
+ "%8lu %8lu %8llu %8u"
"\n",
- part_stat_read(p, ios[READ]),
- part_stat_read(p, merges[READ]),
- (unsigned long long)part_stat_read(p, sectors[READ]),
- jiffies_to_msecs(part_stat_read(p, ticks[READ])),
- part_stat_read(p, ios[WRITE]),
- part_stat_read(p, merges[WRITE]),
- (unsigned long long)part_stat_read(p, sectors[WRITE]),
- jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
+ part_stat_read(p, ios[STAT_READ]),
+ part_stat_read(p, merges[STAT_READ]),
+ (unsigned long long)part_stat_read(p, sectors[STAT_READ]),
+ jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])),
+ part_stat_read(p, ios[STAT_WRITE]),
+ part_stat_read(p, merges[STAT_WRITE]),
+ (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
+ jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])),
inflight[0],
jiffies_to_msecs(part_stat_read(p, io_ticks)),
- jiffies_to_msecs(part_stat_read(p, time_in_queue)));
+ jiffies_to_msecs(part_stat_read(p, time_in_queue)),
+ part_stat_read(p, ios[STAT_DISCARD]),
+ part_stat_read(p, merges[STAT_DISCARD]),
+ (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]),
+ jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD])));
}
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
index 007f95eea0e1..903f3ed175d0 100644
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -178,7 +178,7 @@ int aix_partition(struct parsed_partitions *state)
u32 vgda_sector = 0;
u32 vgda_len = 0;
int numlvs = 0;
- struct pvd *pvd;
+ struct pvd *pvd = NULL;
struct lv_info {
unsigned short pps_per_lv;
unsigned short pps_found;
@@ -232,10 +232,11 @@ int aix_partition(struct parsed_partitions *state)
if (lvip[i].pps_per_lv)
foundlvs += 1;
}
+ /* pvd loops depend on n[].name and lvip[].pps_per_lv */
+ pvd = alloc_pvd(state, vgda_sector + 17);
}
put_dev_sector(sect);
}
- pvd = alloc_pvd(state, vgda_sector + 17);
if (pvd) {
int numpps = be16_to_cpu(pvd->pp_count);
int psn_part1 = be32_to_cpu(pvd->psn_part1);
@@ -282,10 +283,14 @@ int aix_partition(struct parsed_partitions *state)
next_lp_ix += 1;
}
for (i = 0; i < state->limit; i += 1)
- if (lvip[i].pps_found && !lvip[i].lv_is_contiguous)
+ if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) {
+ char tmp[sizeof(n[i].name) + 1]; // null char
+
+ snprintf(tmp, sizeof(tmp), "%s", n[i].name);
pr_warn("partition %s (%u pp's found) is "
"not contiguous\n",
- n[i].name, lvip[i].pps_found);
+ tmp, lvip[i].pps_found);
+ }
kfree(pvd);
}
kfree(n);
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
index 0417937dfe99..16766f267559 100644
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -830,7 +830,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
{
char buf[64];
int r_objid, r_name, r_id1, r_id2, len;
- struct vblk_dgrp *dgrp;
BUG_ON (!buffer || !vb);
@@ -853,8 +852,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
if (len != get_unaligned_be32(buffer + 0x14))
return false;
- dgrp = &vb->vblk.dgrp;
-
ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
return true;
}
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 945f4b8610e0..e0de4dd448b3 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -877,7 +877,7 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
return 0;
}
- if (n > resp->num) {
+ if (n >= resp->num) {
pr_debug("Response has %d tokens. Can't access %d\n",
resp->num, n);
return 0;
@@ -916,7 +916,7 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n)
return 0;
}
- if (n > resp->num) {
+ if (n >= resp->num) {
pr_debug("Response has %d tokens. Can't access %d\n",
resp->num, n);
return 0;
diff --git a/block/t10-pi.c b/block/t10-pi.c
index a98db384048f..62aed77d0bb9 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -184,3 +184,113 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
.verify_fn = t10_pi_type3_verify_ip,
};
EXPORT_SYMBOL(t10_pi_type3_ip);
+
+/**
+ * t10_pi_prepare - prepare PI prior submitting request to device
+ * @rq: request with PI that should be prepared
+ * @protection_type: PI type (Type 1/Type 2/Type 3)
+ *
+ * For Type 1/Type 2, the virtual start sector is the one that was
+ * originally submitted by the block layer for the ref_tag usage. Due to
+ * partitioning, MD/DM cloning, etc. the actual physical start sector is
+ * likely to be different. Remap protection information to match the
+ * physical LBA.
+ *
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+void t10_pi_prepare(struct request *rq, u8 protection_type)
+{
+ const int tuple_sz = rq->q->integrity.tuple_size;
+ u32 ref_tag = t10_pi_ref_tag(rq);
+ struct bio *bio;
+
+ if (protection_type == T10_PI_TYPE3_PROTECTION)
+ return;
+
+ __rq_for_each_bio(bio, rq) {
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ u32 virt = bip_get_seed(bip) & 0xffffffff;
+ struct bio_vec iv;
+ struct bvec_iter iter;
+
+ /* Already remapped? */
+ if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
+ break;
+
+ bip_for_each_vec(iv, bip, iter) {
+ void *p, *pmap;
+ unsigned int j;
+
+ pmap = kmap_atomic(iv.bv_page);
+ p = pmap + iv.bv_offset;
+ for (j = 0; j < iv.bv_len; j += tuple_sz) {
+ struct t10_pi_tuple *pi = p;
+
+ if (be32_to_cpu(pi->ref_tag) == virt)
+ pi->ref_tag = cpu_to_be32(ref_tag);
+ virt++;
+ ref_tag++;
+ p += tuple_sz;
+ }
+
+ kunmap_atomic(pmap);
+ }
+
+ bip->bip_flags |= BIP_MAPPED_INTEGRITY;
+ }
+}
+EXPORT_SYMBOL(t10_pi_prepare);
+
+/**
+ * t10_pi_complete - prepare PI prior returning request to the block layer
+ * @rq: request with PI that should be prepared
+ * @protection_type: PI type (Type 1/Type 2/Type 3)
+ * @intervals: total elements to prepare
+ *
+ * For Type 1/Type 2, the virtual start sector is the one that was
+ * originally submitted by the block layer for the ref_tag usage. Due to
+ * partitioning, MD/DM cloning, etc. the actual physical start sector is
+ * likely to be different. Since the physical start sector was submitted
+ * to the device, we should remap it back to virtual values expected by the
+ * block layer.
+ *
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+void t10_pi_complete(struct request *rq, u8 protection_type,
+ unsigned int intervals)
+{
+ const int tuple_sz = rq->q->integrity.tuple_size;
+ u32 ref_tag = t10_pi_ref_tag(rq);
+ struct bio *bio;
+
+ if (protection_type == T10_PI_TYPE3_PROTECTION)
+ return;
+
+ __rq_for_each_bio(bio, rq) {
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ u32 virt = bip_get_seed(bip) & 0xffffffff;
+ struct bio_vec iv;
+ struct bvec_iter iter;
+
+ bip_for_each_vec(iv, bip, iter) {
+ void *p, *pmap;
+ unsigned int j;
+
+ pmap = kmap_atomic(iv.bv_page);
+ p = pmap + iv.bv_offset;
+ for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
+ struct t10_pi_tuple *pi = p;
+
+ if (be32_to_cpu(pi->ref_tag) == ref_tag)
+ pi->ref_tag = cpu_to_be32(virt);
+ virt++;
+ ref_tag++;
+ intervals--;
+ p += tuple_sz;
+ }
+
+ kunmap_atomic(pmap);
+ }
+ }
+}
+EXPORT_SYMBOL(t10_pi_complete);