summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2022-03-07 22:44:37 +0300
committerJens Axboe <axboe@kernel.dk>2022-03-07 22:44:37 +0300
commit13400b145426e2a13294fc42c5686dff30f19677 (patch)
treea8cb05b12d92a592037e75f8879f70833a0395a5 /block
parentffb217a13a2eaf6d5bd974fc83036a53ca69f1e2 (diff)
parent97939610b893de068c82c347d06319cd231a4602 (diff)
downloadlinux-13400b145426e2a13294fc42c5686dff30f19677.tar.xz
Merge branch 'for-5.18/block' into for-5.18/write-streams
* for-5.18/block: (96 commits) block: remove bio_devname ext4: stop using bio_devname raid5-ppl: stop using bio_devname raid1: stop using bio_devname md-multipath: stop using bio_devname dm-integrity: stop using bio_devname dm-crypt: stop using bio_devname pktcdvd: remove a pointless debug check in pkt_submit_bio block: remove handle_bad_sector block: fix and cleanup bio_check_ro bfq: fix use-after-free in bfq_dispatch_request blk-crypto: show crypto capabilities in sysfs block: don't delete queue kobject before its children block: simplify calling convention of elv_unregister_queue() block: remove redundant semicolon block: default BLOCK_LEGACY_AUTOLOAD to y block: update io_ticks when io hang block, bfq: don't move oom_bfqq block, bfq: avoid moving bfqq to it's parent bfqg block, bfq: cleanup bfq_bfqq_to_bfqg() ...
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig13
-rw-r--r--block/Makefile3
-rw-r--r--block/bdev.c11
-rw-r--r--block/bfq-cgroup.c16
-rw-r--r--block/bfq-iosched.c19
-rw-r--r--block/bfq-iosched.h2
-rw-r--r--block/bfq-wf2q.c17
-rw-r--r--block/bio-integrity.c1
-rw-r--r--block/bio.c187
-rw-r--r--block/blk-cgroup-rwstat.h2
-rw-r--r--block/blk-cgroup.c15
-rw-r--r--block/blk-cgroup.h477
-rw-r--r--block/blk-core.c286
-rw-r--r--block/blk-crypto-fallback.c2
-rw-r--r--block/blk-crypto-internal.h12
-rw-r--r--block/blk-crypto-sysfs.c172
-rw-r--r--block/blk-crypto.c4
-rw-r--r--block/blk-flush.c4
-rw-r--r--block/blk-iocost.c2
-rw-r--r--block/blk-iolatency.c2
-rw-r--r--block/blk-ioprio.c2
-rw-r--r--block/blk-lib.c46
-rw-r--r--block/blk-merge.c2
-rw-r--r--block/blk-mq-tag.c2
-rw-r--r--block/blk-mq.c64
-rw-r--r--block/blk-sysfs.c19
-rw-r--r--block/blk-throttle.c62
-rw-r--r--block/blk-throttle.h16
-rw-r--r--block/blk-zoned.c14
-rw-r--r--block/blk.h8
-rw-r--r--block/bounce.c11
-rw-r--r--block/disk-events.c2
-rw-r--r--block/elevator.c10
-rw-r--r--block/fops.c35
-rw-r--r--block/genhd.c26
-rw-r--r--block/holder.c2
-rw-r--r--block/partitions/check.h1
-rw-r--r--block/partitions/core.c1
-rw-r--r--block/partitions/efi.h1
-rw-r--r--block/partitions/ldm.h1
-rw-r--r--block/sed-opal.c2
41 files changed, 1101 insertions, 473 deletions
diff --git a/block/Kconfig b/block/Kconfig
index d5d4197b7ed2..7eb5d6d53b3f 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,6 +26,16 @@ menuconfig BLOCK
if BLOCK
+config BLOCK_LEGACY_AUTOLOAD
+ bool "Legacy autoloading support"
+ default y
+ help
+ Enable loading modules and creating block device instances based on
+ accesses through their device special file. This is a historic Linux
+ feature and makes no sense in a udev world where device files are
+ created on demand, but scripts that manually create device nodes and
+ then call losetup might rely on this behavior.
+
config BLK_RQ_ALLOC_TIME
bool
@@ -218,6 +228,9 @@ config BLK_PM
config BLOCK_HOLDER_DEPRECATED
bool
+config BLK_MQ_STACKING
+ bool
+
source "block/Kconfig.iosched"
endif # BLOCK
diff --git a/block/Makefile b/block/Makefile
index f38eaa612929..3950ecbc5c26 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
-obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
+ blk-crypto-sysfs.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
diff --git a/block/bdev.c b/block/bdev.c
index 102837a37051..ce8de42a89be 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -678,7 +678,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
if (test_bit(GD_NEED_PART_SCAN, &disk->state))
bdev_disk_changed(disk, false);
bdev->bd_openers++;
- return 0;;
+ return 0;
}
static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
@@ -733,12 +733,15 @@ struct block_device *blkdev_get_no_open(dev_t dev)
struct inode *inode;
inode = ilookup(blockdev_superblock, dev);
- if (!inode) {
+ if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
blk_request_module(dev);
inode = ilookup(blockdev_superblock, dev);
- if (!inode)
- return NULL;
+ if (inode)
+ pr_warn_ratelimited(
+"block device autoloading is deprecated and will be removed.\n");
}
+ if (!inode)
+ return NULL;
/* switch from the inode reference to a device mode one: */
bdev = &BDEV_I(inode)->bdev;
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 24a5c5329bcd..420eda2589c0 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -645,8 +645,22 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg)
{
struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_group *old_parent = bfqq_group(bfqq);
/*
+ * No point to move bfqq to the same group, which can happen when
+ * root group is offlined
+ */
+ if (old_parent == bfqg)
+ return;
+
+ /*
+ * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group
+ * until elevator exit.
+ */
+ if (bfqq == &bfqd->oom_bfqq)
+ return;
+ /*
* Get extra reference to prevent bfqq from being freed in
* next possible expire or deactivate.
*/
@@ -666,7 +680,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
else if (entity->on_st_or_in_serv)
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
- bfqg_and_blkg_put(bfqq_group(bfqq));
+ bfqg_and_blkg_put(old_parent);
if (entity->parent &&
entity->parent->last_bfqq_created == bfqq)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 36a66e97e3c2..d23b5112c682 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -774,7 +774,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
if (!bfqq->next_rq)
return;
- bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree;
__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
blk_rq_pos(bfqq->next_rq), &parent, &p);
if (!__bfqq) {
@@ -2669,7 +2669,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
sector_t sector)
{
- struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree;
struct rb_node *parent, *node;
struct bfq_queue *__bfqq;
@@ -5181,7 +5181,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct request *rq;
struct bfq_queue *in_serv_queue;
- bool waiting_rq, idle_timer_disabled;
+ bool waiting_rq, idle_timer_disabled = false;
spin_lock_irq(&bfqd->lock);
@@ -5189,14 +5189,15 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
rq = __bfq_dispatch_request(hctx);
-
- idle_timer_disabled =
- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ if (in_serv_queue == bfqd->in_service_queue) {
+ idle_timer_disabled =
+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ }
spin_unlock_irq(&bfqd->lock);
-
- bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
- idle_timer_disabled);
+ bfq_update_dispatch_stats(hctx->queue, rq,
+ idle_timer_disabled ? in_serv_queue : NULL,
+ idle_timer_disabled);
return rq;
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 07288b9da389..3b83e3d1c2e5 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -8,7 +8,6 @@
#include <linux/blktrace_api.h>
#include <linux/hrtimer.h>
-#include <linux/blk-cgroup.h>
#include "blk-cgroup-rwstat.h"
@@ -1051,7 +1050,6 @@ extern struct blkcg_policy blkcg_policy_bfq;
for (parent = NULL; entity ; entity = parent)
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index b74cc0da118e..f8eb340381cf 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -142,16 +142,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
#ifdef CONFIG_BFQ_GROUP_IOSCHED
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- struct bfq_entity *group_entity = bfqq->entity.parent;
-
- if (!group_entity)
- group_entity = &bfqq->bfqd->root_group->entity;
-
- return container_of(group_entity, struct bfq_group, entity);
-}
-
/*
* Returns true if this budget changes may let next_in_service->parent
* become the next_in_service entity for its parent entity.
@@ -230,11 +220,6 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
#else /* CONFIG_BFQ_GROUP_IOSCHED */
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- return bfqq->bfqd->root_group;
-}
-
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
{
return false;
@@ -519,7 +504,7 @@ unsigned short bfq_ioprio_to_weight(int ioprio)
static unsigned short bfq_weight_to_ioprio(int weight)
{
return max_t(int, 0,
- IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+ IOPRIO_NR_LEVELS - weight / BFQ_WEIGHT_CONVERSION_COEFF);
}
static void bfq_get_entity(struct bfq_entity *entity)
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 0827b19820c5..6996e7bd66e9 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -420,7 +420,6 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
return 0;
}
-EXPORT_SYMBOL(bio_integrity_clone);
int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
diff --git a/block/bio.c b/block/bio.c
index 4312a8085396..151cace2dbe1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -15,7 +15,6 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
-#include <linux/blk-cgroup.h>
#include <linux/highmem.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
@@ -24,6 +23,7 @@
#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
+#include "blk-cgroup.h"
struct bio_alloc_cache {
struct bio *free_list;
@@ -249,12 +249,12 @@ static void bio_free(struct bio *bio)
* they must remember to pair any call to bio_init() with bio_uninit()
* when IO has completed, or when the bio is released.
*/
-void bio_init(struct bio *bio, struct bio_vec *table,
- unsigned short max_vecs)
+void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
+ unsigned short max_vecs, unsigned int opf)
{
bio->bi_next = NULL;
- bio->bi_bdev = NULL;
- bio->bi_opf = 0;
+ bio->bi_bdev = bdev;
+ bio->bi_opf = opf;
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
@@ -268,6 +268,8 @@ void bio_init(struct bio *bio, struct bio_vec *table,
#ifdef CONFIG_BLK_CGROUP
bio->bi_blkg = NULL;
bio->bi_issue.value = 0;
+ if (bdev)
+ bio_associate_blkg(bio);
#ifdef CONFIG_BLK_CGROUP_IOCOST
bio->bi_iocost_cost = 0;
#endif
@@ -293,6 +295,8 @@ EXPORT_SYMBOL(bio_init);
/**
* bio_reset - reinitialize a bio
* @bio: bio to reset
+ * @bdev: block device to use the bio for
+ * @opf: operation and flags for bio
*
* Description:
* After calling bio_reset(), @bio will be in the same state as a freshly
@@ -300,11 +304,15 @@ EXPORT_SYMBOL(bio_init);
* preserved are the ones that are initialized by bio_alloc_bioset(). See
* comment in struct bio.
*/
-void bio_reset(struct bio *bio)
+void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
{
bio_uninit(bio);
memset(bio, 0, BIO_RESET_BYTES);
atomic_set(&bio->__bi_remaining, 1);
+ bio->bi_bdev = bdev;
+ if (bio->bi_bdev)
+ bio_associate_blkg(bio);
+ bio->bi_opf = opf;
}
EXPORT_SYMBOL(bio_reset);
@@ -344,6 +352,20 @@ void bio_chain(struct bio *bio, struct bio *parent)
}
EXPORT_SYMBOL(bio_chain);
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+ unsigned int nr_pages, unsigned int opf, gfp_t gfp)
+{
+ struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
+
+ if (bio) {
+ bio_chain(bio, new);
+ submit_bio(bio);
+ }
+
+ return new;
+}
+EXPORT_SYMBOL_GPL(blk_next_bio);
+
static void bio_alloc_rescue(struct work_struct *work)
{
struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
@@ -400,8 +422,10 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
/**
* bio_alloc_bioset - allocate a bio for I/O
+ * @bdev: block device to allocate the bio for (can be %NULL)
+ * @nr_vecs: number of bvecs to pre-allocate
+ * @opf: operation and flags for bio
* @gfp_mask: the GFP_* mask given to the slab allocator
- * @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
* Allocate a bio from the mempools in @bs.
@@ -430,15 +454,16 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
*
* Returns: Pointer to new bio on success, NULL on failure.
*/
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
+struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
+ unsigned int opf, gfp_t gfp_mask,
struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
struct bio *bio;
void *p;
- /* should not use nobvec bioset for nr_iovecs > 0 */
- if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0))
+ /* should not use nobvec bioset for nr_vecs > 0 */
+ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
return NULL;
/*
@@ -475,23 +500,23 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
return NULL;
bio = p + bs->front_pad;
- if (nr_iovecs > BIO_INLINE_VECS) {
+ if (nr_vecs > BIO_INLINE_VECS) {
struct bio_vec *bvl = NULL;
- bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
if (!bvl && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
- bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
}
if (unlikely(!bvl))
goto err_free;
- bio_init(bio, bvl, nr_iovecs);
- } else if (nr_iovecs) {
- bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
+ bio_init(bio, bdev, bvl, nr_vecs, opf);
+ } else if (nr_vecs) {
+ bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
} else {
- bio_init(bio, NULL, 0);
+ bio_init(bio, bdev, NULL, 0, opf);
}
bio->bi_pool = bs;
@@ -522,7 +547,8 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
if (unlikely(!bio))
return NULL;
- bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs);
+ bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs,
+ 0);
bio->bi_pool = NULL;
return bio;
}
@@ -702,80 +728,84 @@ void bio_put(struct bio *bio)
}
EXPORT_SYMBOL(bio_put);
-/**
- * __bio_clone_fast - clone a bio that shares the original bio's biovec
- * @bio: destination bio
- * @bio_src: bio to clone
- *
- * Clone a &bio. Caller will own the returned bio, but not
- * the actual data it points to. Reference count of returned
- * bio will be one.
- *
- * Caller must ensure that @bio_src is not freed before @bio.
- */
-void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{
- WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
-
- /*
- * most users will be overriding ->bi_bdev with a new target,
- * so we don't set nor calculate new physical/hw segment counts here
- */
- bio->bi_bdev = bio_src->bi_bdev;
bio_set_flag(bio, BIO_CLONED);
if (bio_flagged(bio_src, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
- if (bio_flagged(bio_src, BIO_REMAPPED))
+ if (bio->bi_bdev == bio_src->bi_bdev &&
+ bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
- bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
- bio->bi_io_vec = bio_src->bi_io_vec;
bio_clone_blkg_association(bio, bio_src);
blkcg_bio_issue_init(bio);
+
+ if (bio_crypt_clone(bio, bio_src, gfp) < 0)
+ return -ENOMEM;
+ if (bio_integrity(bio_src) &&
+ bio_integrity_clone(bio, bio_src, gfp) < 0)
+ return -ENOMEM;
+ return 0;
}
-EXPORT_SYMBOL(__bio_clone_fast);
/**
- * bio_clone_fast - clone a bio that shares the original bio's biovec
- * @bio: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
+ * bio_alloc_clone - clone a bio that shares the original bio's biovec
+ * @bdev: block_device to clone onto
+ * @bio_src: bio to clone from
+ * @gfp: allocation priority
+ * @bs: bio_set to allocate from
+ *
+ * Allocate a new bio that is a clone of @bio_src. The caller owns the returned
+ * bio, but not the actual data it points to.
*
- * Like __bio_clone_fast, only also allocates the returned bio
+ * The caller must ensure that the return bio is not freed before @bio_src.
*/
-struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
+ gfp_t gfp, struct bio_set *bs)
{
- struct bio *b;
+ struct bio *bio;
- b = bio_alloc_bioset(gfp_mask, 0, bs);
- if (!b)
+ bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
+ if (!bio)
return NULL;
- __bio_clone_fast(b, bio);
-
- if (bio_crypt_clone(b, bio, gfp_mask) < 0)
- goto err_put;
-
- if (bio_integrity(bio) &&
- bio_integrity_clone(b, bio, gfp_mask) < 0)
- goto err_put;
-
- return b;
+ if (__bio_clone(bio, bio_src, gfp) < 0) {
+ bio_put(bio);
+ return NULL;
+ }
+ bio->bi_io_vec = bio_src->bi_io_vec;
-err_put:
- bio_put(b);
- return NULL;
+ return bio;
}
-EXPORT_SYMBOL(bio_clone_fast);
+EXPORT_SYMBOL(bio_alloc_clone);
-const char *bio_devname(struct bio *bio, char *buf)
+/**
+ * bio_init_clone - clone a bio that shares the original bio's biovec
+ * @bdev: block_device to clone onto
+ * @bio: bio to clone into
+ * @bio_src: bio to clone from
+ * @gfp: allocation priority
+ *
+ * Initialize a new bio in caller provided memory that is a clone of @bio_src.
+ * The caller owns the returned bio, but not the actual data it points to.
+ *
+ * The caller must ensure that @bio_src is not freed before @bio.
+ */
+int bio_init_clone(struct block_device *bdev, struct bio *bio,
+ struct bio *bio_src, gfp_t gfp)
{
- return bdevname(bio->bi_bdev, buf);
+ int ret;
+
+ bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
+ ret = __bio_clone(bio, bio_src, gfp);
+ if (ret)
+ bio_uninit(bio);
+ return ret;
}
-EXPORT_SYMBOL(bio_devname);
+EXPORT_SYMBOL(bio_init_clone);
/**
* bio_full - check if the bio is full
@@ -1054,7 +1084,7 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
if (len > UINT_MAX || off > UINT_MAX)
- return 0;
+ return false;
return bio_add_page(bio, &folio->page, len, off) > 0;
}
@@ -1541,7 +1571,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
return NULL;
- split = bio_clone_fast(bio, gfp, bs);
+ split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
if (!split)
return NULL;
@@ -1636,9 +1666,9 @@ EXPORT_SYMBOL(bioset_exit);
* Note that the bio must be embedded at the END of that structure always,
* or things will break badly.
* If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
- * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast().
- * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
- * dispatch queued requests when the mempool runs out of space.
+ * for allocating iovecs. This pool is not needed e.g. for bio_init_clone().
+ * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used
+ * to dispatch queued requests when the mempool runs out of space.
*
*/
int bioset_init(struct bio_set *bs,
@@ -1708,7 +1738,9 @@ EXPORT_SYMBOL(bioset_init_from_src);
/**
* bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
* @kiocb: kiocb describing the IO
+ * @bdev: block device to allocate the bio for (can be %NULL)
* @nr_vecs: number of iovecs to pre-allocate
+ * @opf: operation and flags for bio
* @bs: bio_set to allocate from
*
* Description:
@@ -1719,14 +1751,14 @@ EXPORT_SYMBOL(bioset_init_from_src);
* MUST be done from process context, not hard/soft IRQ.
*
*/
-struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
- struct bio_set *bs)
+struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev,
+ unsigned short nr_vecs, unsigned int opf, struct bio_set *bs)
{
struct bio_alloc_cache *cache;
struct bio *bio;
if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
- return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ return bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs);
cache = per_cpu_ptr(bs->cache, get_cpu());
if (cache->free_list) {
@@ -1734,13 +1766,14 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
cache->free_list = bio->bi_next;
cache->nr--;
put_cpu();
- bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
+ bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL,
+ nr_vecs, opf);
bio->bi_pool = bs;
bio_set_flag(bio, BIO_PERCPU_CACHE);
return bio;
}
put_cpu();
- bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ bio = bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs);
bio_set_flag(bio, BIO_PERCPU_CACHE);
return bio;
}
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index ee746919c41f..9f2723b34b75 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -6,7 +6,7 @@
#ifndef _BLK_CGROUP_RWSTAT_H
#define _BLK_CGROUP_RWSTAT_H
-#include <linux/blk-cgroup.h>
+#include "blk-cgroup.h"
enum blkg_rwstat_type {
BLKG_RWSTAT_READ,
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 650f7e27989f..fa063c6c0338 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -23,15 +23,14 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
-#include <linux/genhd.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/ctype.h>
-#include <linux/blk-cgroup.h>
#include <linux/tracehook.h>
#include <linux/psi.h>
#include <linux/part_stat.h>
#include "blk.h"
+#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"
@@ -857,11 +856,11 @@ static void blkcg_fill_root_iostats(void)
blk_queue_root_blkg(bdev_get_queue(bdev));
struct blkg_iostat tmp;
int cpu;
+ unsigned long flags;
memset(&tmp, 0, sizeof(tmp));
for_each_possible_cpu(cpu) {
struct disk_stats *cpu_dkstats;
- unsigned long flags;
cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
tmp.ios[BLKG_IOSTAT_READ] +=
@@ -877,11 +876,11 @@ static void blkcg_fill_root_iostats(void)
cpu_dkstats->sectors[STAT_WRITE] << 9;
tmp.bytes[BLKG_IOSTAT_DISCARD] +=
cpu_dkstats->sectors[STAT_DISCARD] << 9;
-
- flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
- blkg_iostat_set(&blkg->iostat.cur, &tmp);
- u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
+
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+ blkg_iostat_set(&blkg->iostat.cur, &tmp);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
}
@@ -1176,6 +1175,8 @@ int blkcg_init_queue(struct request_queue *q)
bool preloaded;
int ret;
+ INIT_LIST_HEAD(&q->blkg_list);
+
new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
if (!new_blkg)
return -ENOMEM;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..3e91803c4a55
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BLK_CGROUP_PRIVATE_H
+#define _BLK_CGROUP_PRIVATE_H
+/*
+ * block cgroup private header
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/blk-cgroup.h>
+
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
+
+#ifdef CONFIG_BLK_CGROUP
+
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q). This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods. A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
+ */
+struct blkg_policy_data {
+ /* the blkg and policy id this per-policy data belongs to */
+ struct blkcg_gq *blkg;
+ int plid;
+};
+
+/*
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods. A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
+ */
+struct blkcg_policy_data {
+ /* the blkcg and policy id this per-policy data belongs to */
+ struct blkcg *blkcg;
+ int plid;
+};
+
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
+ struct request_queue *q, struct blkcg *blkcg);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
+typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
+ struct seq_file *s);
+
+struct blkcg_policy {
+ int plid;
+ /* cgroup files for the policy */
+ struct cftype *dfl_cftypes;
+ struct cftype *legacy_cftypes;
+
+ /* operations */
+ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn;
+ blkcg_pol_init_cpd_fn *cpd_init_fn;
+ blkcg_pol_free_cpd_fn *cpd_free_fn;
+ blkcg_pol_bind_cpd_fn *cpd_bind_fn;
+
+ blkcg_pol_alloc_pd_fn *pd_alloc_fn;
+ blkcg_pol_init_pd_fn *pd_init_fn;
+ blkcg_pol_online_pd_fn *pd_online_fn;
+ blkcg_pol_offline_pd_fn *pd_offline_fn;
+ blkcg_pol_free_pd_fn *pd_free_fn;
+ blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
+ blkcg_pol_stat_pd_fn *pd_stat_fn;
+};
+
+extern struct blkcg blkcg_root;
+extern bool blkcg_debug_stats;
+
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+ struct request_queue *q, bool update_hint);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
+
+/* Blkio controller policy registration */
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol);
+
+const char *blkg_dev_name(struct blkcg_gq *blkg);
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+ u64 (*prfill)(struct seq_file *,
+ struct blkg_policy_data *, int),
+ const struct blkcg_policy *pol, int data,
+ bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+
+struct blkg_conf_ctx {
+ struct block_device *bdev;
+ struct blkcg_gq *blkg;
+ char *body;
+};
+
+struct block_device *blkcg_conf_open_bdev(char **inputp);
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+ char *input, struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+/**
+ * blkcg_css - find the current css
+ *
+ * Find the css associated with either the kthread or the current task.
+ * This may return a dying css, so it is up to the caller to use tryget logic
+ * to confirm it is alive and well.
+ */
+static inline struct cgroup_subsys_state *blkcg_css(void)
+{
+ struct cgroup_subsys_state *css;
+
+ css = kthread_blkcg();
+ if (css)
+ return css;
+ return task_css(current, io_cgrp_id);
+}
+
+/**
+ * __bio_blkcg - internal, inconsistent version to get blkcg
+ *
+ * DO NOT USE.
+ * This function is inconsistent and consequently is dangerous to use. The
+ * first part of the function returns a blkcg where a reference is owned by the
+ * bio. This means it does not need to be rcu protected as it cannot go away
+ * with the bio owning a reference to it. However, the latter potentially gets
+ * it from task_css(). This can race against task migration and the cgroup
+ * dying. It is also semantically different as it must be called rcu protected
+ * and is susceptible to failure when trying to get a reference to it.
+ * Therefore, it is not ok to assume that *_get() will always succeed on the
+ * blkcg returned here.
+ */
+static inline struct blkcg *__bio_blkcg(struct bio *bio)
+{
+ if (bio && bio->bi_blkg)
+ return bio->bi_blkg->blkcg;
+ return css_to_blkcg(blkcg_css());
+}
+
+/**
+ * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
+ * @return: true if this bio needs to be submitted with the root blkg context.
+ *
+ * In order to avoid priority inversions we sometimes need to issue a bio as if
+ * it were attached to the root blkg, and then backcharge to the actual owning
+ * blkg. The idea is we do bio_blkcg() to look up the actual context for the
+ * bio and attach the appropriate blkg to the bio. Then we call this helper and
+ * if it is true run with the root blkg for that queue and then do any
+ * backcharging to the originating cgroup once the io is complete.
+ */
+static inline bool bio_issue_as_root_blkg(struct bio *bio)
+{
+ return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
+}
+
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state. If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q,
+ bool update_hint)
+{
+ struct blkcg_gq *blkg;
+
+ if (blkcg == &blkcg_root)
+ return q->root_blkg;
+
+ blkg = rcu_dereference(blkcg->blkg_hint);
+ if (blkg && blkg->q == q)
+ return blkg;
+
+ return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair. This function should be called
+ * under RCU read lock.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return __blkg_lookup(blkcg, q, false);
+}
+
+/**
+ * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for @q at the root level. See also blkg_lookup().
+ */
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{
+ return q->root_blkg;
+}
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol)
+{
+ return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+ struct blkcg_policy *pol)
+{
+ return blkcg ? blkcg->cpd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data. Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+ return pd ? pd->blkg : NULL;
+}
+
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+ return cpd ? cpd->blkcg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+ return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+}
+
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
+{
+ percpu_ref_get(&blkg->refcnt);
+}
+
+/**
+ * blkg_tryget - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg. We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline bool blkg_tryget(struct blkcg_gq *blkg)
+{
+ return blkg && percpu_ref_tryget(&blkg->refcnt);
+}
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+ percpu_ref_put(&blkg->refcnt);
+}
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
+ * read locked. If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs. The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
+ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
+ if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
+ (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead. Synchronization rules are the same. @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
+ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
+ if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
+ (p_blkg)->q, false)))
+
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+ if (bio->bi_opf & REQ_CGROUP_PUNT)
+ return __blkcg_punt_bio_submit(bio);
+ else
+ return false;
+}
+
+static inline void blkcg_bio_issue_init(struct bio *bio)
+{
+ bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+}
+
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+ if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
+ return;
+ if (atomic_add_return(1, &blkg->use_delay) == 1)
+ atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ if (WARN_ON_ONCE(old < 0))
+ return 0;
+ if (old == 0)
+ return 0;
+
+ /*
+ * We do this song and dance because we can race with somebody else
+ * adding or removing delay. If we just did an atomic_dec we'd end up
+ * negative and we'd already be in trouble. We need to subtract 1 and
+ * then check to see if we were the last delay so we can drop the
+ * congestion count on the cgroup.
+ */
+ while (old) {
+ int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
+ if (cur == old)
+ break;
+ old = cur;
+ }
+
+ if (old == 0)
+ return 0;
+ if (old == 1)
+ atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+ return 1;
+}
+
+/**
+ * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
+ * @blkg: target blkg
+ * @delay: delay duration in nsecs
+ *
+ * When enabled with this function, the delay is not decayed and must be
+ * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
+ * blkcg_[un]use_delay() and blkcg_add_delay() usages.
+ */
+static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ /* We only want 1 person setting the congestion count for this blkg. */
+ if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old)
+ atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+
+ atomic64_set(&blkg->delay_nsec, delay);
+}
+
+/**
+ * blkcg_clear_delay - Disable allocator delay mechanism
+ * @blkg: target blkg
+ *
+ * Disable use_delay mechanism. See blkcg_set_delay().
+ */
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ /* We only want 1 person clearing the congestion count for this blkg. */
+ if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old)
+ atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+void blk_cgroup_bio_start(struct bio *bio);
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+#else /* CONFIG_BLK_CGROUP */
+
+struct blkg_policy_data {
+};
+
+struct blkcg_policy_data {
+};
+
+struct blkcg_policy {
+};
+
+#ifdef CONFIG_BLOCK
+
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{ return NULL; }
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol) { }
+
+static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
+static inline void blkcg_bio_issue_init(struct bio *bio) { }
+static inline void blk_cgroup_bio_start(struct bio *bio) { }
+
+#define blk_queue_for_each_rl(rl, q) \
+ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
+#endif /* CONFIG_BLOCK */
+#endif /* CONFIG_BLK_CGROUP */
+
+#endif /* _BLK_CGROUP_PRIVATE_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 1039515c99d6..3bb5a551bb90 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,7 +34,6 @@
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
@@ -49,6 +48,7 @@
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
+#include "blk-cgroup.h"
#include "blk-throttle.h"
struct dentry *blk_debugfs_root;
@@ -164,6 +164,7 @@ static const struct {
[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
[BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
+ [BLK_STS_OFFLINE] = { -ENODEV, "device offline" },
/* device mapper special case, should not leak out: */
[BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
@@ -469,9 +470,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
-#ifdef CONFIG_BLK_CGROUP
- INIT_LIST_HEAD(&q->blkg_list);
-#endif
kobject_init(&q->kobj, &blk_queue_ktype);
@@ -536,17 +534,6 @@ bool blk_get_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_get_queue);
-static void handle_bad_sector(struct bio *bio, sector_t maxsector)
-{
- char b[BDEVNAME_SIZE];
-
- pr_info_ratelimited("%s: attempt to access beyond end of device\n"
- "%s: rw=%d, want=%llu, limit=%llu\n",
- current->comm,
- bio_devname(bio, b), bio->bi_opf,
- bio_end_sector(bio), maxsector);
-}
-
#ifdef CONFIG_FAIL_MAKE_REQUEST
static DECLARE_FAULT_ATTR(fail_make_request);
@@ -576,14 +563,10 @@ late_initcall(fail_make_request_debugfs);
static inline bool bio_check_ro(struct bio *bio)
{
if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
- char b[BDEVNAME_SIZE];
-
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
return false;
-
- WARN_ONCE(1,
- "Trying to write to read-only block-device %s (partno %d)\n",
- bio_devname(bio, b), bio->bi_bdev->bd_partno);
+ pr_warn("Trying to write to read-only block-device %pg\n",
+ bio->bi_bdev);
/* Older lvm-tools actually trigger this */
return false;
}
@@ -612,7 +595,11 @@ static inline int bio_check_eod(struct bio *bio)
if (nr_sectors && maxsector &&
(nr_sectors > maxsector ||
bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
- handle_bad_sector(bio, maxsector);
+ pr_info_ratelimited("%s: attempt to access beyond end of device\n"
+ "%pg: rw=%d, want=%llu, limit=%llu\n",
+ current->comm,
+ bio->bi_bdev, bio->bi_opf,
+ bio_end_sector(bio), maxsector);
return -EIO;
}
return 0;
@@ -672,7 +659,123 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_OK;
}
-noinline_for_stack bool submit_bio_checks(struct bio *bio)
+static void __submit_bio(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+ if (unlikely(!blk_crypto_bio_prep(&bio)))
+ return;
+
+ if (!disk->fops->submit_bio) {
+ blk_mq_submit_bio(bio);
+ } else if (likely(bio_queue_enter(bio) == 0)) {
+ disk->fops->submit_bio(bio);
+ blk_queue_exit(disk->queue);
+ }
+}
+
+/*
+ * The loop in this function may be a bit non-obvious, and so deserves some
+ * explanation:
+ *
+ * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
+ * that), so we have a list with a single bio.
+ * - We pretend that we have just taken it off a longer list, so we assign
+ * bio_list to a pointer to the bio_list_on_stack, thus initialising the
+ * bio_list of new bios to be added. ->submit_bio() may indeed add some more
+ * bios through a recursive call to submit_bio_noacct. If it did, we find a
+ * non-NULL value in bio_list and re-enter the loop from the top.
+ * - In this case we really did just take the bio of the top of the list (no
+ * pretending) and so remove it from bio_list, and call into ->submit_bio()
+ * again.
+ *
+ * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
+ * bio_list_on_stack[1] contains bios that were submitted before the current
+ * ->submit_bio_bio, but that haven't been processed yet.
+ */
+static void __submit_bio_noacct(struct bio *bio)
+{
+ struct bio_list bio_list_on_stack[2];
+
+ BUG_ON(bio->bi_next);
+
+ bio_list_init(&bio_list_on_stack[0]);
+ current->bio_list = bio_list_on_stack;
+
+ do {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ struct bio_list lower, same;
+
+ /*
+ * Create a fresh bio_list for all subordinate requests.
+ */
+ bio_list_on_stack[1] = bio_list_on_stack[0];
+ bio_list_init(&bio_list_on_stack[0]);
+
+ __submit_bio(bio);
+
+ /*
+ * Sort new bios into those for a lower level and those for the
+ * same level.
+ */
+ bio_list_init(&lower);
+ bio_list_init(&same);
+ while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
+ if (q == bdev_get_queue(bio->bi_bdev))
+ bio_list_add(&same, bio);
+ else
+ bio_list_add(&lower, bio);
+
+ /*
+ * Now assemble so we handle the lowest level first.
+ */
+ bio_list_merge(&bio_list_on_stack[0], &lower);
+ bio_list_merge(&bio_list_on_stack[0], &same);
+ bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
+ } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
+
+ current->bio_list = NULL;
+}
+
+static void __submit_bio_noacct_mq(struct bio *bio)
+{
+ struct bio_list bio_list[2] = { };
+
+ current->bio_list = bio_list;
+
+ do {
+ __submit_bio(bio);
+ } while ((bio = bio_list_pop(&bio_list[0])));
+
+ current->bio_list = NULL;
+}
+
+void submit_bio_noacct_nocheck(struct bio *bio)
+{
+ /*
+ * We only want one ->submit_bio to be active at a time, else stack
+ * usage with stacked devices could be a problem. Use current->bio_list
+ * to collect a list of requests submited by a ->submit_bio method while
+ * it is active, and then process them after it returned.
+ */
+ if (current->bio_list)
+ bio_list_add(&current->bio_list[0], bio);
+ else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
+ __submit_bio_noacct_mq(bio);
+ else
+ __submit_bio_noacct(bio);
+}
+
+/**
+ * submit_bio_noacct - re-submit a bio to the block device layer for I/O
+ * @bio: The bio describing the location in memory and on the device.
+ *
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers. All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
+ */
+void submit_bio_noacct(struct bio *bio)
{
struct block_device *bdev = bio->bi_bdev;
struct request_queue *q = bdev_get_queue(bdev);
@@ -757,7 +860,7 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio)
}
if (blk_throtl_bio(bio))
- return false;
+ return;
blk_cgroup_bio_start(bio);
blkcg_bio_issue_init(bio);
@@ -769,138 +872,14 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio)
*/
bio_set_flag(bio, BIO_TRACE_COMPLETION);
}
- return true;
+ submit_bio_noacct_nocheck(bio);
+ return;
not_supported:
status = BLK_STS_NOTSUPP;
end_io:
bio->bi_status = status;
bio_endio(bio);
- return false;
-}
-
-static void __submit_bio_fops(struct gendisk *disk, struct bio *bio)
-{
- if (blk_crypto_bio_prep(&bio)) {
- if (likely(bio_queue_enter(bio) == 0)) {
- disk->fops->submit_bio(bio);
- blk_queue_exit(disk->queue);
- }
- }
-}
-
-static void __submit_bio(struct bio *bio)
-{
- struct gendisk *disk = bio->bi_bdev->bd_disk;
-
- if (unlikely(!submit_bio_checks(bio)))
- return;
-
- if (!disk->fops->submit_bio)
- blk_mq_submit_bio(bio);
- else
- __submit_bio_fops(disk, bio);
-}
-
-/*
- * The loop in this function may be a bit non-obvious, and so deserves some
- * explanation:
- *
- * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
- * that), so we have a list with a single bio.
- * - We pretend that we have just taken it off a longer list, so we assign
- * bio_list to a pointer to the bio_list_on_stack, thus initialising the
- * bio_list of new bios to be added. ->submit_bio() may indeed add some more
- * bios through a recursive call to submit_bio_noacct. If it did, we find a
- * non-NULL value in bio_list and re-enter the loop from the top.
- * - In this case we really did just take the bio of the top of the list (no
- * pretending) and so remove it from bio_list, and call into ->submit_bio()
- * again.
- *
- * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
- * bio_list_on_stack[1] contains bios that were submitted before the current
- * ->submit_bio_bio, but that haven't been processed yet.
- */
-static void __submit_bio_noacct(struct bio *bio)
-{
- struct bio_list bio_list_on_stack[2];
-
- BUG_ON(bio->bi_next);
-
- bio_list_init(&bio_list_on_stack[0]);
- current->bio_list = bio_list_on_stack;
-
- do {
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct bio_list lower, same;
-
- /*
- * Create a fresh bio_list for all subordinate requests.
- */
- bio_list_on_stack[1] = bio_list_on_stack[0];
- bio_list_init(&bio_list_on_stack[0]);
-
- __submit_bio(bio);
-
- /*
- * Sort new bios into those for a lower level and those for the
- * same level.
- */
- bio_list_init(&lower);
- bio_list_init(&same);
- while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
- if (q == bdev_get_queue(bio->bi_bdev))
- bio_list_add(&same, bio);
- else
- bio_list_add(&lower, bio);
-
- /*
- * Now assemble so we handle the lowest level first.
- */
- bio_list_merge(&bio_list_on_stack[0], &lower);
- bio_list_merge(&bio_list_on_stack[0], &same);
- bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
- } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
-
- current->bio_list = NULL;
-}
-
-static void __submit_bio_noacct_mq(struct bio *bio)
-{
- struct bio_list bio_list[2] = { };
-
- current->bio_list = bio_list;
-
- do {
- __submit_bio(bio);
- } while ((bio = bio_list_pop(&bio_list[0])));
-
- current->bio_list = NULL;
-}
-
-/**
- * submit_bio_noacct - re-submit a bio to the block device layer for I/O
- * @bio: The bio describing the location in memory and on the device.
- *
- * This is a version of submit_bio() that shall only be used for I/O that is
- * resubmitted to lower level drivers by stacking block drivers. All file
- * systems and other upper level users of the block layer should use
- * submit_bio() instead.
- */
-void submit_bio_noacct(struct bio *bio)
-{
- /*
- * We only want one ->submit_bio to be active at a time, else stack
- * usage with stacked devices could be a problem. Use current->bio_list
- * to collect a list of requests submited by a ->submit_bio method while
- * it is active, and then process them after it returned.
- */
- if (current->bio_list)
- bio_list_add(&current->bio_list[0], bio);
- else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
- __submit_bio_noacct_mq(bio);
- else
- __submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio_noacct);
@@ -985,8 +964,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
- if (current->plug)
- blk_flush_plug(current->plug, false);
+ blk_flush_plug(current->plug, false);
if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT))
return 0;
@@ -1268,7 +1246,7 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
}
EXPORT_SYMBOL(blk_check_plugged);
-void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
+void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
{
if (!list_empty(&plug->cb_list))
flush_plug_callbacks(plug, from_schedule);
@@ -1297,7 +1275,7 @@ void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
void blk_finish_plug(struct blk_plug *plug)
{
if (plug == current->plug) {
- blk_flush_plug(plug, false);
+ __blk_flush_plug(plug, false);
current->plug = NULL;
}
}
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index c87aba8584c6..18c8eafe20b9 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -10,7 +10,6 @@
#define pr_fmt(fmt) "blk-crypto-fallback: " fmt
#include <crypto/skcipher.h>
-#include <linux/blk-cgroup.h>
#include <linux/blk-crypto.h>
#include <linux/blk-crypto-profile.h>
#include <linux/blkdev.h>
@@ -20,6 +19,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
+#include "blk-cgroup.h"
#include "blk-crypto-internal.h"
static unsigned int num_prealloc_bounce_pg = 32;
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index 2fb0d65a464c..e6818ffaddbf 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -11,6 +11,7 @@
/* Represents a crypto mode supported by blk-crypto */
struct blk_crypto_mode {
+ const char *name; /* name of this mode, shown in sysfs */
const char *cipher_str; /* crypto API name (for fallback case) */
unsigned int keysize; /* key size in bytes */
unsigned int ivsize; /* iv size in bytes */
@@ -20,6 +21,10 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+int blk_crypto_sysfs_register(struct request_queue *q);
+
+void blk_crypto_sysfs_unregister(struct request_queue *q);
+
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
unsigned int inc);
@@ -62,6 +67,13 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
+static inline int blk_crypto_sysfs_register(struct request_queue *q)
+{
+ return 0;
+}
+
+static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { }
+
static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
struct bio *bio)
{
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
new file mode 100644
index 000000000000..fd93bd2f33b7
--- /dev/null
+++ b/block/blk-crypto-sysfs.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Google LLC
+ *
+ * sysfs support for blk-crypto. This file contains the code which exports the
+ * crypto capabilities of devices via /sys/block/$disk/queue/crypto/.
+ */
+
+#include <linux/blk-crypto-profile.h>
+
+#include "blk-crypto-internal.h"
+
+struct blk_crypto_kobj {
+ struct kobject kobj;
+ struct blk_crypto_profile *profile;
+};
+
+struct blk_crypto_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page);
+};
+
+static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj)
+{
+ return container_of(kobj, struct blk_crypto_kobj, kobj)->profile;
+}
+
+static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
+{
+ return container_of(attr, struct blk_crypto_attr, attr);
+}
+
+static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported);
+}
+
+static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ return sysfs_emit(page, "%u\n", profile->num_slots);
+}
+
+#define BLK_CRYPTO_RO_ATTR(_name) \
+ static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
+
+BLK_CRYPTO_RO_ATTR(max_dun_bits);
+BLK_CRYPTO_RO_ATTR(num_keyslots);
+
+static struct attribute *blk_crypto_attrs[] = {
+ &max_dun_bits_attr.attr,
+ &num_keyslots_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group blk_crypto_attr_group = {
+ .attrs = blk_crypto_attrs,
+};
+
+/*
+ * The encryption mode attributes. To avoid hard-coding the list of encryption
+ * modes, these are initialized at boot time by blk_crypto_sysfs_init().
+ */
+static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX];
+static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1];
+
+static umode_t blk_crypto_mode_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+ int mode_num = a - __blk_crypto_mode_attrs;
+
+ if (profile->modes_supported[mode_num])
+ return 0444;
+ return 0;
+}
+
+static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ int mode_num = attr - __blk_crypto_mode_attrs;
+
+ return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]);
+}
+
+static const struct attribute_group blk_crypto_modes_attr_group = {
+ .name = "modes",
+ .attrs = blk_crypto_mode_attrs,
+ .is_visible = blk_crypto_mode_is_visible,
+};
+
+static const struct attribute_group *blk_crypto_attr_groups[] = {
+ &blk_crypto_attr_group,
+ &blk_crypto_modes_attr_group,
+ NULL,
+};
+
+static ssize_t blk_crypto_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *page)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+
+ return a->show(profile, a, page);
+}
+
+static const struct sysfs_ops blk_crypto_attr_ops = {
+ .show = blk_crypto_attr_show,
+};
+
+static void blk_crypto_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct blk_crypto_kobj, kobj));
+}
+
+static struct kobj_type blk_crypto_ktype = {
+ .default_groups = blk_crypto_attr_groups,
+ .sysfs_ops = &blk_crypto_attr_ops,
+ .release = blk_crypto_release,
+};
+
+/*
+ * If the request_queue has a blk_crypto_profile, create the "crypto"
+ * subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
+ */
+int blk_crypto_sysfs_register(struct request_queue *q)
+{
+ struct blk_crypto_kobj *obj;
+ int err;
+
+ if (!q->crypto_profile)
+ return 0;
+
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+ obj->profile = q->crypto_profile;
+
+ err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj,
+ "crypto");
+ if (err) {
+ kobject_put(&obj->kobj);
+ return err;
+ }
+ q->crypto_kobject = &obj->kobj;
+ return 0;
+}
+
+void blk_crypto_sysfs_unregister(struct request_queue *q)
+{
+ kobject_put(q->crypto_kobject);
+}
+
+static int __init blk_crypto_sysfs_init(void)
+{
+ int i;
+
+ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
+ for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) {
+ struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i];
+
+ attr->attr.name = blk_crypto_modes[i].name;
+ attr->attr.mode = 0444;
+ attr->show = blk_crypto_mode_show;
+ blk_crypto_mode_attrs[i - 1] = &attr->attr;
+ }
+ return 0;
+}
+subsys_initcall(blk_crypto_sysfs_init);
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index ec9efeeeca91..a496aaef85ba 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -19,16 +19,19 @@
const struct blk_crypto_mode blk_crypto_modes[] = {
[BLK_ENCRYPTION_MODE_AES_256_XTS] = {
+ .name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
+ .name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_ADIANTUM] = {
+ .name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
.ivsize = 32,
@@ -111,7 +114,6 @@ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask)
*dst->bi_crypt_context = *src->bi_crypt_context;
return 0;
}
-EXPORT_SYMBOL_GPL(__bio_crypt_clone);
/* Increments @dun by @inc, treating @dun as a multi-limb integer. */
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e4df894189ce..c68968724870 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -460,9 +460,7 @@ int blkdev_issue_flush(struct block_device *bdev)
{
struct bio bio;
- bio_init(&bio, NULL, 0);
- bio_set_dev(&bio, bdev);
- bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
return submit_bio_wait(&bio);
}
EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 769b64394298..70a0a3d680a3 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -178,12 +178,12 @@
#include <linux/time64.h>
#include <linux/parser.h>
#include <linux/sched/signal.h>
-#include <linux/blk-cgroup.h>
#include <asm/local.h>
#include <asm/local64.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
#ifdef CONFIG_TRACEPOINTS
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 6593c7123b97..010e658d44a8 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -74,9 +74,9 @@
#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
+#include "blk-cgroup.h"
#include "blk.h"
#define DEFAULT_SCALE_COOKIE 1000000U
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 2e7f10e1c03f..79e797f5d194 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -12,11 +12,11 @@
* Documentation/admin-guide/cgroup-v2.rst.
*/
-#include <linux/blk-cgroup.h>
#include <linux/blk-mq.h>
#include <linux/blk_types.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-rq-qos.h"
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9f09beadcbe3..fc6ea52e7482 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -10,19 +10,6 @@
#include "blk.h"
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
-{
- struct bio *new = bio_alloc(gfp, nr_pages);
-
- if (bio) {
- bio_chain(bio, new);
- submit_bio(bio);
- }
-
- return new;
-}
-EXPORT_SYMBOL_GPL(blk_next_bio);
-
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, int flags,
struct bio **biop)
@@ -32,9 +19,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
unsigned int op;
sector_t bs_mask, part_offset = 0;
- if (!q)
- return -ENXIO;
-
if (bdev_read_only(bdev))
return -EPERM;
@@ -95,11 +79,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
- bio = blk_next_bio(bio, 0, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 0, op, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio_set_op_attrs(bio, op, 0);
-
bio->bi_iter.bi_size = req_sects << 9;
sector += req_sects;
nr_sects -= req_sects;
@@ -172,9 +153,6 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
struct bio *bio = *biop;
sector_t bs_mask;
- if (!q)
- return -ENXIO;
-
if (bdev_read_only(bdev))
return -EPERM;
@@ -189,14 +167,12 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
max_write_same_sectors = bio_allowed_max_sectors(q);
while (nr_sects) {
- bio = blk_next_bio(bio, 1, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 1, REQ_OP_WRITE_SAME, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
bio->bi_vcnt = 1;
bio->bi_io_vec->bv_page = page;
bio->bi_io_vec->bv_offset = 0;
bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
- bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0);
if (nr_sects > max_write_same_sectors) {
bio->bi_iter.bi_size = max_write_same_sectors << 9;
@@ -250,10 +226,6 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
{
struct bio *bio = *biop;
unsigned int max_write_zeroes_sectors;
- struct request_queue *q = bdev_get_queue(bdev);
-
- if (!q)
- return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
@@ -265,10 +237,8 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
return -EOPNOTSUPP;
while (nr_sects) {
- bio = blk_next_bio(bio, 0, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE_ZEROES;
if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP;
@@ -304,23 +274,17 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop)
{
- struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
int bi_size = 0;
unsigned int sz;
- if (!q)
- return -ENXIO;
-
if (bdev_read_only(bdev))
return -EPERM;
while (nr_sects != 0) {
- bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
- gfp_mask);
+ bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects),
+ REQ_OP_WRITE, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
while (nr_sects != 0) {
sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4de34a332c9f..f5255991b773 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -368,8 +368,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
trace_block_split(split, (*bio)->bi_iter.bi_sector);
submit_bio_noacct(*bio);
*bio = split;
-
- blk_throtl_charge_bio_split(*bio);
}
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 845f74e8dd7b..0fd409b8e86e 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -107,7 +107,7 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
return BLK_MQ_NO_TAG;
if (data->shallow_depth)
- return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
+ return sbitmap_queue_get_shallow(bt, data->shallow_depth);
else
return __sbitmap_queue_get(bt);
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d69ca91fbc8b..f1b067d06ab5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -793,8 +793,10 @@ bool blk_update_request(struct request *req, blk_status_t error,
#endif
if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)))
+ !(req->rq_flags & RQF_QUIET))) {
blk_print_req_error(req, error);
+ trace_block_rq_error(req, error, nr_bytes);
+ }
blk_account_io_completion(req, nr_bytes);
@@ -2182,6 +2184,14 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
if (blk_mq_hctx_stopped(hctx))
continue;
/*
+ * If there is already a run_work pending, leave the
+ * pending delay untouched. Otherwise, a hctx can stall
+ * if another hctx is re-delaying the other's work
+ * before the work executes.
+ */
+ if (delayed_work_pending(&hctx->run_work))
+ continue;
+ /*
* Dispatch from this hctx either if there's no hctx preferred
* by IO scheduler or if it has requests that bypass the
* scheduler.
@@ -2790,9 +2800,6 @@ void blk_mq_submit_bio(struct bio *bio)
unsigned int nr_segs = 1;
blk_status_t ret;
- if (unlikely(!blk_crypto_bio_prep(&bio)))
- return;
-
blk_queue_bounce(q, &bio);
if (blk_may_split(q, bio))
__blk_queue_split(q, &bio, &nr_segs);
@@ -2842,27 +2849,16 @@ void blk_mq_submit_bio(struct bio *bio)
blk_mq_try_issue_directly(rq->mq_hctx, rq));
}
+#ifdef CONFIG_BLK_MQ_STACKING
/**
- * blk_cloned_rq_check_limits - Helper function to check a cloned request
- * for the new queue limits
- * @q: the queue
- * @rq: the request being checked
- *
- * Description:
- * @rq may have been made based on weaker limitations of upper-level queues
- * in request stacking drivers, and it may violate the limitation of @q.
- * Since the block layer and the underlying device driver trust @rq
- * after it is inserted to @q, it should be checked against @q before
- * the insertion using this generic function.
- *
- * Request stacking drivers like request-based dm may change the queue
- * limits when retrying requests on other queues. Those requests need
- * to be checked against the new queue limits again during dispatch.
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
+ * @rq: the request being queued
*/
-static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
- struct request *rq)
+blk_status_t blk_insert_cloned_request(struct request *rq)
{
+ struct request_queue *q = rq->q;
unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+ blk_status_t ret;
if (blk_rq_sectors(rq) > max_sectors) {
/*
@@ -2894,24 +2890,7 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
return BLK_STS_IOERR;
}
- return BLK_STS_OK;
-}
-
-/**
- * blk_insert_cloned_request - Helper for stacking drivers to submit a request
- * @q: the queue to submit the request
- * @rq: the request being queued
- */
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
-{
- blk_status_t ret;
-
- ret = blk_cloned_rq_check_limits(q, rq);
- if (ret != BLK_STS_OK)
- return ret;
-
- if (rq->q->disk &&
- should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq)))
+ if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
return BLK_STS_IOERR;
if (blk_crypto_insert_cloned_request(rq))
@@ -2924,7 +2903,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
* bypass a potential scheduler on the bottom device for
* insert.
*/
- blk_mq_run_dispatch_ops(rq->q,
+ blk_mq_run_dispatch_ops(q,
ret = blk_mq_request_issue_directly(rq, true));
if (ret)
blk_account_io_done(rq, ktime_get_ns());
@@ -2979,10 +2958,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
bs = &fs_bio_set;
__rq_for_each_bio(bio_src, rq_src) {
- bio = bio_clone_fast(bio_src, gfp_mask, bs);
+ bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
+ bs);
if (!bio)
goto free_and_out;
- bio->bi_bdev = rq->q->disk->part0;
if (bio_ctr && bio_ctr(bio, bio_src, data))
goto free_and_out;
@@ -3019,6 +2998,7 @@ free_and_out:
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
+#endif /* CONFIG_BLK_MQ_STACKING */
/*
* Steal bios from a request and add them to a bio list.
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9f32882ceb2f..241ded62f458 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -10,7 +10,6 @@
#include <linux/backing-dev.h>
#include <linux/blktrace_api.h>
#include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
#include <linux/debugfs.h>
#include "blk.h"
@@ -18,6 +17,7 @@
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
#include "blk-throttle.h"
struct queue_sysfs_entry {
@@ -880,6 +880,10 @@ int blk_register_queue(struct gendisk *disk)
goto put_dev;
}
+ ret = blk_crypto_sysfs_register(q);
+ if (ret)
+ goto put_dev;
+
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register_queue(q);
@@ -910,6 +914,7 @@ unlock:
return ret;
put_dev:
+ elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
@@ -954,16 +959,18 @@ void blk_unregister_queue(struct gendisk *disk)
*/
if (queue_is_mq(q))
blk_mq_unregister_dev(disk_to_dev(disk), q);
-
- kobject_uevent(&q->kobj, KOBJ_REMOVE);
- kobject_del(&q->kobj);
+ blk_crypto_sysfs_unregister(q);
blk_trace_remove_sysfs(disk_to_dev(disk));
mutex_lock(&q->sysfs_lock);
- if (q->elevator)
- elv_unregister_queue(q);
+ elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
+
+ /* Now that we've deleted all child objects, we can delete the queue. */
+ kobject_uevent(&q->kobj, KOBJ_REMOVE);
+ kobject_del(&q->kobj);
+
mutex_unlock(&q->sysfs_dir_lock);
kobject_put(&disk_to_dev(disk)->kobj);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7c462c006b26..a3b3ebc72dd4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,7 +10,6 @@
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/blktrace_api.h>
-#include <linux/blk-cgroup.h>
#include "blk.h"
#include "blk-cgroup-rwstat.h"
#include "blk-stat.h"
@@ -42,11 +41,6 @@
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
-enum tg_state_flags {
- THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
- THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
-};
-
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
/* We measure latency for request size from <= 4k to >= 1M */
@@ -426,12 +420,24 @@ static void tg_update_has_rules(struct throtl_grp *tg)
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
struct throtl_data *td = tg->td;
int rw;
+ int has_iops_limit = 0;
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ unsigned int iops_limit = tg_iops_limit(tg, rw);
- for (rw = READ; rw <= WRITE; rw++)
tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
(td->limit_valid[td->limit_index] &&
(tg_bps_limit(tg, rw) != U64_MAX ||
- tg_iops_limit(tg, rw) != UINT_MAX));
+ iops_limit != UINT_MAX));
+
+ if (iops_limit != UINT_MAX)
+ has_iops_limit = 1;
+ }
+
+ if (has_iops_limit)
+ tg->flags |= THROTL_TG_HAS_IOPS_LIMIT;
+ else
+ tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT;
}
static void throtl_pd_online(struct blkg_policy_data *pd)
@@ -634,8 +640,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
- atomic_set(&tg->io_split_cnt[rw], 0);
-
/*
* Previous slice has expired. We must have trimmed it after last
* bio dispatch. That means since start of last slice, we never used
@@ -659,8 +663,6 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
- atomic_set(&tg->io_split_cnt[rw], 0);
-
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -808,7 +810,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
- if (bps_limit == U64_MAX) {
+ /* no need to throttle if this bio's bytes have been accounted */
+ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) {
if (wait)
*wait = 0;
return true;
@@ -893,9 +896,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
- if (iops_limit != UINT_MAX)
- tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0);
-
if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (wait)
@@ -920,9 +920,12 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */
- tg->bytes_disp[rw] += bio_size;
+ if (!bio_flagged(bio, BIO_THROTTLED)) {
+ tg->bytes_disp[rw] += bio_size;
+ tg->last_bytes_disp[rw] += bio_size;
+ }
+
tg->io_disp[rw]++;
- tg->last_bytes_disp[rw] += bio_size;
tg->last_io_disp[rw]++;
/*
@@ -1219,7 +1222,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bio_list_on_stack)))
- submit_bio_noacct(bio);
+ submit_bio_noacct_nocheck(bio);
blk_finish_plug(&plug);
}
}
@@ -1917,14 +1920,12 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
}
if (tg->iops[READ][LIMIT_LOW]) {
- tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0);
iops = tg->last_io_disp[READ] * HZ / elapsed_time;
if (iops >= tg->iops[READ][LIMIT_LOW])
tg->last_low_overflow_time[READ] = now;
}
if (tg->iops[WRITE][LIMIT_LOW]) {
- tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0);
iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
if (iops >= tg->iops[WRITE][LIMIT_LOW])
tg->last_low_overflow_time[WRITE] = now;
@@ -2043,25 +2044,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
}
#endif
-void blk_throtl_charge_bio_split(struct bio *bio)
-{
- struct blkcg_gq *blkg = bio->bi_blkg;
- struct throtl_grp *parent = blkg_to_tg(blkg);
- struct throtl_service_queue *parent_sq;
- bool rw = bio_data_dir(bio);
-
- do {
- if (!parent->has_rules[rw])
- break;
-
- atomic_inc(&parent->io_split_cnt[rw]);
- atomic_inc(&parent->last_io_split_cnt[rw]);
-
- parent_sq = parent->service_queue.parent_sq;
- parent = sq_to_tg(parent_sq);
- } while (parent);
-}
-
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 175f03abd9e4..b23a9f3abb82 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -52,6 +52,12 @@ struct throtl_service_queue {
struct timer_list pending_timer; /* fires on first_pending_disptime */
};
+enum tg_state_flags {
+ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
+ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
+ THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */
+};
+
enum {
LIMIT_LOW,
LIMIT_MAX,
@@ -132,9 +138,6 @@ struct throtl_grp {
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
- atomic_t io_split_cnt[2];
- atomic_t last_io_split_cnt[2];
-
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
@@ -158,20 +161,21 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
-static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
#else /* CONFIG_BLK_DEV_THROTTLING */
int blk_throtl_init(struct request_queue *q);
void blk_throtl_exit(struct request_queue *q);
void blk_throtl_register_queue(struct request_queue *q);
-void blk_throtl_charge_bio_split(struct bio *bio);
bool __blk_throtl_bio(struct bio *bio);
static inline bool blk_throtl_bio(struct bio *bio)
{
struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
- if (bio_flagged(bio, BIO_THROTTLED))
+ /* no need to throttle bps any more if the bio has been throttled */
+ if (bio_flagged(bio, BIO_THROTTLED) &&
+ !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT))
return false;
+
if (!tg->has_rules[bio_data_dir(bio)])
return false;
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 774ecc598bee..602bef54c813 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -215,9 +215,8 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
continue;
}
- bio = blk_next_bio(bio, 0, gfp_mask);
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC;
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
+ gfp_mask);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
@@ -239,10 +238,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
{
struct bio bio;
- bio_init(&bio, NULL, 0);
- bio_set_dev(&bio, bdev);
- bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
-
+ bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
return submit_bio_wait(&bio);
}
@@ -306,9 +302,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
}
while (sector < end_sector) {
- bio = blk_next_bio(bio, 0, gfp_mask);
- bio_set_dev(bio, bdev);
- bio->bi_opf = op | REQ_SYNC;
+ bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
diff --git a/block/blk.h b/block/blk.h
index 8bd43b3ad33d..ebaa59ca46ca 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -46,7 +46,7 @@ void blk_freeze_queue(struct request_queue *q);
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
void blk_queue_start_drain(struct request_queue *q);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
-bool submit_bio_checks(struct bio *bio);
+void submit_bio_noacct_nocheck(struct bio *bio);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
@@ -406,8 +406,6 @@ extern int blk_iolatency_init(struct request_queue *q);
static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
#endif
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
-
#ifdef CONFIG_BLK_DEV_ZONED
void blk_queue_free_zone_bitmaps(struct request_queue *q);
void blk_queue_clear_zone_settings(struct request_queue *q);
@@ -426,6 +424,7 @@ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
int bdev_del_partition(struct gendisk *disk, int partno);
int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length);
+void blk_drop_partitions(struct gendisk *disk);
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
@@ -445,6 +444,9 @@ int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
void disk_del_events(struct gendisk *disk);
void disk_release_events(struct gendisk *disk);
+void disk_block_events(struct gendisk *disk);
+void disk_unblock_events(struct gendisk *disk);
+void disk_flush_events(struct gendisk *disk, unsigned int mask);
extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;
diff --git a/block/bounce.c b/block/bounce.c
index 7af1a72835b9..3d50d19cde72 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -14,7 +14,6 @@
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
-#include <linux/blk-cgroup.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/hash.h>
@@ -24,6 +23,7 @@
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-cgroup.h"
#define POOL_SIZE 64
#define ISA_POOL_SIZE 16
@@ -162,15 +162,12 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
* that does not own the bio - reason being drivers don't use it for
* iterating over the biovec anymore, so expecting it to be kept up
* to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work on
- * __bio_clone_fast() anyways.
+ * asking for trouble and would force extra work.
*/
- bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src),
- &bounce_bio_set);
- bio->bi_bdev = bio_src->bi_bdev;
+ bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
+ bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
- bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
diff --git a/block/disk-events.c b/block/disk-events.c
index 8d5496e7592a..aee25a7e1ab7 100644
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -4,7 +4,7 @@
*/
#include <linux/export.h>
#include <linux/moduleparam.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include "blk.h"
struct disk_events {
diff --git a/block/elevator.c b/block/elevator.c
index 482df2a350fc..9a9e52374e27 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -35,7 +35,6 @@
#include <linux/hash.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
@@ -44,6 +43,7 @@
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -516,9 +516,11 @@ int elv_register_queue(struct request_queue *q, bool uevent)
void elv_unregister_queue(struct request_queue *q)
{
+ struct elevator_queue *e = q->elevator;
+
lockdep_assert_held(&q->sysfs_lock);
- if (q) {
+ if (e && e->registered) {
struct elevator_queue *e = q->elevator;
kobject_uevent(&e->kobj, KOBJ_REMOVE);
@@ -591,9 +593,7 @@ int elevator_switch_mq(struct request_queue *q,
lockdep_assert_held(&q->sysfs_lock);
if (q->elevator) {
- if (q->elevator->registered)
- elv_unregister_queue(q);
-
+ elv_unregister_queue(q);
ioc_clear_queue(q);
blk_mq_sched_free_rqs(q);
elevator_exit(q);
diff --git a/block/fops.c b/block/fops.c
index a18e7fbd97b8..7ccc4ff109ce 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -75,8 +75,13 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
return -ENOMEM;
}
- bio_init(&bio, vecs, nr_pages);
- bio_set_dev(&bio, bdev);
+ if (iov_iter_rw(iter) == READ) {
+ bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
+ if (iter_is_iovec(iter))
+ should_dirty = true;
+ } else {
+ bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
+ }
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = iocb->ki_hint;
bio.bi_private = current;
@@ -88,14 +93,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
goto out;
ret = bio.bi_iter.bi_size;
- if (iov_iter_rw(iter) == READ) {
- bio.bi_opf = REQ_OP_READ;
- if (iter_is_iovec(iter))
- should_dirty = true;
- } else {
- bio.bi_opf = dio_bio_write_op(iocb);
+ if (iov_iter_rw(iter) == WRITE)
task_io_account_write(ret);
- }
+
if (iocb->ki_flags & IOCB_NOWAIT)
bio.bi_opf |= REQ_NOWAIT;
if (iocb->ki_flags & IOCB_HIPRI)
@@ -190,6 +190,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
struct blkdev_dio *dio;
struct bio *bio;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+ unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
loff_t pos = iocb->ki_pos;
int ret = 0;
@@ -197,7 +198,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+ bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
atomic_set(&dio->ref, 1);
@@ -223,7 +224,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
for (;;) {
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio;
@@ -238,11 +238,9 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
if (is_read) {
- bio->bi_opf = REQ_OP_READ;
if (dio->flags & DIO_SHOULD_DIRTY)
bio_set_pages_dirty(bio);
} else {
- bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -258,7 +256,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
atomic_inc(&dio->ref);
submit_bio(bio);
- bio = bio_alloc(GFP_KERNEL, nr_pages);
+ bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL);
}
blk_finish_plug(&plug);
@@ -313,6 +311,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
unsigned int nr_pages)
{
struct block_device *bdev = iocb->ki_filp->private_data;
+ bool is_read = iov_iter_rw(iter) == READ;
+ unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
struct bio *bio;
loff_t pos = iocb->ki_pos;
@@ -322,11 +322,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+ bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
dio->flags = 0;
dio->iocb = iocb;
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_end_io = blkdev_bio_end_io_async;
@@ -349,14 +348,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
}
dio->size = bio->bi_iter.bi_size;
- if (iov_iter_rw(iter) == READ) {
- bio->bi_opf = REQ_OP_READ;
+ if (is_read) {
if (iter_is_iovec(iter)) {
dio->flags |= DIO_SHOULD_DIRTY;
bio_set_pages_dirty(bio);
}
} else {
- bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
diff --git a/block/genhd.c b/block/genhd.c
index 9eca1f7d35c9..11c761afd64f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -8,7 +8,6 @@
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
@@ -185,7 +184,9 @@ static struct blk_major_name {
struct blk_major_name *next;
int major;
char name[16];
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
void (*probe)(dev_t devt);
+#endif
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
static DEFINE_MUTEX(major_names_lock);
static DEFINE_SPINLOCK(major_names_spinlock);
@@ -275,7 +276,9 @@ int __register_blkdev(unsigned int major, const char *name,
}
p->major = major;
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
p->probe = probe;
+#endif
strlcpy(p->name, name, sizeof(p->name));
p->next = NULL;
index = major_to_index(major);
@@ -523,6 +526,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
disk_update_readahead(disk);
disk_add_events(disk);
+ set_bit(GD_ADDED, &disk->state);
return 0;
out_unregister_bdi:
@@ -693,6 +697,7 @@ static ssize_t disk_badblocks_store(struct device *dev,
return badblocks_store(disk->bb, page, len, 0);
}
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
void blk_request_module(dev_t devt)
{
unsigned int major = MAJOR(devt);
@@ -712,6 +717,7 @@ void blk_request_module(dev_t devt)
/* Make old-style 2.4 aliases work */
request_module("block-major-%d", MAJOR(devt));
}
+#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
/*
* print a full list of all partitions - intended for places where the root
@@ -927,12 +933,17 @@ ssize_t part_stat_show(struct device *dev,
struct disk_stats stat;
unsigned int inflight;
- part_stat_read_all(bdev, &stat);
if (queue_is_mq(q))
inflight = blk_mq_in_flight(q, bdev);
else
inflight = part_in_flight(bdev);
+ if (inflight) {
+ part_stat_lock();
+ update_io_ticks(bdev, jiffies, true);
+ part_stat_unlock();
+ }
+ part_stat_read_all(bdev, &stat);
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
@@ -1128,6 +1139,10 @@ static void disk_release(struct device *dev)
xa_destroy(&disk->part_tbl);
disk->queue->disk = NULL;
blk_put_queue(disk->queue);
+
+ if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk)
+ disk->fops->free_disk(disk);
+
iput(disk->part0->bd_inode); /* frees the disk */
}
@@ -1188,12 +1203,17 @@ static int diskstats_show(struct seq_file *seqf, void *v)
xa_for_each(&gp->part_tbl, idx, hd) {
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
- part_stat_read_all(hd, &stat);
if (queue_is_mq(gp->queue))
inflight = blk_mq_in_flight(gp->queue, hd);
else
inflight = part_in_flight(hd);
+ if (inflight) {
+ part_stat_lock();
+ update_io_ticks(hd, jiffies, true);
+ part_stat_unlock();
+ }
+ part_stat_read_all(hd, &stat);
seq_printf(seqf, "%4d %7d %pg "
"%lu %lu %lu %u "
"%lu %lu %lu %u "
diff --git a/block/holder.c b/block/holder.c
index 27cddce1b446..8d750281a1cd 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/slab.h>
struct bd_holder_disk {
diff --git a/block/partitions/check.h b/block/partitions/check.h
index d5b28e309d64..4ffa2359b1a3 100644
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/pagemap.h>
#include <linux/blkdev.h>
-#include <linux/genhd.h>
#include "../blk.h"
/*
diff --git a/block/partitions/core.c b/block/partitions/core.c
index c2a1635922b1..2ef8dfa1e5c8 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -8,7 +8,6 @@
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/vmalloc.h>
#include <linux/blktrace_api.h>
#include <linux/raid/detect.h>
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index 8cc2b88d0aa8..84b9f36b9e47 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -13,7 +13,6 @@
#include <linux/types.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index 8693704dcf5e..0a747a0c782d 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -14,7 +14,6 @@
#include <linux/types.h>
#include <linux/list.h>
-#include <linux/genhd.h>
#include <linux/fs.h>
#include <asm/unaligned.h>
#include <asm/byteorder.h>
diff --git a/block/sed-opal.c b/block/sed-opal.c
index daafadbb88ca..9700197000f2 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -13,7 +13,7 @@
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/list.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <uapi/linux/sed-opal.h>