summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig10
-rw-r--r--block/Makefile8
-rw-r--r--block/badblocks.c53
-rw-r--r--block/bdev.c86
-rw-r--r--block/bfq-iosched.c59
-rw-r--r--block/bfq-iosched.h3
-rw-r--r--block/bio-integrity-auto.c223
-rw-r--r--block/bio-integrity.c287
-rw-r--r--block/bio.c166
-rw-r--r--block/blk-cgroup.c93
-rw-r--r--block/blk-cgroup.h2
-rw-r--r--block/blk-core.c13
-rw-r--r--block/blk-crypto-fallback.c8
-rw-r--r--block/blk-crypto-internal.h10
-rw-r--r--block/blk-crypto-profile.c105
-rw-r--r--block/blk-crypto-sysfs.c35
-rw-r--r--block/blk-crypto.c204
-rw-r--r--block/blk-flush.c10
-rw-r--r--block/blk-integrity.c7
-rw-r--r--block/blk-iocost.c23
-rw-r--r--block/blk-iolatency.c3
-rw-r--r--block/blk-ioprio.c23
-rw-r--r--block/blk-map.c93
-rw-r--r--block/blk-merge.c163
-rw-r--r--block/blk-mq-debugfs.c55
-rw-r--r--block/blk-mq-dma.c116
-rw-r--r--block/blk-mq-sched.c248
-rw-r--r--block/blk-mq-sched.h12
-rw-r--r--block/blk-mq-sysfs.c4
-rw-r--r--block/blk-mq-tag.c3
-rw-r--r--block/blk-mq.c358
-rw-r--r--block/blk-mq.h11
-rw-r--r--block/blk-rq-qos.c86
-rw-r--r--block/blk-rq-qos.h48
-rw-r--r--block/blk-settings.c31
-rw-r--r--block/blk-stat.c4
-rw-r--r--block/blk-stat.h2
-rw-r--r--block/blk-sysfs.c311
-rw-r--r--block/blk-throttle.c467
-rw-r--r--block/blk-throttle.h40
-rw-r--r--block/blk-wbt.c28
-rw-r--r--block/blk-zoned.c44
-rw-r--r--block/blk.h61
-rw-r--r--block/bounce.c269
-rw-r--r--block/bsg-lib.c2
-rw-r--r--block/elevator.c377
-rw-r--r--block/elevator.h24
-rw-r--r--block/fops.c59
-rw-r--r--block/genhd.c269
-rw-r--r--block/ioctl.c11
-rw-r--r--block/ioprio.c6
-rw-r--r--block/kyber-iosched.c24
-rw-r--r--block/mq-deadline.c32
-rw-r--r--block/partitions/sgi.c2
-rw-r--r--block/partitions/sun.c2
-rw-r--r--block/t10-pi.c8
56 files changed, 2724 insertions, 1977 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 5b623b876d3b..15027963472d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -63,7 +63,7 @@ config BLK_DEV_BSGLIB
config BLK_DEV_INTEGRITY
bool "Block layer data integrity support"
select CRC_T10DIF
- select CRC64_ROCKSOFT
+ select CRC64
help
Some storage devices allow extra information to be
stored/retrieved to help protect the data. The block layer
@@ -211,14 +211,6 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
source "block/partitions/Kconfig"
-config BLK_MQ_PCI
- def_bool PCI
-
-config BLK_MQ_VIRTIO
- bool
- depends on VIRTIO
- default y
-
config BLK_PM
def_bool PM
diff --git a/block/Makefile b/block/Makefile
index 33748123710b..c65f4da93702 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,13 +5,12 @@
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
- blk-merge.o blk-timeout.o \
- blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
+ blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
+ blk-mq-tag.o blk-mq-dma.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
disk-events.o blk-ia-ranges.o early-lookup.o
-obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
@@ -26,7 +25,8 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \
+ bio-integrity-auto.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
diff --git a/block/badblocks.c b/block/badblocks.c
index dc147c017961..ece64e76fe8f 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -905,39 +905,35 @@ re_insert:
goto update_sectors;
}
+ if (can_merge_front(bb, prev, &bad)) {
+ len = front_merge(bb, prev, &bad);
+ added++;
+ hint = prev;
+ goto update_sectors;
+ }
+
if (overlap_front(bb, prev, &bad)) {
- if (can_merge_front(bb, prev, &bad)) {
- len = front_merge(bb, prev, &bad);
- added++;
- } else {
- int extra = 0;
+ int extra = 0;
- if (!can_front_overwrite(bb, prev, &bad, &extra)) {
- if (extra > 0)
- goto out;
+ if (!can_front_overwrite(bb, prev, &bad, &extra)) {
+ if (extra > 0)
+ goto out;
- len = min_t(sector_t,
- BB_END(p[prev]) - s, sectors);
- hint = prev;
- goto update_sectors;
- }
+ len = min_t(sector_t,
+ BB_END(p[prev]) - s, sectors);
+ hint = prev;
+ goto update_sectors;
+ }
- len = front_overwrite(bb, prev, &bad, extra);
- added++;
- bb->count += extra;
+ len = front_overwrite(bb, prev, &bad, extra);
+ added++;
+ bb->count += extra;
- if (can_combine_front(bb, prev, &bad)) {
- front_combine(bb, prev);
- bb->count--;
- }
+ if (can_combine_front(bb, prev, &bad)) {
+ front_combine(bb, prev);
+ bb->count--;
}
- hint = prev;
- goto update_sectors;
- }
- if (can_merge_front(bb, prev, &bad)) {
- len = front_merge(bb, prev, &bad);
- added++;
hint = prev;
goto update_sectors;
}
@@ -1246,14 +1242,15 @@ re_check:
len = sectors;
update_sectors:
+ /* This situation should never happen */
+ WARN_ON(sectors < len);
+
s += len;
sectors -= len;
if (sectors > 0)
goto re_check;
- WARN_ON(sectors < 0);
-
if (unacked_badblocks > 0)
rv = -1;
else if (acked_badblocks > 0)
diff --git a/block/bdev.c b/block/bdev.c
index 9d73a8fbf7f9..b77ddd12dc06 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -148,28 +148,69 @@ static void set_init_blocksize(struct block_device *bdev)
bsize <<= 1;
}
BD_INODE(bdev)->i_blkbits = blksize_bits(bsize);
+ mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping,
+ get_order(bsize));
}
-int set_blocksize(struct file *file, int size)
+/**
+ * bdev_validate_blocksize - check that this block size is acceptable
+ * @bdev: blockdevice to check
+ * @block_size: block size to check
+ *
+ * For block device users that do not use buffer heads or the block device
+ * page cache, make sure that this block size can be used with the device.
+ *
+ * Return: On success zero is returned, negative error code on failure.
+ */
+int bdev_validate_blocksize(struct block_device *bdev, int block_size)
{
- struct inode *inode = file->f_mapping->host;
- struct block_device *bdev = I_BDEV(inode);
-
- if (blk_validate_block_size(size))
+ if (blk_validate_block_size(block_size))
return -EINVAL;
/* Size cannot be smaller than the size supported by the device */
- if (size < bdev_logical_block_size(bdev))
+ if (block_size < bdev_logical_block_size(bdev))
return -EINVAL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_validate_blocksize);
+
+int set_blocksize(struct file *file, int size)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct block_device *bdev = I_BDEV(inode);
+ int ret;
+
+ ret = bdev_validate_blocksize(bdev, size);
+ if (ret)
+ return ret;
+
if (!file->private_data)
return -EINVAL;
/* Don't change the size if it is same as current */
if (inode->i_blkbits != blksize_bits(size)) {
+ /*
+ * Flush and truncate the pagecache before we reconfigure the
+ * mapping geometry because folio sizes are variable now. If a
+ * reader has already allocated a folio whose size is smaller
+ * than the new min_order but invokes readahead after the new
+ * min_order becomes visible, readahead will think there are
+ * "zero" blocks per folio and crash. Take the inode and
+ * invalidation locks to avoid racing with
+ * read/write/fallocate.
+ */
+ inode_lock(inode);
+ filemap_invalidate_lock(inode->i_mapping);
+
sync_blockdev(bdev);
+ kill_bdev(bdev);
+
inode->i_blkbits = blksize_bits(size);
+ mapping_set_folio_min_order(inode->i_mapping, get_order(size));
kill_bdev(bdev);
+ filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
}
return 0;
}
@@ -178,10 +219,11 @@ EXPORT_SYMBOL(set_blocksize);
int sb_set_blocksize(struct super_block *sb, int size)
{
+ if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE)
+ return 0;
if (set_blocksize(sb->s_bdev_file, size))
return 0;
- /* If we get here, we know size is power of two
- * and it's value is between 512 and PAGE_SIZE */
+ /* If we get here, we know size is validated */
sb->s_blocksize = size;
sb->s_blocksize_bits = blksize_bits(size);
return sb->s_blocksize;
@@ -773,13 +815,13 @@ static void blkdev_put_part(struct block_device *part)
blkdev_put_whole(whole);
}
-struct block_device *blkdev_get_no_open(dev_t dev)
+struct block_device *blkdev_get_no_open(dev_t dev, bool autoload)
{
struct block_device *bdev;
struct inode *inode;
inode = ilookup(blockdev_superblock, dev);
- if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
+ if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
blk_request_module(dev);
inode = ilookup(blockdev_superblock, dev);
if (inode)
@@ -1001,7 +1043,7 @@ struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
if (ret)
return ERR_PTR(ret);
- bdev = blkdev_get_no_open(dev);
+ bdev = blkdev_get_no_open(dev, true);
if (!bdev)
return ERR_PTR(-ENXIO);
@@ -1268,24 +1310,17 @@ void sync_bdevs(bool wait)
/*
* Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices.
*/
-void bdev_statx(struct path *path, struct kstat *stat,
- u32 request_mask)
+void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask)
{
- struct inode *backing_inode;
struct block_device *bdev;
- if (!(request_mask & (STATX_DIOALIGN | STATX_WRITE_ATOMIC)))
- return;
-
- backing_inode = d_backing_inode(path->dentry);
-
/*
- * Note that backing_inode is the inode of a block device node file,
- * not the block device's internal inode. Therefore it is *not* valid
- * to use I_BDEV() here; the block device has to be looked up by i_rdev
+ * Note that d_backing_inode() returns the block device node inode, not
+ * the block device's internal inode. Therefore it is *not* valid to
+ * use I_BDEV() here; the block device has to be looked up by i_rdev
* instead.
*/
- bdev = blkdev_get_no_open(backing_inode->i_rdev);
+ bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false);
if (!bdev)
return;
@@ -1300,9 +1335,12 @@ void bdev_statx(struct path *path, struct kstat *stat,
generic_fill_statx_atomic_writes(stat,
queue_atomic_write_unit_min_bytes(bd_queue),
- queue_atomic_write_unit_max_bytes(bd_queue));
+ queue_atomic_write_unit_max_bytes(bd_queue),
+ 0);
}
+ stat->blksize = bdev_io_min(bdev);
+
blkdev_put_no_open(bdev);
}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 167542201603..9483c529c958 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -701,17 +701,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
struct bfq_data *bfqd = data->q->elevator->elevator_data;
struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
- int depth;
- unsigned limit = data->q->nr_requests;
- unsigned int act_idx;
+ unsigned int limit, act_idx;
/* Sync reads have full depth available */
- if (op_is_sync(opf) && !op_is_write(opf)) {
- depth = 0;
- } else {
- depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
- limit = (limit * depth) >> bfqd->full_depth_shift;
- }
+ if (op_is_sync(opf) && !op_is_write(opf))
+ limit = data->q->nr_requests;
+ else
+ limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
/* Fast path to check if bfqq is already allocated. */
@@ -725,14 +721,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
* available requests and thus starve other entities.
*/
if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) {
- depth = 1;
+ limit = 1;
break;
}
}
+
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
- __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
- if (depth)
- data->shallow_depth = depth;
+ __func__, bfqd->wr_busy_queues, op_is_sync(opf), limit);
+
+ if (limit < data->q->nr_requests)
+ data->shallow_depth = limit;
}
static struct bfq_queue *
@@ -7128,9 +7126,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
*/
static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
{
- unsigned int depth = 1U << bt->sb.shift;
+ unsigned int nr_requests = bfqd->queue->nr_requests;
- bfqd->full_depth_shift = bt->sb.shift;
/*
* In-word depths if no bfq_queue is being weight-raised:
* leaving 25% of tags only for sync reads.
@@ -7142,13 +7139,13 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
* limit 'something'.
*/
/* no more than 50% of tags for async I/O */
- bfqd->word_depths[0][0] = max(depth >> 1, 1U);
+ bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U);
/*
* no more than 75% of tags for sync writes (25% extra tags
* w.r.t. async I/O, to prevent async I/O from starving sync
* writes)
*/
- bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
+ bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U);
/*
* In-word depths in case some bfq_queue is being weight-
@@ -7158,9 +7155,9 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
* shortage.
*/
/* no more than ~18% of tags for async I/O */
- bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
+ bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
- bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
+ bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
}
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
@@ -7210,8 +7207,8 @@ static void bfq_exit_queue(struct elevator_queue *e)
#endif
blk_stat_disable_accounting(bfqd->queue);
- clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags);
- wbt_enable_default(bfqd->queue->disk);
+ blk_queue_flag_clear(QUEUE_FLAG_DISABLE_WBT_DEF, bfqd->queue);
+ set_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &e->flags);
kfree(bfqd);
}
@@ -7232,22 +7229,16 @@ static void bfq_init_root_group(struct bfq_group *root_group,
root_group->sched_data.bfq_class_idle_last_service = jiffies;
}
-static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
{
struct bfq_data *bfqd;
- struct elevator_queue *eq;
unsigned int i;
struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges;
- eq = elevator_alloc(q, e);
- if (!eq)
- return -ENOMEM;
-
bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
- if (!bfqd) {
- kobject_put(&eq->kobj);
+ if (!bfqd)
return -ENOMEM;
- }
+
eq->elevator_data = bfqd;
spin_lock_irq(&q->queue_lock);
@@ -7315,9 +7306,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
INIT_LIST_HEAD(&bfqd->dispatch);
- hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+ hrtimer_setup(&bfqd->idle_slice_timer, bfq_idle_slice_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
bfqd->queue_weights_tree = RB_ROOT_CACHED;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -7398,7 +7388,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
- set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags);
+ blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q);
wbt_disable_default(q->disk);
blk_stat_enable_accounting(q);
@@ -7406,7 +7396,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
out_free:
kfree(bfqd);
- kobject_put(&eq->kobj);
return -ENOMEM;
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 687a3a7ba784..31217f196f4f 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -813,8 +813,7 @@ struct bfq_data {
* Depth limits used in bfq_limit_depth (see comments on the
* function)
*/
- unsigned int word_depths[2][2];
- unsigned int full_depth_shift;
+ unsigned int async_depths[2][2];
/*
* Number of independent actuators. This is equal to 1 in
diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c
new file mode 100644
index 000000000000..9c6657664792
--- /dev/null
+++ b/block/bio-integrity-auto.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007, 2008, 2009 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * Automatically generate and verify integrity data on PI capable devices if the
+ * bio submitter didn't provide PI itself. This ensures that kernel verifies
+ * data integrity even if the file system (or other user of the block device) is
+ * not aware of PI.
+ */
+#include <linux/blk-integrity.h>
+#include <linux/t10-pi.h>
+#include <linux/workqueue.h>
+#include "blk.h"
+
+struct bio_integrity_data {
+ struct bio *bio;
+ struct bvec_iter saved_bio_iter;
+ struct work_struct work;
+ struct bio_integrity_payload bip;
+ struct bio_vec bvec;
+};
+
+static struct kmem_cache *bid_slab;
+static mempool_t bid_pool;
+static struct workqueue_struct *kintegrityd_wq;
+
+static void bio_integrity_finish(struct bio_integrity_data *bid)
+{
+ bid->bio->bi_integrity = NULL;
+ bid->bio->bi_opf &= ~REQ_INTEGRITY;
+ kfree(bvec_virt(bid->bip.bip_vec));
+ mempool_free(bid, &bid_pool);
+}
+
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+ struct bio_integrity_data *bid =
+ container_of(work, struct bio_integrity_data, work);
+ struct bio *bio = bid->bio;
+
+ blk_integrity_verify_iter(bio, &bid->saved_bio_iter);
+ bio_integrity_finish(bid);
+ bio_endio(bio);
+}
+
+#define BIP_CHECK_FLAGS (BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG)
+static bool bip_should_check(struct bio_integrity_payload *bip)
+{
+ return bip->bip_flags & BIP_CHECK_FLAGS;
+}
+
+static bool bi_offload_capable(struct blk_integrity *bi)
+{
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_CRC64:
+ return bi->tuple_size == sizeof(struct crc64_pi_tuple);
+ case BLK_INTEGRITY_CSUM_CRC:
+ case BLK_INTEGRITY_CSUM_IP:
+ return bi->tuple_size == sizeof(struct t10_pi_tuple);
+ default:
+ pr_warn_once("%s: unknown integrity checksum type:%d\n",
+ __func__, bi->csum_type);
+ fallthrough;
+ case BLK_INTEGRITY_CSUM_NONE:
+ return false;
+ }
+}
+
+/**
+ * __bio_integrity_endio - Integrity I/O completion function
+ * @bio: Protected bio
+ *
+ * Normally I/O completion is done in interrupt context. However, verifying I/O
+ * integrity is a time-consuming task which must be run in process context.
+ *
+ * This function postpones completion accordingly.
+ */
+bool __bio_integrity_endio(struct bio *bio)
+{
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ struct bio_integrity_data *bid =
+ container_of(bip, struct bio_integrity_data, bip);
+
+ if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
+ bip_should_check(bip)) {
+ INIT_WORK(&bid->work, bio_integrity_verify_fn);
+ queue_work(kintegrityd_wq, &bid->work);
+ return false;
+ }
+
+ bio_integrity_finish(bid);
+ return true;
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio: bio to prepare
+ *
+ * Checks if the bio already has an integrity payload attached. If it does, the
+ * payload has been generated by another kernel subsystem, and we just pass it
+ * through.
+ * Otherwise allocates integrity payload and for writes the integrity metadata
+ * will be generated. For reads, the completion handler will verify the
+ * metadata.
+ */
+bool bio_integrity_prep(struct bio *bio)
+{
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+ struct bio_integrity_data *bid;
+ bool set_flags = true;
+ gfp_t gfp = GFP_NOIO;
+ unsigned int len;
+ void *buf;
+
+ if (!bi)
+ return true;
+
+ if (!bio_sectors(bio))
+ return true;
+
+ /* Already protected? */
+ if (bio_integrity(bio))
+ return true;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ if (bi->flags & BLK_INTEGRITY_NOVERIFY) {
+ if (bi_offload_capable(bi))
+ return true;
+ set_flags = false;
+ }
+ break;
+ case REQ_OP_WRITE:
+ /*
+ * Zero the memory allocated to not leak uninitialized kernel
+ * memory to disk for non-integrity metadata where nothing else
+ * initializes the memory.
+ */
+ if (bi->flags & BLK_INTEGRITY_NOGENERATE) {
+ if (bi_offload_capable(bi))
+ return true;
+ set_flags = false;
+ gfp |= __GFP_ZERO;
+ } else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
+ gfp |= __GFP_ZERO;
+ break;
+ default:
+ return true;
+ }
+
+ if (WARN_ON_ONCE(bio_has_crypt_ctx(bio)))
+ return true;
+
+ /* Allocate kernel buffer for protection data */
+ len = bio_integrity_bytes(bi, bio_sectors(bio));
+ buf = kmalloc(len, gfp);
+ if (!buf)
+ goto err_end_io;
+ bid = mempool_alloc(&bid_pool, GFP_NOIO);
+ if (!bid)
+ goto err_free_buf;
+ bio_integrity_init(bio, &bid->bip, &bid->bvec, 1);
+
+ bid->bio = bio;
+
+ bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY;
+ bip_set_seed(&bid->bip, bio->bi_iter.bi_sector);
+
+ if (set_flags) {
+ if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
+ bid->bip.bip_flags |= BIP_IP_CHECKSUM;
+ if (bi->csum_type)
+ bid->bip.bip_flags |= BIP_CHECK_GUARD;
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ bid->bip.bip_flags |= BIP_CHECK_REFTAG;
+ }
+
+ if (bio_integrity_add_page(bio, virt_to_page(buf), len,
+ offset_in_page(buf)) < len)
+ goto err_end_io;
+
+ /* Auto-generate integrity metadata if this is a write */
+ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip))
+ blk_integrity_generate(bio);
+ else
+ bid->saved_bio_iter = bio->bi_iter;
+ return true;
+
+err_free_buf:
+ kfree(buf);
+err_end_io:
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ return false;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+void blk_flush_integrity(void)
+{
+ flush_workqueue(kintegrityd_wq);
+}
+
+static int __init blk_integrity_auto_init(void)
+{
+ bid_slab = kmem_cache_create("bio_integrity_data",
+ sizeof(struct bio_integrity_data), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+ if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab))
+ panic("bio: can't create integrity pool\n");
+
+ /*
+ * kintegrityd won't block much but may burn a lot of CPU cycles.
+ * Make it highpri CPU intensive wq with max concurrency of 1.
+ */
+ kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
+ if (!kintegrityd_wq)
+ panic("Failed to create kintegrityd\n");
+ return 0;
+}
+subsys_initcall(blk_integrity_auto_init);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 03e8fae17393..10912988c8f5 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -7,20 +7,12 @@
*/
#include <linux/blk-integrity.h>
-#include <linux/mempool.h>
-#include <linux/export.h>
-#include <linux/bio.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
#include "blk.h"
-static struct kmem_cache *bip_slab;
-static struct workqueue_struct *kintegrityd_wq;
-
-void blk_flush_integrity(void)
-{
- flush_workqueue(kintegrityd_wq);
-}
+struct bio_integrity_alloc {
+ struct bio_integrity_payload bip;
+ struct bio_vec bvecs[];
+};
/**
* bio_integrity_free - Free bio integrity payload
@@ -30,21 +22,23 @@ void blk_flush_integrity(void)
*/
void bio_integrity_free(struct bio *bio)
{
- struct bio_integrity_payload *bip = bio_integrity(bio);
- struct bio_set *bs = bio->bi_pool;
-
- if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
- if (bip->bip_vec)
- bvec_free(&bs->bvec_integrity_pool, bip->bip_vec,
- bip->bip_max_vcnt);
- mempool_free(bip, &bs->bio_integrity_pool);
- } else {
- kfree(bip);
- }
+ kfree(bio_integrity(bio));
bio->bi_integrity = NULL;
bio->bi_opf &= ~REQ_INTEGRITY;
}
+void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip,
+ struct bio_vec *bvecs, unsigned int nr_vecs)
+{
+ memset(bip, 0, sizeof(*bip));
+ bip->bip_max_vcnt = nr_vecs;
+ if (nr_vecs)
+ bip->bip_vec = bvecs;
+
+ bio->bi_integrity = bip;
+ bio->bi_opf |= REQ_INTEGRITY;
+}
+
/**
* bio_integrity_alloc - Allocate integrity payload and attach it to bio
* @bio: bio to attach integrity metadata to
@@ -59,48 +53,16 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
gfp_t gfp_mask,
unsigned int nr_vecs)
{
- struct bio_integrity_payload *bip;
- struct bio_set *bs = bio->bi_pool;
- unsigned inline_vecs;
+ struct bio_integrity_alloc *bia;
if (WARN_ON_ONCE(bio_has_crypt_ctx(bio)))
return ERR_PTR(-EOPNOTSUPP);
- if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
- bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask);
- inline_vecs = nr_vecs;
- } else {
- bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
- inline_vecs = BIO_INLINE_VECS;
- }
-
- if (unlikely(!bip))
+ bia = kmalloc(struct_size(bia, bvecs, nr_vecs), gfp_mask);
+ if (unlikely(!bia))
return ERR_PTR(-ENOMEM);
-
- memset(bip, 0, sizeof(*bip));
-
- /* always report as many vecs as asked explicitly, not inline vecs */
- bip->bip_max_vcnt = nr_vecs;
- if (nr_vecs > inline_vecs) {
- bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
- &bip->bip_max_vcnt, gfp_mask);
- if (!bip->bip_vec)
- goto err;
- } else if (nr_vecs) {
- bip->bip_vec = bip->bip_inline_vecs;
- }
-
- bip->bip_bio = bio;
- bio->bi_integrity = bip;
- bio->bi_opf |= REQ_INTEGRITY;
-
- return bip;
-err:
- if (bs && mempool_initialized(&bs->bio_integrity_pool))
- mempool_free(bip, &bs->bio_integrity_pool);
- else
- kfree(bip);
- return ERR_PTR(-ENOMEM);
+ bio_integrity_init(bio, &bia->bip, bia->bvecs, nr_vecs);
+ return &bia->bip;
}
EXPORT_SYMBOL(bio_integrity_alloc);
@@ -165,10 +127,8 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
if (bip->bip_vcnt > 0) {
struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
- bool same_page = false;
- if (bvec_try_merge_hw_page(q, bv, page, len, offset,
- &same_page)) {
+ if (bvec_try_merge_hw_page(q, bv, page, len, offset)) {
bip->bip_iter.bi_size += len;
return len;
}
@@ -194,10 +154,9 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
EXPORT_SYMBOL(bio_integrity_add_page);
static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
- int nr_vecs, unsigned int len,
- unsigned int direction)
+ int nr_vecs, unsigned int len)
{
- bool write = direction == ITER_SOURCE;
+ bool write = op_is_write(bio_op(bio));
struct bio_integrity_payload *bip;
struct iov_iter iter;
void *buf;
@@ -208,7 +167,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
return -ENOMEM;
if (write) {
- iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
+ iov_iter_bvec(&iter, ITER_SOURCE, bvec, nr_vecs, len);
if (!copy_from_iter_full(buf, len, &iter)) {
ret = -EFAULT;
goto free_buf;
@@ -304,7 +263,7 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
size_t offset, bytes = iter->count;
- unsigned int direction, nr_bvecs;
+ unsigned int nr_bvecs;
int ret, nr_vecs;
bool copy;
@@ -313,11 +272,6 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q))
return -E2BIG;
- if (bio_data_dir(bio) == READ)
- direction = ITER_DEST;
- else
- direction = ITER_SOURCE;
-
nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1);
if (nr_vecs > BIO_MAX_VECS)
return -E2BIG;
@@ -340,8 +294,7 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
copy = true;
if (copy)
- ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
- direction);
+ ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes);
else
ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes);
if (ret)
@@ -409,149 +362,6 @@ int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
}
/**
- * bio_integrity_prep - Prepare bio for integrity I/O
- * @bio: bio to prepare
- *
- * Description: Checks if the bio already has an integrity payload attached.
- * If it does, the payload has been generated by another kernel subsystem,
- * and we just pass it through. Otherwise allocates integrity payload.
- * The bio must have data direction, target device and start sector set priot
- * to calling. In the WRITE case, integrity metadata will be generated using
- * the block device's integrity function. In the READ case, the buffer
- * will be prepared for DMA and a suitable end_io handler set up.
- */
-bool bio_integrity_prep(struct bio *bio)
-{
- struct bio_integrity_payload *bip;
- struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
- unsigned int len;
- void *buf;
- gfp_t gfp = GFP_NOIO;
-
- if (!bi)
- return true;
-
- if (!bio_sectors(bio))
- return true;
-
- /* Already protected? */
- if (bio_integrity(bio))
- return true;
-
- switch (bio_op(bio)) {
- case REQ_OP_READ:
- if (bi->flags & BLK_INTEGRITY_NOVERIFY)
- return true;
- break;
- case REQ_OP_WRITE:
- if (bi->flags & BLK_INTEGRITY_NOGENERATE)
- return true;
-
- /*
- * Zero the memory allocated to not leak uninitialized kernel
- * memory to disk for non-integrity metadata where nothing else
- * initializes the memory.
- */
- if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
- gfp |= __GFP_ZERO;
- break;
- default:
- return true;
- }
-
- /* Allocate kernel buffer for protection data */
- len = bio_integrity_bytes(bi, bio_sectors(bio));
- buf = kmalloc(len, gfp);
- if (unlikely(buf == NULL)) {
- goto err_end_io;
- }
-
- bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
- if (IS_ERR(bip)) {
- kfree(buf);
- goto err_end_io;
- }
-
- bip->bip_flags |= BIP_BLOCK_INTEGRITY;
- bip_set_seed(bip, bio->bi_iter.bi_sector);
-
- if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
- bip->bip_flags |= BIP_IP_CHECKSUM;
-
- /* describe what tags to check in payload */
- if (bi->csum_type)
- bip->bip_flags |= BIP_CHECK_GUARD;
- if (bi->flags & BLK_INTEGRITY_REF_TAG)
- bip->bip_flags |= BIP_CHECK_REFTAG;
- if (bio_integrity_add_page(bio, virt_to_page(buf), len,
- offset_in_page(buf)) < len) {
- printk(KERN_ERR "could not attach integrity payload\n");
- goto err_end_io;
- }
-
- /* Auto-generate integrity metadata if this is a write */
- if (bio_data_dir(bio) == WRITE)
- blk_integrity_generate(bio);
- else
- bip->bio_iter = bio->bi_iter;
- return true;
-
-err_end_io:
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- return false;
-}
-EXPORT_SYMBOL(bio_integrity_prep);
-
-/**
- * bio_integrity_verify_fn - Integrity I/O completion worker
- * @work: Work struct stored in bio to be verified
- *
- * Description: This workqueue function is called to complete a READ
- * request. The function verifies the transferred integrity metadata
- * and then calls the original bio end_io function.
- */
-static void bio_integrity_verify_fn(struct work_struct *work)
-{
- struct bio_integrity_payload *bip =
- container_of(work, struct bio_integrity_payload, bip_work);
- struct bio *bio = bip->bip_bio;
-
- blk_integrity_verify(bio);
-
- kfree(bvec_virt(bip->bip_vec));
- bio_integrity_free(bio);
- bio_endio(bio);
-}
-
-/**
- * __bio_integrity_endio - Integrity I/O completion function
- * @bio: Protected bio
- *
- * Description: Completion for integrity I/O
- *
- * Normally I/O completion is done in interrupt context. However,
- * verifying I/O integrity is a time-consuming task which must be run
- * in process context. This function postpones completion
- * accordingly.
- */
-bool __bio_integrity_endio(struct bio *bio)
-{
- struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
- struct bio_integrity_payload *bip = bio_integrity(bio);
-
- if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) {
- INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
- queue_work(kintegrityd_wq, &bip->bip_work);
- return false;
- }
-
- kfree(bvec_virt(bip->bip_vec));
- bio_integrity_free(bio);
- return true;
-}
-
-/**
* bio_integrity_advance - Advance integrity vector
* @bio: bio whose integrity vector to update
* @bytes_done: number of data bytes that have been completed
@@ -612,44 +422,3 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
return 0;
}
-
-int bioset_integrity_create(struct bio_set *bs, int pool_size)
-{
- if (mempool_initialized(&bs->bio_integrity_pool))
- return 0;
-
- if (mempool_init_slab_pool(&bs->bio_integrity_pool,
- pool_size, bip_slab))
- return -1;
-
- if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) {
- mempool_exit(&bs->bio_integrity_pool);
- return -1;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bioset_integrity_create);
-
-void bioset_integrity_free(struct bio_set *bs)
-{
- mempool_exit(&bs->bio_integrity_pool);
- mempool_exit(&bs->bvec_integrity_pool);
-}
-
-void __init bio_integrity_init(void)
-{
- /*
- * kintegrityd won't block much but may burn a lot of CPU cycles.
- * Make it highpri CPU intensive wq with max concurrency of 1.
- */
- kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
- WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
- if (!kintegrityd_wq)
- panic("Failed to create kintegrityd\n");
-
- bip_slab = kmem_cache_create("bio_integrity_payload",
- sizeof(struct bio_integrity_payload) +
- sizeof(struct bio_vec) * BIO_INLINE_VECS,
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-}
diff --git a/block/bio.c b/block/bio.c
index 6deea10b2cd3..3c0a558c90f5 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
+ bio->bi_write_stream = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
@@ -611,7 +612,7 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
{
struct bio *bio;
- if (nr_vecs > UIO_MAXIOV)
+ if (nr_vecs > BIO_MAX_INLINE_VECS)
return NULL;
return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask);
}
@@ -827,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
bio_set_flag(bio, BIO_CLONED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter = bio_src->bi_iter;
if (bio->bi_bdev) {
@@ -918,7 +920,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
}
static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
- unsigned int len, unsigned int off, bool *same_page)
+ unsigned int len, unsigned int off)
{
size_t bv_end = bv->bv_offset + bv->bv_len;
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
@@ -931,9 +933,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
return false;
- *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
- PAGE_MASK));
- if (!*same_page) {
+ if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
if (IS_ENABLED(CONFIG_KMSAN))
return false;
if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
@@ -953,8 +953,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
* helpers to split. Hopefully this will go away soon.
*/
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
- struct page *page, unsigned len, unsigned offset,
- bool *same_page)
+ struct page *page, unsigned len, unsigned offset)
{
unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = bvec_phys(bv);
@@ -964,7 +963,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
return false;
if (len > queue_max_segment_size(q) - bv->bv_len)
return false;
- return bvec_try_merge_page(bv, page, len, offset, same_page);
+ return bvec_try_merge_page(bv, page, len, offset);
}
/**
@@ -990,6 +989,22 @@ void __bio_add_page(struct bio *bio, struct page *page,
EXPORT_SYMBOL_GPL(__bio_add_page);
/**
+ * bio_add_virt_nofail - add data in the direct kernel mapping to a bio
+ * @bio: destination bio
+ * @vaddr: data to add
+ * @len: length of the data to add, may cross pages
+ *
+ * Add the data at @vaddr to @bio. The caller must have ensure a segment
+ * is available for the added data. No merging into an existing segment
+ * will be performed.
+ */
+void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len)
+{
+ __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr));
+}
+EXPORT_SYMBOL_GPL(bio_add_virt_nofail);
+
+/**
* bio_add_page - attempt to add page(s) to bio
* @bio: destination bio
* @page: start page to add
@@ -1002,8 +1017,6 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
- bool same_page = false;
-
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
if (bio->bi_iter.bi_size > UINT_MAX - len)
@@ -1011,7 +1024,7 @@ int bio_add_page(struct bio *bio, struct page *page,
if (bio->bi_vcnt > 0 &&
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- page, len, offset, &same_page)) {
+ page, len, offset)) {
bio->bi_iter.bi_size += len;
return len;
}
@@ -1058,6 +1071,61 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
}
EXPORT_SYMBOL(bio_add_folio);
+/**
+ * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio
+ * @bio: destination bio
+ * @vaddr: vmalloc address to add
+ * @len: total length in bytes of the data to add
+ *
+ * Add data starting at @vaddr to @bio and return how many bytes were added.
+ * This may be less than the amount originally asked. Returns 0 if no data
+ * could be added to @bio.
+ *
+ * This helper calls flush_kernel_vmap_range() for the range added. For reads
+ * the caller still needs to manually call invalidate_kernel_vmap_range() in
+ * the completion handler.
+ */
+unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len)
+{
+ unsigned int offset = offset_in_page(vaddr);
+
+ len = min(len, PAGE_SIZE - offset);
+ if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len)
+ return 0;
+ if (op_is_write(bio_op(bio)))
+ flush_kernel_vmap_range(vaddr, len);
+ return len;
+}
+EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk);
+
+/**
+ * bio_add_vmalloc - add a vmalloc region to a bio
+ * @bio: destination bio
+ * @vaddr: vmalloc address to add
+ * @len: total length in bytes of the data to add
+ *
+ * Add data starting at @vaddr to @bio. Return %true on success or %false if
+ * @bio does not have enough space for the payload.
+ *
+ * This helper calls flush_kernel_vmap_range() for the range added. For reads
+ * the caller still needs to manually call invalidate_kernel_vmap_range() in
+ * the completion handler.
+ */
+bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len)
+{
+ do {
+ unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len);
+
+ if (!added)
+ return false;
+ vaddr += added;
+ len -= added;
+ } while (len);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(bio_add_vmalloc);
+
void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct folio_iter fi;
@@ -1088,27 +1156,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
-static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len,
- size_t offset)
-{
- bool same_page = false;
-
- if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
- return -EIO;
-
- if (bio->bi_vcnt > 0 &&
- bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- folio_page(folio, 0), len, offset,
- &same_page)) {
- bio->bi_iter.bi_size += len;
- if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
- unpin_user_folio(folio, 1);
- return 0;
- }
- bio_add_folio_nofail(bio, folio, len, offset);
- return 0;
-}
-
static unsigned int get_contig_folio_len(unsigned int *num_pages,
struct page **pages, unsigned int i,
struct folio *folio, size_t left,
@@ -1203,6 +1250,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
struct folio *folio = page_folio(page);
+ unsigned int old_vcnt = bio->bi_vcnt;
folio_offset = ((size_t)folio_page_idx(folio, page) <<
PAGE_SHIFT) + offset;
@@ -1215,7 +1263,23 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
len = get_contig_folio_len(&num_pages, pages, i,
folio, left, offset);
- bio_iov_add_folio(bio, folio, len, folio_offset);
+ if (!bio_add_folio(bio, folio, len, folio_offset)) {
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (bio_flagged(bio, BIO_PAGE_PINNED)) {
+ /*
+ * We're adding another fragment of a page that already
+ * was part of the last segment. Undo our pin as the
+ * page was pinned when an earlier fragment of it was
+ * added to the bio and __bio_release_pages expects a
+ * single pin per page.
+ */
+ if (offset && bio->bi_vcnt == old_vcnt)
+ unpin_user_folio(folio, 1);
+ }
offset = 0;
}
@@ -1301,6 +1365,36 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
+/**
+ * bdev_rw_virt - synchronously read into / write from kernel mapping
+ * @bdev: block device to access
+ * @sector: sector to access
+ * @data: data to read/write
+ * @len: length in byte to read/write
+ * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE)
+ *
+ * Performs synchronous I/O to @bdev for @data/@len. @data must be in
+ * the kernel direct mapping and not a vmalloc address.
+ */
+int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
+ size_t len, enum req_op op)
+{
+ struct bio_vec bv;
+ struct bio bio;
+ int error;
+
+ if (WARN_ON_ONCE(is_vmalloc_addr(data)))
+ return -EIO;
+
+ bio_init(&bio, bdev, &bv, 1, op);
+ bio.bi_iter.bi_sector = sector;
+ bio_add_virt_nofail(&bio, data, len);
+ error = submit_bio_wait(&bio);
+ bio_uninit(&bio);
+ return error;
+}
+EXPORT_SYMBOL_GPL(bdev_rw_virt);
+
static void bio_wait_end_io(struct bio *bio)
{
complete(bio->bi_private);
@@ -1660,7 +1754,6 @@ void bioset_exit(struct bio_set *bs)
mempool_exit(&bs->bio_pool);
mempool_exit(&bs->bvec_pool);
- bioset_integrity_free(bs);
if (bs->bio_slab)
bio_put_slab(bs);
bs->bio_slab = NULL;
@@ -1740,8 +1833,6 @@ static int __init init_bio(void)
BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags));
- bio_integrity_init();
-
for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
struct biovec_slab *bvs = bvec_slabs + i;
@@ -1757,9 +1848,6 @@ static int __init init_bio(void)
BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
panic("bio: can't allocate bios\n");
- if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
- panic("bio: can't create integrity pool\n");
-
return 0;
}
subsys_initcall(init_bio);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9ed93d91d754..5936db7f8475 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -659,6 +659,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
struct blkcg_gq *blkg;
int i;
+ pr_info_once("blkio.%s is deprecated\n", cftype->name);
mutex_lock(&blkcg_pol_mutex);
spin_lock_irq(&blkcg->lock);
@@ -796,7 +797,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
return -EINVAL;
input = skip_spaces(input);
- bdev = blkdev_get_no_open(MKDEV(major, minor));
+ bdev = blkdev_get_no_open(MKDEV(major, minor), false);
if (!bdev)
return -ENODEV;
if (bdev_is_partition(bdev)) {
@@ -815,6 +816,41 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
ctx->bdev = bdev;
return 0;
}
+/*
+ * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
+ * acquires q->elevator_lock, and ensures the correct locking order
+ * between q->elevator_lock and q->rq_qos_mutex.
+ *
+ * This function returns negative error on failure. On success it returns
+ * memflags which must be saved and later passed to blkg_conf_exit_frozen
+ * for restoring the memalloc scope.
+ */
+unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
+{
+ int ret;
+ unsigned long memflags;
+
+ if (ctx->bdev)
+ return -EINVAL;
+
+ ret = blkg_conf_open_bdev(ctx);
+ if (ret < 0)
+ return ret;
+ /*
+ * At this point, we haven’t started protecting anything related to QoS,
+ * so we release q->rq_qos_mutex here, which was first acquired in blkg_
+ * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
+ * the queue and acquiring q->elevator_lock to maintain the correct
+ * locking order.
+ */
+ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
+
+ memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
+ mutex_lock(&ctx->bdev->bd_queue->elevator_lock);
+ mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);
+
+ return memflags;
+}
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
@@ -971,6 +1007,22 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_exit);
+/*
+ * Similar to blkg_conf_exit, but also unfreezes the queue and releases
+ * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen
+ * is used to open the bdev.
+ */
+void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
+{
+ if (ctx->bdev) {
+ struct request_queue *q = ctx->bdev->bd_queue;
+
+ blkg_conf_exit(ctx);
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
+ }
+}
+
static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
{
int i;
@@ -1022,8 +1074,8 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
/*
* For covering concurrent parent blkg update from blkg_release().
*
- * When flushing from cgroup, cgroup_rstat_lock is always held, so
- * this lock won't cause contention most of time.
+ * When flushing from cgroup, the subsystem rstat lock is always held,
+ * so this lock won't cause contention most of time.
*/
raw_spin_lock_irqsave(&blkg_stat_lock, flags);
@@ -1092,7 +1144,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
/*
* We source root cgroup stats from the system-wide stats to avoid
* tracking the same information twice and incurring overhead when no
- * cgroups are defined. For that reason, cgroup_rstat_flush in
+ * cgroups are defined. For that reason, css_rstat_flush in
* blkcg_print_stat does not actually fill out the iostat in the root
* cgroup's blkcg_gq.
*
@@ -1201,7 +1253,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
if (!seq_css(sf)->parent)
blkcg_fill_root_iostats();
else
- cgroup_rstat_flush(blkcg->css.cgroup);
+ css_rstat_flush(&blkcg->css);
rcu_read_lock();
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
@@ -1727,27 +1779,27 @@ int blkcg_policy_register(struct blkcg_policy *pol)
struct blkcg *blkcg;
int i, ret;
+ /*
+ * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
+ * without pd_alloc_fn/pd_free_fn can't be activated.
+ */
+ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
+ (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
+ return -EINVAL;
+
mutex_lock(&blkcg_pol_register_mutex);
mutex_lock(&blkcg_pol_mutex);
/* find an empty slot */
- ret = -ENOSPC;
for (i = 0; i < BLKCG_MAX_POLS; i++)
if (!blkcg_policy[i])
break;
if (i >= BLKCG_MAX_POLS) {
pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
+ ret = -ENOSPC;
goto err_unlock;
}
- /*
- * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
- * without pd_alloc_fn/pd_free_fn can't be activated.
- */
- if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
- (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
- goto err_unlock;
-
/* register @pol */
pol->plid = i;
blkcg_policy[pol->plid] = pol;
@@ -1758,8 +1810,10 @@ int blkcg_policy_register(struct blkcg_policy *pol)
struct blkcg_policy_data *cpd;
cpd = pol->cpd_alloc_fn(GFP_KERNEL);
- if (!cpd)
+ if (!cpd) {
+ ret = -ENOMEM;
goto err_free_cpds;
+ }
blkcg->cpd[pol->plid] = cpd;
cpd->blkcg = blkcg;
@@ -1770,12 +1824,15 @@ int blkcg_policy_register(struct blkcg_policy *pol)
mutex_unlock(&blkcg_pol_mutex);
/* everything is in place, add intf files for the new policy */
- if (pol->dfl_cftypes)
+ if (pol->dfl_cftypes == pol->legacy_cftypes) {
+ WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys,
+ pol->dfl_cftypes));
+ } else {
WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
pol->dfl_cftypes));
- if (pol->legacy_cftypes)
WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
pol->legacy_cftypes));
+ }
mutex_unlock(&blkcg_pol_register_mutex);
return 0;
@@ -2186,7 +2243,7 @@ void blk_cgroup_bio_start(struct bio *bio)
}
u64_stats_update_end_irqrestore(&bis->sync, flags);
- cgroup_rstat_updated(blkcg->css.cgroup, cpu);
+ css_rstat_updated(&blkcg->css, cpu);
put_cpu();
}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2c4663bd993a..81868ad86330 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -219,9 +219,11 @@ struct blkg_conf_ctx {
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
+unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkg_conf_ctx *ctx);
void blkg_conf_exit(struct blkg_conf_ctx *ctx);
+void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);
/**
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
diff --git a/block/blk-core.c b/block/blk-core.c
index d6c4fa3943b5..fdac48aec5ef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -219,7 +219,7 @@ EXPORT_SYMBOL_GPL(blk_status_to_str);
*/
void blk_sync_queue(struct request_queue *q)
{
- del_timer_sync(&q->timeout);
+ timer_delete_sync(&q->timeout);
cancel_work_sync(&q->timeout_work);
}
EXPORT_SYMBOL(blk_sync_queue);
@@ -381,7 +381,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
static void blk_rq_timed_out_timer(struct timer_list *t)
{
- struct request_queue *q = from_timer(q, t, timeout);
+ struct request_queue *q = timer_container_of(q, t, timeout);
kblockd_schedule_work(&q->timeout_work);
}
@@ -429,6 +429,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
refcount_set(&q->refs, 1);
mutex_init(&q->debugfs_mutex);
+ mutex_init(&q->elevator_lock);
mutex_init(&q->sysfs_lock);
mutex_init(&q->limits_lock);
mutex_init(&q->rq_qos_mutex);
@@ -455,6 +456,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)",
&q->q_lock_cls_key, 0);
+ /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */
+ fs_reclaim_acquire(GFP_KERNEL);
+ rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
+ fs_reclaim_release(GFP_KERNEL);
+
q->nr_requests = BLKDEV_DEFAULT_RQ;
return q;
@@ -1011,7 +1018,7 @@ again:
stamp = READ_ONCE(part->bd_stamp);
if (unlikely(time_after(now, stamp)) &&
likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
- (end || part_in_flight(part)))
+ (end || bdev_count_inflight(part)))
__part_stat_add(part, io_ticks, now - stamp);
if (bdev_is_partition(part)) {
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 29a205482617..005c9157ffb3 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -87,7 +87,7 @@ static struct bio_set crypto_bio_split;
* This is the key we set when evicting a keyslot. This *should* be the all 0's
* key, but AES-XTS rejects that key, so we use some random bytes instead.
*/
-static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
+static u8 blank_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE];
static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
{
@@ -119,7 +119,7 @@ blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
blk_crypto_fallback_evict_keyslot(slot);
slotp->crypto_mode = crypto_mode;
- err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
+ err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes,
key->size);
if (err) {
blk_crypto_fallback_evict_keyslot(slot);
@@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
@@ -539,7 +540,7 @@ static int blk_crypto_fallback_init(void)
if (blk_crypto_fallback_inited)
return 0;
- get_random_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE);
+ get_random_bytes(blank_key, sizeof(blank_key));
err = bioset_init(&crypto_bio_split, 64, 0, 0);
if (err)
@@ -561,6 +562,7 @@ static int blk_crypto_fallback_init(void)
blk_crypto_fallback_profile->ll_ops = blk_crypto_fallback_ll_ops;
blk_crypto_fallback_profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
+ blk_crypto_fallback_profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW;
/* All blk-crypto modes have a crypto API fallback. */
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index 93a141979694..ccf6dff6ff6b 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -14,6 +14,7 @@ struct blk_crypto_mode {
const char *name; /* name of this mode, shown in sysfs */
const char *cipher_str; /* crypto API name (for fallback case) */
unsigned int keysize; /* key size in bytes */
+ unsigned int security_strength; /* security strength in bytes */
unsigned int ivsize; /* iv size in bytes */
};
@@ -82,6 +83,9 @@ int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
const struct blk_crypto_config *cfg);
+int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
+ void __user *argp);
+
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
static inline int blk_crypto_sysfs_register(struct gendisk *disk)
@@ -129,6 +133,12 @@ static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
return false;
}
+static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
+ void __user *argp)
+{
+ return -ENOTTY;
+}
+
#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
void __bio_crypt_advance(struct bio *bio, unsigned int bytes);
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 7fabc883e39f..81918f6e0cae 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -352,6 +352,8 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
return false;
if (profile->max_dun_bytes_supported < cfg->dun_bytes)
return false;
+ if (!(profile->key_types_supported & cfg->key_type))
+ return false;
return true;
}
@@ -463,6 +465,103 @@ bool blk_crypto_register(struct blk_crypto_profile *profile,
EXPORT_SYMBOL_GPL(blk_crypto_register);
/**
+ * blk_crypto_derive_sw_secret() - Derive software secret from wrapped key
+ * @bdev: a block device that supports hardware-wrapped keys
+ * @eph_key: a hardware-wrapped key in ephemerally-wrapped form
+ * @eph_key_size: size of @eph_key in bytes
+ * @sw_secret: (output) the software secret
+ *
+ * Given a hardware-wrapped key in ephemerally-wrapped form (the same form that
+ * it is used for I/O), ask the hardware to derive the secret which software can
+ * use for cryptographic tasks other than inline encryption. This secret is
+ * guaranteed to be cryptographically isolated from the inline encryption key,
+ * i.e. derived with a different KDF context.
+ *
+ * Return: 0 on success, -EOPNOTSUPP if the block device doesn't support
+ * hardware-wrapped keys, -EBADMSG if the key isn't a valid
+ * ephemerally-wrapped key, or another -errno code.
+ */
+int blk_crypto_derive_sw_secret(struct block_device *bdev,
+ const u8 *eph_key, size_t eph_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
+{
+ struct blk_crypto_profile *profile =
+ bdev_get_queue(bdev)->crypto_profile;
+ int err;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.derive_sw_secret)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ err = profile->ll_ops.derive_sw_secret(profile, eph_key, eph_key_size,
+ sw_secret);
+ blk_crypto_hw_exit(profile);
+ return err;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_derive_sw_secret);
+
+int blk_crypto_import_key(struct blk_crypto_profile *profile,
+ const u8 *raw_key, size_t raw_key_size,
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ int ret;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.import_key)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ ret = profile->ll_ops.import_key(profile, raw_key, raw_key_size,
+ lt_key);
+ blk_crypto_hw_exit(profile);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_import_key);
+
+int blk_crypto_generate_key(struct blk_crypto_profile *profile,
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ int ret;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.generate_key)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ ret = profile->ll_ops.generate_key(profile, lt_key);
+ blk_crypto_hw_exit(profile);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_generate_key);
+
+int blk_crypto_prepare_key(struct blk_crypto_profile *profile,
+ const u8 *lt_key, size_t lt_key_size,
+ u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ int ret;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.prepare_key)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ ret = profile->ll_ops.prepare_key(profile, lt_key, lt_key_size,
+ eph_key);
+ blk_crypto_hw_exit(profile);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_prepare_key);
+
+/**
* blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
* by child device
* @parent: the crypto profile for the parent device
@@ -485,10 +584,12 @@ void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
child->max_dun_bytes_supported);
for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++)
parent->modes_supported[i] &= child->modes_supported[i];
+ parent->key_types_supported &= child->key_types_supported;
} else {
parent->max_dun_bytes_supported = 0;
memset(parent->modes_supported, 0,
sizeof(parent->modes_supported));
+ parent->key_types_supported = 0;
}
}
EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities);
@@ -521,6 +622,9 @@ bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
target->max_dun_bytes_supported)
return false;
+ if (reference->key_types_supported & ~target->key_types_supported)
+ return false;
+
return true;
}
EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities);
@@ -555,5 +659,6 @@ void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
sizeof(dst->modes_supported));
dst->max_dun_bytes_supported = src->max_dun_bytes_supported;
+ dst->key_types_supported = src->key_types_supported;
}
EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities);
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
index a304434489ba..e832f403f200 100644
--- a/block/blk-crypto-sysfs.c
+++ b/block/blk-crypto-sysfs.c
@@ -31,6 +31,13 @@ static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
return container_of(attr, struct blk_crypto_attr, attr);
}
+static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ /* Always show supported, since the file doesn't exist otherwise. */
+ return sysfs_emit(page, "supported\n");
+}
+
static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
struct blk_crypto_attr *attr, char *page)
{
@@ -43,20 +50,48 @@ static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
return sysfs_emit(page, "%u\n", profile->num_slots);
}
+static ssize_t raw_keys_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ /* Always show supported, since the file doesn't exist otherwise. */
+ return sysfs_emit(page, "supported\n");
+}
+
#define BLK_CRYPTO_RO_ATTR(_name) \
static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
+BLK_CRYPTO_RO_ATTR(hw_wrapped_keys);
BLK_CRYPTO_RO_ATTR(max_dun_bits);
BLK_CRYPTO_RO_ATTR(num_keyslots);
+BLK_CRYPTO_RO_ATTR(raw_keys);
+
+static umode_t blk_crypto_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+
+ if (a == &hw_wrapped_keys_attr &&
+ !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return 0;
+ if (a == &raw_keys_attr &&
+ !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_RAW))
+ return 0;
+
+ return 0444;
+}
static struct attribute *blk_crypto_attrs[] = {
+ &hw_wrapped_keys_attr.attr,
&max_dun_bits_attr.attr,
&num_keyslots_attr.attr,
+ &raw_keys_attr.attr,
NULL,
};
static const struct attribute_group blk_crypto_attr_group = {
.attrs = blk_crypto_attrs,
+ .is_visible = blk_crypto_is_visible,
};
/*
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 4d760b092deb..4b1ad84d1b5a 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -23,24 +23,28 @@ const struct blk_crypto_mode blk_crypto_modes[] = {
.name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
+ .security_strength = 32,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
.name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_ADIANTUM] = {
.name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 32,
},
[BLK_ENCRYPTION_MODE_SM4_XTS] = {
.name = "SM4-XTS",
.cipher_str = "xts(sm4)",
.keysize = 32,
+ .security_strength = 16,
.ivsize = 16,
},
};
@@ -76,9 +80,15 @@ static int __init bio_crypt_ctx_init(void)
/* This is assumed in various places. */
BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
- /* Sanity check that no algorithm exceeds the defined limits. */
+ /*
+ * Validate the crypto mode properties. This ideally would be done with
+ * static assertions, but boot-time checks are the next best thing.
+ */
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) {
- BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE);
+ BUG_ON(blk_crypto_modes[i].keysize >
+ BLK_CRYPTO_MAX_RAW_KEY_SIZE);
+ BUG_ON(blk_crypto_modes[i].security_strength >
+ blk_crypto_modes[i].keysize);
BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE);
}
@@ -315,17 +325,20 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
/**
* blk_crypto_init_key() - Prepare a key for use with blk-crypto
* @blk_key: Pointer to the blk_crypto_key to initialize.
- * @raw_key: Pointer to the raw key. Must be the correct length for the chosen
- * @crypto_mode; see blk_crypto_modes[].
+ * @key_bytes: the bytes of the key
+ * @key_size: size of the key in bytes
+ * @key_type: type of the key -- either raw or hardware-wrapped
* @crypto_mode: identifier for the encryption algorithm to use
* @dun_bytes: number of bytes that will be used to specify the DUN when this
* key is used
* @data_unit_size: the data unit size to use for en/decryption
*
* Return: 0 on success, -errno on failure. The caller is responsible for
- * zeroizing both blk_key and raw_key when done with them.
+ * zeroizing both blk_key and key_bytes when done with them.
*/
-int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
+int blk_crypto_init_key(struct blk_crypto_key *blk_key,
+ const u8 *key_bytes, size_t key_size,
+ enum blk_crypto_key_type key_type,
enum blk_crypto_mode_num crypto_mode,
unsigned int dun_bytes,
unsigned int data_unit_size)
@@ -338,8 +351,19 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
return -EINVAL;
mode = &blk_crypto_modes[crypto_mode];
- if (mode->keysize == 0)
+ switch (key_type) {
+ case BLK_CRYPTO_KEY_TYPE_RAW:
+ if (key_size != mode->keysize)
+ return -EINVAL;
+ break;
+ case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED:
+ if (key_size < mode->security_strength ||
+ key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE)
+ return -EINVAL;
+ break;
+ default:
return -EINVAL;
+ }
if (dun_bytes == 0 || dun_bytes > mode->ivsize)
return -EINVAL;
@@ -350,9 +374,10 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
blk_key->crypto_cfg.crypto_mode = crypto_mode;
blk_key->crypto_cfg.dun_bytes = dun_bytes;
blk_key->crypto_cfg.data_unit_size = data_unit_size;
+ blk_key->crypto_cfg.key_type = key_type;
blk_key->data_unit_size_bits = ilog2(data_unit_size);
- blk_key->size = mode->keysize;
- memcpy(blk_key->raw, raw_key, mode->keysize);
+ blk_key->size = key_size;
+ memcpy(blk_key->bytes, key_bytes, key_size);
return 0;
}
@@ -372,8 +397,10 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev,
bool blk_crypto_config_supported(struct block_device *bdev,
const struct blk_crypto_config *cfg)
{
- return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
- blk_crypto_config_supported_natively(bdev, cfg);
+ if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) &&
+ cfg->key_type == BLK_CRYPTO_KEY_TYPE_RAW)
+ return true;
+ return blk_crypto_config_supported_natively(bdev, cfg);
}
/**
@@ -387,15 +414,21 @@ bool blk_crypto_config_supported(struct block_device *bdev,
* an skcipher, and *should not* be called from the data path, since that might
* cause a deadlock
*
- * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and
- * blk-crypto-fallback is either disabled or the needed algorithm
- * is disabled in the crypto API; or another -errno code.
+ * Return: 0 on success; -EOPNOTSUPP if the key is wrapped but the hardware does
+ * not support wrapped keys; -ENOPKG if the key is a raw key but the
+ * hardware does not support raw keys and blk-crypto-fallback is either
+ * disabled or the needed algorithm is disabled in the crypto API; or
+ * another -errno code if something else went wrong.
*/
int blk_crypto_start_using_key(struct block_device *bdev,
const struct blk_crypto_key *key)
{
if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
return 0;
+ if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_RAW) {
+ pr_warn_ratelimited("%pg: no support for wrapped keys\n", bdev);
+ return -EOPNOTSUPP;
+ }
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}
@@ -436,3 +469,146 @@ void blk_crypto_evict_key(struct block_device *bdev,
pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err);
}
EXPORT_SYMBOL_GPL(blk_crypto_evict_key);
+
+static int blk_crypto_ioctl_import_key(struct blk_crypto_profile *profile,
+ void __user *argp)
+{
+ struct blk_crypto_import_key_arg arg;
+ u8 raw_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE];
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ int ret;
+
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+
+ if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
+ return -EINVAL;
+
+ if (arg.raw_key_size < 16 || arg.raw_key_size > sizeof(raw_key))
+ return -EINVAL;
+
+ if (copy_from_user(raw_key, u64_to_user_ptr(arg.raw_key_ptr),
+ arg.raw_key_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = blk_crypto_import_key(profile, raw_key, arg.raw_key_size, lt_key);
+ if (ret < 0)
+ goto out;
+ if (ret > arg.lt_key_size) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+ arg.lt_key_size = ret;
+ if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key,
+ arg.lt_key_size) ||
+ copy_to_user(argp, &arg, sizeof(arg))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ memzero_explicit(raw_key, sizeof(raw_key));
+ memzero_explicit(lt_key, sizeof(lt_key));
+ return ret;
+}
+
+static int blk_crypto_ioctl_generate_key(struct blk_crypto_profile *profile,
+ void __user *argp)
+{
+ struct blk_crypto_generate_key_arg arg;
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ int ret;
+
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+
+ if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
+ return -EINVAL;
+
+ ret = blk_crypto_generate_key(profile, lt_key);
+ if (ret < 0)
+ goto out;
+ if (ret > arg.lt_key_size) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+ arg.lt_key_size = ret;
+ if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key,
+ arg.lt_key_size) ||
+ copy_to_user(argp, &arg, sizeof(arg))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ memzero_explicit(lt_key, sizeof(lt_key));
+ return ret;
+}
+
+static int blk_crypto_ioctl_prepare_key(struct blk_crypto_profile *profile,
+ void __user *argp)
+{
+ struct blk_crypto_prepare_key_arg arg;
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ int ret;
+
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+
+ if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
+ return -EINVAL;
+
+ if (arg.lt_key_size > sizeof(lt_key))
+ return -EINVAL;
+
+ if (copy_from_user(lt_key, u64_to_user_ptr(arg.lt_key_ptr),
+ arg.lt_key_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = blk_crypto_prepare_key(profile, lt_key, arg.lt_key_size, eph_key);
+ if (ret < 0)
+ goto out;
+ if (ret > arg.eph_key_size) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+ arg.eph_key_size = ret;
+ if (copy_to_user(u64_to_user_ptr(arg.eph_key_ptr), eph_key,
+ arg.eph_key_size) ||
+ copy_to_user(argp, &arg, sizeof(arg))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ memzero_explicit(lt_key, sizeof(lt_key));
+ memzero_explicit(eph_key, sizeof(eph_key));
+ return ret;
+}
+
+int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
+ void __user *argp)
+{
+ struct blk_crypto_profile *profile =
+ bdev_get_queue(bdev)->crypto_profile;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+
+ switch (cmd) {
+ case BLKCRYPTOIMPORTKEY:
+ return blk_crypto_ioctl_import_key(profile, argp);
+ case BLKCRYPTOGENERATEKEY:
+ return blk_crypto_ioctl_generate_key(profile, argp);
+ case BLKCRYPTOPREPAREKEY:
+ return blk_crypto_ioctl_prepare_key(profile, argp);
+ default:
+ return -ENOTTY;
+ }
+}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index a72e2a83d075..43d6152897a4 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -95,9 +95,9 @@ static void blk_kick_flush(struct request_queue *q,
struct blk_flush_queue *fq, blk_opf_t flags);
static inline struct blk_flush_queue *
-blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
+blk_get_flush_queue(struct blk_mq_ctx *ctx)
{
- return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
+ return blk_mq_map_queue(REQ_OP_FLUSH, ctx)->fq;
}
static unsigned int blk_flush_cur_seq(struct request *rq)
@@ -205,7 +205,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
struct list_head *running;
struct request *rq, *n;
unsigned long flags = 0;
- struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
+ struct blk_flush_queue *fq = blk_get_flush_queue(flush_rq->mq_ctx);
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
@@ -341,7 +341,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
struct blk_mq_ctx *ctx = rq->mq_ctx;
unsigned long flags;
- struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
+ struct blk_flush_queue *fq = blk_get_flush_queue(ctx);
if (q->elevator) {
WARN_ON(rq->tag < 0);
@@ -382,7 +382,7 @@ static void blk_rq_init_flush(struct request *rq)
bool blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
- struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
+ struct blk_flush_queue *fq = blk_get_flush_queue(rq->mq_ctx);
bool supports_fua = q->limits.features & BLK_FEAT_FUA;
unsigned int policy = 0;
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index a1678f0a9f81..e4e2567061f9 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -117,13 +117,8 @@ int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
{
int ret;
struct iov_iter iter;
- unsigned int direction;
- if (op_is_write(req_op(rq)))
- direction = ITER_DEST;
- else
- direction = ITER_SOURCE;
- iov_iter_ubuf(&iter, direction, ubuf, bytes);
+ iov_iter_ubuf(&iter, rq_data_dir(rq), ubuf, bytes);
ret = bio_integrity_map_user(rq->bio, &iter);
if (ret)
return ret;
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 65a1d4427ccf..5bfd70311359 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2718,8 +2718,7 @@ retry_lock:
* All waiters are on iocg->waitq and the wait states are
* synchronized using waitq.lock.
*/
- init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
- wait.wait.private = current;
+ init_wait_func(&wait.wait, iocg_wake_fn);
wait.bio = bio;
wait.abs_cost = abs_cost;
wait.committed = false; /* will be set true by waker */
@@ -3004,8 +3003,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
iocg->hweight_inuse = WEIGHT_ONE;
init_waitqueue_head(&iocg->waitq);
- hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- iocg->waitq_timer.function = iocg_waitq_timer_fn;
+ hrtimer_setup(&iocg->waitq_timer, iocg_waitq_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
iocg->level = blkg->blkcg->css.cgroup->level;
@@ -3224,14 +3222,16 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
u32 qos[NR_QOS_PARAMS];
bool enable, user;
char *body, *p;
- unsigned int memflags;
+ unsigned long memflags;
int ret;
blkg_conf_init(&ctx, input);
- ret = blkg_conf_open_bdev(&ctx);
- if (ret)
+ memflags = blkg_conf_open_bdev_frozen(&ctx);
+ if (IS_ERR_VALUE(memflags)) {
+ ret = memflags;
goto err;
+ }
body = ctx.body;
disk = ctx.bdev->bd_disk;
@@ -3248,7 +3248,6 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(disk->queue);
}
- memflags = blk_mq_freeze_queue(disk->queue);
blk_mq_quiesce_queue(disk->queue);
spin_lock_irq(&ioc->lock);
@@ -3348,19 +3347,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
wbt_enable_default(disk);
blk_mq_unquiesce_queue(disk->queue);
- blk_mq_unfreeze_queue(disk->queue, memflags);
- blkg_conf_exit(&ctx);
+ blkg_conf_exit_frozen(&ctx, memflags);
return nbytes;
einval:
spin_unlock_irq(&ioc->lock);
-
blk_mq_unquiesce_queue(disk->queue);
- blk_mq_unfreeze_queue(disk->queue, memflags);
-
ret = -EINVAL;
err:
- blkg_conf_exit(&ctx);
+ blkg_conf_exit_frozen(&ctx, memflags);
return ret;
}
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 42c1e0b9a68f..2f8fdecdd7a9 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -658,7 +658,8 @@ static const struct rq_qos_ops blkcg_iolatency_ops = {
static void blkiolatency_timer_fn(struct timer_list *t)
{
- struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
+ struct blk_iolatency *blkiolat = timer_container_of(blkiolat, t,
+ timer);
struct blkcg_gq *blkg;
struct cgroup_subsys_state *pos_css;
u64 now = blk_time_get_ns();
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 8fff7ccc0ac7..13659dc15c3f 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -113,27 +113,18 @@ static void ioprio_free_cpd(struct blkcg_policy_data *cpd)
kfree(blkcg);
}
-#define IOPRIO_ATTRS \
- { \
- .name = "prio.class", \
- .seq_show = ioprio_show_prio_policy, \
- .write = ioprio_set_prio_policy, \
- }, \
- { } /* sentinel */
-
-/* cgroup v2 attributes */
static struct cftype ioprio_files[] = {
- IOPRIO_ATTRS
-};
-
-/* cgroup v1 attributes */
-static struct cftype ioprio_legacy_files[] = {
- IOPRIO_ATTRS
+ {
+ .name = "prio.class",
+ .seq_show = ioprio_show_prio_policy,
+ .write = ioprio_set_prio_policy,
+ },
+ { } /* sentinel */
};
static struct blkcg_policy ioprio_policy = {
.dfl_cftypes = ioprio_files,
- .legacy_cftypes = ioprio_legacy_files,
+ .legacy_cftypes = ioprio_files,
.cpd_alloc_fn = ioprio_alloc_cpd,
.cpd_free_fn = ioprio_free_cpd,
diff --git a/block/blk-map.c b/block/blk-map.c
index d2f22744b3d1..23e5d5ebe59e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -317,64 +317,26 @@ static void bio_map_kern_endio(struct bio *bio)
kfree(bio);
}
-/**
- * bio_map_kern - map kernel address into bio
- * @q: the struct request_queue for the bio
- * @data: pointer to buffer to map
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio allocation
- *
- * Map the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-static struct bio *bio_map_kern(struct request_queue *q, void *data,
- unsigned int len, gfp_t gfp_mask)
+static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op,
+ gfp_t gfp_mask)
{
- unsigned long kaddr = (unsigned long)data;
- unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = kaddr >> PAGE_SHIFT;
- const int nr_pages = end - start;
- bool is_vmalloc = is_vmalloc_addr(data);
- struct page *page;
- int offset, i;
+ unsigned int nr_vecs = bio_add_max_vecs(data, len);
struct bio *bio;
- bio = bio_kmalloc(nr_pages, gfp_mask);
+ bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
-
- if (is_vmalloc) {
- flush_kernel_vmap_range(data, len);
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op);
+ if (is_vmalloc_addr(data)) {
bio->bi_private = data;
- }
-
- offset = offset_in_page(kaddr);
- for (i = 0; i < nr_pages; i++) {
- unsigned int bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- if (!is_vmalloc)
- page = virt_to_page(data);
- else
- page = vmalloc_to_page(data);
- if (bio_add_page(bio, page, bytes, offset) < bytes) {
- /* we don't support partial mappings */
+ if (!bio_add_vmalloc(bio, data, len)) {
bio_uninit(bio);
kfree(bio);
return ERR_PTR(-EINVAL);
}
-
- data += bytes;
- len -= bytes;
- offset = 0;
+ } else {
+ bio_add_virt_nofail(bio, data, len);
}
-
bio->bi_end_io = bio_map_kern_endio;
return bio;
}
@@ -402,17 +364,16 @@ static void bio_copy_kern_endio_read(struct bio *bio)
/**
* bio_copy_kern - copy kernel address into bio
- * @q: the struct request_queue for the bio
* @data: pointer to buffer to copy
* @len: length in bytes
+ * @op: bio/request operation
* @gfp_mask: allocation flags for bio and page allocation
- * @reading: data direction is READ
*
* copy the kernel address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
-static struct bio *bio_copy_kern(struct request_queue *q, void *data,
- unsigned int len, gfp_t gfp_mask, int reading)
+static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op,
+ gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -431,7 +392,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op);
while (len) {
struct page *page;
@@ -444,7 +405,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
if (!page)
goto cleanup;
- if (!reading)
+ if (op_is_write(op))
memcpy(page_address(page), p, bytes);
if (bio_add_page(bio, page, bytes, 0) < bytes)
@@ -454,11 +415,11 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
p += bytes;
}
- if (reading) {
+ if (op_is_write(op)) {
+ bio->bi_end_io = bio_copy_kern_endio;
+ } else {
bio->bi_end_io = bio_copy_kern_endio_read;
bio->bi_private = data;
- } else {
- bio->bi_end_io = bio_copy_kern_endio;
}
return bio;
@@ -556,8 +517,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
if (map_data)
copy = true;
- else if (blk_queue_may_bounce(q))
- copy = true;
else if (iov_iter_alignment(iter) & align)
copy = true;
else if (iov_iter_is_bvec(iter))
@@ -689,7 +648,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
/**
* blk_rq_map_kern - map kernel data to a request, for passthrough requests
- * @q: request queue where request should be inserted
* @rq: request to fill
* @kbuf: the kernel buffer
* @len: length of user data
@@ -700,31 +658,26 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
* buffer is used. Can be called multiple times to append multiple
* buffers.
*/
-int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
- unsigned int len, gfp_t gfp_mask)
+int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len,
+ gfp_t gfp_mask)
{
- int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
struct bio *bio;
int ret;
- if (len > (queue_max_hw_sectors(q) << 9))
+ if (len > (queue_max_hw_sectors(rq->q) << SECTOR_SHIFT))
return -EINVAL;
if (!len || !kbuf)
return -EINVAL;
- if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) ||
- blk_queue_may_bounce(q))
- bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
+ if (!blk_rq_aligned(rq->q, addr, len) || object_is_on_stack(kbuf))
+ bio = bio_copy_kern(kbuf, len, req_op(rq), gfp_mask);
else
- bio = bio_map_kern(q, kbuf, len, gfp_mask);
+ bio = bio_map_kern(kbuf, len, req_op(rq), gfp_mask);
if (IS_ERR(bio))
return PTR_ERR(bio);
- bio->bi_opf &= ~REQ_OP_MASK;
- bio->bi_opf |= req_op(rq);
-
ret = blk_rq_append_bio(rq, bio);
if (unlikely(ret)) {
bio_uninit(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1d1589c35297..70d704615be5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -7,7 +7,6 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
-#include <linux/scatterlist.h>
#include <linux/part_stat.h>
#include <linux/blk-cgroup.h>
@@ -226,27 +225,6 @@ static inline unsigned get_max_io_size(struct bio *bio,
}
/**
- * get_max_segment_size() - maximum number of bytes to add as a single segment
- * @lim: Request queue limits.
- * @paddr: address of the range to add
- * @len: maximum length available to add at @paddr
- *
- * Returns the maximum number of bytes of the range starting at @paddr that can
- * be added to a single segment.
- */
-static inline unsigned get_max_segment_size(const struct queue_limits *lim,
- phys_addr_t paddr, unsigned int len)
-{
- /*
- * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
- * after having calculated the minimum.
- */
- return min_t(unsigned long, len,
- min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
- (unsigned long)lim->max_segment_size - 1) + 1);
-}
-
-/**
* bvec_split_segs - verify whether or not a bvec should be split in the middle
* @lim: [in] queue limits to split based on
* @bv: [in] bvec to examine
@@ -473,117 +451,6 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
return nr_phys_segs;
}
-struct phys_vec {
- phys_addr_t paddr;
- u32 len;
-};
-
-static bool blk_map_iter_next(struct request *req,
- struct req_iterator *iter, struct phys_vec *vec)
-{
- unsigned int max_size;
- struct bio_vec bv;
-
- if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
- if (!iter->bio)
- return false;
- vec->paddr = bvec_phys(&req->special_vec);
- vec->len = req->special_vec.bv_len;
- iter->bio = NULL;
- return true;
- }
-
- if (!iter->iter.bi_size)
- return false;
-
- bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
- vec->paddr = bvec_phys(&bv);
- max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
- bv.bv_len = min(bv.bv_len, max_size);
- bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
-
- /*
- * If we are entirely done with this bi_io_vec entry, check if the next
- * one could be merged into it. This typically happens when moving to
- * the next bio, but some callers also don't pack bvecs tight.
- */
- while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
- struct bio_vec next;
-
- if (!iter->iter.bi_size) {
- if (!iter->bio->bi_next)
- break;
- iter->bio = iter->bio->bi_next;
- iter->iter = iter->bio->bi_iter;
- }
-
- next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
- if (bv.bv_len + next.bv_len > max_size ||
- !biovec_phys_mergeable(req->q, &bv, &next))
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
- }
-
- vec->len = bv.bv_len;
- return true;
-}
-
-static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
- struct scatterlist *sglist)
-{
- if (!*sg)
- return sglist;
-
- /*
- * If the driver previously mapped a shorter list, we could see a
- * termination bit prematurely unless it fully inits the sg table
- * on each mapping. We KNOW that there must be more entries here
- * or the driver would be buggy, so force clear the termination bit
- * to avoid doing a full sg_init_table() in drivers for each command.
- */
- sg_unmark_end(*sg);
- return sg_next(*sg);
-}
-
-/*
- * Map a request to scatterlist, return number of sg entries setup. Caller
- * must make sure sg can hold rq->nr_phys_segments entries.
- */
-int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
- struct scatterlist *sglist, struct scatterlist **last_sg)
-{
- struct req_iterator iter = {
- .bio = rq->bio,
- };
- struct phys_vec vec;
- int nsegs = 0;
-
- /* the internal flush request may not have bio attached */
- if (iter.bio)
- iter.iter = iter.bio->bi_iter;
-
- while (blk_map_iter_next(rq, &iter, &vec)) {
- *last_sg = blk_next_sg(last_sg, sglist);
- sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
- offset_in_page(vec.paddr));
- nsegs++;
- }
-
- if (*last_sg)
- sg_mark_end(*last_sg);
-
- /*
- * Something must have been wrong if the figured number of
- * segment is bigger than number of req's physical segments
- */
- WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
-
- return nsegs;
-}
-EXPORT_SYMBOL(__blk_rq_map_sg);
-
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
@@ -832,6 +699,8 @@ static struct request *attempt_merge(struct request_queue *q,
if (req->bio->bi_write_hint != next->bio->bi_write_hint)
return NULL;
+ if (req->bio->bi_write_stream != next->bio->bi_write_stream)
+ return NULL;
if (req->bio->bi_ioprio != next->bio->bi_ioprio)
return NULL;
if (!blk_atomic_write_mergeable_rqs(req, next))
@@ -953,6 +822,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
return false;
if (rq->bio->bi_write_hint != bio->bi_write_hint)
return false;
+ if (rq->bio->bi_write_stream != bio->bi_write_stream)
+ return false;
if (rq->bio->bi_ioprio != bio->bi_ioprio)
return false;
if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
@@ -1127,20 +998,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
if (!plug || rq_list_empty(&plug->mq_list))
return false;
- rq_list_for_each(&plug->mq_list, rq) {
- if (rq->q == q) {
- if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
- BIO_MERGE_OK)
- return true;
- break;
- }
+ rq = plug->mq_list.tail;
+ if (rq->q == q)
+ return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK;
+ else if (!plug->multiple_queues)
+ return false;
- /*
- * Only keep iterating plug list for merges if we have multiple
- * queues
- */
- if (!plug->multiple_queues)
- break;
+ rq_list_for_each(&plug->mq_list, rq) {
+ if (rq->q != q)
+ continue;
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK)
+ return true;
+ break;
}
return false;
}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index adf5f0697b6b..bdcb27ab5606 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -93,6 +93,9 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(SQ_SCHED),
+ QUEUE_FLAG_NAME(DISABLE_WBT_DEF),
+ QUEUE_FLAG_NAME(NO_ELV_SWITCH),
+ QUEUE_FLAG_NAME(QOS_ENABLED),
};
#undef QUEUE_FLAG_NAME
@@ -347,9 +350,14 @@ static int hctx_busy_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
struct show_busy_params params = { .m = m, .hctx = hctx };
+ int res;
+ res = mutex_lock_interruptible(&hctx->queue->elevator_lock);
+ if (res)
+ return res;
blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq,
&params);
+ mutex_unlock(&hctx->queue->elevator_lock);
return 0;
}
@@ -400,15 +408,14 @@ static int hctx_tags_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->tags)
blk_mq_debugfs_tags_show(m, hctx->tags);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
@@ -417,15 +424,14 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->tags)
sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_sched_tags_show(void *data, struct seq_file *m)
@@ -434,15 +440,14 @@ static int hctx_sched_tags_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->sched_tags)
blk_mq_debugfs_tags_show(m, hctx->sched_tags);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
@@ -451,15 +456,14 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->sched_tags)
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_active_show(void *data, struct seq_file *m)
@@ -623,20 +627,9 @@ void blk_mq_debugfs_register(struct request_queue *q)
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
- /*
- * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
- * didn't exist yet (because we don't know what to name the directory
- * until the queue is registered to a gendisk).
- */
- if (q->elevator && !q->sched_debugfs_dir)
- blk_mq_debugfs_register_sched(q);
-
- /* Similarly, blk_mq_init_hctx() couldn't do this previously. */
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->debugfs_dir)
blk_mq_debugfs_register_hctx(q, hctx);
- if (q->elevator && !hctx->sched_debugfs_dir)
- blk_mq_debugfs_register_sched_hctx(q, hctx);
}
if (q->rq_qos) {
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
new file mode 100644
index 000000000000..82bae475dfa4
--- /dev/null
+++ b/block/blk-mq-dma.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Christoph Hellwig
+ */
+#include "blk.h"
+
+struct phys_vec {
+ phys_addr_t paddr;
+ u32 len;
+};
+
+static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
+ struct phys_vec *vec)
+{
+ unsigned int max_size;
+ struct bio_vec bv;
+
+ if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ if (!iter->bio)
+ return false;
+ vec->paddr = bvec_phys(&req->special_vec);
+ vec->len = req->special_vec.bv_len;
+ iter->bio = NULL;
+ return true;
+ }
+
+ if (!iter->iter.bi_size)
+ return false;
+
+ bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ vec->paddr = bvec_phys(&bv);
+ max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
+ bv.bv_len = min(bv.bv_len, max_size);
+ bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
+
+ /*
+ * If we are entirely done with this bi_io_vec entry, check if the next
+ * one could be merged into it. This typically happens when moving to
+ * the next bio, but some callers also don't pack bvecs tight.
+ */
+ while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
+ struct bio_vec next;
+
+ if (!iter->iter.bi_size) {
+ if (!iter->bio->bi_next)
+ break;
+ iter->bio = iter->bio->bi_next;
+ iter->iter = iter->bio->bi_iter;
+ }
+
+ next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ if (bv.bv_len + next.bv_len > max_size ||
+ !biovec_phys_mergeable(req->q, &bv, &next))
+ break;
+
+ bv.bv_len += next.bv_len;
+ bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
+ }
+
+ vec->len = bv.bv_len;
+ return true;
+}
+
+static inline struct scatterlist *
+blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
+{
+ if (!*sg)
+ return sglist;
+
+ /*
+ * If the driver previously mapped a shorter list, we could see a
+ * termination bit prematurely unless it fully inits the sg table
+ * on each mapping. We KNOW that there must be more entries here
+ * or the driver would be buggy, so force clear the termination bit
+ * to avoid doing a full sg_init_table() in drivers for each command.
+ */
+ sg_unmark_end(*sg);
+ return sg_next(*sg);
+}
+
+/*
+ * Map a request to scatterlist, return number of sg entries setup. Caller
+ * must make sure sg can hold rq->nr_phys_segments entries.
+ */
+int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
+ struct scatterlist **last_sg)
+{
+ struct req_iterator iter = {
+ .bio = rq->bio,
+ };
+ struct phys_vec vec;
+ int nsegs = 0;
+
+ /* the internal flush request may not have bio attached */
+ if (iter.bio)
+ iter.iter = iter.bio->bi_iter;
+
+ while (blk_map_iter_next(rq, &iter, &vec)) {
+ *last_sg = blk_next_sg(last_sg, sglist);
+ sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
+ offset_in_page(vec.paddr));
+ nsegs++;
+ }
+
+ if (*last_sg)
+ sg_mark_end(*last_sg);
+
+ /*
+ * Something must have been wrong if the figured number of
+ * segment is bigger than number of req's physical segments
+ */
+ WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
+
+ return nsegs;
+}
+EXPORT_SYMBOL(__blk_rq_map_sg);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 7442ca27c2bf..e2ce4a28e6c9 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -59,19 +59,17 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
struct request *rq;
LIST_HEAD(hctx_list);
- unsigned int count = 0;
list_for_each_entry(rq, rq_list, queuelist) {
if (rq->mq_hctx != hctx) {
list_cut_before(&hctx_list, rq_list, &rq->queuelist);
goto dispatch;
}
- count++;
}
list_splice_tail_init(rq_list, &hctx_list);
dispatch:
- return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
+ return blk_mq_dispatch_rq_list(hctx, &hctx_list, false);
}
#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
@@ -167,7 +165,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
} while (!list_empty(&rq_list));
} else {
- dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
+ dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false);
}
if (busy)
@@ -261,7 +259,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
- } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
+ } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false));
WRITE_ONCE(hctx->dispatch_from, ctx);
return ret;
@@ -298,7 +296,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0))
+ if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true))
return 0;
need_dispatch = true;
} else {
@@ -312,7 +310,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
if (need_dispatch)
return blk_mq_do_dispatch_ctx(hctx);
blk_mq_flush_busy_ctxs(hctx, &rq_list);
- blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
+ blk_mq_dispatch_rq_list(hctx, &rq_list, true);
return 0;
}
@@ -349,7 +347,7 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
}
ctx = blk_mq_get_ctx(q);
- hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+ hctx = blk_mq_map_queue(bio->bi_opf, ctx);
type = hctx->type;
if (list_empty_careful(&ctx->rq_lists[type]))
goto out_put;
@@ -376,68 +374,177 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
-static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
+/* called in queue's release handler, tagset has gone away */
+static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
- if (blk_mq_is_shared_tags(q->tag_set->flags)) {
- hctx->sched_tags = q->sched_shared_tags;
- return 0;
- }
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
- hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
- q->nr_requests);
+ queue_for_each_hw_ctx(q, hctx, i)
+ hctx->sched_tags = NULL;
- if (!hctx->sched_tags)
- return -ENOMEM;
- return 0;
+ if (blk_mq_is_shared_tags(flags))
+ q->sched_shared_tags = NULL;
}
-static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
+void blk_mq_sched_reg_debugfs(struct request_queue *q)
{
- blk_mq_free_rq_map(queue->sched_shared_tags);
- queue->sched_shared_tags = NULL;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_register_sched(q);
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_debugfs_register_sched_hctx(q, hctx);
+ mutex_unlock(&q->debugfs_mutex);
}
-/* called in queue's release handler, tagset has gone away */
-static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
+void blk_mq_sched_unreg_debugfs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->sched_tags) {
- if (!blk_mq_is_shared_tags(flags))
- blk_mq_free_rq_map(hctx->sched_tags);
- hctx->sched_tags = NULL;
- }
+ mutex_lock(&q->debugfs_mutex);
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_debugfs_unregister_sched_hctx(hctx);
+ blk_mq_debugfs_unregister_sched(q);
+ mutex_unlock(&q->debugfs_mutex);
+}
+
+void blk_mq_free_sched_tags(struct elevator_tags *et,
+ struct blk_mq_tag_set *set)
+{
+ unsigned long i;
+
+ /* Shared tags are stored at index 0 in @tags. */
+ if (blk_mq_is_shared_tags(set->flags))
+ blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
+ else {
+ for (i = 0; i < et->nr_hw_queues; i++)
+ blk_mq_free_map_and_rqs(set, et->tags[i], i);
}
- if (blk_mq_is_shared_tags(flags))
- blk_mq_exit_sched_shared_tags(q);
+ kfree(et);
}
-static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
+void blk_mq_free_sched_tags_batch(struct xarray *et_table,
+ struct blk_mq_tag_set *set)
{
- struct blk_mq_tag_set *set = queue->tag_set;
+ struct request_queue *q;
+ struct elevator_tags *et;
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is
+ * safe because we're holding here set->update_nr_hwq_lock in
+ * the writer context. So, scheduler update/switch code (which
+ * acquires the same lock but in the reader context) can't run
+ * concurrently.
+ */
+ if (q->elevator) {
+ et = xa_load(et_table, q->id);
+ if (unlikely(!et))
+ WARN_ON_ONCE(1);
+ else
+ blk_mq_free_sched_tags(et, set);
+ }
+ }
+}
+
+struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues)
+{
+ unsigned int nr_tags;
+ int i;
+ struct elevator_tags *et;
+ gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+
+ if (blk_mq_is_shared_tags(set->flags))
+ nr_tags = 1;
+ else
+ nr_tags = nr_hw_queues;
+
+ et = kmalloc(sizeof(struct elevator_tags) +
+ nr_tags * sizeof(struct blk_mq_tags *), gfp);
+ if (!et)
+ return NULL;
/*
- * Set initial depth at max so that we don't need to reallocate for
- * updating nr_requests.
+ * Default to double of smaller one between hw queue_depth and
+ * 128, since we don't split into sync/async like the old code
+ * did. Additionally, this is a per-hw queue depth.
*/
- queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
- BLK_MQ_NO_HCTX_IDX,
- MAX_SCHED_RQ);
- if (!queue->sched_shared_tags)
- return -ENOMEM;
+ et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
+ BLKDEV_DEFAULT_RQ);
+ et->nr_hw_queues = nr_hw_queues;
+
+ if (blk_mq_is_shared_tags(set->flags)) {
+ /* Shared tags are stored at index 0 in @tags. */
+ et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
+ MAX_SCHED_RQ);
+ if (!et->tags[0])
+ goto out;
+ } else {
+ for (i = 0; i < et->nr_hw_queues; i++) {
+ et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
+ et->nr_requests);
+ if (!et->tags[i])
+ goto out_unwind;
+ }
+ }
- blk_mq_tag_update_sched_shared_tags(queue);
+ return et;
+out_unwind:
+ while (--i >= 0)
+ blk_mq_free_map_and_rqs(set, et->tags[i], i);
+out:
+ kfree(et);
+ return NULL;
+}
+
+int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
+ struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
+{
+ struct request_queue *q;
+ struct elevator_tags *et;
+ gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is
+ * safe because we're holding here set->update_nr_hwq_lock in
+ * the writer context. So, scheduler update/switch code (which
+ * acquires the same lock but in the reader context) can't run
+ * concurrently.
+ */
+ if (q->elevator) {
+ et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
+ if (!et)
+ goto out_unwind;
+ if (xa_insert(et_table, q->id, et, gfp))
+ goto out_free_tags;
+ }
+ }
return 0;
+out_free_tags:
+ blk_mq_free_sched_tags(et, set);
+out_unwind:
+ list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
+ if (q->elevator) {
+ et = xa_load(et_table, q->id);
+ if (et)
+ blk_mq_free_sched_tags(et, set);
+ }
+ }
+ return -ENOMEM;
}
/* caller must have a reference to @e, will grab another one if successful */
-int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
+int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
+ struct elevator_tags *et)
{
unsigned int flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx;
@@ -445,56 +552,44 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
unsigned long i;
int ret;
- /*
- * Default to double of smaller one between hw queue_depth and 128,
- * since we don't split into sync/async like the old code did.
- * Additionally, this is a per-hw queue depth.
- */
- q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
- BLKDEV_DEFAULT_RQ);
+ eq = elevator_alloc(q, e, et);
+ if (!eq)
+ return -ENOMEM;
+
+ q->nr_requests = et->nr_requests;
if (blk_mq_is_shared_tags(flags)) {
- ret = blk_mq_init_sched_shared_tags(q);
- if (ret)
- return ret;
+ /* Shared tags are stored at index 0 in @et->tags. */
+ q->sched_shared_tags = et->tags[0];
+ blk_mq_tag_update_sched_shared_tags(q);
}
queue_for_each_hw_ctx(q, hctx, i) {
- ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
- if (ret)
- goto err_free_map_and_rqs;
+ if (blk_mq_is_shared_tags(flags))
+ hctx->sched_tags = q->sched_shared_tags;
+ else
+ hctx->sched_tags = et->tags[i];
}
- ret = e->ops.init_sched(q, e);
+ ret = e->ops.init_sched(q, eq);
if (ret)
- goto err_free_map_and_rqs;
-
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_register_sched(q);
- mutex_unlock(&q->debugfs_mutex);
+ goto out;
queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) {
ret = e->ops.init_hctx(hctx, i);
if (ret) {
- eq = q->elevator;
- blk_mq_sched_free_rqs(q);
blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj);
return ret;
}
}
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_register_sched_hctx(q, hctx);
- mutex_unlock(&q->debugfs_mutex);
}
-
return 0;
-err_free_map_and_rqs:
- blk_mq_sched_free_rqs(q);
+out:
blk_mq_sched_tags_teardown(q, flags);
-
+ kobject_put(&eq->kobj);
q->elevator = NULL;
return ret;
}
@@ -527,10 +622,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_unregister_sched_hctx(hctx);
- mutex_unlock(&q->debugfs_mutex);
-
if (e->type->ops.exit_hctx && hctx->sched_data) {
e->type->ops.exit_hctx(hctx, i);
hctx->sched_data = NULL;
@@ -538,12 +629,9 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
flags = hctx->flags;
}
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_unregister_sched(q);
- mutex_unlock(&q->debugfs_mutex);
-
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q, flags);
+ set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags);
q->elevator = NULL;
}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 1326526bb733..b554e1d55950 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -18,10 +18,20 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
-int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
+int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
+ struct elevator_tags *et);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q);
+struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues);
+int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
+ struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
+void blk_mq_free_sched_tags(struct elevator_tags *et,
+ struct blk_mq_tag_set *set);
+void blk_mq_free_sched_tags_batch(struct xarray *et_table,
+ struct blk_mq_tag_set *set);
+
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 3feeeccf8a99..24656980f443 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -61,9 +61,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
if (!entry->show)
return -EIO;
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
res = entry->show(hctx, page);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
return res;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index b9f417d980b4..d880c50629d6 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -190,8 +190,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
sbitmap_finish_wait(bt, ws, &wait);
data->ctx = blk_mq_get_ctx(data->q);
- data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
- data->ctx);
+ data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);
tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED)
bt = &tags->breserved_tags;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 005c520d3498..355db0abe44b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,7 @@ struct mq_inflight {
unsigned int inflight[2];
};
-static bool blk_mq_check_inflight(struct request *rq, void *priv)
+static bool blk_mq_check_in_driver(struct request *rq, void *priv)
{
struct mq_inflight *mi = priv;
@@ -101,24 +101,14 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv)
return true;
}
-unsigned int blk_mq_in_flight(struct request_queue *q,
- struct block_device *part)
+void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
-
- return mi.inflight[0] + mi.inflight[1];
-}
-
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
- unsigned int inflight[2])
-{
- struct mq_inflight mi = { .part = part };
-
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
- inflight[0] = mi.inflight[0];
- inflight[1] = mi.inflight[1];
+ blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver,
+ &mi);
+ inflight[READ] = mi.inflight[READ];
+ inflight[WRITE] = mi.inflight[WRITE];
}
#ifdef CONFIG_LOCKDEP
@@ -508,7 +498,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
retry:
data->ctx = blk_mq_get_ctx(q);
- data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+ data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);
if (q->elevator) {
/*
@@ -584,9 +574,13 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = plug->nr_ios,
.cached_rqs = &plug->cached_rqs,
+ .ctx = NULL,
+ .hctx = NULL
};
struct request *rq;
@@ -646,8 +640,13 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
int ret;
@@ -675,8 +674,13 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
u64 alloc_time_ns = 0;
struct request *rq;
@@ -2080,7 +2084,7 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
* Returns true if we did some work AND can potentially do more.
*/
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
- unsigned int nr_budgets)
+ bool get_budget)
{
enum prep_dispatch prep;
struct request_queue *q = hctx->queue;
@@ -2102,7 +2106,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
WARN_ON_ONCE(hctx != rq->mq_hctx);
- prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
+ prep = blk_mq_prep_dispatch_rq(rq, get_budget);
if (prep != PREP_DISPATCH_OK)
break;
@@ -2111,12 +2115,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
bd.rq = rq;
bd.last = list_empty(list);
- /*
- * once the request is queued to lld, no need to cover the
- * budget any more
- */
- if (nr_budgets)
- nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
@@ -2150,7 +2148,11 @@ out:
((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
blk_mq_is_shared_tags(hctx->flags));
- if (nr_budgets)
+ /*
+ * If the caller allocated budgets, free the budgets of the
+ * requests that have not yet been passed to the block driver.
+ */
+ if (!get_budget)
blk_mq_release_budgets(q, list);
spin_lock(&hctx->lock);
@@ -2778,15 +2780,15 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
return __blk_mq_issue_directly(hctx, rq, last);
}
-static void blk_mq_plug_issue_direct(struct blk_plug *plug)
+static void blk_mq_issue_direct(struct rq_list *rqs)
{
struct blk_mq_hw_ctx *hctx = NULL;
struct request *rq;
int queued = 0;
blk_status_t ret = BLK_STS_OK;
- while ((rq = rq_list_pop(&plug->mq_list))) {
- bool last = rq_list_empty(&plug->mq_list);
+ while ((rq = rq_list_pop(rqs))) {
+ bool last = rq_list_empty(rqs);
if (hctx != rq->mq_hctx) {
if (hctx) {
@@ -2817,15 +2819,64 @@ out:
blk_mq_commit_rqs(hctx, queued, false);
}
-static void __blk_mq_flush_plug_list(struct request_queue *q,
- struct blk_plug *plug)
+static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs)
{
if (blk_queue_quiesced(q))
return;
- q->mq_ops->queue_rqs(&plug->mq_list);
+ q->mq_ops->queue_rqs(rqs);
+}
+
+static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs,
+ struct rq_list *queue_rqs)
+{
+ struct request *rq = rq_list_pop(rqs);
+ struct request_queue *this_q = rq->q;
+ struct request **prev = &rqs->head;
+ struct rq_list matched_rqs = {};
+ struct request *last = NULL;
+ unsigned depth = 1;
+
+ rq_list_add_tail(&matched_rqs, rq);
+ while ((rq = *prev)) {
+ if (rq->q == this_q) {
+ /* move rq from rqs to matched_rqs */
+ *prev = rq->rq_next;
+ rq_list_add_tail(&matched_rqs, rq);
+ depth++;
+ } else {
+ /* leave rq in rqs */
+ prev = &rq->rq_next;
+ last = rq;
+ }
+ }
+
+ rqs->tail = last;
+ *queue_rqs = matched_rqs;
+ return depth;
}
-static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
+static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth)
+{
+ struct request_queue *q = rq_list_peek(rqs)->q;
+
+ trace_block_unplug(q, depth, true);
+
+ /*
+ * Peek first request and see if we have a ->queue_rqs() hook.
+ * If we do, we can dispatch the whole list in one go.
+ * We already know at this point that all requests belong to the
+ * same queue, caller must ensure that's the case.
+ */
+ if (q->mq_ops->queue_rqs) {
+ blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs));
+ if (rq_list_empty(rqs))
+ return;
+ }
+
+ blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs));
+}
+
+static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched)
{
struct blk_mq_hw_ctx *this_hctx = NULL;
struct blk_mq_ctx *this_ctx = NULL;
@@ -2835,7 +2886,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
LIST_HEAD(list);
do {
- struct request *rq = rq_list_pop(&plug->mq_list);
+ struct request *rq = rq_list_pop(rqs);
if (!this_hctx) {
this_hctx = rq->mq_hctx;
@@ -2848,9 +2899,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
}
list_add_tail(&rq->queuelist, &list);
depth++;
- } while (!rq_list_empty(&plug->mq_list));
+ } while (!rq_list_empty(rqs));
- plug->mq_list = requeue_list;
+ *rqs = requeue_list;
trace_block_unplug(this_hctx->queue, depth, !from_sched);
percpu_ref_get(&this_hctx->queue->q_usage_counter);
@@ -2870,9 +2921,21 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
percpu_ref_put(&this_hctx->queue->q_usage_counter);
}
+static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs)
+{
+ do {
+ struct rq_list queue_rqs;
+ unsigned depth;
+
+ depth = blk_mq_extract_queue_requests(rqs, &queue_rqs);
+ blk_mq_dispatch_queue_requests(&queue_rqs, depth);
+ while (!rq_list_empty(&queue_rqs))
+ blk_mq_dispatch_list(&queue_rqs, false);
+ } while (!rq_list_empty(rqs));
+}
+
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
- struct request *rq;
unsigned int depth;
/*
@@ -2887,34 +2950,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
depth = plug->rq_count;
plug->rq_count = 0;
- if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
- struct request_queue *q;
-
- rq = rq_list_peek(&plug->mq_list);
- q = rq->q;
- trace_block_unplug(q, depth, true);
-
- /*
- * Peek first request and see if we have a ->queue_rqs() hook.
- * If we do, we can dispatch the whole plug list in one go. We
- * already know at this point that all requests belong to the
- * same queue, caller must ensure that's the case.
- */
- if (q->mq_ops->queue_rqs) {
- blk_mq_run_dispatch_ops(q,
- __blk_mq_flush_plug_list(q, plug));
- if (rq_list_empty(&plug->mq_list))
- return;
+ if (!plug->has_elevator && !from_schedule) {
+ if (plug->multiple_queues) {
+ blk_mq_dispatch_multiple_queue_requests(&plug->mq_list);
+ return;
}
- blk_mq_run_dispatch_ops(q,
- blk_mq_plug_issue_direct(plug));
+ blk_mq_dispatch_queue_requests(&plug->mq_list, depth);
if (rq_list_empty(&plug->mq_list))
return;
}
do {
- blk_mq_dispatch_plug_list(plug, from_schedule);
+ blk_mq_dispatch_list(&plug->mq_list, from_schedule);
} while (!rq_list_empty(&plug->mq_list));
}
@@ -2965,13 +3013,18 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q,
static struct request *blk_mq_get_new_requests(struct request_queue *q,
struct blk_plug *plug,
- struct bio *bio,
- unsigned int nsegs)
+ struct bio *bio)
{
struct blk_mq_alloc_data data = {
.q = q,
- .nr_tags = 1,
+ .flags = 0,
+ .shallow_depth = 0,
.cmd_flags = bio->bi_opf,
+ .rq_flags = 0,
+ .nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
struct request *rq;
@@ -3081,8 +3134,6 @@ void blk_mq_submit_bio(struct bio *bio)
goto new_request;
}
- bio = blk_queue_bounce(bio, q);
-
/*
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
@@ -3118,14 +3169,16 @@ void blk_mq_submit_bio(struct bio *bio)
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
- if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
- goto queue_exit;
+ if (bio_needs_zone_write_plugging(bio)) {
+ if (blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+ }
new_request:
if (rq) {
blk_mq_use_cached_rq(rq, plug, bio);
} else {
- rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
+ rq = blk_mq_get_new_requests(q, plug, bio);
if (unlikely(!rq)) {
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
@@ -4461,14 +4514,12 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
return NULL;
}
-static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
- struct request_queue *q)
+static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
+ struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i, j;
- /* protect against switching io scheduler */
- mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int old_node;
int node = blk_mq_get_hctx_node(set, i);
@@ -4501,7 +4552,12 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
xa_for_each_start(&q->hctx_table, j, hctx, j)
blk_mq_exit_hctx(q, set, hctx, j);
- mutex_unlock(&q->sysfs_lock);
+}
+
+static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
+ struct request_queue *q)
+{
+ __blk_mq_realloc_hw_ctxs(set, q);
/* unregister cpuhp callbacks for exited hctxs */
blk_mq_remove_hw_queues_cpuhp(q);
@@ -4550,8 +4606,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->nr_requests = set->queue_depth;
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
- blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
+ blk_mq_add_queue_tag_set(set, q);
return 0;
err_hctxs:
@@ -4771,6 +4827,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
goto out_free_srcu;
}
+ init_rwsem(&set->update_nr_hwq_lock);
+
ret = -ENOMEM;
set->tags = kcalloc_node(set->nr_hw_queues,
sizeof(struct blk_mq_tags *), GFP_KERNEL,
@@ -4911,91 +4969,69 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
}
/*
- * request_queue and elevator_type pair.
- * It is just used by __blk_mq_update_nr_hw_queues to cache
- * the elevator_type associated with a request_queue.
+ * Switch back to the elevator type stored in the xarray.
*/
-struct blk_mq_qe_pair {
- struct list_head node;
- struct request_queue *q;
- struct elevator_type *type;
-};
-
-/*
- * Cache the elevator_type in qe pair list and switch the
- * io scheduler to 'none'
- */
-static bool blk_mq_elv_switch_none(struct list_head *head,
- struct request_queue *q)
+static void blk_mq_elv_switch_back(struct request_queue *q,
+ struct xarray *elv_tbl, struct xarray *et_tbl)
{
- struct blk_mq_qe_pair *qe;
+ struct elevator_type *e = xa_load(elv_tbl, q->id);
+ struct elevator_tags *t = xa_load(et_tbl, q->id);
- qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
- if (!qe)
- return false;
+ /* The elv_update_nr_hw_queues unfreezes the queue. */
+ elv_update_nr_hw_queues(q, e, t);
- /* q->elevator needs protection from ->sysfs_lock */
- mutex_lock(&q->sysfs_lock);
-
- /* the check has to be done with holding sysfs_lock */
- if (!q->elevator) {
- kfree(qe);
- goto unlock;
- }
-
- INIT_LIST_HEAD(&qe->node);
- qe->q = q;
- qe->type = q->elevator->type;
- /* keep a reference to the elevator module as we'll switch back */
- __elevator_get(qe->type);
- list_add(&qe->node, head);
- elevator_disable(q);
-unlock:
- mutex_unlock(&q->sysfs_lock);
-
- return true;
+ /* Drop the reference acquired in blk_mq_elv_switch_none. */
+ if (e)
+ elevator_put(e);
}
-static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
- struct request_queue *q)
+/*
+ * Stores elevator type in xarray and set current elevator to none. It uses
+ * q->id as an index to store the elevator type into the xarray.
+ */
+static int blk_mq_elv_switch_none(struct request_queue *q,
+ struct xarray *elv_tbl)
{
- struct blk_mq_qe_pair *qe;
+ int ret = 0;
- list_for_each_entry(qe, head, node)
- if (qe->q == q)
- return qe;
+ lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock);
- return NULL;
-}
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is safe here
+ * because we're called from nr_hw_queue update which is protected by
+ * set->update_nr_hwq_lock in the writer context. So, scheduler update/
+ * switch code (which acquires the same lock in the reader context)
+ * can't run concurrently.
+ */
+ if (q->elevator) {
-static void blk_mq_elv_switch_back(struct list_head *head,
- struct request_queue *q)
-{
- struct blk_mq_qe_pair *qe;
- struct elevator_type *t;
+ ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(ret))
+ return ret;
- qe = blk_lookup_qe_pair(head, q);
- if (!qe)
- return;
- t = qe->type;
- list_del(&qe->node);
- kfree(qe);
+ /*
+ * Before we switch elevator to 'none', take a reference to
+ * the elevator module so that while nr_hw_queue update is
+ * running, no one can remove elevator module. We'd put the
+ * reference to elevator module later when we switch back
+ * elevator.
+ */
+ __elevator_get(q->elevator->type);
- mutex_lock(&q->sysfs_lock);
- elevator_switch(q, t);
- /* drop the reference acquired in blk_mq_elv_switch_none */
- elevator_put(t);
- mutex_unlock(&q->sysfs_lock);
+ elevator_set_none(q);
+ }
+ return ret;
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
- LIST_HEAD(head);
int prev_nr_hw_queues = set->nr_hw_queues;
unsigned int memflags;
int i;
+ struct xarray elv_tbl, et_tbl;
+ bool queues_frozen = false;
lockdep_assert_held(&set->tag_list_lock);
@@ -5007,8 +5043,17 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
return;
memflags = memalloc_noio_save();
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_freeze_queue_nomemsave(q);
+
+ xa_init(&et_tbl);
+ if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0)
+ goto out_memalloc_restore;
+
+ xa_init(&elv_tbl);
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ blk_mq_debugfs_unregister_hctxs(q);
+ blk_mq_sysfs_unregister_hctxs(q);
+ }
/*
* Switch IO scheduler to 'none', cleaning up the data associated
@@ -5016,21 +5061,19 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
* updating the new sw to hw queue mappings.
*/
list_for_each_entry(q, &set->tag_list, tag_set_list)
- if (!blk_mq_elv_switch_none(&head, q))
+ if (blk_mq_elv_switch_none(q, &elv_tbl))
goto switch_back;
- list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_debugfs_unregister_hctxs(q);
- blk_mq_sysfs_unregister_hctxs(q);
- }
-
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_freeze_queue_nomemsave(q);
+ queues_frozen = true;
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
- goto reregister;
+ goto switch_back;
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_realloc_hw_ctxs(set, q);
+ __blk_mq_realloc_hw_ctxs(set, q);
if (q->nr_hw_queues != set->nr_hw_queues) {
int i = prev_nr_hw_queues;
@@ -5045,19 +5088,26 @@ fallback:
}
blk_mq_map_swqueue(q);
}
+switch_back:
+ /* The blk_mq_elv_switch_back unfreezes queue for us. */
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ /* switch_back expects queue to be frozen */
+ if (!queues_frozen)
+ blk_mq_freeze_queue_nomemsave(q);
+ blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
+ }
-reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
- }
-switch_back:
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_elv_switch_back(&head, q);
+ blk_mq_remove_hw_queues_cpuhp(q);
+ blk_mq_add_hw_queues_cpuhp(q);
+ }
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_unfreeze_queue_nomemrestore(q);
+ xa_destroy(&elv_tbl);
+ xa_destroy(&et_tbl);
+out_memalloc_restore:
memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
@@ -5067,9 +5117,11 @@ switch_back:
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
+ down_write(&set->update_nr_hwq_lock);
mutex_lock(&set->tag_list_lock);
__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
mutex_unlock(&set->tag_list_lock);
+ up_write(&set->update_nr_hwq_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 44979e92b79f..affb2e14b56e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -48,7 +48,7 @@ void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
- unsigned int);
+ bool);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
@@ -100,12 +100,10 @@ static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
/*
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
- * @q: request queue
* @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED).
* @ctx: software queue cpu ctx
*/
-static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
- blk_opf_t opf,
+static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf,
struct blk_mq_ctx *ctx)
{
return ctx->hctxs[blk_mq_get_hctx_type(opf)];
@@ -248,10 +246,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
return hctx->nr_ctx && hctx->tags;
}
-unsigned int blk_mq_in_flight(struct request_queue *q,
- struct block_device *part);
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
- unsigned int inflight[2]);
+void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]);
static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
int budget_token)
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index d4d4f4dc0e23..654478dfbc20 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -196,7 +196,6 @@ bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
struct rq_qos_wait_data {
struct wait_queue_entry wq;
- struct task_struct *task;
struct rq_wait *rqw;
acquire_inflight_cb_t *cb;
void *private_data;
@@ -218,7 +217,20 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
return -1;
data->got_token = true;
- wake_up_process(data->task);
+ /*
+ * autoremove_wake_function() removes the wait entry only when it
+ * actually changed the task state. We want the wait always removed.
+ * Remove explicitly and use default_wake_function().
+ */
+ default_wake_function(curr, mode, wake_flags, key);
+ /*
+ * Note that the order of operations is important as finish_wait()
+ * tests whether @curr is removed without grabbing the lock. This
+ * should be the last thing to do to make sure we will not have a
+ * UAF access to @data. And the semantics of memory barrier in it
+ * also make sure the waiter will see the latest @data->got_token
+ * once list_empty_careful() in finish_wait() returns true.
+ */
list_del_init_careful(&curr->entry);
return 1;
}
@@ -244,41 +256,55 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
cleanup_cb_t *cleanup_cb)
{
struct rq_qos_wait_data data = {
- .wq = {
- .func = rq_qos_wake_function,
- .entry = LIST_HEAD_INIT(data.wq.entry),
- },
- .task = current,
- .rqw = rqw,
- .cb = acquire_inflight_cb,
- .private_data = private_data,
+ .rqw = rqw,
+ .cb = acquire_inflight_cb,
+ .private_data = private_data,
+ .got_token = false,
};
- bool has_sleeper;
+ bool first_waiter;
- has_sleeper = wq_has_sleeper(&rqw->wait);
- if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
+ /*
+ * If there are no waiters in the waiting queue, try to increase the
+ * inflight counter if we can. Otherwise, prepare for adding ourselves
+ * to the waiting queue.
+ */
+ if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data))
return;
- has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq,
+ init_wait_func(&data.wq, rq_qos_wake_function);
+ first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq,
TASK_UNINTERRUPTIBLE);
+ /*
+ * Make sure there is at least one inflight process; otherwise, waiters
+ * will never be woken up. Since there may be no inflight process before
+ * adding ourselves to the waiting queue above, we need to try to
+ * increase the inflight counter for ourselves. And it is sufficient to
+ * guarantee that at least the first waiter to enter the waiting queue
+ * will re-check the waiting condition before going to sleep, thus
+ * ensuring forward progress.
+ */
+ if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) {
+ finish_wait(&rqw->wait, &data.wq);
+ /*
+ * We raced with rq_qos_wake_function() getting a token,
+ * which means we now have two. Put our local token
+ * and wake anyone else potentially waiting for one.
+ *
+ * Enough memory barrier in list_empty_careful() in
+ * finish_wait() is paired with list_del_init_careful()
+ * in rq_qos_wake_function() to make sure we will see
+ * the latest @data->got_token.
+ */
+ if (data.got_token)
+ cleanup_cb(rqw, private_data);
+ return;
+ }
+
+ /* we are now relying on the waker to increase our inflight counter. */
do {
- /* The memory barrier in set_current_state saves us here. */
if (data.got_token)
break;
- if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
- finish_wait(&rqw->wait, &data.wq);
-
- /*
- * We raced with rq_qos_wake_function() getting a token,
- * which means we now have two. Put our local token
- * and wake anyone else potentially waiting for one.
- */
- if (data.got_token)
- cleanup_cb(rqw, private_data);
- return;
- }
io_schedule();
- has_sleeper = true;
set_current_state(TASK_UNINTERRUPTIBLE);
} while (1);
finish_wait(&rqw->wait, &data.wq);
@@ -292,6 +318,7 @@ void rq_qos_exit(struct request_queue *q)
q->rq_qos = rqos->next;
rqos->ops->exit(rqos);
}
+ blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
mutex_unlock(&q->rq_qos_mutex);
}
@@ -317,6 +344,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
+ blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
blk_mq_unfreeze_queue(q, memflags);
@@ -347,6 +375,8 @@ void rq_qos_del(struct rq_qos *rqos)
break;
}
}
+ if (!q->rq_qos)
+ blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
blk_mq_unfreeze_queue(q, memflags);
mutex_lock(&q->debugfs_mutex);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 37245c97ee61..b538f2c0febc 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -112,41 +112,58 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
- if (q->rq_qos)
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos)
__rq_qos_cleanup(q->rq_qos, bio);
}
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos && !blk_rq_is_passthrough(rq))
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos && !blk_rq_is_passthrough(rq))
__rq_qos_done(q->rq_qos, rq);
}
static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos)
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos)
__rq_qos_issue(q->rq_qos, rq);
}
static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos)
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos)
__rq_qos_requeue(q->rq_qos, rq);
}
static inline void rq_qos_done_bio(struct bio *bio)
{
- if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
- bio_flagged(bio, BIO_QOS_MERGED))) {
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- if (q->rq_qos)
- __rq_qos_done_bio(q->rq_qos, bio);
- }
+ struct request_queue *q;
+
+ if (!bio->bi_bdev || (!bio_flagged(bio, BIO_QOS_THROTTLED) &&
+ !bio_flagged(bio, BIO_QOS_MERGED)))
+ return;
+
+ q = bdev_get_queue(bio->bi_bdev);
+
+ /*
+ * A BIO may carry BIO_QOS_* flags even if the associated request_queue
+ * does not have rq_qos enabled. This can happen with stacked block
+ * devices — for example, NVMe multipath, where it's possible that the
+ * bottom device has QoS enabled but the top device does not. Therefore,
+ * always verify that q->rq_qos is present and QoS is enabled before
+ * calling __rq_qos_done_bio().
+ */
+ if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
+ __rq_qos_done_bio(q->rq_qos, bio);
}
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
- if (q->rq_qos) {
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos) {
bio_set_flag(bio, BIO_QOS_THROTTLED);
__rq_qos_throttle(q->rq_qos, bio);
}
@@ -155,14 +172,16 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos)
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos)
__rq_qos_track(q->rq_qos, rq, bio);
}
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos) {
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos) {
bio_set_flag(bio, BIO_QOS_MERGED);
__rq_qos_merge(q->rq_qos, rq, bio);
}
@@ -170,7 +189,8 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
{
- if (q->rq_qos)
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos)
__rq_qos_queue_depth_changed(q->rq_qos);
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 66721afeea54..44dabc636a59 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -21,7 +21,7 @@
void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
{
- q->rq_timeout = timeout;
+ WRITE_ONCE(q->rq_timeout, timeout);
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
@@ -61,8 +61,14 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
+ *
+ * There is no hardware limitation for the read-ahead size and the user
+ * might have increased the read-ahead size through sysfs, so don't ever
+ * decrease it.
*/
- bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
+ bdi->ra_pages = max3(bdi->ra_pages,
+ lim->io_opt * 2 / PAGE_SIZE,
+ VM_READAHEAD_PAGES);
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
}
@@ -175,6 +181,8 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim)
static void blk_validate_atomic_write_limits(struct queue_limits *lim)
{
unsigned int boundary_sectors;
+ unsigned int atomic_write_hw_max_sectors =
+ lim->atomic_write_hw_max >> SECTOR_SHIFT;
if (!(lim->features & BLK_FEAT_ATOMIC_WRITES))
goto unsupported;
@@ -196,6 +204,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
lim->atomic_write_hw_max))
goto unsupported;
+ if (WARN_ON_ONCE(lim->chunk_sectors &&
+ atomic_write_hw_max_sectors > lim->chunk_sectors))
+ goto unsupported;
+
boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT;
if (boundary_sectors) {
@@ -330,12 +342,19 @@ int blk_validate_limits(struct queue_limits *lim)
lim->max_discard_sectors =
min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
+ /*
+ * When discard is not supported, discard_granularity should be reported
+ * as 0 to userspace.
+ */
+ if (lim->max_discard_sectors)
+ lim->discard_granularity =
+ max(lim->discard_granularity, lim->physical_block_size);
+ else
+ lim->discard_granularity = 0;
+
if (!lim->max_discard_segments)
lim->max_discard_segments = 1;
- if (lim->discard_granularity < lim->physical_block_size)
- lim->discard_granularity = lim->physical_block_size;
-
/*
* By default there is no limit on the segment boundary alignment,
* but if there is one it can't be smaller than the page size as
@@ -773,7 +792,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
}
/* chunk_sectors a multiple of the physical block size? */
- if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) {
+ if (t->chunk_sectors % (t->physical_block_size >> SECTOR_SHIFT)) {
t->chunk_sectors = 0;
t->flags |= BLK_FLAG_MISALIGNED;
ret = -1;
diff --git a/block/blk-stat.c b/block/blk-stat.c
index eaf60097bbe1..682a8ddb1173 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -76,7 +76,7 @@ void blk_stat_add(struct request *rq, u64 now)
static void blk_stat_timer_fn(struct timer_list *t)
{
- struct blk_stat_callback *cb = from_timer(cb, t, timer);
+ struct blk_stat_callback *cb = timer_container_of(cb, t, timer);
unsigned int bucket;
int cpu;
@@ -162,7 +162,7 @@ void blk_stat_remove_callback(struct request_queue *q,
blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
spin_unlock_irqrestore(&q->stats->lock, flags);
- del_timer_sync(&cb->timer);
+ timer_delete_sync(&cb->timer);
}
static void blk_stat_free_callback_rcu(struct rcu_head *head)
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 5d7f18ba436d..9e05bf18d1be 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -148,7 +148,7 @@ static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
static inline void blk_stat_deactivate(struct blk_stat_callback *cb)
{
- del_timer_sync(&cb->timer);
+ timer_delete_sync(&cb->timer);
}
/**
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 780218684907..7d1a03c36502 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -23,10 +23,11 @@
struct queue_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct gendisk *disk, char *page);
+ ssize_t (*show_limit)(struct gendisk *disk, char *page);
+
ssize_t (*store)(struct gendisk *disk, const char *page, size_t count);
int (*store_limit)(struct gendisk *disk, const char *page,
size_t count, struct queue_limits *lim);
- void (*load_module)(struct gendisk *disk, const char *page, size_t count);
};
static ssize_t
@@ -52,7 +53,12 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
static ssize_t queue_requests_show(struct gendisk *disk, char *page)
{
- return queue_var_show(disk->queue->nr_requests, page);
+ ssize_t ret;
+
+ mutex_lock(&disk->queue->elevator_lock);
+ ret = queue_var_show(disk->queue->nr_requests, page);
+ mutex_unlock(&disk->queue->elevator_lock);
+ return ret;
}
static ssize_t
@@ -60,27 +66,38 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count)
{
unsigned long nr;
int ret, err;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
- if (!queue_is_mq(disk->queue))
+ if (!queue_is_mq(q))
return -EINVAL;
ret = queue_var_store(&nr, page, count);
if (ret < 0)
return ret;
+ memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->elevator_lock);
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
err = blk_mq_update_nr_requests(disk->queue, nr);
if (err)
- return err;
-
+ ret = err;
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
return ret;
}
static ssize_t queue_ra_show(struct gendisk *disk, char *page)
{
- return queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page);
+ ssize_t ret;
+
+ mutex_lock(&disk->queue->limits_lock);
+ ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page);
+ mutex_unlock(&disk->queue->limits_lock);
+
+ return ret;
}
static ssize_t
@@ -88,11 +105,22 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count)
{
unsigned long ra_kb;
ssize_t ret;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
ret = queue_var_store(&ra_kb, page, count);
if (ret < 0)
return ret;
+ /*
+ * ->ra_pages is protected by ->limits_lock because it is usually
+ * calculated from the queue limits by queue_limits_commit_update.
+ */
+ mutex_lock(&q->limits_lock);
+ memflags = blk_mq_freeze_queue(q);
disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
+ mutex_unlock(&q->limits_lock);
+ blk_mq_unfreeze_queue(q, memflags);
+
return ret;
}
@@ -106,6 +134,8 @@ QUEUE_SYSFS_LIMIT_SHOW(max_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_segment_size)
+QUEUE_SYSFS_LIMIT_SHOW(max_write_streams)
+QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity)
QUEUE_SYSFS_LIMIT_SHOW(logical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(physical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors)
@@ -238,8 +268,9 @@ static ssize_t queue_poll_show(struct gendisk *disk, char *page)
{
if (queue_is_mq(disk->queue))
return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue));
+
return sysfs_emit(page, "%u\n",
- !!(disk->queue->limits.features & BLK_FEAT_POLL));
+ !!(disk->queue->limits.features & BLK_FEAT_POLL));
}
static ssize_t queue_zoned_show(struct gendisk *disk, char *page)
@@ -286,17 +317,21 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page,
size_t count)
{
unsigned long nm;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
ssize_t ret = queue_var_store(&nm, page, count);
if (ret < 0)
return ret;
- blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, disk->queue);
+ memflags = blk_mq_freeze_queue(q);
+ blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
+ blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
if (nm == 2)
- blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue);
+ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
else if (nm)
- blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, disk->queue);
+ blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
+ blk_mq_unfreeze_queue(q, memflags);
return ret;
}
@@ -316,11 +351,19 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
#ifdef CONFIG_SMP
struct request_queue *q = disk->queue;
unsigned long val;
+ unsigned int memflags;
ret = queue_var_store(&val, page, count);
if (ret < 0)
return ret;
+ /*
+ * Here we update two queue flags each using atomic bitops, although
+ * updating two flags isn't atomic it should be harmless as those flags
+ * are accessed individually using atomic test_bit operation. So we
+ * don't grab any lock while updating these flags.
+ */
+ memflags = blk_mq_freeze_queue(q);
if (val == 2) {
blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
@@ -331,6 +374,7 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
}
+ blk_mq_unfreeze_queue(q, memflags);
#endif
return ret;
}
@@ -344,29 +388,43 @@ static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page,
static ssize_t queue_poll_store(struct gendisk *disk, const char *page,
size_t count)
{
- if (!(disk->queue->limits.features & BLK_FEAT_POLL))
- return -EINVAL;
+ unsigned int memflags;
+ ssize_t ret = count;
+ struct request_queue *q = disk->queue;
+
+ memflags = blk_mq_freeze_queue(q);
+ if (!(q->limits.features & BLK_FEAT_POLL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
pr_info_ratelimited("writes to the poll attribute are ignored.\n");
pr_info_ratelimited("please use driver specific parameters instead.\n");
- return count;
+out:
+ blk_mq_unfreeze_queue(q, memflags);
+ return ret;
}
static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page)
{
- return sysfs_emit(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout));
+ return sysfs_emit(page, "%u\n",
+ jiffies_to_msecs(READ_ONCE(disk->queue->rq_timeout)));
}
static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page,
size_t count)
{
- unsigned int val;
+ unsigned int val, memflags;
int err;
+ struct request_queue *q = disk->queue;
err = kstrtou32(page, 10, &val);
if (err || val == 0)
return -EINVAL;
- blk_queue_rq_timeout(disk->queue, msecs_to_jiffies(val));
+ memflags = blk_mq_freeze_queue(q);
+ blk_queue_rq_timeout(q, msecs_to_jiffies(val));
+ blk_mq_unfreeze_queue(q, memflags);
return count;
}
@@ -412,57 +470,57 @@ static struct queue_sysfs_entry _prefix##_entry = { \
.store = _prefix##_store, \
};
-#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \
+#define QUEUE_LIM_RO_ENTRY(_prefix, _name) \
static struct queue_sysfs_entry _prefix##_entry = { \
- .attr = { .name = _name, .mode = 0644 }, \
- .show = _prefix##_show, \
- .store_limit = _prefix##_store, \
+ .attr = { .name = _name, .mode = 0444 }, \
+ .show_limit = _prefix##_show, \
}
-#define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \
-static struct queue_sysfs_entry _prefix##_entry = { \
+#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \
+static struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0644 }, \
- .show = _prefix##_show, \
- .load_module = _prefix##_load_module, \
- .store = _prefix##_store, \
+ .show_limit = _prefix##_show, \
+ .store_limit = _prefix##_store, \
}
QUEUE_RW_ENTRY(queue_requests, "nr_requests");
QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb");
QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
-QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
-QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
-QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
-QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size");
-QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler");
-
-QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size");
-QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size");
-QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors");
-QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size");
-QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
-
-QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
-QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
-QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
+QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments");
+QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
+QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size");
+QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams");
+QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity");
+QUEUE_RW_ENTRY(elv_iosched, "scheduler");
+
+QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size");
+QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size");
+QUEUE_LIM_RO_ENTRY(queue_chunk_sectors, "chunk_sectors");
+QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size");
+QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size");
+
+QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
+QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes");
QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes");
QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
-QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes");
-QUEUE_RO_ENTRY(queue_atomic_write_boundary_sectors,
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors,
"atomic_write_boundary_bytes");
-QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes");
-QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
-QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
-QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
-QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
+QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
-QUEUE_RO_ENTRY(queue_zoned, "zoned");
+QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned");
QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
-QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
-QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
+QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones");
+QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones");
QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough");
@@ -470,16 +528,16 @@ QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
QUEUE_RW_ENTRY(queue_poll, "io_poll");
QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay");
QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache");
-QUEUE_RO_ENTRY(queue_fua, "fua");
-QUEUE_RO_ENTRY(queue_dax, "dax");
+QUEUE_LIM_RO_ENTRY(queue_fua, "fua");
+QUEUE_LIM_RO_ENTRY(queue_dax, "dax");
QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
-QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
-QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
+QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
+QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment");
/* legacy alias for logical_block_size: */
static struct queue_sysfs_entry queue_hw_sector_size_entry = {
- .attr = {.name = "hw_sector_size", .mode = 0444 },
- .show = queue_logical_block_size_show,
+ .attr = {.name = "hw_sector_size", .mode = 0444 },
+ .show_limit = queue_logical_block_size_show,
};
QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational");
@@ -503,14 +561,24 @@ static ssize_t queue_var_store64(s64 *var, const char *page)
static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
{
- if (!wbt_rq_qos(disk->queue))
- return -EINVAL;
+ ssize_t ret;
+ struct request_queue *q = disk->queue;
- if (wbt_disabled(disk->queue))
- return sysfs_emit(page, "0\n");
+ mutex_lock(&disk->rqos_state_mutex);
+ if (!wbt_rq_qos(q)) {
+ ret = -EINVAL;
+ goto out;
+ }
- return sysfs_emit(page, "%llu\n",
- div_u64(wbt_get_min_lat(disk->queue), 1000));
+ if (wbt_disabled(q)) {
+ ret = sysfs_emit(page, "0\n");
+ goto out;
+ }
+
+ ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
+out:
+ mutex_unlock(&disk->rqos_state_mutex);
+ return ret;
}
static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
@@ -520,6 +588,7 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
struct rq_qos *rqos;
ssize_t ret;
s64 val;
+ unsigned int memflags;
ret = queue_var_store64(&val, page);
if (ret < 0)
@@ -527,20 +596,23 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
if (val < -1)
return -EINVAL;
+ memflags = blk_mq_freeze_queue(q);
+
rqos = wbt_rq_qos(q);
if (!rqos) {
ret = wbt_init(disk);
if (ret)
- return ret;
+ goto out;
}
+ ret = count;
if (val == -1)
val = wbt_default_latency_nsec(q);
else if (val >= 0)
val *= 1000ULL;
if (wbt_get_min_lat(q) == val)
- return count;
+ goto out;
/*
* Ensure that the queue is idled, in case the latency update
@@ -549,11 +621,15 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
*/
blk_mq_quiesce_queue(q);
+ mutex_lock(&disk->rqos_state_mutex);
wbt_set_min_lat(q, val);
+ mutex_unlock(&disk->rqos_state_mutex);
blk_mq_unquiesce_queue(q);
+out:
+ blk_mq_unfreeze_queue(q, memflags);
- return count;
+ return ret;
}
QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
@@ -561,13 +637,17 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
/* Common attributes for bio-based and request-based queues. */
static struct attribute *queue_attrs[] = {
- &queue_ra_entry.attr,
+ /*
+ * Attributes which are protected with q->limits_lock.
+ */
&queue_max_hw_sectors_entry.attr,
&queue_max_sectors_entry.attr,
&queue_max_segments_entry.attr,
&queue_max_discard_segments_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
+ &queue_max_write_streams_entry.attr,
+ &queue_write_stream_granularity_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
@@ -577,44 +657,58 @@ static struct attribute *queue_attrs[] = {
&queue_discard_granularity_entry.attr,
&queue_max_discard_sectors_entry.attr,
&queue_max_hw_discard_sectors_entry.attr,
- &queue_discard_zeroes_data_entry.attr,
&queue_atomic_write_max_sectors_entry.attr,
&queue_atomic_write_boundary_sectors_entry.attr,
&queue_atomic_write_unit_min_entry.attr,
&queue_atomic_write_unit_max_entry.attr,
- &queue_write_same_max_entry.attr,
&queue_max_write_zeroes_sectors_entry.attr,
&queue_max_zone_append_sectors_entry.attr,
&queue_zone_write_granularity_entry.attr,
&queue_rotational_entry.attr,
&queue_zoned_entry.attr,
- &queue_nr_zones_entry.attr,
&queue_max_open_zones_entry.attr,
&queue_max_active_zones_entry.attr,
- &queue_nomerges_entry.attr,
&queue_iostats_passthrough_entry.attr,
&queue_iostats_entry.attr,
&queue_stable_writes_entry.attr,
&queue_add_random_entry.attr,
- &queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_fua_entry.attr,
&queue_dax_entry.attr,
- &queue_poll_delay_entry.attr,
&queue_virt_boundary_mask_entry.attr,
&queue_dma_alignment_entry.attr,
+ &queue_ra_entry.attr,
+
+ /*
+ * Attributes which don't require locking.
+ */
+ &queue_discard_zeroes_data_entry.attr,
+ &queue_write_same_max_entry.attr,
+ &queue_nr_zones_entry.attr,
+ &queue_nomerges_entry.attr,
+ &queue_poll_entry.attr,
+ &queue_poll_delay_entry.attr,
+
NULL,
};
/* Request-based queue attributes that are not relevant for bio-based queues. */
static struct attribute *blk_mq_queue_attrs[] = {
- &queue_requests_entry.attr,
+ /*
+ * Attributes which require some form of locking other than
+ * q->sysfs_lock.
+ */
&elv_iosched_entry.attr,
- &queue_rq_affinity_entry.attr,
- &queue_io_timeout_entry.attr,
+ &queue_requests_entry.attr,
#ifdef CONFIG_BLK_WBT
&queue_wb_lat_entry.attr,
#endif
+ /*
+ * Attributes which don't require locking.
+ */
+ &queue_rq_affinity_entry.attr,
+ &queue_io_timeout_entry.attr,
+
NULL,
};
@@ -664,14 +758,20 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct queue_sysfs_entry *entry = to_queue(attr);
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
- ssize_t res;
- if (!entry->show)
+ if (!entry->show && !entry->show_limit)
return -EIO;
- mutex_lock(&disk->queue->sysfs_lock);
- res = entry->show(disk, page);
- mutex_unlock(&disk->queue->sysfs_lock);
- return res;
+
+ if (entry->show_limit) {
+ ssize_t res;
+
+ mutex_lock(&disk->queue->limits_lock);
+ res = entry->show_limit(disk, page);
+ mutex_unlock(&disk->queue->limits_lock);
+ return res;
+ }
+
+ return entry->show(disk, page);
}
static ssize_t
@@ -681,21 +781,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
struct queue_sysfs_entry *entry = to_queue(attr);
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
struct request_queue *q = disk->queue;
- unsigned int memflags;
- ssize_t res;
if (!entry->store_limit && !entry->store)
return -EIO;
- /*
- * If the attribute needs to load a module, do it before freezing the
- * queue to ensure that the module file can be read when the request
- * queue is the one for the device storing the module file.
- */
- if (entry->load_module)
- entry->load_module(disk, page, length);
-
if (entry->store_limit) {
+ ssize_t res;
+
struct queue_limits lim = queue_limits_start_update(q);
res = entry->store_limit(disk, page, length, &lim);
@@ -710,12 +802,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
return length;
}
- mutex_lock(&q->sysfs_lock);
- memflags = blk_mq_freeze_queue(q);
- res = entry->store(disk, page, length);
- blk_mq_unfreeze_queue(q, memflags);
- mutex_unlock(&q->sysfs_lock);
- return res;
+ return entry->store(disk, page, length);
}
static const struct sysfs_ops queue_sysfs_ops = {
@@ -734,7 +821,7 @@ static void blk_queue_release(struct kobject *kobj)
/* nothing to do here, all data is associated with the parent gendisk */
}
-static const struct kobj_type blk_queue_ktype = {
+const struct kobj_type blk_queue_ktype = {
.default_groups = blk_queue_attr_groups,
.sysfs_ops = &queue_sysfs_ops,
.release = blk_queue_release,
@@ -762,15 +849,14 @@ int blk_register_queue(struct gendisk *disk)
struct request_queue *q = disk->queue;
int ret;
- kobject_init(&disk->queue_kobj, &blk_queue_ktype);
ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
if (ret < 0)
- goto out_put_queue_kobj;
+ return ret;
if (queue_is_mq(q)) {
ret = blk_mq_sysfs_register(disk);
if (ret)
- goto out_put_queue_kobj;
+ goto out_del_queue_kobj;
}
mutex_lock(&q->sysfs_lock);
@@ -784,15 +870,12 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
goto out_debugfs_remove;
- if (q->elevator) {
- ret = elv_register_queue(q, false);
- if (ret)
- goto out_unregister_ia_ranges;
- }
-
ret = blk_crypto_sysfs_register(disk);
if (ret)
- goto out_elv_unregister;
+ goto out_unregister_ia_ranges;
+
+ if (queue_is_mq(q))
+ elevator_set_default(q);
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(disk);
@@ -817,8 +900,6 @@ int blk_register_queue(struct gendisk *disk)
return ret;
-out_elv_unregister:
- elv_unregister_queue(q);
out_unregister_ia_ranges:
disk_unregister_independent_access_ranges(disk);
out_debugfs_remove:
@@ -826,8 +907,8 @@ out_debugfs_remove:
mutex_unlock(&q->sysfs_lock);
if (queue_is_mq(q))
blk_mq_sysfs_unregister(disk);
-out_put_queue_kobj:
- kobject_put(&disk->queue_kobj);
+out_del_queue_kobj:
+ kobject_del(&disk->queue_kobj);
return ret;
}
@@ -867,7 +948,6 @@ void blk_unregister_queue(struct gendisk *disk)
blk_crypto_sysfs_unregister(disk);
mutex_lock(&q->sysfs_lock);
- elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
@@ -875,5 +955,8 @@ void blk_unregister_queue(struct gendisk *disk)
kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
kobject_del(&disk->queue_kobj);
+ if (queue_is_mq(q))
+ elevator_set_none(q);
+
blk_debugfs_remove(disk);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a52f0d6b40ad..397b6a410f9e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -143,7 +143,8 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio)
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
{
INIT_LIST_HEAD(&qn->node);
- bio_list_init(&qn->bios);
+ bio_list_init(&qn->bios_bps);
+ bio_list_init(&qn->bios_iops);
qn->tg = tg;
}
@@ -151,18 +152,32 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
* throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
* @bio: bio being added
* @qn: qnode to add bio to
- * @queued: the service_queue->queued[] list @qn belongs to
+ * @sq: the service_queue @qn belongs to
*
- * Add @bio to @qn and put @qn on @queued if it's not already on.
+ * Add @bio to @qn and put @qn on @sq->queued if it's not already on.
* @qn->tg's reference count is bumped when @qn is activated. See the
* comment on top of throtl_qnode definition for details.
*/
static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
- struct list_head *queued)
+ struct throtl_service_queue *sq)
{
- bio_list_add(&qn->bios, bio);
+ bool rw = bio_data_dir(bio);
+
+ /*
+ * Split bios have already been throttled by bps, so they are
+ * directly queued into the iops path.
+ */
+ if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) ||
+ bio_flagged(bio, BIO_BPS_THROTTLED)) {
+ bio_list_add(&qn->bios_iops, bio);
+ sq->nr_queued_iops[rw]++;
+ } else {
+ bio_list_add(&qn->bios_bps, bio);
+ sq->nr_queued_bps[rw]++;
+ }
+
if (list_empty(&qn->node)) {
- list_add_tail(&qn->node, queued);
+ list_add_tail(&qn->node, &sq->queued[rw]);
blkg_get(tg_to_blkg(qn->tg));
}
}
@@ -170,6 +185,10 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
/**
* throtl_peek_queued - peek the first bio on a qnode list
* @queued: the qnode list to peek
+ *
+ * Always take a bio from the head of the iops queue first. If the queue is
+ * empty, we then take it from the bps queue to maintain the overall idea of
+ * fetching bios from the head.
*/
static struct bio *throtl_peek_queued(struct list_head *queued)
{
@@ -180,28 +199,33 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_peek(&qn->bios);
+ bio = bio_list_peek(&qn->bios_iops);
+ if (!bio)
+ bio = bio_list_peek(&qn->bios_bps);
WARN_ON_ONCE(!bio);
return bio;
}
/**
* throtl_pop_queued - pop the first bio form a qnode list
- * @queued: the qnode list to pop a bio from
+ * @sq: the service_queue to pop a bio from
* @tg_to_put: optional out argument for throtl_grp to put
+ * @rw: read/write
*
- * Pop the first bio from the qnode list @queued. After popping, the first
- * qnode is removed from @queued if empty or moved to the end of @queued so
- * that the popping order is round-robin.
+ * Pop the first bio from the qnode list @sq->queued. Note that we firstly
+ * focus on the iops list because bios are ultimately dispatched from it.
+ * After popping, the first qnode is removed from @sq->queued if empty or moved
+ * to the end of @sq->queued so that the popping order is round-robin.
*
* When the first qnode is removed, its associated throtl_grp should be put
* too. If @tg_to_put is NULL, this function automatically puts it;
* otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
* responsible for putting it.
*/
-static struct bio *throtl_pop_queued(struct list_head *queued,
- struct throtl_grp **tg_to_put)
+static struct bio *throtl_pop_queued(struct throtl_service_queue *sq,
+ struct throtl_grp **tg_to_put, bool rw)
{
+ struct list_head *queued = &sq->queued[rw];
struct throtl_qnode *qn;
struct bio *bio;
@@ -209,10 +233,17 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_pop(&qn->bios);
+ bio = bio_list_pop(&qn->bios_iops);
+ if (bio) {
+ sq->nr_queued_iops[rw]--;
+ } else {
+ bio = bio_list_pop(&qn->bios_bps);
+ if (bio)
+ sq->nr_queued_bps[rw]--;
+ }
WARN_ON_ONCE(!bio);
- if (bio_list_empty(&qn->bios)) {
+ if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) {
list_del_init(&qn->node);
if (tg_to_put)
*tg_to_put = qn->tg;
@@ -333,7 +364,7 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
{
struct throtl_grp *tg = pd_to_tg(pd);
- del_timer_sync(&tg->service_queue.pending_timer);
+ timer_delete_sync(&tg->service_queue.pending_timer);
blkg_rwstat_exit(&tg->stat_bytes);
blkg_rwstat_exit(&tg->stat_ios);
kfree(tg);
@@ -478,8 +509,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
{
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
- tg->carryover_bytes[rw] = 0;
- tg->carryover_ios[rw] = 0;
/*
* Previous slice has expired. We must have trimmed it after last
@@ -498,16 +527,14 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
}
static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw,
- bool clear_carryover)
+ bool clear)
{
- tg->bytes_disp[rw] = 0;
- tg->io_disp[rw] = 0;
+ if (clear) {
+ tg->bytes_disp[rw] = 0;
+ tg->io_disp[rw] = 0;
+ }
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
- if (clear_carryover) {
- tg->carryover_bytes[rw] = 0;
- tg->carryover_ios[rw] = 0;
- }
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
@@ -524,6 +551,9 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
+ if (!time_before(tg->slice_end[rw], jiffy_end))
+ return;
+
throtl_set_slice_end(tg, rw, jiffy_end);
throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
@@ -540,6 +570,11 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
return true;
}
+static unsigned int sq_queued(struct throtl_service_queue *sq, int type)
+{
+ return sq->nr_queued_bps[type] + sq->nr_queued_iops[type];
+}
+
static unsigned int calculate_io_allowed(u32 iops_limit,
unsigned long jiffy_elapsed)
{
@@ -575,6 +610,48 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
}
+static long long throtl_trim_bps(struct throtl_grp *tg, bool rw,
+ unsigned long time_elapsed)
+{
+ u64 bps_limit = tg_bps_limit(tg, rw);
+ long long bytes_trim;
+
+ if (bps_limit == U64_MAX)
+ return 0;
+
+ /* Need to consider the case of bytes_allowed overflow. */
+ bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed);
+ if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) {
+ bytes_trim = tg->bytes_disp[rw];
+ tg->bytes_disp[rw] = 0;
+ } else {
+ tg->bytes_disp[rw] -= bytes_trim;
+ }
+
+ return bytes_trim;
+}
+
+static int throtl_trim_iops(struct throtl_grp *tg, bool rw,
+ unsigned long time_elapsed)
+{
+ u32 iops_limit = tg_iops_limit(tg, rw);
+ int io_trim;
+
+ if (iops_limit == UINT_MAX)
+ return 0;
+
+ /* Need to consider the case of io_allowed overflow. */
+ io_trim = calculate_io_allowed(iops_limit, time_elapsed);
+ if (io_trim <= 0 || tg->io_disp[rw] < io_trim) {
+ io_trim = tg->io_disp[rw];
+ tg->io_disp[rw] = 0;
+ } else {
+ tg->io_disp[rw] -= io_trim;
+ }
+
+ return io_trim;
+}
+
/* Trim the used slices and adjust slice start accordingly */
static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
{
@@ -616,26 +693,11 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
* one extra slice is preserved for deviation.
*/
time_elapsed -= tg->td->throtl_slice;
- bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw),
- time_elapsed) +
- tg->carryover_bytes[rw];
- io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) +
- tg->carryover_ios[rw];
- if (bytes_trim <= 0 && io_trim <= 0)
+ bytes_trim = throtl_trim_bps(tg, rw, time_elapsed);
+ io_trim = throtl_trim_iops(tg, rw, time_elapsed);
+ if (!bytes_trim && !io_trim)
return;
- tg->carryover_bytes[rw] = 0;
- if ((long long)tg->bytes_disp[rw] >= bytes_trim)
- tg->bytes_disp[rw] -= bytes_trim;
- else
- tg->bytes_disp[rw] = 0;
-
- tg->carryover_ios[rw] = 0;
- if ((int)tg->io_disp[rw] >= io_trim)
- tg->io_disp[rw] -= io_trim;
- else
- tg->io_disp[rw] = 0;
-
tg->slice_start[rw] += time_elapsed;
throtl_log(&tg->service_queue,
@@ -645,39 +707,60 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
jiffies);
}
-static void __tg_update_carryover(struct throtl_grp *tg, bool rw)
+static void __tg_update_carryover(struct throtl_grp *tg, bool rw,
+ long long *bytes, int *ios)
{
unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
u64 bps_limit = tg_bps_limit(tg, rw);
u32 iops_limit = tg_iops_limit(tg, rw);
+ long long bytes_allowed;
+ int io_allowed;
+
+ /*
+ * If the queue is empty, carryover handling is not needed. In such cases,
+ * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch
+ * of subsequent bios. The same handling applies when the previous BPS/IOPS
+ * limit was set to max.
+ */
+ if (sq_queued(&tg->service_queue, rw) == 0) {
+ tg->bytes_disp[rw] = 0;
+ tg->io_disp[rw] = 0;
+ return;
+ }
/*
* If config is updated while bios are still throttled, calculate and
- * accumulate how many bytes/ios are waited across changes. And
- * carryover_bytes/ios will be used to calculate new wait time under new
- * configuration.
+ * accumulate how many bytes/ios are waited across changes. And use the
+ * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which
+ * will be used to calculate new wait time under new configuration.
+ * And we need to consider the case of bytes/io_allowed overflow.
*/
- if (bps_limit != U64_MAX)
- tg->carryover_bytes[rw] +=
- calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
- tg->bytes_disp[rw];
- if (iops_limit != UINT_MAX)
- tg->carryover_ios[rw] +=
- calculate_io_allowed(iops_limit, jiffy_elapsed) -
- tg->io_disp[rw];
+ if (bps_limit != U64_MAX) {
+ bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed);
+ if (bytes_allowed > 0)
+ *bytes = bytes_allowed - tg->bytes_disp[rw];
+ }
+ if (iops_limit != UINT_MAX) {
+ io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed);
+ if (io_allowed > 0)
+ *ios = io_allowed - tg->io_disp[rw];
+ }
+
+ tg->bytes_disp[rw] = -*bytes;
+ tg->io_disp[rw] = -*ios;
}
static void tg_update_carryover(struct throtl_grp *tg)
{
- if (tg->service_queue.nr_queued[READ])
- __tg_update_carryover(tg, READ);
- if (tg->service_queue.nr_queued[WRITE])
- __tg_update_carryover(tg, WRITE);
+ long long bytes[2] = {0};
+ int ios[2] = {0};
+
+ __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]);
+ __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]);
- /* see comments in struct throtl_grp for meaning of these fields. */
+ /* see comments in struct throtl_grp for meaning of carryover. */
throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__,
- tg->carryover_bytes[READ], tg->carryover_bytes[WRITE],
- tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
+ bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]);
}
static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
@@ -687,16 +770,11 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio
int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
- if (iops_limit == UINT_MAX) {
- return 0;
- }
-
jiffy_elapsed = jiffies - tg->slice_start[rw];
/* Round up to the next throttle slice, wait time must be nonzero */
jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
- io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
- tg->carryover_ios[rw];
+ io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd);
if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed)
return 0;
@@ -717,11 +795,6 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
- /* no need to throttle if this bio's bytes have been accounted */
- if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
- return 0;
- }
-
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
/* Slice has just started. Consider one slice interval */
@@ -729,9 +802,10 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
- bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
- tg->carryover_bytes[rw];
- if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
+ bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd);
+ /* Need to consider the case of bytes_allowed overflow. */
+ if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
+ || bytes_allowed < 0)
return 0;
/* Calc approx time to dispatch */
@@ -749,17 +823,82 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
return jiffy_wait;
}
+static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio)
+{
+ unsigned int bio_size = throtl_bio_data_size(bio);
+
+ /* Charge the bio to the group */
+ if (!bio_flagged(bio, BIO_BPS_THROTTLED) &&
+ !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) {
+ bio_set_flag(bio, BIO_TG_BPS_THROTTLED);
+ tg->bytes_disp[bio_data_dir(bio)] += bio_size;
+ }
+}
+
+static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio)
+{
+ bio_clear_flag(bio, BIO_TG_BPS_THROTTLED);
+ tg->io_disp[bio_data_dir(bio)]++;
+}
+
/*
- * Returns whether one can dispatch a bio or not. Also returns approx number
- * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ * If previous slice expired, start a new one otherwise renew/extend existing
+ * slice to make sure it is at least throtl_slice interval long since now. New
+ * slice is started only for empty throttle group. If there is queued bio, that
+ * means there should be an active slice and it should be extended instead.
*/
-static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
- unsigned long *wait)
+static void tg_update_slice(struct throtl_grp *tg, bool rw)
+{
+ if (throtl_slice_used(tg, rw) &&
+ sq_queued(&tg->service_queue, rw) == 0)
+ throtl_start_new_slice(tg, rw, true);
+ else
+ throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice);
+}
+
+static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
- unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
u64 bps_limit = tg_bps_limit(tg, rw);
+ unsigned long bps_wait;
+
+ /* no need to throttle if this bio's bytes have been accounted */
+ if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING ||
+ bio_flagged(bio, BIO_BPS_THROTTLED) ||
+ bio_flagged(bio, BIO_TG_BPS_THROTTLED))
+ return 0;
+
+ tg_update_slice(tg, rw);
+ bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
+ throtl_extend_slice(tg, rw, jiffies + bps_wait);
+
+ return bps_wait;
+}
+
+static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio)
+{
+ bool rw = bio_data_dir(bio);
u32 iops_limit = tg_iops_limit(tg, rw);
+ unsigned long iops_wait;
+
+ if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING)
+ return 0;
+
+ tg_update_slice(tg, rw);
+ iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
+ throtl_extend_slice(tg, rw, jiffies + iops_wait);
+
+ return iops_wait;
+}
+
+/*
+ * Returns approx number of jiffies to wait before this bio is with-in IO rate
+ * and can be moved to other queue or dispatched.
+ */
+static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio)
+{
+ bool rw = bio_data_dir(bio);
+ unsigned long wait;
/*
* Currently whole state machine of group depends on first bio
@@ -767,65 +906,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
* this function with a different bio if there are other bios
* queued.
*/
- BUG_ON(tg->service_queue.nr_queued[rw] &&
+ BUG_ON(sq_queued(&tg->service_queue, rw) &&
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
- /* If tg->bps = -1, then BW is unlimited */
- if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
- tg->flags & THROTL_TG_CANCELING) {
- if (wait)
- *wait = 0;
- return true;
- }
+ wait = tg_dispatch_bps_time(tg, bio);
+ if (wait != 0)
+ return wait;
/*
- * If previous slice expired, start a new one otherwise renew/extend
- * existing slice to make sure it is at least throtl_slice interval
- * long since now. New slice is started only for empty throttle group.
- * If there is queued bio, that means there should be an active
- * slice and it should be extended instead.
+ * Charge bps here because @bio will be directly placed into the
+ * iops queue afterward.
*/
- if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
- throtl_start_new_slice(tg, rw, true);
- else {
- if (time_before(tg->slice_end[rw],
- jiffies + tg->td->throtl_slice))
- throtl_extend_slice(tg, rw,
- jiffies + tg->td->throtl_slice);
- }
-
- bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
- iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
- if (bps_wait + iops_wait == 0) {
- if (wait)
- *wait = 0;
- return true;
- }
-
- max_wait = max(bps_wait, iops_wait);
-
- if (wait)
- *wait = max_wait;
-
- if (time_before(tg->slice_end[rw], jiffies + max_wait))
- throtl_extend_slice(tg, rw, jiffies + max_wait);
+ throtl_charge_bps_bio(tg, bio);
- return false;
-}
-
-static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
-{
- bool rw = bio_data_dir(bio);
- unsigned int bio_size = throtl_bio_data_size(bio);
-
- /* Charge the bio to the group */
- if (!bio_flagged(bio, BIO_BPS_THROTTLED)) {
- tg->bytes_disp[rw] += bio_size;
- tg->last_bytes_disp[rw] += bio_size;
- }
-
- tg->io_disp[rw]++;
- tg->last_io_disp[rw]++;
+ return tg_dispatch_iops_time(tg, bio);
}
/**
@@ -852,28 +946,36 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
* dispatched. Mark that @tg was empty. This is automatically
* cleared on the next tg_update_disptime().
*/
- if (!sq->nr_queued[rw])
+ if (sq_queued(sq, rw) == 0)
tg->flags |= THROTL_TG_WAS_EMPTY;
- throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+ throtl_qnode_add_bio(bio, qn, sq);
+
+ /*
+ * Since we have split the queues, when the iops queue is
+ * previously empty and a new @bio is added into the first @qn,
+ * we also need to update the @tg->disptime.
+ */
+ if (bio_flagged(bio, BIO_BPS_THROTTLED) &&
+ bio == throtl_peek_queued(&sq->queued[rw]))
+ tg->flags |= THROTL_TG_IOPS_WAS_EMPTY;
- sq->nr_queued[rw]++;
throtl_enqueue_tg(tg);
}
static void tg_update_disptime(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
- unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+ unsigned long read_wait = -1, write_wait = -1, min_wait, disptime;
struct bio *bio;
bio = throtl_peek_queued(&sq->queued[READ]);
if (bio)
- tg_may_dispatch(tg, bio, &read_wait);
+ read_wait = tg_dispatch_time(tg, bio);
bio = throtl_peek_queued(&sq->queued[WRITE]);
if (bio)
- tg_may_dispatch(tg, bio, &write_wait);
+ write_wait = tg_dispatch_time(tg, bio);
min_wait = min(read_wait, write_wait);
disptime = jiffies + min_wait;
@@ -885,6 +987,7 @@ static void tg_update_disptime(struct throtl_grp *tg)
/* see throtl_add_bio_tg() */
tg->flags &= ~THROTL_TG_WAS_EMPTY;
+ tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY;
}
static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
@@ -911,10 +1014,9 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
* getting released prematurely. Remember the tg to put and put it
* after @bio is transferred to @parent_sq.
*/
- bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
- sq->nr_queued[rw]--;
+ bio = throtl_pop_queued(sq, &tg_to_put, rw);
- throtl_charge_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
/*
* If our parent is another tg, we just need to transfer @bio to
@@ -929,7 +1031,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
} else {
bio_set_flag(bio, BIO_BPS_THROTTLED);
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
- &parent_sq->queued[rw]);
+ parent_sq);
BUG_ON(tg->td->nr_queued[rw] <= 0);
tg->td->nr_queued[rw]--;
}
@@ -951,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
/* Try to dispatch 75% READS and 25% WRITES */
while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
- tg_may_dispatch(tg, bio, NULL)) {
+ tg_dispatch_time(tg, bio) == 0) {
tg_dispatch_one_bio(tg, READ);
nr_reads++;
@@ -961,7 +1063,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
}
while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
- tg_may_dispatch(tg, bio, NULL)) {
+ tg_dispatch_time(tg, bio) == 0) {
tg_dispatch_one_bio(tg, WRITE);
nr_writes++;
@@ -994,7 +1096,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
nr_disp += throtl_dispatch_tg(tg);
sq = &tg->service_queue;
- if (sq->nr_queued[READ] || sq->nr_queued[WRITE])
+ if (sq_queued(sq, READ) || sq_queued(sq, WRITE))
tg_update_disptime(tg);
else
throtl_dequeue_tg(tg);
@@ -1023,7 +1125,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
*/
static void throtl_pending_timer_fn(struct timer_list *t)
{
- struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
+ struct throtl_service_queue *sq = timer_container_of(sq, t,
+ pending_timer);
struct throtl_grp *tg = sq_to_tg(sq);
struct throtl_data *td = sq_to_td(sq);
struct throtl_service_queue *parent_sq;
@@ -1047,9 +1150,11 @@ again:
dispatched = false;
while (true) {
+ unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ);
+ unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE);
+
throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
- sq->nr_queued[READ] + sq->nr_queued[WRITE],
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
+ bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w);
ret = throtl_select_dispatch(sq);
if (ret) {
@@ -1071,7 +1176,8 @@ again:
if (parent_sq) {
/* @parent_sq is another throl_grp, propagate dispatch */
- if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ if (tg->flags & THROTL_TG_WAS_EMPTY ||
+ tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
if (!throtl_schedule_next_dispatch(parent_sq, false)) {
/* window is already open, repeat dispatching */
@@ -1111,7 +1217,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
spin_lock_irq(&q->queue_lock);
for (rw = READ; rw <= WRITE; rw++)
- while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+ while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
bio_list_add(&bio_list_on_stack, bio);
spin_unlock_irq(&q->queue_lock);
@@ -1616,18 +1722,30 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
{
- /* throtl is FIFO - if bios are already queued, should queue */
- if (tg->service_queue.nr_queued[rw])
- return false;
+ struct throtl_service_queue *sq = &tg->service_queue;
- return tg_may_dispatch(tg, bio, NULL);
-}
+ /*
+ * For a split bio, we need to specifically distinguish whether the
+ * iops queue is empty.
+ */
+ if (bio_flagged(bio, BIO_BPS_THROTTLED))
+ return sq->nr_queued_iops[rw] == 0 &&
+ tg_dispatch_iops_time(tg, bio) == 0;
-static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw)
-{
- if (!bio_flagged(bio, BIO_BPS_THROTTLED))
- tg->carryover_bytes[rw] -= throtl_bio_data_size(bio);
- tg->carryover_ios[rw]--;
+ /*
+ * Throtl is FIFO - if bios are already queued, should queue.
+ * If the bps queue is empty and @bio is within the bps limit, charge
+ * bps here for direct placement into the iops queue.
+ */
+ if (sq_queued(&tg->service_queue, rw)) {
+ if (sq->nr_queued_bps[rw] == 0 &&
+ tg_dispatch_bps_time(tg, bio) == 0)
+ throtl_charge_bps_bio(tg, bio);
+
+ return false;
+ }
+
+ return tg_dispatch_time(tg, bio) == 0;
}
bool __blk_throtl_bio(struct bio *bio)
@@ -1648,7 +1766,7 @@ bool __blk_throtl_bio(struct bio *bio)
while (true) {
if (tg_within_limit(tg, bio, rw)) {
/* within limits, let's charge and dispatch directly */
- throtl_charge_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
/*
* We need to trim slice even when bios are not being
@@ -1666,10 +1784,13 @@ bool __blk_throtl_bio(struct bio *bio)
/*
* IOs which may cause priority inversions are
* dispatched directly, even if they're over limit.
- * Debts are handled by carryover_bytes/ios while
- * calculating wait time.
+ *
+ * Charge and dispatch directly, and our throttle
+ * control algorithm is adaptive, and extra IO bytes
+ * will be throttled for paying the debt
*/
- tg_dispatch_in_debt(tg, bio, rw);
+ throtl_charge_bps_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
} else {
/* if above limits, break to queue */
break;
@@ -1695,7 +1816,7 @@ bool __blk_throtl_bio(struct bio *bio)
tg->bytes_disp[rw], bio->bi_iter.bi_size,
tg_bps_limit(tg, rw),
tg->io_disp[rw], tg_iops_limit(tg, rw),
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
+ sq_queued(sq, READ), sq_queued(sq, WRITE));
td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
@@ -1703,11 +1824,13 @@ bool __blk_throtl_bio(struct bio *bio)
/*
* Update @tg's dispatch time and force schedule dispatch if @tg
- * was empty before @bio. The forced scheduling isn't likely to
- * cause undue delay as @bio is likely to be dispatched directly if
- * its @tg's disptime is not in the future.
+ * was empty before @bio, or the iops queue is empty and @bio will
+ * add to. The forced scheduling isn't likely to cause undue
+ * delay as @bio is likely to be dispatched directly if its @tg's
+ * disptime is not in the future.
*/
- if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ if (tg->flags & THROTL_TG_WAS_EMPTY ||
+ tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
}
@@ -1726,7 +1849,7 @@ void blk_throtl_exit(struct gendisk *disk)
if (!blk_throtl_activated(q))
return;
- del_timer_sync(&q->td->service_queue.pending_timer);
+ timer_delete_sync(&q->td->service_queue.pending_timer);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
kfree(q->td);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 1a36d1278eea..3b27755bfbff 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_THROTTLE_H
#define BLK_THROTTLE_H
@@ -28,7 +29,8 @@
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
- struct bio_list bios; /* queued bios */
+ struct bio_list bios_bps; /* queued bios for bps limit */
+ struct bio_list bios_iops; /* queued bios for iops limit */
struct throtl_grp *tg; /* tg this qnode belongs to */
};
@@ -40,7 +42,8 @@ struct throtl_service_queue {
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
- unsigned int nr_queued[2]; /* number of queued bios */
+ unsigned int nr_queued_bps[2]; /* number of queued bps bios */
+ unsigned int nr_queued_iops[2]; /* number of queued iops bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
@@ -53,9 +56,14 @@ struct throtl_service_queue {
};
enum tg_state_flags {
- THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
- THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
- THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
+ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
+ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
+ /*
+ * The sq's iops queue is empty, and a bio is about to be enqueued
+ * to the first qnode's bios_iops list.
+ */
+ THROTL_TG_IOPS_WAS_EMPTY = 1 << 2,
+ THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
};
struct throtl_grp {
@@ -101,22 +109,16 @@ struct throtl_grp {
/* IOPS limits */
unsigned int iops[2];
- /* Number of bytes dispatched in current slice */
- uint64_t bytes_disp[2];
- /* Number of bio's dispatched in current slice */
- unsigned int io_disp[2];
-
- uint64_t last_bytes_disp[2];
- unsigned int last_io_disp[2];
-
/*
- * The following two fields are updated when new configuration is
- * submitted while some bios are still throttled, they record how many
- * bytes/ios are waited already in previous configuration, and they will
- * be used to calculate wait time under new configuration.
+ * Number of bytes/bio's dispatched in current slice.
+ * When new configuration is submitted while some bios are still throttled,
+ * first calculate the carryover: the amount of bytes/IOs already waited
+ * under the previous configuration. Then, [bytes/io]_disp are represented
+ * as the negative of the carryover, and they will be used to calculate the
+ * wait time under the new configuration.
*/
- long long carryover_bytes[2];
- int carryover_ios[2];
+ int64_t bytes_disp[2];
+ int io_disp[2];
unsigned long last_check_time;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 6dfc659d22e2..a50d4cd55f41 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -37,7 +37,7 @@
enum wbt_flags {
WBT_TRACKED = 1, /* write, tracked for throttling */
WBT_READ = 2, /* read */
- WBT_SWAP = 4, /* write, from swap_writepage() */
+ WBT_SWAP = 4, /* write, from swap_writeout() */
WBT_DISCARD = 8, /* discard */
WBT_NR_BITS = 4, /* number of bits */
@@ -136,8 +136,9 @@ enum {
RWB_MIN_WRITE_SAMPLES = 3,
/*
- * If we have this number of consecutive windows with not enough
- * information to scale up or down, scale up.
+ * If we have this number of consecutive windows without enough
+ * information to scale up or down, slowly return to center state
+ * (step == 0).
*/
RWB_UNKNOWN_BUMP = 5,
};
@@ -446,9 +447,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
break;
case LAT_UNKNOWN_WRITES:
/*
- * We started a the center step, but don't have a valid
- * read/write sample, but we do have writes going on.
- * Allow step to go negative, to increase write perf.
+ * We don't have a valid read/write sample, but we do have
+ * writes going on. Allow step to go negative, to increase
+ * write performance.
*/
scale_up(rwb);
break;
@@ -638,11 +639,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
__wbt_done(rqos, flags);
}
-/*
- * May sleep, if we have exceeded the writeback limits. Caller can pass
- * in an irq held spinlock, if it holds one when calling this function.
- * If we do sleep, we'll release and re-grab it.
- */
+/* May sleep, if we have exceeded the writeback limits. */
static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
{
struct rq_wb *rwb = RQWB(rqos);
@@ -707,8 +704,9 @@ void wbt_enable_default(struct gendisk *disk)
struct rq_qos *rqos;
bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
- if (q->elevator &&
- test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
+ mutex_lock(&disk->rqos_state_mutex);
+
+ if (blk_queue_disable_wbt(q))
enable = false;
/* Throttling already enabled? */
@@ -716,8 +714,10 @@ void wbt_enable_default(struct gendisk *disk)
if (rqos) {
if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
+ mutex_unlock(&disk->rqos_state_mutex);
return;
}
+ mutex_unlock(&disk->rqos_state_mutex);
/* Queue not registered? Maybe shutting down... */
if (!blk_queue_registered(q))
@@ -777,11 +777,13 @@ void wbt_disable_default(struct gendisk *disk)
struct rq_wb *rwb;
if (!rqos)
return;
+ mutex_lock(&disk->rqos_state_mutex);
rwb = RQWB(rqos);
if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
blk_stat_deactivate(rwb->cb);
rwb->enable_state = WBT_STATE_OFF_DEFAULT;
}
+ mutex_unlock(&disk->rqos_state_mutex);
}
EXPORT_SYMBOL_GPL(wbt_disable_default);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 0c77244a35c9..9104c3a34642 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -343,6 +343,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
op = REQ_OP_ZONE_RESET;
/* Invalidate the page cache, including dirty pages. */
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
if (ret)
@@ -364,8 +365,10 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
fail:
- if (cmd == BLKRESETZONE)
+ if (cmd == BLKRESETZONE) {
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
+ }
return ret;
}
@@ -1113,25 +1116,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
{
struct block_device *bdev = bio->bi_bdev;
- if (!bdev->bd_disk->zone_wplugs_hash)
- return false;
-
- /*
- * If the BIO already has the plugging flag set, then it was already
- * handled through this path and this is a submission from the zone
- * plug bio submit work.
- */
- if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
- return false;
-
- /*
- * We do not need to do anything special for empty flush BIOs, e.g
- * BIOs such as issued by blkdev_issue_flush(). The is because it is
- * the responsibility of the user to first wait for the completion of
- * write operations for flush to have any effect on the persistence of
- * the written data.
- */
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+ if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
return false;
/*
@@ -1222,6 +1207,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= REQ_OP_ZONE_APPEND;
+ bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
}
/*
@@ -1280,14 +1266,14 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
struct block_device *bdev;
unsigned long flags;
struct bio *bio;
+ bool prepared;
/*
* Submit the next plugged BIO. If we do not have any, clear
* the plugged flag.
*/
- spin_lock_irqsave(&zwplug->lock, flags);
-
again:
+ spin_lock_irqsave(&zwplug->lock, flags);
bio = bio_list_pop(&zwplug->bio_list);
if (!bio) {
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
@@ -1295,15 +1281,15 @@ again:
goto put_zwplug;
}
- if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
+ prepared = blk_zone_wplug_prepare_bio(zwplug, bio);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ if (!prepared) {
blk_zone_wplug_bio_io_error(zwplug, bio);
goto again;
}
- spin_unlock_irqrestore(&zwplug->lock, flags);
-
bdev = bio->bi_bdev;
- submit_bio_noacct_nocheck(bio);
/*
* blk-mq devices will reuse the extra reference on the request queue
@@ -1311,8 +1297,12 @@ again:
* path for BIO-based devices will not do that. So drop this extra
* reference here.
*/
- if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO))
+ if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
+ bdev->bd_disk->fops->submit_bio(bio);
blk_queue_exit(bdev->bd_disk->queue);
+ } else {
+ blk_mq_submit_bio(bio);
+ }
put_zwplug:
/* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
diff --git a/block/blk.h b/block/blk.h
index 9cf9a0099416..5d9ca8c95193 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -12,6 +12,7 @@
#include "blk-crypto-internal.h"
struct elevator_type;
+struct elevator_tags;
#define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9)
#define BLK_MIN_SEGMENT_SIZE 4096
@@ -19,6 +20,7 @@ struct elevator_type;
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
+extern const struct kobj_type blk_queue_ktype;
extern struct dentry *blk_debugfs_root;
struct blk_flush_queue {
@@ -94,14 +96,16 @@ static inline void blk_wait_io(struct completion *done)
wait_for_completion_io(done);
}
+struct block_device *blkdev_get_no_open(dev_t dev, bool autoload);
+void blkdev_put_no_open(struct block_device *bdev);
+
#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask);
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
- struct page *page, unsigned len, unsigned offset,
- bool *same_page);
+ struct page *page, unsigned len, unsigned offset);
static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
@@ -319,11 +323,10 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);
-int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
-void elevator_disable(struct request_queue *q);
-void elevator_exit(struct request_queue *q);
-int elv_register_queue(struct request_queue *q, bool uevent);
-void elv_unregister_queue(struct request_queue *q);
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
+ struct elevator_tags *t);
+void elevator_set_default(struct request_queue *q);
+void elevator_set_none(struct request_queue *q);
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
char *buf);
@@ -404,6 +407,27 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio,
}
}
+/**
+ * get_max_segment_size() - maximum number of bytes to add as a single segment
+ * @lim: Request queue limits.
+ * @paddr: address of the range to add
+ * @len: maximum length available to add at @paddr
+ *
+ * Returns the maximum number of bytes of the range starting at @paddr that can
+ * be added to a single segment.
+ */
+static inline unsigned get_max_segment_size(const struct queue_limits *lim,
+ phys_addr_t paddr, unsigned int len)
+{
+ /*
+ * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
+ * after having calculated the minimum.
+ */
+ return min_t(unsigned long, len,
+ min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
+ (unsigned long)lim->max_segment_size - 1) + 1);
+}
+
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -418,7 +442,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
int blk_dev_init(void);
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
-unsigned int part_in_flight(struct block_device *part);
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
@@ -440,23 +463,6 @@ static inline void ioc_clear_queue(struct request_queue *q)
}
#endif /* CONFIG_BLK_ICQ */
-struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
-
-static inline bool blk_queue_may_bounce(struct request_queue *q)
-{
- return IS_ENABLED(CONFIG_BOUNCE) &&
- (q->limits.features & BLK_FEAT_BOUNCE_HIGH) &&
- max_low_pfn >= max_pfn;
-}
-
-static inline struct bio *blk_queue_bounce(struct bio *bio,
- struct request_queue *q)
-{
- if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
- return __blk_queue_bounce(bio, q);
- return bio;
-}
-
#ifdef CONFIG_BLK_DEV_ZONED
void disk_init_zone_resources(struct gendisk *disk);
void disk_free_zone_resources(struct gendisk *disk);
@@ -477,7 +483,8 @@ static inline void blk_zone_update_request_bio(struct request *rq,
* the original BIO sector so that blk_zone_write_plug_bio_endio() can
* lookup the zone write plug.
*/
- if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
+ if (req_op(rq) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND))
bio->bi_iter.bi_sector = rq->__sector;
}
void blk_zone_write_plug_bio_endio(struct bio *bio);
@@ -715,7 +722,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
void blk_integrity_generate(struct bio *bio);
-void blk_integrity_verify(struct bio *bio);
+void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter);
void blk_integrity_prepare(struct request *rq);
void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);
diff --git a/block/bounce.c b/block/bounce.c
deleted file mode 100644
index 0d898cd5ec49..000000000000
--- a/block/bounce.c
+++ /dev/null
@@ -1,269 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* bounce buffer handling for block devices
- *
- * - Split from highmem.c
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/mm.h>
-#include <linux/export.h>
-#include <linux/swap.h>
-#include <linux/gfp.h>
-#include <linux/bio-integrity.h>
-#include <linux/pagemap.h>
-#include <linux/mempool.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/init.h>
-#include <linux/hash.h>
-#include <linux/highmem.h>
-#include <linux/printk.h>
-#include <asm/tlbflush.h>
-
-#include <trace/events/block.h>
-#include "blk.h"
-#include "blk-cgroup.h"
-
-#define POOL_SIZE 64
-#define ISA_POOL_SIZE 16
-
-static struct bio_set bounce_bio_set, bounce_bio_split;
-static mempool_t page_pool;
-
-static void init_bounce_bioset(void)
-{
- static bool bounce_bs_setup;
- int ret;
-
- if (bounce_bs_setup)
- return;
-
- ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- BUG_ON(ret);
- if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
- BUG_ON(1);
-
- ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
- BUG_ON(ret);
- bounce_bs_setup = true;
-}
-
-static __init int init_emergency_pool(void)
-{
- int ret;
-
-#ifndef CONFIG_MEMORY_HOTPLUG
- if (max_pfn <= max_low_pfn)
- return 0;
-#endif
-
- ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
- BUG_ON(ret);
- pr_info("pool size: %d pages\n", POOL_SIZE);
-
- init_bounce_bioset();
- return 0;
-}
-
-__initcall(init_emergency_pool);
-
-/*
- * Simple bounce buffer support for highmem pages. Depending on the
- * queue gfp mask set, *to may or may not be a highmem page. kmap it
- * always, it will do the Right Thing
- */
-static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
-{
- struct bio_vec tovec, fromvec;
- struct bvec_iter iter;
- /*
- * The bio of @from is created by bounce, so we can iterate
- * its bvec from start to end, but the @from->bi_iter can't be
- * trusted because it might be changed by splitting.
- */
- struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
-
- bio_for_each_segment(tovec, to, iter) {
- fromvec = bio_iter_iovec(from, from_iter);
- if (tovec.bv_page != fromvec.bv_page) {
- /*
- * fromvec->bv_offset and fromvec->bv_len might have
- * been modified by the block layer, so use the original
- * copy, bounce_copy_vec already uses tovec->bv_len
- */
- memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) +
- tovec.bv_offset);
- }
- bio_advance_iter(from, &from_iter, tovec.bv_len);
- }
-}
-
-static void bounce_end_io(struct bio *bio)
-{
- struct bio *bio_orig = bio->bi_private;
- struct bio_vec *bvec, orig_vec;
- struct bvec_iter orig_iter = bio_orig->bi_iter;
- struct bvec_iter_all iter_all;
-
- /*
- * free up bounce indirect pages used
- */
- bio_for_each_segment_all(bvec, bio, iter_all) {
- orig_vec = bio_iter_iovec(bio_orig, orig_iter);
- if (bvec->bv_page != orig_vec.bv_page) {
- dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
- mempool_free(bvec->bv_page, &page_pool);
- }
- bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
- }
-
- bio_orig->bi_status = bio->bi_status;
- bio_endio(bio_orig);
- bio_put(bio);
-}
-
-static void bounce_end_io_write(struct bio *bio)
-{
- bounce_end_io(bio);
-}
-
-static void bounce_end_io_read(struct bio *bio)
-{
- struct bio *bio_orig = bio->bi_private;
-
- if (!bio->bi_status)
- copy_to_high_bio_irq(bio_orig, bio);
-
- bounce_end_io(bio);
-}
-
-static struct bio *bounce_clone_bio(struct bio *bio_src)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- struct bio *bio;
-
- /*
- * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
- * bio_src->bi_io_vec to bio->bi_io_vec.
- *
- * We can't do that anymore, because:
- *
- * - The point of cloning the biovec is to produce a bio with a biovec
- * the caller can modify: bi_idx and bi_bvec_done should be 0.
- *
- * - The original bio could've had more than BIO_MAX_VECS biovecs; if
- * we tried to clone the whole thing bio_alloc_bioset() would fail.
- * But the clone should succeed as long as the number of biovecs we
- * actually need to allocate is fewer than BIO_MAX_VECS.
- *
- * - Lastly, bi_vcnt should not be looked at or relied upon by code
- * that does not own the bio - reason being drivers don't use it for
- * iterating over the biovec anymore, so expecting it to be kept up
- * to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work.
- */
- bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
- bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
- if (bio_flagged(bio_src, BIO_REMAPPED))
- bio_set_flag(bio, BIO_REMAPPED);
- bio->bi_ioprio = bio_src->bi_ioprio;
- bio->bi_write_hint = bio_src->bi_write_hint;
- bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
- bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
-
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- case REQ_OP_WRITE_ZEROES:
- break;
- default:
- bio_for_each_segment(bv, bio_src, iter)
- bio->bi_io_vec[bio->bi_vcnt++] = bv;
- break;
- }
-
- if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0)
- goto err_put;
-
- if (bio_integrity(bio_src) &&
- bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0)
- goto err_put;
-
- bio_clone_blkg_association(bio, bio_src);
-
- return bio;
-
-err_put:
- bio_put(bio);
- return NULL;
-}
-
-struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
-{
- struct bio *bio;
- int rw = bio_data_dir(bio_orig);
- struct bio_vec *to, from;
- struct bvec_iter iter;
- unsigned i = 0, bytes = 0;
- bool bounce = false;
- int sectors;
-
- bio_for_each_segment(from, bio_orig, iter) {
- if (i++ < BIO_MAX_VECS)
- bytes += from.bv_len;
- if (PageHighMem(from.bv_page))
- bounce = true;
- }
- if (!bounce)
- return bio_orig;
-
- /*
- * Individual bvecs might not be logical block aligned. Round down
- * the split size so that each bio is properly block size aligned,
- * even if we do not use the full hardware limits.
- */
- sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
- SECTOR_SHIFT;
- if (sectors < bio_sectors(bio_orig)) {
- bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
- bio_chain(bio, bio_orig);
- submit_bio_noacct(bio_orig);
- bio_orig = bio;
- }
- bio = bounce_clone_bio(bio_orig);
-
- /*
- * Bvec table can't be updated by bio_for_each_segment_all(),
- * so retrieve bvec from the table directly. This way is safe
- * because the 'bio' is single-page bvec.
- */
- for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
- struct page *bounce_page;
-
- if (!PageHighMem(to->bv_page))
- continue;
-
- bounce_page = mempool_alloc(&page_pool, GFP_NOIO);
- inc_zone_page_state(bounce_page, NR_BOUNCE);
-
- if (rw == WRITE) {
- flush_dcache_page(to->bv_page);
- memcpy_from_bvec(page_address(bounce_page), to);
- }
- to->bv_page = bounce_page;
- }
-
- trace_block_bio_bounce(bio_orig);
-
- bio->bi_flags |= (1 << BIO_BOUNCED);
-
- if (rw == READ)
- bio->bi_end_io = bounce_end_io_read;
- else
- bio->bi_end_io = bounce_end_io_write;
-
- bio->bi_private = bio_orig;
- return bio;
-}
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 93523d8f8195..9ceb5d0832f5 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -219,7 +219,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
if (!buf->sg_list)
return -ENOMEM;
sg_init_table(buf->sg_list, req->nr_phys_segments);
- buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
+ buf->sg_cnt = blk_rq_map_sg(req, buf->sg_list);
buf->payload_len = blk_rq_bytes(req);
return 0;
}
diff --git a/block/elevator.c b/block/elevator.c
index cd2ce4921601..fe96c6f4753c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -45,6 +45,19 @@
#include "blk-wbt.h"
#include "blk-cgroup.h"
+/* Holding context data for changing elevator */
+struct elv_change_ctx {
+ const char *name;
+ bool no_uevent;
+
+ /* for unregistering old elevator */
+ struct elevator_queue *old;
+ /* for registering new elevator */
+ struct elevator_queue *new;
+ /* holds sched tags data */
+ struct elevator_tags *et;
+};
+
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -121,7 +134,7 @@ static struct elevator_type *elevator_find_get(const char *name)
static const struct kobj_type elv_ktype;
struct elevator_queue *elevator_alloc(struct request_queue *q,
- struct elevator_type *e)
+ struct elevator_type *e, struct elevator_tags *et)
{
struct elevator_queue *eq;
@@ -134,10 +147,10 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
hash_init(eq->hash);
+ eq->et = et;
return eq;
}
-EXPORT_SYMBOL(elevator_alloc);
static void elevator_release(struct kobject *kobj)
{
@@ -148,18 +161,17 @@ static void elevator_release(struct kobject *kobj)
kfree(e);
}
-void elevator_exit(struct request_queue *q)
+static void elevator_exit(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
+ lockdep_assert_held(&q->elevator_lock);
+
ioc_clear_queue(q);
- blk_mq_sched_free_rqs(q);
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
-
- kobject_put(&e->kobj);
}
static inline void __elv_rqhash_del(struct request *rq)
@@ -412,14 +424,15 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
const struct elv_fs_entry *entry = to_elv(attr);
struct elevator_queue *e;
- ssize_t error;
+ ssize_t error = -ENODEV;
if (!entry->show)
return -EIO;
e = container_of(kobj, struct elevator_queue, kobj);
mutex_lock(&e->sysfs_lock);
- error = e->type ? entry->show(e, page) : -ENOENT;
+ if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags))
+ error = entry->show(e, page);
mutex_unlock(&e->sysfs_lock);
return error;
}
@@ -430,14 +443,15 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
{
const struct elv_fs_entry *entry = to_elv(attr);
struct elevator_queue *e;
- ssize_t error;
+ ssize_t error = -ENODEV;
if (!entry->store)
return -EIO;
e = container_of(kobj, struct elevator_queue, kobj);
mutex_lock(&e->sysfs_lock);
- error = e->type ? entry->store(e, page, length) : -ENOENT;
+ if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags))
+ error = entry->store(e, page, length);
mutex_unlock(&e->sysfs_lock);
return error;
}
@@ -452,13 +466,12 @@ static const struct kobj_type elv_ktype = {
.release = elevator_release,
};
-int elv_register_queue(struct request_queue *q, bool uevent)
+static int elv_register_queue(struct request_queue *q,
+ struct elevator_queue *e,
+ bool uevent)
{
- struct elevator_queue *e = q->elevator;
int error;
- lockdep_assert_held(&q->sysfs_lock);
-
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
const struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -472,20 +485,25 @@ int elv_register_queue(struct request_queue *q, bool uevent)
if (uevent)
kobject_uevent(&e->kobj, KOBJ_ADD);
+ /*
+ * Sched is initialized, it is ready to export it via
+ * debugfs
+ */
+ blk_mq_sched_reg_debugfs(q);
set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
}
return error;
}
-void elv_unregister_queue(struct request_queue *q)
+static void elv_unregister_queue(struct request_queue *q,
+ struct elevator_queue *e)
{
- struct elevator_queue *e = q->elevator;
-
- lockdep_assert_held(&q->sysfs_lock);
-
if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
+
+ /* unexport via debugfs before exiting sched */
+ blk_mq_sched_unreg_debugfs(q);
}
}
@@ -548,201 +566,282 @@ void elv_unregister(struct elevator_type *e)
EXPORT_SYMBOL_GPL(elv_unregister);
/*
- * For single queue devices, default to using mq-deadline. If we have multiple
- * queues or mq-deadline is not available, default to "none".
- */
-static struct elevator_type *elevator_get_default(struct request_queue *q)
-{
- if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
- return NULL;
-
- if (q->nr_hw_queues != 1 &&
- !blk_mq_is_shared_tags(q->tag_set->flags))
- return NULL;
-
- return elevator_find_get("mq-deadline");
-}
-
-/*
- * Use the default elevator settings. If the chosen elevator initialization
- * fails, fall back to the "none" elevator (no elevator).
- */
-void elevator_init_mq(struct request_queue *q)
-{
- struct elevator_type *e;
- unsigned int memflags;
- int err;
-
- WARN_ON_ONCE(blk_queue_registered(q));
-
- if (unlikely(q->elevator))
- return;
-
- e = elevator_get_default(q);
- if (!e)
- return;
-
- /*
- * We are called before adding disk, when there isn't any FS I/O,
- * so freezing queue plus canceling dispatch work is enough to
- * drain any dispatch activities originated from passthrough
- * requests, then no need to quiesce queue which may add long boot
- * latency, especially when lots of disks are involved.
- *
- * Disk isn't added yet, so verifying queue lock only manually.
- */
- memflags = blk_mq_freeze_queue(q);
-
- blk_mq_cancel_work_sync(q);
-
- err = blk_mq_init_sched(q, e);
-
- blk_mq_unfreeze_queue(q, memflags);
-
- if (err) {
- pr_warn("\"%s\" elevator initialization failed, "
- "falling back to \"none\"\n", e->elevator_name);
- }
-
- elevator_put(e);
-}
-
-/*
* Switch to new_e io scheduler.
*
* If switching fails, we are most likely running out of memory and not able
* to restore the old io scheduler, so leaving the io scheduler being none.
*/
-int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
+static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
{
- unsigned int memflags;
- int ret;
+ struct elevator_type *new_e = NULL;
+ int ret = 0;
- lockdep_assert_held(&q->sysfs_lock);
+ WARN_ON_ONCE(q->mq_freeze_depth == 0);
+ lockdep_assert_held(&q->elevator_lock);
+
+ if (strncmp(ctx->name, "none", 4)) {
+ new_e = elevator_find_get(ctx->name);
+ if (!new_e)
+ return -EINVAL;
+ }
- memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
if (q->elevator) {
- elv_unregister_queue(q);
+ ctx->old = q->elevator;
elevator_exit(q);
}
- ret = blk_mq_init_sched(q, new_e);
- if (ret)
- goto out_unfreeze;
-
- ret = elv_register_queue(q, true);
- if (ret) {
- elevator_exit(q);
- goto out_unfreeze;
+ if (new_e) {
+ ret = blk_mq_init_sched(q, new_e, ctx->et);
+ if (ret)
+ goto out_unfreeze;
+ ctx->new = q->elevator;
+ } else {
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
+ q->elevator = NULL;
+ q->nr_requests = q->tag_set->queue_depth;
}
- blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+ blk_add_trace_msg(q, "elv switch: %s", ctx->name);
out_unfreeze:
blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q, memflags);
if (ret) {
pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
new_e->elevator_name);
}
+ if (new_e)
+ elevator_put(new_e);
return ret;
}
-void elevator_disable(struct request_queue *q)
+static void elv_exit_and_release(struct request_queue *q)
{
- unsigned int memflags;
-
- lockdep_assert_held(&q->sysfs_lock);
+ struct elevator_queue *e;
+ unsigned memflags;
memflags = blk_mq_freeze_queue(q);
- blk_mq_quiesce_queue(q);
-
- elv_unregister_queue(q);
+ mutex_lock(&q->elevator_lock);
+ e = q->elevator;
elevator_exit(q);
- blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
- q->elevator = NULL;
- q->nr_requests = q->tag_set->queue_depth;
- blk_add_trace_msg(q, "elv switch: none");
-
- blk_mq_unquiesce_queue(q);
+ mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
+ if (e) {
+ blk_mq_free_sched_tags(e->et, q->tag_set);
+ kobject_put(&e->kobj);
+ }
+}
+
+static int elevator_change_done(struct request_queue *q,
+ struct elv_change_ctx *ctx)
+{
+ int ret = 0;
+
+ if (ctx->old) {
+ bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT,
+ &ctx->old->flags);
+
+ elv_unregister_queue(q, ctx->old);
+ blk_mq_free_sched_tags(ctx->old->et, q->tag_set);
+ kobject_put(&ctx->old->kobj);
+ if (enable_wbt)
+ wbt_enable_default(q->disk);
+ }
+ if (ctx->new) {
+ ret = elv_register_queue(q, ctx->new, !ctx->no_uevent);
+ if (ret)
+ elv_exit_and_release(q);
+ }
+ return ret;
}
/*
* Switch this queue to the given IO scheduler.
*/
-static int elevator_change(struct request_queue *q, const char *elevator_name)
+static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
{
- struct elevator_type *e;
- int ret;
+ unsigned int memflags;
+ struct blk_mq_tag_set *set = q->tag_set;
+ int ret = 0;
- /* Make sure queue is not in the middle of being removed */
- if (!blk_queue_registered(q))
- return -ENOENT;
+ lockdep_assert_held(&set->update_nr_hwq_lock);
+
+ if (strncmp(ctx->name, "none", 4)) {
+ ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
+ if (!ctx->et)
+ return -ENOMEM;
+ }
+
+ memflags = blk_mq_freeze_queue(q);
+ /*
+ * May be called before adding disk, when there isn't any FS I/O,
+ * so freezing queue plus canceling dispatch work is enough to
+ * drain any dispatch activities originated from passthrough
+ * requests, then no need to quiesce queue which may add long boot
+ * latency, especially when lots of disks are involved.
+ *
+ * Disk isn't added yet, so verifying queue lock only manually.
+ */
+ blk_mq_cancel_work_sync(q);
+ mutex_lock(&q->elevator_lock);
+ if (!(q->elevator && elevator_match(q->elevator->type, ctx->name)))
+ ret = elevator_switch(q, ctx);
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
+ if (!ret)
+ ret = elevator_change_done(q, ctx);
+ /*
+ * Free sched tags if it's allocated but we couldn't switch elevator.
+ */
+ if (ctx->et && !ctx->new)
+ blk_mq_free_sched_tags(ctx->et, set);
+
+ return ret;
+}
- if (!strncmp(elevator_name, "none", 4)) {
- if (q->elevator)
- elevator_disable(q);
- return 0;
+/*
+ * The I/O scheduler depends on the number of hardware queues, this forces a
+ * reattachment when nr_hw_queues changes.
+ */
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
+ struct elevator_tags *t)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+ struct elv_change_ctx ctx = {};
+ int ret = -ENODEV;
+
+ WARN_ON_ONCE(q->mq_freeze_depth == 0);
+
+ if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
+ ctx.name = e->elevator_name;
+ ctx.et = t;
+
+ mutex_lock(&q->elevator_lock);
+ /* force to reattach elevator after nr_hw_queue is updated */
+ ret = elevator_switch(q, &ctx);
+ mutex_unlock(&q->elevator_lock);
}
+ blk_mq_unfreeze_queue_nomemrestore(q);
+ if (!ret)
+ WARN_ON_ONCE(elevator_change_done(q, &ctx));
+ /*
+ * Free sched tags if it's allocated but we couldn't switch elevator.
+ */
+ if (t && !ctx.new)
+ blk_mq_free_sched_tags(t, set);
+}
- if (q->elevator && elevator_match(q->elevator->type, elevator_name))
- return 0;
+/*
+ * Use the default elevator settings. If the chosen elevator initialization
+ * fails, fall back to the "none" elevator (no elevator).
+ */
+void elevator_set_default(struct request_queue *q)
+{
+ struct elv_change_ctx ctx = {
+ .name = "mq-deadline",
+ .no_uevent = true,
+ };
+ int err;
+ struct elevator_type *e;
+
+ /* now we allow to switch elevator */
+ blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q);
+
+ if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
+ return;
- e = elevator_find_get(elevator_name);
+ /*
+ * For single queue devices, default to using mq-deadline. If we
+ * have multiple queues or mq-deadline is not available, default
+ * to "none".
+ */
+ e = elevator_find_get(ctx.name);
if (!e)
- return -EINVAL;
- ret = elevator_switch(q, e);
+ return;
+
+ if ((q->nr_hw_queues == 1 ||
+ blk_mq_is_shared_tags(q->tag_set->flags))) {
+ err = elevator_change(q, &ctx);
+ if (err < 0)
+ pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
+ ctx.name, err);
+ }
elevator_put(e);
- return ret;
}
-void elv_iosched_load_module(struct gendisk *disk, const char *buf,
- size_t count)
+void elevator_set_none(struct request_queue *q)
{
- char elevator_name[ELV_NAME_MAX];
- struct elevator_type *found;
- const char *name;
+ struct elv_change_ctx ctx = {
+ .name = "none",
+ };
+ int err;
- strscpy(elevator_name, buf, sizeof(elevator_name));
- name = strstrip(elevator_name);
+ err = elevator_change(q, &ctx);
+ if (err < 0)
+ pr_warn("%s: set none elevator failed %d\n", __func__, err);
+}
+
+static void elv_iosched_load_module(const char *elevator_name)
+{
+ struct elevator_type *found;
spin_lock(&elv_list_lock);
- found = __elevator_find(name);
+ found = __elevator_find(elevator_name);
spin_unlock(&elv_list_lock);
if (!found)
- request_module("%s-iosched", name);
+ request_module("%s-iosched", elevator_name);
}
ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
size_t count)
{
char elevator_name[ELV_NAME_MAX];
+ struct elv_change_ctx ctx = {};
int ret;
+ struct request_queue *q = disk->queue;
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ /* Make sure queue is not in the middle of being removed */
+ if (!blk_queue_registered(q))
+ return -ENOENT;
+ /*
+ * If the attribute needs to load a module, do it before freezing the
+ * queue to ensure that the module file can be read when the request
+ * queue is the one for the device storing the module file.
+ */
strscpy(elevator_name, buf, sizeof(elevator_name));
- ret = elevator_change(disk->queue, strstrip(elevator_name));
- if (!ret)
- return count;
+ ctx.name = strstrip(elevator_name);
+
+ elv_iosched_load_module(ctx.name);
+
+ down_read(&set->update_nr_hwq_lock);
+ if (!blk_queue_no_elv_switch(q)) {
+ ret = elevator_change(q, &ctx);
+ if (!ret)
+ ret = count;
+ } else {
+ ret = -ENOENT;
+ }
+ up_read(&set->update_nr_hwq_lock);
return ret;
}
ssize_t elv_iosched_show(struct gendisk *disk, char *name)
{
struct request_queue *q = disk->queue;
- struct elevator_queue *eq = q->elevator;
struct elevator_type *cur = NULL, *e;
int len = 0;
+ mutex_lock(&q->elevator_lock);
if (!q->elevator) {
len += sprintf(name+len, "[none] ");
} else {
len += sprintf(name+len, "none ");
- cur = eq->type;
+ cur = q->elevator->type;
}
spin_lock(&elv_list_lock);
@@ -755,6 +854,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
spin_unlock(&elv_list_lock);
len += sprintf(name+len, "\n");
+ mutex_unlock(&q->elevator_lock);
+
return len;
}
diff --git a/block/elevator.h b/block/elevator.h
index e526662c5dbb..adc5c157e17e 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -23,8 +23,17 @@ enum elv_merge {
struct blk_mq_alloc_data;
struct blk_mq_hw_ctx;
+struct elevator_tags {
+ /* num. of hardware queues for which tags are allocated */
+ unsigned int nr_hw_queues;
+ /* depth used while allocating tags */
+ unsigned int nr_requests;
+ /* shared tag is stored at index 0 */
+ struct blk_mq_tags *tags[];
+};
+
struct elevator_mq_ops {
- int (*init_sched)(struct request_queue *, struct elevator_type *);
+ int (*init_sched)(struct request_queue *, struct elevator_queue *);
void (*exit_sched)(struct elevator_queue *);
int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
@@ -113,6 +122,7 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
struct elevator_queue
{
struct elevator_type *type;
+ struct elevator_tags *et;
void *elevator_data;
struct kobject kobj;
struct mutex sysfs_lock;
@@ -121,7 +131,8 @@ struct elevator_queue
};
#define ELEVATOR_FLAG_REGISTERED 0
-#define ELEVATOR_FLAG_DISABLE_WBT 1
+#define ELEVATOR_FLAG_DYING 1
+#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2
/*
* block elevator interface
@@ -148,13 +159,11 @@ extern void elv_unregister(struct elevator_type *);
* io scheduler sysfs switching
*/
ssize_t elv_iosched_show(struct gendisk *disk, char *page);
-void elv_iosched_load_module(struct gendisk *disk, const char *page,
- size_t count);
ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);
extern bool elv_bio_merge_ok(struct request *, struct bio *);
-extern struct elevator_queue *elevator_alloc(struct request_queue *,
- struct elevator_type *);
+struct elevator_queue *elevator_alloc(struct request_queue *,
+ struct elevator_type *, struct elevator_tags *);
/*
* Helper functions.
@@ -184,4 +193,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist)
+void blk_mq_sched_reg_debugfs(struct request_queue *q);
+void blk_mq_sched_unreg_debugfs(struct request_queue *q);
+
#endif /* _ELEVATOR_H */
diff --git a/block/fops.c b/block/fops.c
index be9f1dbea9ce..d62fbefb2e67 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
#include <linux/uio.h>
@@ -54,7 +55,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct bio bio;
ssize_t ret;
- WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA);
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
}
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio.bi_write_stream = iocb->ki_write_stream;
bio.bi_ioprio = iocb->ki_ioprio;
if (iocb->ki_flags & IOCB_ATOMIC)
bio.bi_opf |= REQ_ATOMIC;
@@ -130,7 +131,7 @@ static void blkdev_bio_end_io(struct bio *bio)
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
- if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA))
+ if (bio_integrity(bio))
bio_integrity_unmap_user(bio);
if (atomic_dec_and_test(&dio->ref)) {
@@ -206,6 +207,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -231,7 +233,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
bio->bi_opf |= REQ_NOWAIT;
}
- if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) {
+ if (iocb->ki_flags & IOCB_HAS_METADATA) {
ret = bio_integrity_map_iter(bio, iocb->private);
if (unlikely(ret))
goto fail;
@@ -299,7 +301,7 @@ static void blkdev_bio_end_io_async(struct bio *bio)
ret = blk_status_to_errno(bio->bi_status);
}
- if (iocb->ki_flags & IOCB_HAS_METADATA)
+ if (bio_integrity(bio))
bio_integrity_unmap_user(bio);
iocb->ki_complete(iocb, ret);
@@ -333,6 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
dio->iocb = iocb;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -398,8 +401,29 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (blkdev_dio_invalid(bdev, iocb, iter))
return -EINVAL;
+ if (iov_iter_rw(iter) == WRITE) {
+ u16 max_write_streams = bdev_max_write_streams(bdev);
+
+ if (iocb->ki_write_stream) {
+ if (iocb->ki_write_stream > max_write_streams)
+ return -EINVAL;
+ } else if (max_write_streams) {
+ enum rw_hint write_hint =
+ file_inode(iocb->ki_filp)->i_write_hint;
+
+ /*
+ * Just use the write hint as write stream for block
+ * device writes. This assumes no file system is
+ * mounted that would use the streams differently.
+ */
+ if (write_hint <= max_write_streams)
+ iocb->ki_write_stream = write_hint;
+ }
+ }
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
- if (likely(nr_pages <= BIO_MAX_VECS)) {
+ if (likely(nr_pages <= BIO_MAX_VECS &&
+ !(iocb->ki_flags & IOCB_HAS_METADATA))) {
if (is_sync_kiocb(iocb))
return __blkdev_direct_IO_simple(iocb, iter, bdev,
nr_pages);
@@ -451,12 +475,13 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct folio *folio = NULL;
struct blk_plug plug;
int err;
blk_start_plug(&plug);
- err = write_cache_pages(mapping, wbc, block_write_full_folio,
- blkdev_get_block);
+ while ((folio = writeback_iter(mapping, wbc, folio, &err)))
+ err = block_write_full_folio(folio, wbc, blkdev_get_block);
blk_finish_plug(&plug);
return err;
@@ -642,12 +667,14 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (ret)
return ret;
- bdev = blkdev_get_no_open(inode->i_rdev);
+ bdev = blkdev_get_no_open(inode->i_rdev, true);
if (!bdev)
return -ENXIO;
if (bdev_can_atomic_write(bdev))
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+ if (blk_get_integrity(bdev->bd_disk))
+ filp->f_mode |= FMODE_HAS_METADATA;
ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
if (ret)
@@ -746,7 +773,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = direct_write_fallback(iocb, from, ret,
blkdev_buffered_write(iocb, from));
} else {
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with
+ * set_blocksize changing i_blkbits/folio order and punching
+ * out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = blkdev_buffered_write(iocb, from);
+ inode_unlock_shared(bd_inode);
}
if (ret > 0)
@@ -757,6 +791,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct inode *bd_inode = bdev_file_inode(iocb->ki_filp);
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
loff_t size = bdev_nr_bytes(bdev);
loff_t pos = iocb->ki_pos;
@@ -793,7 +828,13 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
goto reexpand;
}
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize
+ * changing i_blkbits/folio order and punching out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = filemap_read(iocb, to, ret);
+ inode_unlock_shared(bd_inode);
reexpand:
if (unlikely(shorted))
@@ -836,6 +877,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
if ((start | len) & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
+ inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
/*
@@ -868,6 +910,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
fail:
filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
return error;
}
diff --git a/block/genhd.c b/block/genhd.c
index e9375e20d866..9bbc38d12792 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -125,37 +125,50 @@ static void part_stat_read_all(struct block_device *part,
}
}
-unsigned int part_in_flight(struct block_device *part)
+static void bdev_count_inflight_rw(struct block_device *part,
+ unsigned int inflight[2], bool mq_driver)
{
- unsigned int inflight = 0;
+ int write = 0;
+ int read = 0;
int cpu;
+ if (mq_driver) {
+ blk_mq_in_driver_rw(part, inflight);
+ return;
+ }
+
for_each_possible_cpu(cpu) {
- inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
- part_stat_local_read_cpu(part, in_flight[1], cpu);
+ read += part_stat_local_read_cpu(part, in_flight[READ], cpu);
+ write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu);
}
- if ((int)inflight < 0)
- inflight = 0;
- return inflight;
+ /*
+ * While iterating all CPUs, some IOs may be issued from a CPU already
+ * traversed and complete on a CPU that has not yet been traversed,
+ * causing the inflight number to be negative.
+ */
+ inflight[READ] = read > 0 ? read : 0;
+ inflight[WRITE] = write > 0 ? write : 0;
}
-static void part_in_flight_rw(struct block_device *part,
- unsigned int inflight[2])
+/**
+ * bdev_count_inflight - get the number of inflight IOs for a block device.
+ *
+ * @part: the block device.
+ *
+ * Inflight here means started IO accounting, from bdev_start_io_acct() for
+ * bio-based block device, and from blk_account_io_start() for rq-based block
+ * device.
+ */
+unsigned int bdev_count_inflight(struct block_device *part)
{
- int cpu;
+ unsigned int inflight[2] = {0};
- inflight[0] = 0;
- inflight[1] = 0;
- for_each_possible_cpu(cpu) {
- inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
- inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
- }
- if ((int)inflight[0] < 0)
- inflight[0] = 0;
- if ((int)inflight[1] < 0)
- inflight[1] = 0;
+ bdev_count_inflight_rw(part, inflight, false);
+
+ return inflight[READ] + inflight[WRITE];
}
+EXPORT_SYMBOL_GPL(bdev_count_inflight);
/*
* Can be deleted altogether. Later.
@@ -389,19 +402,35 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
return ret;
}
-/**
- * add_disk_fwnode - add disk information to kernel list with fwnode
- * @parent: parent device for the disk
- * @disk: per-device partitioning information
- * @groups: Additional per-device sysfs groups
- * @fwnode: attached disk fwnode
- *
- * This function registers the partitioning information in @disk
- * with the kernel. Also attach a fwnode to the disk device.
- */
-int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups,
- struct fwnode_handle *fwnode)
+static void add_disk_final(struct gendisk *disk)
+{
+ struct device *ddev = disk_to_dev(disk);
+
+ if (!(disk->flags & GENHD_FL_HIDDEN)) {
+ /* Make sure the first partition scan will be proceed */
+ if (get_capacity(disk) && disk_has_partscan(disk))
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+
+ bdev_add(disk->part0, ddev->devt);
+ if (get_capacity(disk))
+ disk_scan_partitions(disk, BLK_OPEN_READ);
+
+ /*
+ * Announce the disk and partitions after all partitions are
+ * created. (for hidden disks uevents remain suppressed forever)
+ */
+ dev_set_uevent_suppress(ddev, 0);
+ disk_uevent(disk, KOBJ_ADD);
+ }
+
+ blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
+ disk_add_events(disk);
+ set_bit(GD_ADDED, &disk->state);
+}
+
+static int __add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups,
+ struct fwnode_handle *fwnode)
{
struct device *ddev = disk_to_dev(disk);
@@ -416,12 +445,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
*/
if (disk->fops->submit_bio || disk->fops->poll_bio)
return -EINVAL;
-
- /*
- * Initialize the I/O scheduler code and pick a default one if
- * needed.
- */
- elevator_init_mq(disk->queue);
} else {
if (!disk->fops->submit_bio)
return -EINVAL;
@@ -438,7 +461,7 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
ret = -EINVAL;
if (disk->major) {
if (WARN_ON(!disk->minors))
- goto out_exit_elevator;
+ goto out;
if (disk->minors > DISK_MAX_PARTS) {
pr_err("block: can't allocate more than %d partitions\n",
@@ -448,14 +471,14 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
if (disk->first_minor > MINORMASK ||
disk->minors > MINORMASK + 1 ||
disk->first_minor + disk->minors > MINORMASK + 1)
- goto out_exit_elevator;
+ goto out;
} else {
if (WARN_ON(disk->minors))
- goto out_exit_elevator;
+ goto out;
ret = blk_alloc_ext_minor();
if (ret < 0)
- goto out_exit_elevator;
+ goto out;
disk->major = BLOCK_EXT_MAJOR;
disk->first_minor = ret;
}
@@ -516,21 +539,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
&disk->bdi->dev->kobj, "bdi");
if (ret)
goto out_unregister_bdi;
-
- /* Make sure the first partition scan will be proceed */
- if (get_capacity(disk) && disk_has_partscan(disk))
- set_bit(GD_NEED_PART_SCAN, &disk->state);
-
- bdev_add(disk->part0, ddev->devt);
- if (get_capacity(disk))
- disk_scan_partitions(disk, BLK_OPEN_READ);
-
- /*
- * Announce the disk and partitions after all partitions are
- * created. (for hidden disks uevents remain suppressed forever)
- */
- dev_set_uevent_suppress(ddev, 0);
- disk_uevent(disk, KOBJ_ADD);
} else {
/*
* Even if the block_device for a hidden gendisk is not
@@ -539,10 +547,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
*/
disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor);
}
-
- blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
- disk_add_events(disk);
- set_bit(GD_ADDED, &disk->state);
return 0;
out_unregister_bdi:
@@ -564,9 +568,46 @@ out_device_del:
out_free_ext_minor:
if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor);
-out_exit_elevator:
- if (disk->queue->elevator)
- elevator_exit(disk->queue);
+out:
+ return ret;
+}
+
+/**
+ * add_disk_fwnode - add disk information to kernel list with fwnode
+ * @parent: parent device for the disk
+ * @disk: per-device partitioning information
+ * @groups: Additional per-device sysfs groups
+ * @fwnode: attached disk fwnode
+ *
+ * This function registers the partitioning information in @disk
+ * with the kernel. Also attach a fwnode to the disk device.
+ */
+int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups,
+ struct fwnode_handle *fwnode)
+{
+ struct blk_mq_tag_set *set;
+ unsigned int memflags;
+ int ret;
+
+ if (queue_is_mq(disk->queue)) {
+ set = disk->queue->tag_set;
+ memflags = memalloc_noio_save();
+ down_read(&set->update_nr_hwq_lock);
+ ret = __add_disk(parent, disk, groups, fwnode);
+ up_read(&set->update_nr_hwq_lock);
+ memalloc_noio_restore(memflags);
+ } else {
+ ret = __add_disk(parent, disk, groups, fwnode);
+ }
+
+ /*
+ * add_disk_final() needn't to read `nr_hw_queues`, so move it out
+ * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary
+ * lock dependency on `disk->open_mutex` from scanning partition.
+ */
+ if (!ret)
+ add_disk_final(disk);
return ret;
}
EXPORT_SYMBOL_GPL(add_disk_fwnode);
@@ -649,26 +690,7 @@ void blk_mark_disk_dead(struct gendisk *disk)
}
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
-/**
- * del_gendisk - remove the gendisk
- * @disk: the struct gendisk to remove
- *
- * Removes the gendisk and all its associated resources. This deletes the
- * partitions associated with the gendisk, and unregisters the associated
- * request_queue.
- *
- * This is the counter to the respective __device_add_disk() call.
- *
- * The final removal of the struct gendisk happens when its refcount reaches 0
- * with put_disk(), which should be called after del_gendisk(), if
- * __device_add_disk() was used.
- *
- * Drivers exist which depend on the release of the gendisk to be synchronous,
- * it should not be deferred.
- *
- * Context: can sleep
- */
-void del_gendisk(struct gendisk *disk)
+static void __del_gendisk(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct block_device *part;
@@ -740,14 +762,7 @@ void del_gendisk(struct gendisk *disk)
if (queue_is_mq(q))
blk_mq_cancel_work_sync(q);
- blk_mq_quiesce_queue(q);
- if (q->elevator) {
- mutex_lock(&q->sysfs_lock);
- elevator_exit(q);
- mutex_unlock(&q->sysfs_lock);
- }
rq_qos_exit(q);
- blk_mq_unquiesce_queue(q);
/*
* If the disk does not own the queue, allow using passthrough requests
@@ -761,6 +776,55 @@ void del_gendisk(struct gendisk *disk)
if (start_drain)
blk_unfreeze_release_lock(q);
}
+
+static void disable_elv_switch(struct request_queue *q)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+ WARN_ON_ONCE(!queue_is_mq(q));
+
+ down_write(&set->update_nr_hwq_lock);
+ blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q);
+ up_write(&set->update_nr_hwq_lock);
+}
+
+/**
+ * del_gendisk - remove the gendisk
+ * @disk: the struct gendisk to remove
+ *
+ * Removes the gendisk and all its associated resources. This deletes the
+ * partitions associated with the gendisk, and unregisters the associated
+ * request_queue.
+ *
+ * This is the counter to the respective __device_add_disk() call.
+ *
+ * The final removal of the struct gendisk happens when its refcount reaches 0
+ * with put_disk(), which should be called after del_gendisk(), if
+ * __device_add_disk() was used.
+ *
+ * Drivers exist which depend on the release of the gendisk to be synchronous,
+ * it should not be deferred.
+ *
+ * Context: can sleep
+ */
+void del_gendisk(struct gendisk *disk)
+{
+ struct blk_mq_tag_set *set;
+ unsigned int memflags;
+
+ if (!queue_is_mq(disk->queue)) {
+ __del_gendisk(disk);
+ } else {
+ set = disk->queue->tag_set;
+
+ disable_elv_switch(disk->queue);
+
+ memflags = memalloc_noio_save();
+ down_read(&set->update_nr_hwq_lock);
+ __del_gendisk(disk);
+ up_read(&set->update_nr_hwq_lock);
+ memalloc_noio_restore(memflags);
+ }
+}
EXPORT_SYMBOL(del_gendisk);
/**
@@ -1002,7 +1066,7 @@ ssize_t part_stat_show(struct device *dev,
struct disk_stats stat;
unsigned int inflight;
- inflight = part_in_flight(bdev);
+ inflight = bdev_count_inflight(bdev);
if (inflight) {
part_stat_lock();
update_io_ticks(bdev, jiffies, true);
@@ -1039,19 +1103,21 @@ ssize_t part_stat_show(struct device *dev,
(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
}
+/*
+ * Show the number of IOs issued to driver.
+ * For bio-based device, started from bdev_start_io_acct();
+ * For rq-based device, started from blk_mq_start_request();
+ */
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev_get_queue(bdev);
- unsigned int inflight[2];
+ unsigned int inflight[2] = {0};
- if (queue_is_mq(q))
- blk_mq_in_flight_rw(q, bdev, inflight);
- else
- part_in_flight_rw(bdev, inflight);
+ bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q));
- return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]);
+ return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]);
}
static ssize_t disk_capability_show(struct device *dev,
@@ -1237,6 +1303,7 @@ static void disk_release(struct device *dev)
disk_free_zone_resources(disk);
xa_destroy(&disk->part_tbl);
+ kobject_put(&disk->queue_kobj);
disk->queue->disk = NULL;
blk_put_queue(disk->queue);
@@ -1304,7 +1371,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
- inflight = part_in_flight(hd);
+ inflight = bdev_count_inflight(hd);
if (inflight) {
part_stat_lock();
update_io_ticks(hd, jiffies, true);
@@ -1419,6 +1486,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
+ mutex_init(&disk->rqos_state_mutex);
+ kobject_init(&disk->queue_kobj, &blk_queue_ktype);
return disk;
out_erase_part0:
diff --git a/block/ioctl.c b/block/ioctl.c
index 6554b728bae6..e472cc1030c6 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -15,6 +15,7 @@
#include <linux/io_uring/cmd.h>
#include <uapi/linux/blkdev.h>
#include "blk.h"
+#include "blk-crypto-internal.h"
static int blkpg_do_ioctl(struct block_device *bdev,
struct blkpg_partition __user *upart, int op)
@@ -141,6 +142,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
if (err)
return err;
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
@@ -173,6 +175,7 @@ out_unplug:
blk_finish_plug(&plug);
fail:
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -198,12 +201,14 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
end > bdev_nr_bytes(bdev))
return -EINVAL;
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end - 1);
if (!err)
err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
GFP_KERNEL);
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -235,6 +240,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
return -EINVAL;
/* Invalidate the page cache, including dirty pages */
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end);
if (err)
@@ -245,6 +251,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
fail:
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -620,6 +627,10 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
+ case BLKCRYPTOIMPORTKEY:
+ case BLKCRYPTOGENERATEKEY:
+ case BLKCRYPTOPREPAREKEY:
+ return blk_crypto_ioctl(bdev, cmd, argp);
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, mode, argp);
case IOC_PR_RESERVE:
diff --git a/block/ioprio.c b/block/ioprio.c
index 73301a261429..f0ee2798539c 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -46,12 +46,8 @@ int ioprio_check_cap(int ioprio)
*/
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
return -EPERM;
- fallthrough;
- /* rt has prio field too */
- case IOPRIO_CLASS_BE:
- if (level >= IOPRIO_NR_LEVELS)
- return -EINVAL;
break;
+ case IOPRIO_CLASS_BE:
case IOPRIO_CLASS_IDLE:
break;
case IOPRIO_CLASS_NONE:
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index dc31f2dfa414..70cbc7b2deb4 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -157,10 +157,7 @@ struct kyber_queue_data {
*/
struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
- /*
- * Async request percentage, converted to per-word depth for
- * sbitmap_get_shallow().
- */
+ /* Number of allowed async requests. */
unsigned int async_depth;
struct kyber_cpu_latency __percpu *cpu_latency;
@@ -276,7 +273,7 @@ static void kyber_resize_domain(struct kyber_queue_data *kqd,
static void kyber_timer_fn(struct timer_list *t)
{
- struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
+ struct kyber_queue_data *kqd = timer_container_of(kqd, t, timer);
unsigned int sched_domain;
int cpu;
bool bad = false;
@@ -402,20 +399,13 @@ err:
return ERR_PTR(ret);
}
-static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
+static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
{
struct kyber_queue_data *kqd;
- struct elevator_queue *eq;
-
- eq = elevator_alloc(q, e);
- if (!eq)
- return -ENOMEM;
kqd = kyber_queue_data_alloc(q);
- if (IS_ERR(kqd)) {
- kobject_put(&eq->kobj);
+ if (IS_ERR(kqd))
return PTR_ERR(kqd);
- }
blk_stat_enable_accounting(q);
@@ -454,10 +444,8 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
- unsigned int shift = tags->bitmap_tags.sb.shift;
-
- kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+ kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U;
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
}
@@ -568,7 +556,7 @@ static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(bio->bi_opf, ctx);
struct kyber_hctx_data *khd = hctx->sched_data;
struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 754f6b7415cd..b9b7cdf1d3c9 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -488,20 +488,6 @@ unlock:
}
/*
- * 'depth' is a number in the range 1..INT_MAX representing a number of
- * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since
- * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow().
- * Values larger than q->nr_requests have the same effect as q->nr_requests.
- */
-static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth)
-{
- struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
- const unsigned int nrr = hctx->queue->nr_requests;
-
- return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
-}
-
-/*
* Called by __blk_mq_alloc_request(). The shallow_depth value set by this
* function is used by __blk_mq_get_tag().
*/
@@ -517,7 +503,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
* Throttle asynchronous requests and writes such that these requests
* do not block the allocation of synchronous requests.
*/
- data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth);
+ data->shallow_depth = dd->async_depth;
}
/* Called by blk_mq_update_nr_requests(). */
@@ -568,20 +554,14 @@ static void dd_exit_sched(struct elevator_queue *e)
/*
* initialize elevator private data (deadline_data).
*/
-static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
+static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq)
{
struct deadline_data *dd;
- struct elevator_queue *eq;
enum dd_prio prio;
- int ret = -ENOMEM;
-
- eq = elevator_alloc(q, e);
- if (!eq)
- return ret;
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
if (!dd)
- goto put_eq;
+ return -ENOMEM;
eq->elevator_data = dd;
@@ -608,10 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
q->elevator = eq;
return 0;
-
-put_eq:
- kobject_put(&eq->kobj);
- return ret;
}
/*
@@ -715,7 +691,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
}
/*
- * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list().
+ * Called from blk_mq_insert_request() or blk_mq_dispatch_list().
*/
static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *list,
diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c
index 9cc6b8c1eea4..b5ecddd5181a 100644
--- a/block/partitions/sgi.c
+++ b/block/partitions/sgi.c
@@ -50,8 +50,6 @@ int sgi_partition(struct parsed_partitions *state)
p = &label->partitions[0];
magic = label->magic_mushroom;
if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
- /*printk("Dev %s SGI disklabel: bad magic %08x\n",
- state->disk->disk_name, be32_to_cpu(magic));*/
put_dev_sector(sect);
return 0;
}
diff --git a/block/partitions/sun.c b/block/partitions/sun.c
index ddf9e6def4b2..2419af76120f 100644
--- a/block/partitions/sun.c
+++ b/block/partitions/sun.c
@@ -74,8 +74,6 @@ int sun_partition(struct parsed_partitions *state)
p = label->partitions;
if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
-/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
- state->disk->disk_name, be16_to_cpu(label->magic)); */
put_dev_sector(sect);
return 0;
}
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 2d05421f0fa5..851db518ee5e 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -210,7 +210,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len)
{
- return cpu_to_be64(crc64_rocksoft_update(crc, data, len));
+ return cpu_to_be64(crc64_nvme(crc, data, len));
}
static void ext_pi_crc64_generate(struct blk_integrity_iter *iter,
@@ -404,7 +404,7 @@ void blk_integrity_generate(struct bio *bio)
}
}
-void blk_integrity_verify(struct bio *bio)
+void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter)
{
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_payload *bip = bio_integrity(bio);
@@ -418,9 +418,9 @@ void blk_integrity_verify(struct bio *bio)
*/
iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
iter.interval = 1 << bi->interval_exp;
- iter.seed = bip->bio_iter.bi_sector;
+ iter.seed = saved_iter->bi_sector;
iter.prot_buf = bvec_virt(bip->bip_vec);
- __bio_for_each_segment(bv, bio, bviter, bip->bio_iter) {
+ __bio_for_each_segment(bv, bio, bviter, *saved_iter) {
void *kaddr = bvec_kmap_local(&bv);
blk_status_t ret = BLK_STS_OK;