summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/badblocks.c5
-rw-r--r--block/bdev.c17
-rw-r--r--block/bfq-iosched.c35
-rw-r--r--block/bfq-iosched.h3
-rw-r--r--block/bio-integrity.c30
-rw-r--r--block/bio.c15
-rw-r--r--block/blk-cgroup.c23
-rw-r--r--block/blk-core.c27
-rw-r--r--block/blk-merge.c54
-rw-r--r--block/blk-mq-cpumap.c36
-rw-r--r--block/blk-mq.c88
-rw-r--r--block/blk-mq.h8
-rw-r--r--block/blk-settings.c28
-rw-r--r--block/blk-sysfs.c12
-rw-r--r--block/blk-throttle.c15
-rw-r--r--block/blk-zoned.c109
-rw-r--r--block/blk.h3
-rw-r--r--block/bounce.c2
-rw-r--r--block/elevator.c3
-rw-r--r--block/fops.c21
-rw-r--r--block/genhd.c22
-rw-r--r--block/ioctl.c6
-rw-r--r--block/kyber-iosched.c9
-rw-r--r--block/mq-deadline.c29
-rw-r--r--block/partitions/efi.c2
-rw-r--r--block/partitions/ldm.h2
-rw-r--r--block/partitions/mac.c18
27 files changed, 384 insertions, 238 deletions
diff --git a/block/badblocks.c b/block/badblocks.c
index db4ec8b9b2a8..a9709771a101 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -1349,14 +1349,15 @@ re_check:
len = sectors;
update_sectors:
+ /* This situation should never happen */
+ WARN_ON(sectors < len);
+
s += len;
sectors -= len;
if (sectors > 0)
goto re_check;
- WARN_ON(sectors < 0);
-
if (unacked_badblocks > 0)
rv = -1;
else if (acked_badblocks > 0)
diff --git a/block/bdev.c b/block/bdev.c
index 738e3c8457e7..e7daca6565ea 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -168,9 +168,26 @@ int set_blocksize(struct file *file, int size)
/* Don't change the size if it is same as current */
if (inode->i_blkbits != blksize_bits(size)) {
+ /*
+ * Flush and truncate the pagecache before we reconfigure the
+ * mapping geometry because folio sizes are variable now. If a
+ * reader has already allocated a folio whose size is smaller
+ * than the new min_order but invokes readahead after the new
+ * min_order becomes visible, readahead will think there are
+ * "zero" blocks per folio and crash. Take the inode and
+ * invalidation locks to avoid racing with
+ * read/write/fallocate.
+ */
+ inode_lock(inode);
+ filemap_invalidate_lock(inode->i_mapping);
+
sync_blockdev(bdev);
+ kill_bdev(bdev);
+
inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
+ filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
}
return 0;
}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index cad16c163611..68359e1b92e2 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -701,17 +701,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
struct bfq_data *bfqd = data->q->elevator->elevator_data;
struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
- int depth;
- unsigned limit = data->q->nr_requests;
- unsigned int act_idx;
+ unsigned int limit, act_idx;
/* Sync reads have full depth available */
- if (op_is_sync(opf) && !op_is_write(opf)) {
- depth = 0;
- } else {
- depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
- limit = (limit * depth) >> bfqd->full_depth_shift;
- }
+ if (op_is_sync(opf) && !op_is_write(opf))
+ limit = data->q->nr_requests;
+ else
+ limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
/* Fast path to check if bfqq is already allocated. */
@@ -725,14 +721,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
* available requests and thus starve other entities.
*/
if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) {
- depth = 1;
+ limit = 1;
break;
}
}
+
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
- __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
- if (depth)
- data->shallow_depth = depth;
+ __func__, bfqd->wr_busy_queues, op_is_sync(opf), limit);
+
+ if (limit < data->q->nr_requests)
+ data->shallow_depth = limit;
}
static struct bfq_queue *
@@ -7128,9 +7126,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
*/
static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
{
- unsigned int depth = 1U << bt->sb.shift;
+ unsigned int nr_requests = bfqd->queue->nr_requests;
- bfqd->full_depth_shift = bt->sb.shift;
/*
* In-word depths if no bfq_queue is being weight-raised:
* leaving 25% of tags only for sync reads.
@@ -7142,13 +7139,13 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
* limit 'something'.
*/
/* no more than 50% of tags for async I/O */
- bfqd->word_depths[0][0] = max(depth >> 1, 1U);
+ bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U);
/*
* no more than 75% of tags for sync writes (25% extra tags
* w.r.t. async I/O, to prevent async I/O from starving sync
* writes)
*/
- bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
+ bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U);
/*
* In-word depths in case some bfq_queue is being weight-
@@ -7158,9 +7155,9 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
* shortage.
*/
/* no more than ~18% of tags for async I/O */
- bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
+ bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
- bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
+ bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
}
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 687a3a7ba784..31217f196f4f 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -813,8 +813,7 @@ struct bfq_data {
* Depth limits used in bfq_limit_depth (see comments on the
* function)
*/
- unsigned int word_depths[2][2];
- unsigned int full_depth_shift;
+ unsigned int async_depths[2][2];
/*
* Number of independent actuators. This is equal to 1 in
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 88e3ad73c385..456026c4a3c9 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -104,31 +104,28 @@ err:
}
EXPORT_SYMBOL(bio_integrity_alloc);
-static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
- bool dirty)
+static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs)
{
int i;
- for (i = 0; i < nr_vecs; i++) {
- if (dirty && !PageCompound(bv[i].bv_page))
- set_page_dirty_lock(bv[i].bv_page);
+ for (i = 0; i < nr_vecs; i++)
unpin_user_page(bv[i].bv_page);
- }
}
static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
{
- unsigned short nr_vecs = bip->bip_max_vcnt - 1;
- struct bio_vec *copy = &bip->bip_vec[1];
- size_t bytes = bip->bip_iter.bi_size;
- struct iov_iter iter;
+ unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1;
+ struct bio_vec *orig_bvecs = &bip->bip_vec[1];
+ struct bio_vec *bounce_bvec = &bip->bip_vec[0];
+ size_t bytes = bounce_bvec->bv_len;
+ struct iov_iter orig_iter;
int ret;
- iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
- ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
+ iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes);
+ ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter);
WARN_ON_ONCE(ret != bytes);
- bio_integrity_unpin_bvec(copy, nr_vecs, true);
+ bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs);
}
/**
@@ -148,8 +145,7 @@ void bio_integrity_unmap_user(struct bio *bio)
return;
}
- bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt,
- bio_data_dir(bio) == READ);
+ bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt);
}
/**
@@ -235,7 +231,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
}
if (write)
- bio_integrity_unpin_bvec(bvec, nr_vecs, false);
+ bio_integrity_unpin_bvec(bvec, nr_vecs);
else
memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec));
@@ -361,7 +357,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
return 0;
release_pages:
- bio_integrity_unpin_bvec(bvec, nr_bvecs, false);
+ bio_integrity_unpin_bvec(bvec, nr_bvecs);
free_bvec:
if (bvec != stack_vec)
kfree(bvec);
diff --git a/block/bio.c b/block/bio.c
index ac4d77c88932..094a5adf79d2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -77,7 +77,7 @@ struct bio_slab {
struct kmem_cache *slab;
unsigned int slab_ref;
unsigned int slab_size;
- char name[8];
+ char name[12];
};
static DEFINE_MUTEX(bio_slab_lock);
static DEFINE_XARRAY(bio_slabs);
@@ -611,7 +611,7 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
{
struct bio *bio;
- if (nr_vecs > UIO_MAXIOV)
+ if (nr_vecs > BIO_MAX_INLINE_VECS)
return NULL;
return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask);
}
@@ -1156,9 +1156,10 @@ EXPORT_SYMBOL(bio_add_page);
void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
+ unsigned long nr = off / PAGE_SIZE;
+
WARN_ON_ONCE(len > UINT_MAX);
- WARN_ON_ONCE(off > UINT_MAX);
- __bio_add_page(bio, &folio->page, len, off);
+ __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE);
}
EXPORT_SYMBOL_GPL(bio_add_folio_nofail);
@@ -1179,9 +1180,11 @@ EXPORT_SYMBOL_GPL(bio_add_folio_nofail);
bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
- if (len > UINT_MAX || off > UINT_MAX)
+ unsigned long nr = off / PAGE_SIZE;
+
+ if (len > UINT_MAX)
return false;
- return bio_add_page(bio, &folio->page, len, off) > 0;
+ return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0;
}
EXPORT_SYMBOL(bio_add_folio);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 45a395862fbc..643d6bf66522 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1138,6 +1138,7 @@ static void blkcg_fill_root_iostats(void)
blkg_iostat_set(&blkg->iostat.cur, &tmp);
u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
+ class_dev_iter_exit(&iter);
}
static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
@@ -1724,27 +1725,27 @@ int blkcg_policy_register(struct blkcg_policy *pol)
struct blkcg *blkcg;
int i, ret;
+ /*
+ * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
+ * without pd_alloc_fn/pd_free_fn can't be activated.
+ */
+ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
+ (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
+ return -EINVAL;
+
mutex_lock(&blkcg_pol_register_mutex);
mutex_lock(&blkcg_pol_mutex);
/* find an empty slot */
- ret = -ENOSPC;
for (i = 0; i < BLKCG_MAX_POLS; i++)
if (!blkcg_policy[i])
break;
if (i >= BLKCG_MAX_POLS) {
pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
+ ret = -ENOSPC;
goto err_unlock;
}
- /*
- * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
- * without pd_alloc_fn/pd_free_fn can't be activated.
- */
- if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
- (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
- goto err_unlock;
-
/* register @pol */
pol->plid = i;
blkcg_policy[pol->plid] = pol;
@@ -1755,8 +1756,10 @@ int blkcg_policy_register(struct blkcg_policy *pol)
struct blkcg_policy_data *cpd;
cpd = pol->cpd_alloc_fn(GFP_KERNEL);
- if (!cpd)
+ if (!cpd) {
+ ret = -ENOMEM;
goto err_free_cpds;
+ }
blkcg->cpd[pol->plid] = cpd;
cpd->blkcg = blkcg;
diff --git a/block/blk-core.c b/block/blk-core.c
index 4f791a3114a1..c7b6c1f76359 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -629,8 +629,14 @@ static void __submit_bio(struct bio *bio)
blk_mq_submit_bio(bio);
} else if (likely(bio_queue_enter(bio) == 0)) {
struct gendisk *disk = bio->bi_bdev->bd_disk;
-
- disk->fops->submit_bio(bio);
+
+ if ((bio->bi_opf & REQ_POLLED) &&
+ !(disk->queue->limits.features & BLK_FEAT_POLL)) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ bio_endio(bio);
+ } else {
+ disk->fops->submit_bio(bio);
+ }
blk_queue_exit(disk->queue);
}
@@ -805,12 +811,6 @@ void submit_bio_noacct(struct bio *bio)
}
}
- if (!(q->limits.features & BLK_FEAT_POLL) &&
- (bio->bi_opf & REQ_POLLED)) {
- bio_clear_polled(bio);
- goto not_supported;
- }
-
switch (bio_op(bio)) {
case REQ_OP_READ:
break;
@@ -935,7 +935,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
return 0;
q = bdev_get_queue(bdev);
- if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL))
+ if (cookie == BLK_QC_T_NONE)
return 0;
blk_flush_plug(current->plug, false);
@@ -956,7 +956,8 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
} else {
struct gendisk *disk = q->disk;
- if (disk && disk->fops->poll_bio)
+ if ((q->limits.features & BLK_FEAT_POLL) && disk &&
+ disk->fops->poll_bio)
ret = disk->fops->poll_bio(bio, iob, flags);
}
blk_queue_exit(q);
@@ -1120,8 +1121,8 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
return;
plug->cur_ktime = 0;
- plug->mq_list = NULL;
- plug->cached_rq = NULL;
+ rq_list_init(&plug->mq_list);
+ rq_list_init(&plug->cached_rqs);
plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
plug->rq_count = 0;
plug->multiple_queues = false;
@@ -1217,7 +1218,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
* queue for cached requests, we don't want a blocked task holding
* up a queue freeze/quiesce event.
*/
- if (unlikely(!rq_list_empty(plug->cached_rq)))
+ if (unlikely(!rq_list_empty(&plug->cached_rqs)))
blk_mq_free_plug_rqs(plug);
plug->cur_ktime = 0;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5baa950f34fe..7ddd7dd23dda 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -864,12 +864,13 @@ static struct request *attempt_merge(struct request_queue *q,
if (rq_data_dir(req) != rq_data_dir(next))
return NULL;
- /* Don't merge requests with different write hints. */
- if (req->write_hint != next->write_hint)
- return NULL;
-
- if (req->ioprio != next->ioprio)
- return NULL;
+ if (req->bio && next->bio) {
+ /* Don't merge requests with different write hints. */
+ if (req->bio->bi_write_hint != next->bio->bi_write_hint)
+ return NULL;
+ if (req->bio->bi_ioprio != next->bio->bi_ioprio)
+ return NULL;
+ }
if (!blk_atomic_write_mergeable_rqs(req, next))
return NULL;
@@ -998,12 +999,13 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (!bio_crypt_rq_ctx_compatible(rq, bio))
return false;
- /* Don't merge requests with different write hints. */
- if (rq->write_hint != bio->bi_write_hint)
- return false;
-
- if (rq->ioprio != bio_prio(bio))
- return false;
+ if (rq->bio) {
+ /* Don't merge requests with different write hints. */
+ if (rq->bio->bi_write_hint != bio->bi_write_hint)
+ return false;
+ if (rq->bio->bi_ioprio != bio->bi_ioprio)
+ return false;
+ }
if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
return false;
@@ -1175,23 +1177,23 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
struct blk_plug *plug = current->plug;
struct request *rq;
- if (!plug || rq_list_empty(plug->mq_list))
+ if (!plug || rq_list_empty(&plug->mq_list))
return false;
- rq_list_for_each(&plug->mq_list, rq) {
- if (rq->q == q) {
- if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
- BIO_MERGE_OK)
- return true;
- break;
- }
+ rq = plug->mq_list.tail;
+ if (rq->q == q)
+ return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK;
+ else if (!plug->multiple_queues)
+ return false;
- /*
- * Only keep iterating plug list for merges if we have multiple
- * queues
- */
- if (!plug->multiple_queues)
- break;
+ rq_list_for_each(&plug->mq_list, rq) {
+ if (rq->q != q)
+ continue;
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK)
+ return true;
+ break;
}
return false;
}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 9638b25fd521..444798c5374f 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -11,6 +11,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/group_cpus.h>
+#include <linux/device/bus.h>
#include "blk.h"
#include "blk-mq.h"
@@ -54,3 +55,38 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index)
return NUMA_NO_NODE;
}
+
+/**
+ * blk_mq_map_hw_queues - Create CPU to hardware queue mapping
+ * @qmap: CPU to hardware queue map
+ * @dev: The device to map queues
+ * @offset: Queue offset to use for the device
+ *
+ * Create a CPU to hardware queue mapping in @qmap. The struct bus_type
+ * irq_get_affinity callback will be used to retrieve the affinity.
+ */
+void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
+ struct device *dev, unsigned int offset)
+
+{
+ const struct cpumask *mask;
+ unsigned int queue, cpu;
+
+ if (!dev->bus->irq_get_affinity)
+ goto fallback;
+
+ for (queue = 0; queue < qmap->nr_queues; queue++) {
+ mask = dev->bus->irq_get_affinity(dev, queue + offset);
+ if (!mask)
+ goto fallback;
+
+ for_each_cpu(cpu, mask)
+ qmap->mq_map[cpu] = qmap->queue_offset + queue;
+ }
+
+ return;
+
+fallback:
+ blk_mq_map_queues(qmap);
+}
+EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e76651e786d..e1bca29dc358 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -506,7 +506,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
prefetch(tags->static_rqs[tag]);
tag_mask &= ~(1UL << i);
rq = blk_mq_rq_ctx_init(data, tags, tag);
- rq_list_add(data->cached_rq, rq);
+ rq_list_add_head(data->cached_rqs, rq);
nr++;
}
if (!(data->rq_flags & RQF_SCHED_TAGS))
@@ -515,7 +515,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
data->nr_tags -= nr;
- return rq_list_pop(data->cached_rq);
+ return rq_list_pop(data->cached_rqs);
}
static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
@@ -612,7 +612,7 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
.flags = flags,
.cmd_flags = opf,
.nr_tags = plug->nr_ios,
- .cached_rq = &plug->cached_rq,
+ .cached_rqs = &plug->cached_rqs,
};
struct request *rq;
@@ -637,14 +637,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (!plug)
return NULL;
- if (rq_list_empty(plug->cached_rq)) {
+ if (rq_list_empty(&plug->cached_rqs)) {
if (plug->nr_ios == 1)
return NULL;
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
if (!rq)
return NULL;
} else {
- rq = rq_list_peek(&plug->cached_rq);
+ rq = rq_list_peek(&plug->cached_rqs);
if (!rq || rq->q != q)
return NULL;
@@ -653,7 +653,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
- plug->cached_rq = rq_list_next(rq);
+ rq_list_pop(&plug->cached_rqs);
blk_mq_rq_time_init(rq, 0);
}
@@ -830,7 +830,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
struct request *rq;
- while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
+ while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
blk_mq_free_request(rq);
}
@@ -870,7 +870,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status)
blk_op_str(req_op(req)),
(__force u32)(req->cmd_flags & ~REQ_OP_MASK),
req->nr_phys_segments,
- IOPRIO_PRIO_CLASS(req->ioprio));
+ IOPRIO_PRIO_CLASS(req_get_ioprio(req)));
}
/*
@@ -1386,8 +1386,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*/
if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
plug->has_elevator = true;
- rq->rq_next = NULL;
- rq_list_add(&plug->mq_list, rq);
+ rq_list_add_tail(&plug->mq_list, rq);
plug->rq_count++;
}
@@ -2655,7 +2654,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
rq->cmd_flags |= REQ_FAILFAST_MASK;
rq->__sector = bio->bi_iter.bi_sector;
- rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);
if (bio_integrity(bio))
rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
@@ -2781,7 +2779,7 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug)
blk_status_t ret = BLK_STS_OK;
while ((rq = rq_list_pop(&plug->mq_list))) {
- bool last = rq_list_empty(plug->mq_list);
+ bool last = rq_list_empty(&plug->mq_list);
if (hctx != rq->mq_hctx) {
if (hctx) {
@@ -2824,8 +2822,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
{
struct blk_mq_hw_ctx *this_hctx = NULL;
struct blk_mq_ctx *this_ctx = NULL;
- struct request *requeue_list = NULL;
- struct request **requeue_lastp = &requeue_list;
+ struct rq_list requeue_list = {};
unsigned int depth = 0;
bool is_passthrough = false;
LIST_HEAD(list);
@@ -2839,12 +2836,12 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
is_passthrough = blk_rq_is_passthrough(rq);
} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
is_passthrough != blk_rq_is_passthrough(rq)) {
- rq_list_add_tail(&requeue_lastp, rq);
+ rq_list_add_tail(&requeue_list, rq);
continue;
}
- list_add(&rq->queuelist, &list);
+ list_add_tail(&rq->queuelist, &list);
depth++;
- } while (!rq_list_empty(plug->mq_list));
+ } while (!rq_list_empty(&plug->mq_list));
plug->mq_list = requeue_list;
trace_block_unplug(this_hctx->queue, depth, !from_sched);
@@ -2899,19 +2896,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
if (q->mq_ops->queue_rqs) {
blk_mq_run_dispatch_ops(q,
__blk_mq_flush_plug_list(q, plug));
- if (rq_list_empty(plug->mq_list))
+ if (rq_list_empty(&plug->mq_list))
return;
}
blk_mq_run_dispatch_ops(q,
blk_mq_plug_issue_direct(plug));
- if (rq_list_empty(plug->mq_list))
+ if (rq_list_empty(&plug->mq_list))
return;
}
do {
blk_mq_dispatch_plug_list(plug, from_schedule);
- } while (!rq_list_empty(plug->mq_list));
+ } while (!rq_list_empty(&plug->mq_list));
}
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2976,7 +2973,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
if (plug) {
data.nr_tags = plug->nr_ios;
plug->nr_ios = 1;
- data.cached_rq = &plug->cached_rq;
+ data.cached_rqs = &plug->cached_rqs;
}
rq = __blk_mq_alloc_requests(&data);
@@ -2999,7 +2996,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
if (!plug)
return NULL;
- rq = rq_list_peek(&plug->cached_rq);
+ rq = rq_list_peek(&plug->cached_rqs);
if (!rq || rq->q != q)
return NULL;
if (type != rq->mq_hctx->type &&
@@ -3013,14 +3010,14 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio)
{
- WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
+ if (rq_list_pop(&plug->cached_rqs) != rq)
+ WARN_ON_ONCE(1);
/*
* If any qos ->throttle() end up blocking, we will have flushed the
* plug and hence killed the cached_rq list as well. Pop this entry
* before we throttle.
*/
- plug->cached_rq = rq_list_next(rq);
rq_qos_throttle(rq->q, bio);
blk_mq_rq_time_init(rq, 0);
@@ -3092,14 +3089,21 @@ void blk_mq_submit_bio(struct bio *bio)
}
/*
- * Device reconfiguration may change logical block size, so alignment
- * check has to be done with queue usage counter held
+ * Device reconfiguration may change logical block size or reduce the
+ * number of poll queues, so the checks for alignment and poll support
+ * have to be done with queue usage counter held.
*/
if (unlikely(bio_unaligned(bio, q))) {
bio_io_error(bio);
goto queue_exit;
}
+ if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ bio_endio(bio);
+ goto queue_exit;
+ }
+
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
goto queue_exit;
@@ -3110,8 +3114,10 @@ void blk_mq_submit_bio(struct bio *bio)
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
- if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
- goto queue_exit;
+ if (bio_needs_zone_write_plugging(bio)) {
+ if (blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+ }
new_request:
if (!rq) {
@@ -3302,8 +3308,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
rq->special_vec = rq_src->special_vec;
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
- rq->ioprio = rq_src->ioprio;
- rq->write_hint = rq_src->write_hint;
+ rq->nr_integrity_segments = rq_src->nr_integrity_segments;
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
goto free_and_out;
@@ -4325,12 +4330,6 @@ void blk_mq_release(struct request_queue *q)
blk_mq_sysfs_deinit(q);
}
-static bool blk_mq_can_poll(struct blk_mq_tag_set *set)
-{
- return set->nr_maps > HCTX_TYPE_POLL &&
- set->map[HCTX_TYPE_POLL].nr_queues;
-}
-
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata)
{
@@ -4341,7 +4340,7 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
if (!lim)
lim = &default_lim;
lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
- if (blk_mq_can_poll(set))
+ if (set->nr_maps > HCTX_TYPE_POLL)
lim->features |= BLK_FEAT_POLL;
q = blk_alloc_queue(lim, set->numa_node);
@@ -5029,8 +5028,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- struct queue_limits lim;
-
blk_mq_realloc_hw_ctxs(set, q);
if (q->nr_hw_queues != set->nr_hw_queues) {
@@ -5044,13 +5041,6 @@ fallback:
set->nr_hw_queues = prev_nr_hw_queues;
goto fallback;
}
- lim = queue_limits_start_update(q);
- if (blk_mq_can_poll(set))
- lim.features |= BLK_FEAT_POLL;
- else
- lim.features &= ~BLK_FEAT_POLL;
- if (queue_limits_commit_update(q, &lim) < 0)
- pr_warn("updating the poll flag failed\n");
blk_mq_map_swqueue(q);
}
@@ -5110,9 +5100,9 @@ static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
struct io_comp_batch *iob, unsigned int flags)
{
- struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie);
-
- return blk_hctx_poll(q, hctx, iob, flags);
+ if (!blk_mq_can_poll(q))
+ return 0;
+ return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags);
}
int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f4ac1af77a26..a80d3b3105f9 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -155,7 +155,7 @@ struct blk_mq_alloc_data {
/* allocate multiple requests/tags in one go */
unsigned int nr_tags;
- struct request **cached_rq;
+ struct rq_list *cached_rqs;
/* input & output parameter */
struct blk_mq_ctx *ctx;
@@ -451,4 +451,10 @@ do { \
#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
__blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
+static inline bool blk_mq_can_poll(struct request_queue *q)
+{
+ return (q->limits.features & BLK_FEAT_POLL) &&
+ q->tag_set->map[HCTX_TYPE_POLL].nr_queues;
+}
+
#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 7abf034089cd..9ae3eee4b5ae 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -61,8 +61,14 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
+ *
+ * There is no hardware limitation for the read-ahead size and the user
+ * might have increased the read-ahead size through sysfs, so don't ever
+ * decrease it.
*/
- bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
+ bdi->ra_pages = max3(bdi->ra_pages,
+ lim->io_opt * 2 / PAGE_SIZE,
+ VM_READAHEAD_PAGES);
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
}
@@ -118,6 +124,11 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
return 0;
}
+ if (lim->features & BLK_FEAT_BOUNCE_HIGH) {
+ pr_warn("no bounce buffer support for integrity metadata\n");
+ return -EINVAL;
+ }
+
if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) {
pr_warn("integrity support disabled.\n");
return -EINVAL;
@@ -309,12 +320,19 @@ static int blk_validate_limits(struct queue_limits *lim)
lim->max_discard_sectors =
min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
+ /*
+ * When discard is not supported, discard_granularity should be reported
+ * as 0 to userspace.
+ */
+ if (lim->max_discard_sectors)
+ lim->discard_granularity =
+ max(lim->discard_granularity, lim->physical_block_size);
+ else
+ lim->discard_granularity = 0;
+
if (!lim->max_discard_segments)
lim->max_discard_segments = 1;
- if (lim->discard_granularity < lim->physical_block_size)
- lim->discard_granularity = lim->physical_block_size;
-
/*
* By default there is no limit on the segment boundary alignment,
* but if there is one it can't be smaller than the page size as
@@ -605,7 +623,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
}
/* chunk_sectors a multiple of the physical block size? */
- if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) {
+ if (t->chunk_sectors % (t->physical_block_size >> SECTOR_SHIFT)) {
t->chunk_sectors = 0;
t->flags |= BLK_FLAG_MISALIGNED;
ret = -1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 207577145c54..6a38f312e385 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -256,10 +256,17 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
!!(disk->queue->limits.features & _feature)); \
}
-QUEUE_SYSFS_FEATURE_SHOW(poll, BLK_FEAT_POLL);
QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA);
QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX);
+static ssize_t queue_poll_show(struct gendisk *disk, char *page)
+{
+ if (queue_is_mq(disk->queue))
+ return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue));
+ return sysfs_emit(page, "%u\n",
+ !!(disk->queue->limits.features & BLK_FEAT_POLL));
+}
+
static ssize_t queue_zoned_show(struct gendisk *disk, char *page)
{
if (blk_queue_is_zoned(disk->queue))
@@ -806,6 +813,8 @@ out_unregister_ia_ranges:
out_debugfs_remove:
blk_debugfs_remove(disk);
mutex_unlock(&q->sysfs_lock);
+ if (queue_is_mq(q))
+ blk_mq_sysfs_unregister(disk);
out_put_queue_kobj:
kobject_put(&disk->queue_kobj);
mutex_unlock(&q->sysfs_dir_lock);
@@ -859,4 +868,5 @@ void blk_unregister_queue(struct gendisk *disk)
mutex_unlock(&q->sysfs_dir_lock);
blk_debugfs_remove(disk);
+ kobject_put(&disk->queue_kobj);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 2c4192e12efa..6b82fcbd7e77 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1593,13 +1593,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
return tg_may_dispatch(tg, bio, NULL);
}
-static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw)
-{
- if (!bio_flagged(bio, BIO_BPS_THROTTLED))
- tg->carryover_bytes[rw] -= throtl_bio_data_size(bio);
- tg->carryover_ios[rw]--;
-}
-
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -1636,10 +1629,12 @@ bool __blk_throtl_bio(struct bio *bio)
/*
* IOs which may cause priority inversions are
* dispatched directly, even if they're over limit.
- * Debts are handled by carryover_bytes/ios while
- * calculating wait time.
+ *
+ * Charge and dispatch directly, and our throttle
+ * control algorithm is adaptive, and extra IO bytes
+ * will be throttled for paying the debt
*/
- tg_dispatch_in_debt(tg, bio, rw);
+ throtl_charge_bio(tg, bio);
} else {
/* if above limits, break to queue */
break;
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 767bcbce74fa..24c80078ca44 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -347,6 +347,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
op = REQ_OP_ZONE_RESET;
/* Invalidate the page cache, including dirty pages. */
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
if (ret)
@@ -368,8 +369,10 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
fail:
- if (cmd == BLKRESETZONE)
+ if (cmd == BLKRESETZONE) {
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
+ }
return ret;
}
@@ -427,13 +430,14 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
}
}
hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
+ atomic_inc(&disk->nr_zone_wplugs);
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
return true;
}
-static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
- sector_t sector)
+static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
+ sector_t sector)
{
unsigned int zno = disk_zone_no(disk, sector);
unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
@@ -454,6 +458,15 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
return NULL;
}
+static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
+ sector_t sector)
+{
+ if (!atomic_read(&disk->nr_zone_wplugs))
+ return NULL;
+
+ return disk_get_hashed_zone_wplug(disk, sector);
+}
+
static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
{
struct blk_zone_wplug *zwplug =
@@ -518,6 +531,7 @@ static void disk_remove_zone_wplug(struct gendisk *disk,
zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
hlist_del_init_rcu(&zwplug->node);
+ atomic_dec(&disk->nr_zone_wplugs);
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
disk_put_zone_wplug(zwplug);
}
@@ -607,6 +621,11 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
{
struct bio *bio;
+ if (bio_list_empty(&zwplug->bio_list))
+ return;
+
+ pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
+ zwplug->disk->disk_name, zwplug->zone_no);
while ((bio = bio_list_pop(&zwplug->bio_list)))
blk_zone_wplug_bio_io_error(zwplug, bio);
}
@@ -1055,6 +1074,47 @@ plug:
return true;
}
+static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ /*
+ * We have native support for zone append operations, so we are not
+ * going to handle @bio through plugging. However, we may already have a
+ * zone write plug for the target zone if that zone was previously
+ * partially written using regular writes. In such case, we risk leaving
+ * the plug in the disk hash table if the zone is fully written using
+ * zone append operations. Avoid this by removing the zone write plug.
+ */
+ zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+ if (likely(!zwplug))
+ return;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * We are about to remove the zone write plug. But if the user
+ * (mistakenly) has issued regular writes together with native zone
+ * append, we must aborts the writes as otherwise the plugged BIOs would
+ * not be executed by the plug BIO work as disk_get_zone_wplug() will
+ * return NULL after the plug is removed. Aborting the plugged write
+ * BIOs is consistent with the fact that these writes will most likely
+ * fail anyway as there is no ordering guarantees between zone append
+ * operations and regular write operations.
+ */
+ if (!bio_list_empty(&zwplug->bio_list)) {
+ pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
+ disk->disk_name, zwplug->zone_no);
+ disk_zone_wplug_abort(zwplug);
+ }
+ disk_remove_zone_wplug(disk, zwplug);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ disk_put_zone_wplug(zwplug);
+}
+
/**
* blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
* @bio: The BIO being submitted
@@ -1071,25 +1131,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
{
struct block_device *bdev = bio->bi_bdev;
- if (!bdev->bd_disk->zone_wplugs_hash)
- return false;
-
- /*
- * If the BIO already has the plugging flag set, then it was already
- * handled through this path and this is a submission from the zone
- * plug bio submit work.
- */
- if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
- return false;
-
- /*
- * We do not need to do anything special for empty flush BIOs, e.g
- * BIOs such as issued by blkdev_issue_flush(). The is because it is
- * the responsibility of the user to first wait for the completion of
- * write operations for flush to have any effect on the persistence of
- * the written data.
- */
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+ if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
return false;
/*
@@ -1111,8 +1153,10 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
*/
switch (bio_op(bio)) {
case REQ_OP_ZONE_APPEND:
- if (!bdev_emulates_zone_append(bdev))
+ if (!bdev_emulates_zone_append(bdev)) {
+ blk_zone_wplug_handle_native_zone_append(bio);
return false;
+ }
fallthrough;
case REQ_OP_WRITE:
case REQ_OP_WRITE_ZEROES:
@@ -1178,6 +1222,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= REQ_OP_ZONE_APPEND;
+ bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
}
/*
@@ -1259,7 +1304,6 @@ again:
spin_unlock_irqrestore(&zwplug->lock, flags);
bdev = bio->bi_bdev;
- submit_bio_noacct_nocheck(bio);
/*
* blk-mq devices will reuse the extra reference on the request queue
@@ -1267,8 +1311,12 @@ again:
* path for BIO-based devices will not do that. So drop this extra
* reference here.
*/
- if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO))
+ if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
+ bdev->bd_disk->fops->submit_bio(bio);
blk_queue_exit(bdev->bd_disk->queue);
+ } else {
+ blk_mq_submit_bio(bio);
+ }
put_zwplug:
/* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
@@ -1299,6 +1347,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
{
unsigned int i;
+ atomic_set(&disk->nr_zone_wplugs, 0);
disk->zone_wplugs_hash_bits =
min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
@@ -1353,6 +1402,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
}
}
+ WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
kfree(disk->zone_wplugs_hash);
disk->zone_wplugs_hash = NULL;
disk->zone_wplugs_hash_bits = 0;
@@ -1570,11 +1620,12 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
}
/*
- * We need to track the write pointer of all zones that are not
- * empty nor full. So make sure we have a zone write plug for
- * such zone if the device has a zone write plug hash table.
+ * If the device needs zone append emulation, we need to track the
+ * write pointer of all zones that are not empty nor full. So make sure
+ * we have a zone write plug for such zone if the device has a zone
+ * write plug hash table.
*/
- if (!disk->zone_wplugs_hash)
+ if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash)
return 0;
disk_zone_wplug_sync_wp_offset(disk, zone);
diff --git a/block/blk.h b/block/blk.h
index 1426f9c28197..e91012247ff2 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -482,7 +482,8 @@ static inline void blk_zone_update_request_bio(struct request *rq,
* the original BIO sector so that blk_zone_write_plug_bio_endio() can
* lookup the zone write plug.
*/
- if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
+ if (req_op(rq) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND))
bio->bi_iter.bi_sector = rq->__sector;
}
void blk_zone_write_plug_bio_endio(struct bio *bio);
diff --git a/block/bounce.c b/block/bounce.c
index 0d898cd5ec49..09a9616cf209 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -41,8 +41,6 @@ static void init_bounce_bioset(void)
ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
BUG_ON(ret);
- if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
- BUG_ON(1);
ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
BUG_ON(ret);
diff --git a/block/elevator.c b/block/elevator.c
index 43ba4ab1ada7..1f76e9efd771 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -752,7 +752,6 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
ssize_t elv_iosched_show(struct gendisk *disk, char *name)
{
struct request_queue *q = disk->queue;
- struct elevator_queue *eq = q->elevator;
struct elevator_type *cur = NULL, *e;
int len = 0;
@@ -763,7 +762,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
len += sprintf(name+len, "[none] ");
} else {
len += sprintf(name+len, "none ");
- cur = eq->type;
+ cur = q->elevator->type;
}
spin_lock(&elv_list_lock);
diff --git a/block/fops.c b/block/fops.c
index 13a67940d040..d4b1d942f270 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -721,7 +721,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = direct_write_fallback(iocb, from, ret,
blkdev_buffered_write(iocb, from));
} else {
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with
+ * set_blocksize changing i_blkbits/folio order and punching
+ * out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = blkdev_buffered_write(iocb, from);
+ inode_unlock_shared(bd_inode);
}
if (ret > 0)
@@ -732,6 +739,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct inode *bd_inode = bdev_file_inode(iocb->ki_filp);
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
loff_t size = bdev_nr_bytes(bdev);
loff_t pos = iocb->ki_pos;
@@ -758,16 +766,23 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
file_accessed(iocb->ki_filp);
ret = blkdev_direct_IO(iocb, to);
- if (ret >= 0) {
+ if (ret > 0) {
iocb->ki_pos += ret;
count -= ret;
}
- iov_iter_revert(to, count - iov_iter_count(to));
+ if (ret != -EIOCBQUEUED)
+ iov_iter_revert(to, count - iov_iter_count(to));
if (ret < 0 || !count)
goto reexpand;
}
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize
+ * changing i_blkbits/folio order and punching out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = filemap_read(iocb, to, ret);
+ inode_unlock_shared(bd_inode);
reexpand:
if (unlikely(shorted))
@@ -810,6 +825,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
if ((start | len) & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
+ inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
/*
@@ -842,6 +858,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
fail:
filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
return error;
}
diff --git a/block/genhd.c b/block/genhd.c
index 8645cf3b0816..99344f53c789 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -778,7 +778,7 @@ static ssize_t disk_badblocks_store(struct device *dev,
}
#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
-void blk_request_module(dev_t devt)
+static bool blk_probe_dev(dev_t devt)
{
unsigned int major = MAJOR(devt);
struct blk_major_name **n;
@@ -788,14 +788,26 @@ void blk_request_module(dev_t devt)
if ((*n)->major == major && (*n)->probe) {
(*n)->probe(devt);
mutex_unlock(&major_names_lock);
- return;
+ return true;
}
}
mutex_unlock(&major_names_lock);
+ return false;
+}
+
+void blk_request_module(dev_t devt)
+{
+ int error;
+
+ if (blk_probe_dev(devt))
+ return;
- if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
- /* Make old-style 2.4 aliases work */
- request_module("block-major-%d", MAJOR(devt));
+ error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt));
+ /* Make old-style 2.4 aliases work */
+ if (error > 0)
+ error = request_module("block-major-%d", MAJOR(devt));
+ if (!error)
+ blk_probe_dev(devt);
}
#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
diff --git a/block/ioctl.c b/block/ioctl.c
index 6554b728bae6..919066b4bb49 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -141,6 +141,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
if (err)
return err;
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
@@ -173,6 +174,7 @@ out_unplug:
blk_finish_plug(&plug);
fail:
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -198,12 +200,14 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
end > bdev_nr_bytes(bdev))
return -EINVAL;
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end - 1);
if (!err)
err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
GFP_KERNEL);
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -235,6 +239,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
return -EINVAL;
/* Invalidate the page cache, including dirty pages */
+ inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end);
if (err)
@@ -245,6 +250,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
fail:
filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 4155594aefc6..ccfefa6a3669 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -157,10 +157,7 @@ struct kyber_queue_data {
*/
struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
- /*
- * Async request percentage, converted to per-word depth for
- * sbitmap_get_shallow().
- */
+ /* Number of allowed async requests. */
unsigned int async_depth;
struct kyber_cpu_latency __percpu *cpu_latency;
@@ -454,10 +451,8 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
- unsigned int shift = tags->bitmap_tags.sb.shift;
-
- kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+ kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U;
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
}
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index acdc28756d9d..19473a9b5044 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -488,20 +488,6 @@ unlock:
}
/*
- * 'depth' is a number in the range 1..INT_MAX representing a number of
- * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since
- * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow().
- * Values larger than q->nr_requests have the same effect as q->nr_requests.
- */
-static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth)
-{
- struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
- const unsigned int nrr = hctx->queue->nr_requests;
-
- return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
-}
-
-/*
* Called by __blk_mq_alloc_request(). The shallow_depth value set by this
* function is used by __blk_mq_get_tag().
*/
@@ -517,7 +503,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
* Throttle asynchronous requests and writes such that these requests
* do not block the allocation of synchronous requests.
*/
- data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth);
+ data->shallow_depth = dd->async_depth;
}
/* Called by blk_mq_update_nr_requests(). */
@@ -685,10 +671,9 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
prio = ioprio_class_to_prio[ioprio_class];
per_prio = &dd->per_prio[prio];
- if (!rq->elv.priv[0]) {
+ if (!rq->elv.priv[0])
per_prio->stats.inserted++;
- rq->elv.priv[0] = (void *)(uintptr_t)1;
- }
+ rq->elv.priv[0] = per_prio;
if (blk_mq_sched_try_insert_merge(q, rq, free))
return;
@@ -753,18 +738,14 @@ static void dd_prepare_request(struct request *rq)
*/
static void dd_finish_request(struct request *rq)
{
- struct request_queue *q = rq->q;
- struct deadline_data *dd = q->elevator->elevator_data;
- const u8 ioprio_class = dd_rq_ioclass(rq);
- const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
- struct dd_per_prio *per_prio = &dd->per_prio[prio];
+ struct dd_per_prio *per_prio = rq->elv.priv[0];
/*
* The block layer core may call dd_finish_request() without having
* called dd_insert_requests(). Skip requests that bypassed I/O
* scheduling. See also blk_mq_request_bypass_insert().
*/
- if (rq->elv.priv[0])
+ if (per_prio)
atomic_inc(&per_prio->stats.completed);
}
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index 5e9be13a56a8..7acba66eed48 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -682,7 +682,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out)
out[size] = 0;
while (i < size) {
- u8 c = le16_to_cpu(in[i]) & 0xff;
+ u8 c = le16_to_cpu(in[i]) & 0x7f;
if (c && !isprint(c))
c = '!';
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index e259180c8914..aa3bd050d8cd 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* ldm - Part of the Linux-NTFS project.
*
* Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
index c80183156d68..b02530d98629 100644
--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@@ -53,13 +53,25 @@ int mac_partition(struct parsed_partitions *state)
}
secsize = be16_to_cpu(md->block_size);
put_dev_sector(sect);
+
+ /*
+ * If the "block size" is not a power of 2, things get weird - we might
+ * end up with a partition straddling a sector boundary, so we wouldn't
+ * be able to read a partition entry with read_part_sector().
+ * Real block sizes are probably (?) powers of two, so just require
+ * that.
+ */
+ if (!is_power_of_2(secsize))
+ return -1;
datasize = round_down(secsize, 512);
data = read_part_sector(state, datasize / 512, &sect);
if (!data)
return -1;
partoffset = secsize % 512;
- if (partoffset + sizeof(*part) > datasize)
+ if (partoffset + sizeof(*part) > datasize) {
+ put_dev_sector(sect);
return -1;
+ }
part = (struct mac_partition *) (data + partoffset);
if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
put_dev_sector(sect);
@@ -112,8 +124,8 @@ int mac_partition(struct parsed_partitions *state)
int i, l;
goodness++;
- l = strlen(part->name);
- if (strcmp(part->name, "/") == 0)
+ l = strnlen(part->name, sizeof(part->name));
+ if (strncmp(part->name, "/", sizeof(part->name)) == 0)
goodness++;
for (i = 0; i <= l - 4; ++i) {
if (strncasecmp(part->name + i, "root",