summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bdev.c2
-rw-r--r--block/bfq-iosched.c25
-rw-r--r--block/bio-integrity.c25
-rw-r--r--block/bio.c78
-rw-r--r--block/blk-cgroup.c35
-rw-r--r--block/blk-cgroup.h12
-rw-r--r--block/blk-core.c21
-rw-r--r--block/blk-crypto-fallback.c19
-rw-r--r--block/blk-integrity.c66
-rw-r--r--block/blk-ioc.c2
-rw-r--r--block/blk-iolatency.c19
-rw-r--r--block/blk-lib.c15
-rw-r--r--block/blk-map.c13
-rw-r--r--block/blk-merge.c85
-rw-r--r--block/blk-mq-debugfs.c2
-rw-r--r--block/blk-mq-dma.c282
-rw-r--r--block/blk-mq-sched.c14
-rw-r--r--block/blk-mq-sched.h13
-rw-r--r--block/blk-mq-sysfs.c7
-rw-r--r--block/blk-mq-tag.c128
-rw-r--r--block/blk-mq.c188
-rw-r--r--block/blk-mq.h22
-rw-r--r--block/blk-rq-qos.c8
-rw-r--r--block/blk-rq-qos.h51
-rw-r--r--block/blk-settings.c96
-rw-r--r--block/blk-sysfs.c84
-rw-r--r--block/blk-throttle.c15
-rw-r--r--block/blk-throttle.h18
-rw-r--r--block/blk-wbt.c15
-rw-r--r--block/blk-zoned.c11
-rw-r--r--block/blk.h47
-rw-r--r--block/elevator.c3
-rw-r--r--block/elevator.h2
-rw-r--r--block/fops.c23
-rw-r--r--block/genhd.c2
-rw-r--r--block/ioctl.c6
-rw-r--r--block/kyber-iosched.c19
-rw-r--r--block/mq-deadline.c20
-rw-r--r--block/partitions/ibm.c2
39 files changed, 827 insertions, 668 deletions
diff --git a/block/bdev.c b/block/bdev.c
index b77ddd12dc06..810707cca970 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -412,7 +412,7 @@ static const struct super_operations bdev_sops = {
.statfs = simple_statfs,
.alloc_inode = bdev_alloc_inode,
.free_inode = bdev_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = bdev_evict_inode,
};
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3bf76902f07f..4a8d3d96bfe4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5847,8 +5847,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
goto out;
}
- bfqq = kmem_cache_alloc_node(bfq_pool,
- GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
+ bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO,
bfqd->queue->node);
if (bfqq) {
@@ -7110,9 +7109,10 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
* See the comments on bfq_limit_depth for the purpose of
* the depths set in the function. Return minimum shallow depth we'll use.
*/
-static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
+static void bfq_depth_updated(struct request_queue *q)
{
- unsigned int nr_requests = bfqd->queue->nr_requests;
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ unsigned int nr_requests = q->nr_requests;
/*
* In-word depths if no bfq_queue is being weight-raised:
@@ -7144,21 +7144,8 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
-}
-
-static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
-{
- struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
- struct blk_mq_tags *tags = hctx->sched_tags;
- bfq_update_depths(bfqd, &tags->bitmap_tags);
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
-}
-
-static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
-{
- bfq_depth_updated(hctx);
- return 0;
+ blk_mq_set_min_shallow_depth(q, 1);
}
static void bfq_exit_queue(struct elevator_queue *e)
@@ -7370,6 +7357,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
goto out_free;
bfq_init_root_group(bfqd->root_group, bfqd);
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+ bfq_depth_updated(q);
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
@@ -7629,7 +7617,6 @@ static struct elevator_type iosched_bfq_mq = {
.request_merged = bfq_request_merged,
.has_work = bfq_has_work,
.depth_updated = bfq_depth_updated,
- .init_hctx = bfq_init_hctx,
.init_sched = bfq_init_queue,
.exit_sched = bfq_exit_queue,
},
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 6b077ca937f6..bed26f1ec869 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -230,7 +230,8 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
}
static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
- int nr_vecs, ssize_t bytes, ssize_t offset)
+ int nr_vecs, ssize_t bytes, ssize_t offset,
+ bool *is_p2p)
{
unsigned int nr_bvecs = 0;
int i, j;
@@ -251,6 +252,9 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
bytes -= next;
}
+ if (is_pci_p2pdma_page(pages[i]))
+ *is_p2p = true;
+
bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset);
offset = 0;
nr_bvecs++;
@@ -262,13 +266,13 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits);
struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
+ iov_iter_extraction_t extraction_flags = 0;
size_t offset, bytes = iter->count;
+ bool copy, is_p2p = false;
unsigned int nr_bvecs;
int ret, nr_vecs;
- bool copy;
if (bio_integrity(bio))
return -EINVAL;
@@ -285,16 +289,25 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
pages = NULL;
}
- copy = !iov_iter_is_aligned(iter, align, align);
- ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset);
+ copy = iov_iter_alignment(iter) &
+ blk_lim_dma_alignment_and_pad(&q->limits);
+
+ if (blk_queue_pci_p2pdma(q))
+ extraction_flags |= ITER_ALLOW_P2PDMA;
+
+ ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs,
+ extraction_flags, &offset);
if (unlikely(ret < 0))
goto free_bvec;
- nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset);
+ nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset,
+ &is_p2p);
if (pages != stack_pages)
kvfree(pages);
if (nr_bvecs > queue_max_integrity_segments(q))
copy = true;
+ if (is_p2p)
+ bio->bi_opf |= REQ_NOMERGE;
if (copy)
ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes);
diff --git a/block/bio.c b/block/bio.c
index 3b371a5da159..3a1a848940dd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -261,7 +261,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_private = NULL;
#ifdef CONFIG_BLK_CGROUP
bio->bi_blkg = NULL;
- bio->bi_issue.value = 0;
+ bio->issue_time_ns = 0;
if (bdev)
bio_associate_blkg(bio);
#ifdef CONFIG_BLK_CGROUP_IOCOST
@@ -462,7 +462,10 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
cache->nr--;
put_cpu();
- bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf);
+ if (nr_vecs)
+ bio_init_inline(bio, bdev, nr_vecs, opf);
+ else
+ bio_init(bio, bdev, NULL, nr_vecs, opf);
bio->bi_pool = bs;
return bio;
}
@@ -578,7 +581,7 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
bio_init(bio, bdev, bvl, nr_vecs, opf);
} else if (nr_vecs) {
- bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
+ bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf);
} else {
bio_init(bio, bdev, NULL, 0, opf);
}
@@ -614,7 +617,8 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
if (nr_vecs > BIO_MAX_INLINE_VECS)
return NULL;
- return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask);
+ return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec),
+ gfp_mask);
}
EXPORT_SYMBOL(bio_kmalloc);
@@ -981,7 +985,7 @@ void __bio_add_page(struct bio *bio, struct page *page,
WARN_ON_ONCE(bio_full(bio, len));
if (is_pci_p2pdma_page(page))
- bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
+ bio->bi_opf |= REQ_NOMERGE;
bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
bio->bi_iter.bi_size += len;
@@ -1227,13 +1231,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
extraction_flags |= ITER_ALLOW_P2PDMA;
- /*
- * Each segment in the iov is required to be a block size multiple.
- * However, we may not be able to get the entire segment if it spans
- * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
- * result to ensure the bio's total size is correct. The remainder of
- * the iov data will be picked up in the next bio iteration.
- */
size = iov_iter_extract_pages(iter, &pages,
UINT_MAX - bio->bi_iter.bi_size,
nr_pages, extraction_flags, &offset);
@@ -1241,18 +1238,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return size ? size : -EFAULT;
nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
-
- if (bio->bi_bdev) {
- size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
- iov_iter_revert(iter, trim);
- size -= trim;
- }
-
- if (unlikely(!size)) {
- ret = -EFAULT;
- goto out;
- }
-
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
struct folio *folio = page_folio(page);
@@ -1297,10 +1282,44 @@ out:
return ret;
}
+/*
+ * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that
+ * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length
+ * for the next iteration.
+ */
+static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
+ unsigned len_align_mask)
+{
+ size_t nbytes = bio->bi_iter.bi_size & len_align_mask;
+
+ if (!nbytes)
+ return 0;
+
+ iov_iter_revert(iter, nbytes);
+ bio->bi_iter.bi_size -= nbytes;
+ do {
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+ if (nbytes < bv->bv_len) {
+ bv->bv_len -= nbytes;
+ break;
+ }
+
+ bio_release_page(bio, bv->bv_page);
+ bio->bi_vcnt--;
+ nbytes -= bv->bv_len;
+ } while (nbytes);
+
+ if (!bio->bi_vcnt)
+ return -EFAULT;
+ return 0;
+}
+
/**
- * bio_iov_iter_get_pages - add user or kernel pages to a bio
+ * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be added
+ * @len_align_mask: the mask to align the total size to, 0 for any length
*
* This takes either an iterator pointing to user memory, or one pointing to
* kernel pages (BVEC iterator). If we're adding user pages, we pin them and
@@ -1317,7 +1336,8 @@ out:
* MM encounters an error pinning the requested pages, it stops. Error
* is returned only if 0 pages could be pinned.
*/
-int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter,
+ unsigned len_align_mask)
{
int ret = 0;
@@ -1336,9 +1356,11 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
- return bio->bi_vcnt ? 0 : ret;
+ if (bio->bi_vcnt)
+ return bio_iov_iter_align_down(bio, iter, len_align_mask);
+ return ret;
}
-EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
+EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned);
static void submit_bio_wait_endio(struct bio *bio)
{
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5936db7f8475..f93de34fe87d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -110,12 +110,6 @@ static struct cgroup_subsys_state *blkcg_css(void)
return task_css(current, io_cgrp_id);
}
-static bool blkcg_policy_enabled(struct request_queue *q,
- const struct blkcg_policy *pol)
-{
- return pol && test_bit(pol->plid, q->blkcg_pols);
-}
-
static void blkg_free_workfn(struct work_struct *work)
{
struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
@@ -394,7 +388,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
/* allocate */
if (!new_blkg) {
- new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN);
+ new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
goto err_put_css;
@@ -883,14 +877,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
disk = ctx->bdev->bd_disk;
q = disk->queue;
- /*
- * blkcg_deactivate_policy() requires queue to be frozen, we can grab
- * q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
- */
- ret = blk_queue_enter(q, 0);
- if (ret)
- goto fail;
-
+ /* Prevent concurrent with blkcg_deactivate_policy() */
+ mutex_lock(&q->blkcg_mutex);
spin_lock_irq(&q->queue_lock);
if (!blkcg_policy_enabled(q, pol)) {
@@ -920,16 +908,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
/* Drop locks to do new blkg allocation with GFP_KERNEL. */
spin_unlock_irq(&q->queue_lock);
- new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
+ new_blkg = blkg_alloc(pos, disk, GFP_NOIO);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto fail_exit_queue;
+ goto fail_exit;
}
if (radix_tree_preload(GFP_KERNEL)) {
blkg_free(new_blkg);
ret = -ENOMEM;
- goto fail_exit_queue;
+ goto fail_exit;
}
spin_lock_irq(&q->queue_lock);
@@ -957,7 +945,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
goto success;
}
success:
- blk_queue_exit(q);
+ mutex_unlock(&q->blkcg_mutex);
ctx->blkg = blkg;
return 0;
@@ -965,9 +953,8 @@ fail_preloaded:
radix_tree_preload_end();
fail_unlock:
spin_unlock_irq(&q->queue_lock);
-fail_exit_queue:
- blk_queue_exit(q);
-fail:
+fail_exit:
+ mutex_unlock(&q->blkcg_mutex);
/*
* If queue was bypassing, we should retry. Do so after a
* short msleep(). It isn't strictly necessary but queue
@@ -1467,7 +1454,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
spin_lock_init(&blkcg->lock);
refcount_set(&blkcg->online_pin, 1);
- INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
+ INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -1630,7 +1617,7 @@ retry:
pd_prealloc = NULL;
} else {
pd = pol->pd_alloc_fn(disk, blkg->blkcg,
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
}
if (!pd) {
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 81868ad86330..1cce3294634d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -370,11 +370,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q)))
-static inline void blkcg_bio_issue_init(struct bio *bio)
-{
- bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-}
-
static inline void blkcg_use_delay(struct blkcg_gq *blkg)
{
if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
@@ -459,6 +454,12 @@ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
}
+static inline bool blkcg_policy_enabled(struct request_queue *q,
+ const struct blkcg_policy *pol)
+{
+ return pol && test_bit(pol->plid, q->blkcg_pols);
+}
+
void blk_cgroup_bio_start(struct bio *bio);
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
#else /* CONFIG_BLK_CGROUP */
@@ -491,7 +492,6 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
-static inline void blkcg_bio_issue_init(struct bio *bio) { }
static inline void blk_cgroup_bio_start(struct bio *bio) { }
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
diff --git a/block/blk-core.c b/block/blk-core.c
index fdac48aec5ef..14ae73eebe0d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -539,7 +539,7 @@ static inline void bio_check_ro(struct bio *bio)
}
}
-static noinline int should_fail_bio(struct bio *bio)
+int should_fail_bio(struct bio *bio)
{
if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
return -EIO;
@@ -560,6 +560,8 @@ static inline int bio_check_eod(struct bio *bio)
if (nr_sectors &&
(nr_sectors > maxsector ||
bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
+ if (!maxsector)
+ return -EIO;
pr_info_ratelimited("%s: attempt to access beyond end of device\n"
"%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
current->comm, bio->bi_bdev, bio->bi_opf,
@@ -725,10 +727,9 @@ static void __submit_bio_noacct_mq(struct bio *bio)
current->bio_list = NULL;
}
-void submit_bio_noacct_nocheck(struct bio *bio)
+void submit_bio_noacct_nocheck(struct bio *bio, bool split)
{
blk_cgroup_bio_start(bio);
- blkcg_bio_issue_init(bio);
if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_queue(bio);
@@ -745,12 +746,16 @@ void submit_bio_noacct_nocheck(struct bio *bio)
* to collect a list of requests submited by a ->submit_bio method while
* it is active, and then process them after it returned.
*/
- if (current->bio_list)
- bio_list_add(&current->bio_list[0], bio);
- else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
+ if (current->bio_list) {
+ if (split)
+ bio_list_add_head(&current->bio_list[0], bio);
+ else
+ bio_list_add(&current->bio_list[0], bio);
+ } else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
__submit_bio_noacct_mq(bio);
- else
+ } else {
__submit_bio_noacct(bio);
+ }
}
static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
@@ -871,7 +876,7 @@ void submit_bio_noacct(struct bio *bio)
if (blk_throtl_bio(bio))
return;
- submit_bio_noacct_nocheck(bio);
+ submit_bio_noacct_nocheck(bio, false);
return;
not_supported:
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 005c9157ffb3..86b27f96051a 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -167,8 +167,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
bio = bio_kmalloc(nr_segs, GFP_NOIO);
if (!bio)
return NULL;
- bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs,
- bio_src->bi_opf);
+ bio_init_inline(bio, bio_src->bi_bdev, nr_segs, bio_src->bi_opf);
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
@@ -222,18 +221,14 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
if (++i == BIO_MAX_VECS)
break;
}
- if (num_sectors < bio_sectors(bio)) {
- struct bio *split_bio;
- split_bio = bio_split(bio, num_sectors, GFP_NOIO,
- &crypto_bio_split);
- if (IS_ERR(split_bio)) {
- bio->bi_status = BLK_STS_RESOURCE;
+ if (num_sectors < bio_sectors(bio)) {
+ bio = bio_submit_split_bioset(bio, num_sectors,
+ &crypto_bio_split);
+ if (!bio)
return false;
- }
- bio_chain(split_bio, bio);
- submit_bio_noacct(bio);
- *bio_ptr = split_bio;
+
+ *bio_ptr = bio;
}
return true;
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 056b8948369d..9b27963680dc 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -58,16 +58,14 @@ new_segment:
int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
struct logical_block_metadata_cap __user *argp)
{
- struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk);
+ struct blk_integrity *bi;
struct logical_block_metadata_cap meta_cap = {};
size_t usize = _IOC_SIZE(cmd);
- if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) ||
- _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) ||
- _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) ||
- _IOC_SIZE(cmd) < LBMD_SIZE_VER0)
+ if (!extensible_ioctl_valid(cmd, FS_IOC_GETLBMD_CAP, LBMD_SIZE_VER0))
return -ENOIOCTLCMD;
+ bi = blk_get_integrity(bdev->bd_disk);
if (!bi)
goto out;
@@ -122,64 +120,6 @@ out:
NULL);
}
-/**
- * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
- * @rq: request to map
- * @sglist: target scatterlist
- *
- * Description: Map the integrity vectors in request into a
- * scatterlist. The scatterlist must be big enough to hold all
- * elements. I.e. sized using blk_rq_count_integrity_sg() or
- * rq->nr_integrity_segments.
- */
-int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
-{
- struct bio_vec iv, ivprv = { NULL };
- struct request_queue *q = rq->q;
- struct scatterlist *sg = NULL;
- struct bio *bio = rq->bio;
- unsigned int segments = 0;
- struct bvec_iter iter;
- int prev = 0;
-
- bio_for_each_integrity_vec(iv, bio, iter) {
- if (prev) {
- if (!biovec_phys_mergeable(q, &ivprv, &iv))
- goto new_segment;
- if (sg->length + iv.bv_len > queue_max_segment_size(q))
- goto new_segment;
-
- sg->length += iv.bv_len;
- } else {
-new_segment:
- if (!sg)
- sg = sglist;
- else {
- sg_unmark_end(sg);
- sg = sg_next(sg);
- }
-
- sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset);
- segments++;
- }
-
- prev = 1;
- ivprv = iv;
- }
-
- if (sg)
- sg_mark_end(sg);
-
- /*
- * Something must have been wrong if the figured number of segment
- * is bigger than number of req's physical integrity segments
- */
- BUG_ON(segments > rq->nr_integrity_segments);
- BUG_ON(segments > queue_max_integrity_segments(q));
- return segments;
-}
-EXPORT_SYMBOL(blk_rq_map_integrity_sg);
-
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
ssize_t bytes)
{
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 9fda3906e5f5..d15918d7fabb 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -286,7 +286,7 @@ out:
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
-int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
+int __copy_io(u64 clone_flags, struct task_struct *tsk)
{
struct io_context *ioc = current->io_context;
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 2f8fdecdd7a9..45bd18f68541 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -485,19 +485,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
mod_timer(&blkiolat->timer, jiffies + HZ);
}
-static void iolatency_record_time(struct iolatency_grp *iolat,
- struct bio_issue *issue, u64 now,
- bool issue_as_root)
+static void iolatency_record_time(struct iolatency_grp *iolat, u64 start,
+ u64 now, bool issue_as_root)
{
- u64 start = bio_issue_time(issue);
u64 req_time;
- /*
- * Have to do this so we are truncated to the correct time that our
- * issue is truncated to.
- */
- now = __bio_issue_time(now);
-
if (now <= start)
return;
@@ -625,7 +617,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
* submitted, so do not account for it.
*/
if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) {
- iolatency_record_time(iolat, &bio->bi_issue, now,
+ iolatency_record_time(iolat, bio->issue_time_ns, now,
issue_as_root);
window_start = atomic64_read(&iolat->window_start);
if (now > window_start &&
@@ -750,10 +742,15 @@ static void blkiolatency_enable_work_fn(struct work_struct *work)
*/
enabled = atomic_read(&blkiolat->enable_cnt);
if (enabled != blkiolat->enabled) {
+ struct request_queue *q = blkiolat->rqos.disk->queue;
unsigned int memflags;
memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue);
blkiolat->enabled = enabled;
+ if (enabled)
+ blk_queue_flag_set(QUEUE_FLAG_BIO_ISSUE_TIME, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_BIO_ISSUE_TIME, q);
blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags);
}
}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 4c9f20a689f7..3030a772d3aa 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -196,6 +196,8 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop, unsigned int flags)
{
+ struct folio *zero_folio = largest_zero_folio();
+
while (nr_sects) {
unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects);
struct bio *bio;
@@ -208,15 +210,14 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev,
break;
do {
- unsigned int len, added;
+ unsigned int len;
- len = min_t(sector_t,
- PAGE_SIZE, nr_sects << SECTOR_SHIFT);
- added = bio_add_page(bio, ZERO_PAGE(0), len, 0);
- if (added < len)
+ len = min_t(sector_t, folio_size(zero_folio),
+ nr_sects << SECTOR_SHIFT);
+ if (!bio_add_folio(bio, zero_folio, len, 0))
break;
- nr_sects -= added >> SECTOR_SHIFT;
- sector += added >> SECTOR_SHIFT;
+ nr_sects -= len >> SECTOR_SHIFT;
+ sector += len >> SECTOR_SHIFT;
} while (nr_sects);
*biop = bio_chain_and_submit(*biop, bio);
diff --git a/block/blk-map.c b/block/blk-map.c
index 23e5d5ebe59e..165f2234f00f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -157,7 +157,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
goto out_bmd;
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq));
+ bio_init_inline(bio, NULL, nr_pages, req_op(rq));
if (map_data) {
nr_pages = 1U << map_data->page_order;
@@ -253,10 +253,11 @@ static void blk_mq_map_bio_put(struct bio *bio)
static struct bio *blk_rq_map_bio_alloc(struct request *rq,
unsigned int nr_vecs, gfp_t gfp_mask)
{
+ struct block_device *bdev = rq->q->disk ? rq->q->disk->part0 : NULL;
struct bio *bio;
if (rq->cmd_flags & REQ_ALLOC_CACHE && (nr_vecs <= BIO_INLINE_VECS)) {
- bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask,
+ bio = bio_alloc_bioset(bdev, nr_vecs, rq->cmd_flags, gfp_mask,
&fs_bio_set);
if (!bio)
return NULL;
@@ -264,7 +265,7 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq,
bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return NULL;
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq));
+ bio_init_inline(bio, bdev, nr_vecs, req_op(rq));
}
return bio;
}
@@ -326,7 +327,7 @@ static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op,
bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op);
+ bio_init_inline(bio, NULL, nr_vecs, op);
if (is_vmalloc_addr(data)) {
bio->bi_private = data;
if (!bio_add_vmalloc(bio, data, len)) {
@@ -392,7 +393,7 @@ static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op,
bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op);
+ bio_init_inline(bio, NULL, nr_pages, op);
while (len) {
struct page *page;
@@ -443,7 +444,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio)
int ret;
/* check that the data layout matches the hardware restrictions */
- ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes);
+ ret = bio_split_io_at(bio, lim, &nr_segs, max_bytes, 0);
if (ret) {
/* if we would have to split the bio, copy instead */
if (ret > 0)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 70d704615be5..37864c5d287e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -104,34 +104,58 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
}
+/*
+ * bio_submit_split_bioset - Submit a bio, splitting it at a designated sector
+ * @bio: the original bio to be submitted and split
+ * @split_sectors: the sector count at which to split
+ * @bs: the bio set used for allocating the new split bio
+ *
+ * The original bio is modified to contain the remaining sectors and submitted.
+ * The caller is responsible for submitting the returned bio.
+ *
+ * If succeed, the newly allocated bio representing the initial part will be
+ * returned, on failure NULL will be returned and original bio will fail.
+ */
+struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
+ struct bio_set *bs)
+{
+ struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs);
+
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return NULL;
+ }
+
+ bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
+ WARN_ON_ONCE(bio_zone_write_plugging(bio));
+
+ if (should_fail_bio(bio))
+ bio_io_error(bio);
+ else if (!blk_throtl_bio(bio))
+ submit_bio_noacct_nocheck(bio, true);
+
+ return split;
+}
+EXPORT_SYMBOL_GPL(bio_submit_split_bioset);
+
static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
{
- if (unlikely(split_sectors < 0))
- goto error;
+ if (unlikely(split_sectors < 0)) {
+ bio->bi_status = errno_to_blk_status(split_sectors);
+ bio_endio(bio);
+ return NULL;
+ }
if (split_sectors) {
- struct bio *split;
-
- split = bio_split(bio, split_sectors, GFP_NOIO,
+ bio = bio_submit_split_bioset(bio, split_sectors,
&bio->bi_bdev->bd_disk->bio_split);
- if (IS_ERR(split)) {
- split_sectors = PTR_ERR(split);
- goto error;
- }
- split->bi_opf |= REQ_NOMERGE;
- blkcg_bio_issue_init(split);
- bio_chain(split, bio);
- trace_block_split(split, bio->bi_iter.bi_sector);
- WARN_ON_ONCE(bio_zone_write_plugging(bio));
- submit_bio_noacct(bio);
- return split;
+ if (bio)
+ bio->bi_opf |= REQ_NOMERGE;
}
return bio;
-error:
- bio->bi_status = errno_to_blk_status(split_sectors);
- bio_endio(bio);
- return NULL;
}
struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
@@ -279,25 +303,30 @@ static unsigned int bio_split_alignment(struct bio *bio,
}
/**
- * bio_split_rw_at - check if and where to split a read/write bio
+ * bio_split_io_at - check if and where to split a bio
* @bio: [in] bio to be split
* @lim: [in] queue limits to split based on
* @segs: [out] number of segments in the bio with the first half of the sectors
* @max_bytes: [in] maximum number of bytes per bio
+ * @len_align_mask: [in] length alignment mask for each vector
*
* Find out if @bio needs to be split to fit the queue limits in @lim and a
* maximum size of @max_bytes. Returns a negative error number if @bio can't be
* split, 0 if the bio doesn't have to be split, or a positive sector offset if
* @bio needs to be split.
*/
-int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
- unsigned *segs, unsigned max_bytes)
+int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
+ unsigned *segs, unsigned max_bytes, unsigned len_align_mask)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
unsigned nsegs = 0, bytes = 0;
bio_for_each_bvec(bv, bio, iter) {
+ if (bv.bv_offset & lim->dma_alignment ||
+ bv.bv_len & len_align_mask)
+ return -EINVAL;
+
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
@@ -339,8 +368,16 @@ split:
* Individual bvecs might not be logical block aligned. Round down the
* split size so that each bio is properly block size aligned, even if
* we do not use the full hardware limits.
+ *
+ * It is possible to submit a bio that can't be split into a valid io:
+ * there may either be too many discontiguous vectors for the max
+ * segments limit, or contain virtual boundary gaps without having a
+ * valid block sized split. A zero byte result means one of those
+ * conditions occured.
*/
bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim));
+ if (!bytes)
+ return -EINVAL;
/*
* Bio splitting may cause subtle trouble such as hang when doing sync
@@ -350,7 +387,7 @@ split:
bio_clear_polled(bio);
return bytes >> SECTOR_SHIFT;
}
-EXPORT_SYMBOL_GPL(bio_split_rw_at);
+EXPORT_SYMBOL_GPL(bio_split_io_at);
struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *nr_segs)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7ed3e71f2fc0..4896525b1c05 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -95,6 +95,8 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SQ_SCHED),
QUEUE_FLAG_NAME(DISABLE_WBT_DEF),
QUEUE_FLAG_NAME(NO_ELV_SWITCH),
+ QUEUE_FLAG_NAME(QOS_ENABLED),
+ QUEUE_FLAG_NAME(BIO_ISSUE_TIME),
};
#undef QUEUE_FLAG_NAME
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index ad283017caef..449950029872 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2025 Christoph Hellwig
*/
+#include <linux/blk-integrity.h>
#include <linux/blk-mq-dma.h>
#include "blk.h"
@@ -10,29 +11,38 @@ struct phys_vec {
u32 len;
};
-static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
+static bool __blk_map_iter_next(struct blk_map_iter *iter)
+{
+ if (iter->iter.bi_size)
+ return true;
+ if (!iter->bio || !iter->bio->bi_next)
+ return false;
+
+ iter->bio = iter->bio->bi_next;
+ if (iter->is_integrity) {
+ iter->iter = bio_integrity(iter->bio)->bip_iter;
+ iter->bvecs = bio_integrity(iter->bio)->bip_vec;
+ } else {
+ iter->iter = iter->bio->bi_iter;
+ iter->bvecs = iter->bio->bi_io_vec;
+ }
+ return true;
+}
+
+static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
struct phys_vec *vec)
{
unsigned int max_size;
struct bio_vec bv;
- if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
- if (!iter->bio)
- return false;
- vec->paddr = bvec_phys(&req->special_vec);
- vec->len = req->special_vec.bv_len;
- iter->bio = NULL;
- return true;
- }
-
if (!iter->iter.bi_size)
return false;
- bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
vec->paddr = bvec_phys(&bv);
max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
bv.bv_len = min(bv.bv_len, max_size);
- bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
+ bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len);
/*
* If we are entirely done with this bi_io_vec entry, check if the next
@@ -42,20 +52,16 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
struct bio_vec next;
- if (!iter->iter.bi_size) {
- if (!iter->bio->bi_next)
- break;
- iter->bio = iter->bio->bi_next;
- iter->iter = iter->bio->bi_iter;
- }
+ if (!__blk_map_iter_next(iter))
+ break;
- next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ next = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
if (bv.bv_len + next.bv_len > max_size ||
!biovec_phys_mergeable(req->q, &bv, &next))
break;
bv.bv_len += next.bv_len;
- bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
+ bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len);
}
vec->len = bv.bv_len;
@@ -125,6 +131,72 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
return true;
}
+static inline void blk_rq_map_iter_init(struct request *rq,
+ struct blk_map_iter *iter)
+{
+ struct bio *bio = rq->bio;
+
+ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ *iter = (struct blk_map_iter) {
+ .bvecs = &rq->special_vec,
+ .iter = {
+ .bi_size = rq->special_vec.bv_len,
+ }
+ };
+ } else if (bio) {
+ *iter = (struct blk_map_iter) {
+ .bio = bio,
+ .bvecs = bio->bi_io_vec,
+ .iter = bio->bi_iter,
+ };
+ } else {
+ /* the internal flush request may not have bio attached */
+ *iter = (struct blk_map_iter) {};
+ }
+}
+
+static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter,
+ unsigned int total_len)
+{
+ struct phys_vec vec;
+
+ memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
+ iter->status = BLK_STS_OK;
+
+ /*
+ * Grab the first segment ASAP because we'll need it to check for P2P
+ * transfers.
+ */
+ if (!blk_map_iter_next(req, &iter->iter, &vec))
+ return false;
+
+ switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
+ phys_to_page(vec.paddr))) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ if (iter->iter.is_integrity)
+ bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA;
+ else
+ req->cmd_flags |= REQ_P2PDMA;
+ return blk_dma_map_bus(iter, &vec);
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * P2P transfers through the host bridge are treated the
+ * same as non-P2P transfers below and during unmap.
+ */
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ default:
+ iter->status = BLK_STS_INVAL;
+ return false;
+ }
+
+ if (blk_can_dma_map_iova(req, dma_dev) &&
+ dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
+ return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
+ return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+
/**
* blk_rq_dma_map_iter_start - map the first DMA segment for a request
* @req: request to map
@@ -150,43 +222,9 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
struct dma_iova_state *state, struct blk_dma_iter *iter)
{
- unsigned int total_len = blk_rq_payload_bytes(req);
- struct phys_vec vec;
-
- iter->iter.bio = req->bio;
- iter->iter.iter = req->bio->bi_iter;
- memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
- iter->status = BLK_STS_OK;
-
- /*
- * Grab the first segment ASAP because we'll need it to check for P2P
- * transfers.
- */
- if (!blk_map_iter_next(req, &iter->iter, &vec))
- return false;
-
- if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
- switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
- phys_to_page(vec.paddr))) {
- case PCI_P2PDMA_MAP_BUS_ADDR:
- return blk_dma_map_bus(iter, &vec);
- case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
- /*
- * P2P transfers through the host bridge are treated the
- * same as non-P2P transfers below and during unmap.
- */
- req->cmd_flags &= ~REQ_P2PDMA;
- break;
- default:
- iter->status = BLK_STS_INVAL;
- return false;
- }
- }
-
- if (blk_can_dma_map_iova(req, dma_dev) &&
- dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
- return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
- return blk_dma_map_direct(req, dma_dev, iter, &vec);
+ blk_rq_map_iter_init(req, &iter->iter);
+ return blk_dma_map_iter_start(req, dma_dev, state, iter,
+ blk_rq_payload_bytes(req));
}
EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
@@ -246,16 +284,11 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
struct scatterlist **last_sg)
{
- struct req_iterator iter = {
- .bio = rq->bio,
- };
+ struct blk_map_iter iter;
struct phys_vec vec;
int nsegs = 0;
- /* the internal flush request may not have bio attached */
- if (iter.bio)
- iter.iter = iter.bio->bi_iter;
-
+ blk_rq_map_iter_init(rq, &iter);
while (blk_map_iter_next(rq, &iter, &vec)) {
*last_sg = blk_next_sg(last_sg, sglist);
sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
@@ -275,3 +308,124 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
return nsegs;
}
EXPORT_SYMBOL(__blk_rq_map_sg);
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+/**
+ * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment
+ * for a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are
+ * provided by the caller and don't need to be initialized. @state needs to be
+ * stored for use at unmap time, @iter is only needed at map time.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true if it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr
+ * and the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ *
+ * The caller can call blk_rq_dma_map_coalesce() to check if further segments
+ * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
+ * to try to map the following segments.
+ */
+bool blk_rq_integrity_dma_map_iter_start(struct request *req,
+ struct device *dma_dev, struct dma_iova_state *state,
+ struct blk_dma_iter *iter)
+{
+ unsigned len = bio_integrity_bytes(&req->q->limits.integrity,
+ blk_rq_sectors(req));
+ struct bio *bio = req->bio;
+
+ iter->iter = (struct blk_map_iter) {
+ .bio = bio,
+ .iter = bio_integrity(bio)->bip_iter,
+ .bvecs = bio_integrity(bio)->bip_vec,
+ .is_integrity = true,
+ };
+ return blk_dma_map_iter_start(req, dma_dev, state, iter, len);
+}
+EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start);
+
+/**
+ * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for
+ * a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Iterate to the next integrity mapping after a previous call to
+ * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description
+ * of the arguments.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true if it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ */
+bool blk_rq_integrity_dma_map_iter_next(struct request *req,
+ struct device *dma_dev, struct blk_dma_iter *iter)
+{
+ struct phys_vec vec;
+
+ if (!blk_map_iter_next(req, &iter->iter, &vec))
+ return false;
+
+ if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+ return blk_dma_map_bus(iter, &vec);
+ return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
+
+/**
+ * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
+ * @rq: request to map
+ * @sglist: target scatterlist
+ *
+ * Description: Map the integrity vectors in request into a
+ * scatterlist. The scatterlist must be big enough to hold all
+ * elements. I.e. sized using blk_rq_count_integrity_sg() or
+ * rq->nr_integrity_segments.
+ */
+int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+{
+ struct request_queue *q = rq->q;
+ struct scatterlist *sg = NULL;
+ struct bio *bio = rq->bio;
+ unsigned int segments = 0;
+ struct phys_vec vec;
+
+ struct blk_map_iter iter = {
+ .bio = bio,
+ .iter = bio_integrity(bio)->bip_iter,
+ .bvecs = bio_integrity(bio)->bip_vec,
+ .is_integrity = true,
+ };
+
+ while (blk_map_iter_next(rq, &iter, &vec)) {
+ sg = blk_next_sg(&sg, sglist);
+ sg_set_page(sg, phys_to_page(vec.paddr), vec.len,
+ offset_in_page(vec.paddr));
+ segments++;
+ }
+
+ if (sg)
+ sg_mark_end(sg);
+
+ /*
+ * Something must have been wrong if the figured number of segment
+ * is bigger than number of req's physical integrity segments
+ */
+ BUG_ON(segments > rq->nr_integrity_segments);
+ BUG_ON(segments > queue_max_integrity_segments(q));
+ return segments;
+}
+EXPORT_SYMBOL(blk_rq_map_integrity_sg);
+#endif
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e2ce4a28e6c9..d06bb137a743 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -454,7 +454,7 @@ void blk_mq_free_sched_tags_batch(struct xarray *et_table,
}
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
- unsigned int nr_hw_queues)
+ unsigned int nr_hw_queues, unsigned int nr_requests)
{
unsigned int nr_tags;
int i;
@@ -470,13 +470,8 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
nr_tags * sizeof(struct blk_mq_tags *), gfp);
if (!et)
return NULL;
- /*
- * Default to double of smaller one between hw queue_depth and
- * 128, since we don't split into sync/async like the old code
- * did. Additionally, this is a per-hw queue depth.
- */
- et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
- BLKDEV_DEFAULT_RQ);
+
+ et->nr_requests = nr_requests;
et->nr_hw_queues = nr_hw_queues;
if (blk_mq_is_shared_tags(set->flags)) {
@@ -521,7 +516,8 @@ int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
* concurrently.
*/
if (q->elevator) {
- et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
+ et = blk_mq_alloc_sched_tags(set, nr_hw_queues,
+ blk_mq_default_nr_requests(set));
if (!et)
goto out_unwind;
if (xa_insert(et_table, q->id, et, gfp))
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index b554e1d55950..8e21a6b1415d 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -24,7 +24,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q);
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
- unsigned int nr_hw_queues);
+ unsigned int nr_hw_queues, unsigned int nr_requests);
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
void blk_mq_free_sched_tags(struct elevator_tags *et,
@@ -92,4 +92,15 @@ static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
+static inline void blk_mq_set_min_shallow_depth(struct request_queue *q,
+ unsigned int depth)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
+ depth);
+}
+
#endif
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 24656980f443..58ec293373c6 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -34,7 +34,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
kobj);
- blk_free_flush_queue(hctx->fq);
sbitmap_free(&hctx->ctx_map);
free_cpumask_var(hctx->cpumask);
kfree(hctx->ctxs);
@@ -150,9 +149,11 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
return;
hctx_for_each_ctx(hctx, ctx, i)
- kobject_del(&ctx->kobj);
+ if (ctx->kobj.state_in_sysfs)
+ kobject_del(&ctx->kobj);
- kobject_del(&hctx->kobj);
+ if (hctx->kobj.state_in_sysfs)
+ kobject_del(&hctx->kobj);
}
static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d880c50629d6..c7a4d4b9cc87 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -8,6 +8,9 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/kmemleak.h>
#include <linux/delay.h>
#include "blk.h"
@@ -253,13 +256,10 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
unsigned int bitnr)
{
struct request *rq;
- unsigned long flags;
- spin_lock_irqsave(&tags->lock, flags);
rq = tags->rqs[bitnr];
if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
rq = NULL;
- spin_unlock_irqrestore(&tags->lock, flags);
return rq;
}
@@ -297,15 +297,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
/**
* bt_for_each - iterate over the requests associated with a hardware queue
* @hctx: Hardware queue to examine.
- * @q: Request queue to examine.
+ * @q: Request queue @hctx is associated with (@hctx->queue).
* @bt: sbitmap to examine. This is either the breserved_tags member
* or the bitmap_tags member of struct blk_mq_tags.
* @fn: Pointer to the function that will be called for each request
* associated with @hctx that has been assigned a driver tag.
- * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
- * where rq is a pointer to a request. Return true to continue
- * iterating tags, false to stop.
- * @data: Will be passed as third argument to @fn.
+ * @fn will be called as follows: @fn(rq, @data) where rq is a
+ * pointer to a request. Return %true to continue iterating tags;
+ * %false to stop.
+ * @data: Will be passed as second argument to @fn.
* @reserved: Indicates whether @bt is the breserved_tags member or the
* bitmap_tags member of struct blk_mq_tags.
*/
@@ -371,9 +371,9 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* @bt: sbitmap to examine. This is either the breserved_tags member
* or the bitmap_tags member of struct blk_mq_tags.
* @fn: Pointer to the function that will be called for each started
- * request. @fn will be called as follows: @fn(rq, @data,
- * @reserved) where rq is a pointer to a request. Return true
- * to continue iterating tags, false to stop.
+ * request. @fn will be called as follows: @fn(rq, @data) where rq
+ * is a pointer to a request. Return %true to continue iterating
+ * tags; %false to stop.
* @data: Will be passed as second argument to @fn.
* @flags: BT_TAG_ITER_*
*/
@@ -406,10 +406,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
* blk_mq_all_tag_iter - iterate over all requests in a tag map
* @tags: Tag map to iterate over.
* @fn: Pointer to the function that will be called for each
- * request. @fn will be called as follows: @fn(rq, @priv,
- * reserved) where rq is a pointer to a request. 'reserved'
- * indicates whether or not @rq is a reserved request. Return
- * true to continue iterating tags, false to stop.
+ * request. @fn will be called as follows: @fn(rq, @priv) where rq
+ * is a pointer to a request. Return %true to continue iterating
+ * tags; %false to stop.
* @priv: Will be passed as second argument to @fn.
*
* Caller has to pass the tag map from which requests are allocated.
@@ -424,10 +423,9 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
* blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
* @tagset: Tag set to iterate over.
* @fn: Pointer to the function that will be called for each started
- * request. @fn will be called as follows: @fn(rq, @priv,
- * reserved) where rq is a pointer to a request. 'reserved'
- * indicates whether or not @rq is a reserved request. Return
- * true to continue iterating tags, false to stop.
+ * request. @fn will be called as follows: @fn(rq, @priv) where
+ * rq is a pointer to a request. Return true to continue iterating
+ * tags, false to stop.
* @priv: Will be passed as second argument to @fn.
*
* We grab one request reference before calling @fn and release it after
@@ -437,7 +435,9 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
{
unsigned int flags = tagset->flags;
- int i, nr_tags;
+ int i, nr_tags, srcu_idx;
+
+ srcu_idx = srcu_read_lock(&tagset->tags_srcu);
nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
@@ -446,6 +446,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
BT_TAG_ITER_STARTED);
}
+ srcu_read_unlock(&tagset->tags_srcu, srcu_idx);
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
@@ -483,11 +484,10 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
* blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
* @q: Request queue to examine.
* @fn: Pointer to the function that will be called for each request
- * on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
- * reserved) where rq is a pointer to a request and hctx points
- * to the hardware queue associated with the request. 'reserved'
- * indicates whether or not @rq is a reserved request.
- * @priv: Will be passed as third argument to @fn.
+ * on @q. @fn will be called as follows: @fn(rq, @priv) where rq
+ * is a pointer to a request and hctx points to the hardware queue
+ * associated with the request.
+ * @priv: Will be passed as second argument to @fn.
*
* Note: if @q->tag_set is shared with other request queues then @fn will be
* called for all requests on all queues that share that tag set and not only
@@ -496,6 +496,8 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv)
{
+ int srcu_idx;
+
/*
* __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
* while the queue is frozen. So we can use q_usage_counter to avoid
@@ -504,6 +506,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
+ srcu_idx = srcu_read_lock(&q->tag_set->tags_srcu);
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
struct blk_mq_tags *tags = q->tag_set->shared_tags;
struct sbitmap_queue *bresv = &tags->breserved_tags;
@@ -533,6 +536,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
bt_for_each(hctx, q, btags, fn, priv, false);
}
}
+ srcu_read_unlock(&q->tag_set->tags_srcu, srcu_idx);
blk_queue_exit(q);
}
@@ -562,6 +566,8 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
+ INIT_LIST_HEAD(&tags->page_list);
+
if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
goto out_free_tags;
if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node))
@@ -576,63 +582,37 @@ out_free_tags:
return NULL;
}
-void blk_mq_free_tags(struct blk_mq_tags *tags)
+static void blk_mq_free_tags_callback(struct rcu_head *head)
{
- sbitmap_queue_free(&tags->bitmap_tags);
- sbitmap_queue_free(&tags->breserved_tags);
- kfree(tags);
-}
-
-int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_tags **tagsptr, unsigned int tdepth,
- bool can_grow)
-{
- struct blk_mq_tags *tags = *tagsptr;
-
- if (tdepth <= tags->nr_reserved_tags)
- return -EINVAL;
-
- /*
- * If we are allowed to grow beyond the original size, allocate
- * a new set of tags before freeing the old one.
- */
- if (tdepth > tags->nr_tags) {
- struct blk_mq_tag_set *set = hctx->queue->tag_set;
- struct blk_mq_tags *new;
-
- if (!can_grow)
- return -EINVAL;
-
- /*
- * We need some sort of upper limit, set it high enough that
- * no valid use cases should require more.
- */
- if (tdepth > MAX_SCHED_RQ)
- return -EINVAL;
+ struct blk_mq_tags *tags = container_of(head, struct blk_mq_tags,
+ rcu_head);
+ struct page *page;
+ while (!list_empty(&tags->page_list)) {
+ page = list_first_entry(&tags->page_list, struct page, lru);
+ list_del_init(&page->lru);
/*
- * Only the sbitmap needs resizing since we allocated the max
- * initially.
+ * Remove kmemleak object previously allocated in
+ * blk_mq_alloc_rqs().
*/
- if (blk_mq_is_shared_tags(set->flags))
- return 0;
+ kmemleak_free(page_address(page));
+ __free_pages(page, page->private);
+ }
+ kfree(tags);
+}
- new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
- if (!new)
- return -ENOMEM;
+void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags)
+{
+ sbitmap_queue_free(&tags->bitmap_tags);
+ sbitmap_queue_free(&tags->breserved_tags);
- blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
- *tagsptr = new;
- } else {
- /*
- * Don't need (or can't) update reserved tags here, they
- * remain static and should never need resizing.
- */
- sbitmap_queue_resize(&tags->bitmap_tags,
- tdepth - tags->nr_reserved_tags);
+ /* if tags pages is not allocated yet, free tags directly */
+ if (list_empty(&tags->page_list)) {
+ kfree(tags);
+ return;
}
- return 0;
+ call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback);
}
void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b67d6c02eceb..09f579414161 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -396,6 +396,15 @@ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
#endif
}
+static inline void blk_mq_bio_issue_init(struct request_queue *q,
+ struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+ if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags))
+ bio->issue_time_ns = blk_time_get_ns();
+#endif
+}
+
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
struct blk_mq_tags *tags, unsigned int tag)
{
@@ -3168,6 +3177,7 @@ void blk_mq_submit_bio(struct bio *bio)
if (!bio_integrity_prep(bio))
goto queue_exit;
+ blk_mq_bio_issue_init(q, bio);
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
@@ -3415,7 +3425,6 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
struct blk_mq_tags *tags)
{
struct page *page;
- unsigned long flags;
/*
* There is no need to clear mapping if driver tags is not initialized
@@ -3439,22 +3448,12 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
}
}
}
-
- /*
- * Wait until all pending iteration is done.
- *
- * Request reference is cleared and it is guaranteed to be observed
- * after the ->lock is released.
- */
- spin_lock_irqsave(&drv_tags->lock, flags);
- spin_unlock_irqrestore(&drv_tags->lock, flags);
}
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
struct blk_mq_tags *drv_tags;
- struct page *page;
if (list_empty(&tags->page_list))
return;
@@ -3478,27 +3477,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
blk_mq_clear_rq_mapping(drv_tags, tags);
-
- while (!list_empty(&tags->page_list)) {
- page = list_first_entry(&tags->page_list, struct page, lru);
- list_del_init(&page->lru);
- /*
- * Remove kmemleak object previously allocated in
- * blk_mq_alloc_rqs().
- */
- kmemleak_free(page_address(page));
- __free_pages(page, page->private);
- }
+ /*
+ * Free request pages in SRCU callback, which is called from
+ * blk_mq_free_tags().
+ */
}
-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags)
{
kfree(tags->rqs);
tags->rqs = NULL;
kfree(tags->static_rqs);
tags->static_rqs = NULL;
- blk_mq_free_tags(tags);
+ blk_mq_free_tags(set, tags);
}
static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
@@ -3560,7 +3552,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
err_free_rqs:
kfree(tags->rqs);
err_free_tags:
- blk_mq_free_tags(tags);
+ blk_mq_free_tags(set, tags);
return NULL;
}
@@ -3590,8 +3582,6 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
if (node == NUMA_NO_NODE)
node = set->numa_node;
- INIT_LIST_HEAD(&tags->page_list);
-
/*
* rq_size is the size of the request plus driver payload, rounded
* to the cacheline size
@@ -3678,8 +3668,12 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
struct rq_iter_data data = {
.hctx = hctx,
};
+ int srcu_idx;
+ srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu);
blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+ srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx);
+
return data.has_rq;
}
@@ -3899,7 +3893,6 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
unsigned int queue_depth, struct request *flush_rq)
{
int i;
- unsigned long flags;
/* The hw queue may not be mapped yet */
if (!tags)
@@ -3909,15 +3902,14 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
for (i = 0; i < queue_depth; i++)
cmpxchg(&tags->rqs[i], flush_rq, NULL);
+}
- /*
- * Wait until all pending iteration is done.
- *
- * Request reference is cleared and it is guaranteed to be observed
- * after the ->lock is released.
- */
- spin_lock_irqsave(&tags->lock, flags);
- spin_unlock_irqrestore(&tags->lock, flags);
+static void blk_free_flush_queue_callback(struct rcu_head *head)
+{
+ struct blk_flush_queue *fq =
+ container_of(head, struct blk_flush_queue, rcu_head);
+
+ blk_free_flush_queue(fq);
}
/* hctx->ctxs will be freed in queue's release handler */
@@ -3939,6 +3931,10 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
+ call_srcu(&set->tags_srcu, &hctx->fq->rcu_head,
+ blk_free_flush_queue_callback);
+ hctx->fq = NULL;
+
xa_erase(&q->hctx_table, hctx_idx);
spin_lock(&q->unused_hctx_lock);
@@ -3964,13 +3960,19 @@ static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
{
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
+
+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
+ if (!hctx->fq)
+ goto fail;
+
hctx->queue_num = hctx_idx;
hctx->tags = set->tags[hctx_idx];
if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
- goto fail;
+ goto fail_free_fq;
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
@@ -3987,6 +3989,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
+ fail_free_fq:
+ blk_free_flush_queue(hctx->fq);
+ hctx->fq = NULL;
fail:
return -1;
}
@@ -4038,16 +4043,10 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
- hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
- if (!hctx->fq)
- goto free_bitmap;
-
blk_mq_hctx_kobj_init(hctx);
return hctx;
- free_bitmap:
- sbitmap_free(&hctx->ctx_map);
free_ctxs:
kfree(hctx->ctxs);
free_cpumask:
@@ -4101,7 +4100,7 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
if (ret) {
- blk_mq_free_rq_map(tags);
+ blk_mq_free_rq_map(set, tags);
return NULL;
}
@@ -4129,7 +4128,7 @@ void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
{
if (tags) {
blk_mq_free_rqs(set, tags, hctx_idx);
- blk_mq_free_rq_map(tags);
+ blk_mq_free_rq_map(set, tags);
}
}
@@ -4828,6 +4827,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (ret)
goto out_free_srcu;
}
+ ret = init_srcu_struct(&set->tags_srcu);
+ if (ret)
+ goto out_cleanup_srcu;
init_rwsem(&set->update_nr_hwq_lock);
@@ -4836,7 +4838,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
sizeof(struct blk_mq_tags *), GFP_KERNEL,
set->numa_node);
if (!set->tags)
- goto out_cleanup_srcu;
+ goto out_cleanup_tags_srcu;
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
@@ -4865,6 +4867,8 @@ out_free_mq_map:
}
kfree(set->tags);
set->tags = NULL;
+out_cleanup_tags_srcu:
+ cleanup_srcu_struct(&set->tags_srcu);
out_cleanup_srcu:
if (set->flags & BLK_MQ_F_BLOCKING)
cleanup_srcu_struct(set->srcu);
@@ -4910,6 +4914,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
kfree(set->tags);
set->tags = NULL;
+
+ srcu_barrier(&set->tags_srcu);
+ cleanup_srcu_struct(&set->tags_srcu);
if (set->flags & BLK_MQ_F_BLOCKING) {
cleanup_srcu_struct(set->srcu);
kfree(set->srcu);
@@ -4917,57 +4924,59 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
-int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
+ struct elevator_tags *et,
+ unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
+ struct elevator_tags *old_et = NULL;
struct blk_mq_hw_ctx *hctx;
- int ret;
unsigned long i;
- if (WARN_ON_ONCE(!q->mq_freeze_depth))
- return -EINVAL;
-
- if (!set)
- return -EINVAL;
-
- if (q->nr_requests == nr)
- return 0;
-
blk_mq_quiesce_queue(q);
- ret = 0;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (!hctx->tags)
- continue;
+ if (blk_mq_is_shared_tags(set->flags)) {
/*
- * If we're using an MQ scheduler, just update the scheduler
- * queue depth. This is similar to what the old code would do.
+ * Shared tags, for sched tags, we allocate max initially hence
+ * tags can't grow, see blk_mq_alloc_sched_tags().
*/
- if (hctx->sched_tags) {
- ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
- nr, true);
- } else {
- ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
- false);
+ if (q->elevator)
+ blk_mq_tag_update_sched_shared_tags(q);
+ else
+ blk_mq_tag_resize_shared_tags(set, nr);
+ } else if (!q->elevator) {
+ /*
+ * Non-shared hardware tags, nr is already checked from
+ * queue_requests_store() and tags can't grow.
+ */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->tags)
+ continue;
+ sbitmap_queue_resize(&hctx->tags->bitmap_tags,
+ nr - hctx->tags->nr_reserved_tags);
}
- if (ret)
- break;
- if (q->elevator && q->elevator->type->ops.depth_updated)
- q->elevator->type->ops.depth_updated(hctx);
- }
- if (!ret) {
- q->nr_requests = nr;
- if (blk_mq_is_shared_tags(set->flags)) {
- if (q->elevator)
- blk_mq_tag_update_sched_shared_tags(q);
- else
- blk_mq_tag_resize_shared_tags(set, nr);
+ } else if (nr <= q->elevator->et->nr_requests) {
+ /* Non-shared sched tags, and tags don't grow. */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->sched_tags)
+ continue;
+ sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags,
+ nr - hctx->sched_tags->nr_reserved_tags);
}
+ } else {
+ /* Non-shared sched tags, and tags grow */
+ queue_for_each_hw_ctx(q, hctx, i)
+ hctx->sched_tags = et->tags[i];
+ old_et = q->elevator->et;
+ q->elevator->et = et;
}
- blk_mq_unquiesce_queue(q);
+ q->nr_requests = nr;
+ if (q->elevator && q->elevator->type->ops.depth_updated)
+ q->elevator->type->ops.depth_updated(q);
- return ret;
+ blk_mq_unquiesce_queue(q);
+ return old_et;
}
/*
@@ -5033,6 +5042,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
unsigned int memflags;
int i;
struct xarray elv_tbl, et_tbl;
+ bool queues_frozen = false;
lockdep_assert_held(&set->tag_list_lock);
@@ -5056,9 +5066,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
blk_mq_sysfs_unregister_hctxs(q);
}
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_freeze_queue_nomemsave(q);
-
/*
* Switch IO scheduler to 'none', cleaning up the data associated
* with the previous scheduler. We will switch back once we are done
@@ -5068,6 +5075,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
if (blk_mq_elv_switch_none(q, &elv_tbl))
goto switch_back;
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_freeze_queue_nomemsave(q);
+ queues_frozen = true;
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
goto switch_back;
@@ -5091,8 +5101,12 @@ fallback:
}
switch_back:
/* The blk_mq_elv_switch_back unfreezes queue for us. */
- list_for_each_entry(q, &set->tag_list, tag_set_list)
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ /* switch_back expects queue to be frozen */
+ if (!queues_frozen)
+ blk_mq_freeze_queue_nomemsave(q);
blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
+ }
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register_hctxs(q);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index affb2e14b56e..af42dc018808 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -6,6 +6,7 @@
#include "blk-stat.h"
struct blk_mq_tag_set;
+struct elevator_tags;
struct blk_mq_ctxs {
struct kobject kobj;
@@ -45,7 +46,9 @@ void blk_mq_submit_bio(struct bio *bio);
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q);
-int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
+struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
+ struct elevator_tags *tags,
+ unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
bool);
@@ -59,7 +62,7 @@ void blk_mq_put_rq_ref(struct request *rq);
*/
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
-void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags);
struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int depth);
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
@@ -110,6 +113,17 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf,
}
/*
+ * Default to double of smaller one between hw queue_depth and
+ * 128, since we don't split into sync/async like the old code
+ * did. Additionally, this is a per-hw queue depth.
+ */
+static inline unsigned int blk_mq_default_nr_requests(
+ struct blk_mq_tag_set *set)
+{
+ return 2 * min_t(unsigned int, set->queue_depth, BLKDEV_DEFAULT_RQ);
+}
+
+/*
* sysfs helpers
*/
extern void blk_mq_sysfs_init(struct request_queue *q);
@@ -162,7 +176,7 @@ struct blk_mq_alloc_data {
struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
unsigned int reserved_tags, unsigned int flags, int node);
-void blk_mq_free_tags(struct blk_mq_tags *tags);
+void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags);
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
@@ -170,8 +184,6 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag);
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
-int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_tags **tags, unsigned int depth, bool can_grow);
void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
unsigned int size);
void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 848591fb3c57..654478dfbc20 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -2,8 +2,6 @@
#include "blk-rq-qos.h"
-__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos);
-
/*
* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
* false if 'v' + 1 would be bigger than 'below'.
@@ -319,8 +317,8 @@ void rq_qos_exit(struct request_queue *q)
struct rq_qos *rqos = q->rq_qos;
q->rq_qos = rqos->next;
rqos->ops->exit(rqos);
- static_branch_dec(&block_rq_qos);
}
+ blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
mutex_unlock(&q->rq_qos_mutex);
}
@@ -346,7 +344,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
- static_branch_inc(&block_rq_qos);
+ blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
blk_mq_unfreeze_queue(q, memflags);
@@ -377,6 +375,8 @@ void rq_qos_del(struct rq_qos *rqos)
break;
}
}
+ if (!q->rq_qos)
+ blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
blk_mq_unfreeze_queue(q, memflags);
mutex_lock(&q->debugfs_mutex);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 39749f4066fb..b538f2c0febc 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -12,7 +12,6 @@
#include "blk-mq-debugfs.h"
struct blk_mq_debugfs_attr;
-extern struct static_key_false block_rq_qos;
enum rq_qos_id {
RQ_QOS_WBT,
@@ -113,43 +112,58 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
- if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos)
__rq_qos_cleanup(q->rq_qos, bio);
}
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
- if (static_branch_unlikely(&block_rq_qos) && q->rq_qos &&
- !blk_rq_is_passthrough(rq))
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos && !blk_rq_is_passthrough(rq))
__rq_qos_done(q->rq_qos, rq);
}
static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
{
- if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos)
__rq_qos_issue(q->rq_qos, rq);
}
static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
{
- if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos)
__rq_qos_requeue(q->rq_qos, rq);
}
static inline void rq_qos_done_bio(struct bio *bio)
{
- if (static_branch_unlikely(&block_rq_qos) &&
- bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
- bio_flagged(bio, BIO_QOS_MERGED))) {
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- if (q->rq_qos)
- __rq_qos_done_bio(q->rq_qos, bio);
- }
+ struct request_queue *q;
+
+ if (!bio->bi_bdev || (!bio_flagged(bio, BIO_QOS_THROTTLED) &&
+ !bio_flagged(bio, BIO_QOS_MERGED)))
+ return;
+
+ q = bdev_get_queue(bio->bi_bdev);
+
+ /*
+ * A BIO may carry BIO_QOS_* flags even if the associated request_queue
+ * does not have rq_qos enabled. This can happen with stacked block
+ * devices — for example, NVMe multipath, where it's possible that the
+ * bottom device has QoS enabled but the top device does not. Therefore,
+ * always verify that q->rq_qos is present and QoS is enabled before
+ * calling __rq_qos_done_bio().
+ */
+ if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
+ __rq_qos_done_bio(q->rq_qos, bio);
}
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
- if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos) {
bio_set_flag(bio, BIO_QOS_THROTTLED);
__rq_qos_throttle(q->rq_qos, bio);
}
@@ -158,14 +172,16 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos)
__rq_qos_track(q->rq_qos, rq, bio);
}
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos) {
bio_set_flag(bio, BIO_QOS_MERGED);
__rq_qos_merge(q->rq_qos, rq, bio);
}
@@ -173,7 +189,8 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
{
- if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
+ if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+ q->rq_qos)
__rq_qos_queue_depth_changed(q->rq_qos);
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 07874e9b609f..54cffaae4df4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -56,6 +56,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
lim->max_hw_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
+ lim->atomic_write_hw_max = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -157,16 +158,14 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
switch (bi->csum_type) {
case BLK_INTEGRITY_CSUM_NONE:
if (bi->pi_tuple_size) {
- pr_warn("pi_tuple_size must be 0 when checksum type \
- is none\n");
+ pr_warn("pi_tuple_size must be 0 when checksum type is none\n");
return -EINVAL;
}
break;
case BLK_INTEGRITY_CSUM_CRC:
case BLK_INTEGRITY_CSUM_IP:
if (bi->pi_tuple_size != sizeof(struct t10_pi_tuple)) {
- pr_warn("pi_tuple_size mismatch for T10 PI: expected \
- %zu, got %u\n",
+ pr_warn("pi_tuple_size mismatch for T10 PI: expected %zu, got %u\n",
sizeof(struct t10_pi_tuple),
bi->pi_tuple_size);
return -EINVAL;
@@ -174,8 +173,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
break;
case BLK_INTEGRITY_CSUM_CRC64:
if (bi->pi_tuple_size != sizeof(struct crc64_pi_tuple)) {
- pr_warn("pi_tuple_size mismatch for CRC64 PI: \
- expected %zu, got %u\n",
+ pr_warn("pi_tuple_size mismatch for CRC64 PI: expected %zu, got %u\n",
sizeof(struct crc64_pi_tuple),
bi->pi_tuple_size);
return -EINVAL;
@@ -226,6 +224,27 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim)
lim->atomic_write_hw_boundary >> SECTOR_SHIFT;
}
+/*
+ * Test whether any boundary is aligned with any chunk size. Stacked
+ * devices store any stripe size in t->chunk_sectors.
+ */
+static bool blk_valid_atomic_writes_boundary(unsigned int chunk_sectors,
+ unsigned int boundary_sectors)
+{
+ if (!chunk_sectors || !boundary_sectors)
+ return true;
+
+ if (boundary_sectors > chunk_sectors &&
+ boundary_sectors % chunk_sectors)
+ return false;
+
+ if (chunk_sectors > boundary_sectors &&
+ chunk_sectors % boundary_sectors)
+ return false;
+
+ return true;
+}
+
static void blk_validate_atomic_write_limits(struct queue_limits *lim)
{
unsigned int boundary_sectors;
@@ -235,6 +254,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
if (!(lim->features & BLK_FEAT_ATOMIC_WRITES))
goto unsupported;
+ /* UINT_MAX indicates stacked limits in initial state */
+ if (lim->atomic_write_hw_max == UINT_MAX)
+ goto unsupported;
+
if (!lim->atomic_write_hw_max)
goto unsupported;
@@ -262,20 +285,9 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
if (WARN_ON_ONCE(lim->atomic_write_hw_max >
lim->atomic_write_hw_boundary))
goto unsupported;
- /*
- * A feature of boundary support is that it disallows bios to
- * be merged which would result in a merged request which
- * crosses either a chunk sector or atomic write HW boundary,
- * even though chunk sectors may be just set for performance.
- * For simplicity, disallow atomic writes for a chunk sector
- * which is non-zero and smaller than atomic write HW boundary.
- * Furthermore, chunk sectors must be a multiple of atomic
- * write HW boundary. Otherwise boundary support becomes
- * complicated.
- * Devices which do not conform to these rules can be dealt
- * with if and when they show up.
- */
- if (WARN_ON_ONCE(lim->chunk_sectors % boundary_sectors))
+
+ if (WARN_ON_ONCE(!blk_valid_atomic_writes_boundary(
+ lim->chunk_sectors, boundary_sectors)))
goto unsupported;
/*
@@ -642,25 +654,6 @@ static bool blk_stack_atomic_writes_tail(struct queue_limits *t,
return true;
}
-/* Check for valid boundary of first bottom device */
-static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t,
- struct queue_limits *b)
-{
- /*
- * Ensure atomic write boundary is aligned with chunk sectors. Stacked
- * devices store chunk sectors in t->io_min.
- */
- if (b->atomic_write_hw_boundary > t->io_min &&
- b->atomic_write_hw_boundary % t->io_min)
- return false;
- if (t->io_min > b->atomic_write_hw_boundary &&
- t->io_min % b->atomic_write_hw_boundary)
- return false;
-
- t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
- return true;
-}
-
static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t)
{
unsigned int chunk_bytes;
@@ -698,13 +691,14 @@ static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t)
static bool blk_stack_atomic_writes_head(struct queue_limits *t,
struct queue_limits *b)
{
- if (b->atomic_write_hw_boundary &&
- !blk_stack_atomic_writes_boundary_head(t, b))
+ if (!blk_valid_atomic_writes_boundary(t->chunk_sectors,
+ b->atomic_write_hw_boundary >> SECTOR_SHIFT))
return false;
t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
t->atomic_write_hw_max = b->atomic_write_hw_max;
+ t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
return true;
}
@@ -720,18 +714,14 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t,
if (!blk_atomic_write_start_sect_aligned(start, b))
goto unsupported;
- /*
- * If atomic_write_hw_max is set, we have already stacked 1x bottom
- * device, so check for compliance.
- */
- if (t->atomic_write_hw_max) {
+ /* UINT_MAX indicates no stacking of bottom devices yet */
+ if (t->atomic_write_hw_max == UINT_MAX) {
+ if (!blk_stack_atomic_writes_head(t, b))
+ goto unsupported;
+ } else {
if (!blk_stack_atomic_writes_tail(t, b))
goto unsupported;
- return;
}
-
- if (!blk_stack_atomic_writes_head(t, b))
- goto unsupported;
blk_stack_atomic_writes_chunk_sectors(t);
return;
@@ -766,7 +756,8 @@ unsupported:
int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t start)
{
- unsigned int top, bottom, alignment, ret = 0;
+ unsigned int top, bottom, alignment;
+ int ret = 0;
t->features |= (b->features & BLK_FEAT_INHERIT_MASK);
@@ -972,6 +963,8 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
goto incompatible;
if (ti->csum_type != bi->csum_type)
goto incompatible;
+ if (ti->pi_tuple_size != bi->pi_tuple_size)
+ goto incompatible;
if ((ti->flags & BLK_INTEGRITY_REF_TAG) !=
(bi->flags & BLK_INTEGRITY_REF_TAG))
goto incompatible;
@@ -980,6 +973,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) |
(bi->flags & BLK_INTEGRITY_REF_TAG);
ti->csum_type = bi->csum_type;
+ ti->pi_tuple_size = bi->pi_tuple_size;
ti->metadata_size = bi->metadata_size;
ti->pi_offset = bi->pi_offset;
ti->interval_exp = bi->interval_exp;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 396cded255ea..76c47fe9b8d6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -64,28 +64,66 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page)
static ssize_t
queue_requests_store(struct gendisk *disk, const char *page, size_t count)
{
- unsigned long nr;
- int ret, err;
- unsigned int memflags;
struct request_queue *q = disk->queue;
-
- if (!queue_is_mq(q))
- return -EINVAL;
+ struct blk_mq_tag_set *set = q->tag_set;
+ struct elevator_tags *et = NULL;
+ unsigned int memflags;
+ unsigned long nr;
+ int ret;
ret = queue_var_store(&nr, page, count);
if (ret < 0)
return ret;
- memflags = blk_mq_freeze_queue(q);
- mutex_lock(&q->elevator_lock);
+ /*
+ * Serialize updating nr_requests with concurrent queue_requests_store()
+ * and switching elevator.
+ */
+ down_write(&set->update_nr_hwq_lock);
+
+ if (nr == q->nr_requests)
+ goto unlock;
+
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
- err = blk_mq_update_nr_requests(disk->queue, nr);
- if (err)
- ret = err;
+ /*
+ * Switching elevator is protected by update_nr_hwq_lock:
+ * - read lock is held from elevator sysfs attribute;
+ * - write lock is held from updating nr_hw_queues;
+ * Hence it's safe to access q->elevator here with write lock held.
+ */
+ if (nr <= set->reserved_tags ||
+ (q->elevator && nr > MAX_SCHED_RQ) ||
+ (!q->elevator && nr > set->queue_depth)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!blk_mq_is_shared_tags(set->flags) && q->elevator &&
+ nr > q->elevator->et->nr_requests) {
+ /*
+ * Tags will grow, allocate memory before freezing queue to
+ * prevent deadlock.
+ */
+ et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr);
+ if (!et) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ }
+
+ memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->elevator_lock);
+ et = blk_mq_update_nr_requests(q, et, nr);
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
+
+ if (et)
+ blk_mq_free_sched_tags(et, set);
+
+unlock:
+ up_write(&set->update_nr_hwq_lock);
return ret;
}
@@ -620,6 +658,11 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
if (val < -1)
return -EINVAL;
+ /*
+ * Ensure that the queue is idled, in case the latency update
+ * ends up either enabling or disabling wbt completely. We can't
+ * have IO inflight if that happens.
+ */
memflags = blk_mq_freeze_queue(q);
rqos = wbt_rq_qos(q);
@@ -638,11 +681,6 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
if (wbt_get_min_lat(q) == val)
goto out;
- /*
- * Ensure that the queue is idled, in case the latency update
- * ends up either enabling or disabling wbt completely. We can't
- * have IO inflight if that happens.
- */
blk_mq_quiesce_queue(q);
mutex_lock(&disk->rqos_state_mutex);
@@ -847,7 +885,7 @@ static void blk_queue_release(struct kobject *kobj)
/* nothing to do here, all data is associated with the parent gendisk */
}
-static const struct kobj_type blk_queue_ktype = {
+const struct kobj_type blk_queue_ktype = {
.default_groups = blk_queue_attr_groups,
.sysfs_ops = &queue_sysfs_ops,
.release = blk_queue_release,
@@ -875,15 +913,14 @@ int blk_register_queue(struct gendisk *disk)
struct request_queue *q = disk->queue;
int ret;
- kobject_init(&disk->queue_kobj, &blk_queue_ktype);
ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
if (ret < 0)
- goto out_put_queue_kobj;
+ return ret;
if (queue_is_mq(q)) {
ret = blk_mq_sysfs_register(disk);
if (ret)
- goto out_put_queue_kobj;
+ goto out_del_queue_kobj;
}
mutex_lock(&q->sysfs_lock);
@@ -903,9 +940,9 @@ int blk_register_queue(struct gendisk *disk)
if (queue_is_mq(q))
elevator_set_default(q);
- wbt_enable_default(disk);
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
+ wbt_enable_default(disk);
/* Now everything is ready and send out KOBJ_ADD uevent */
kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
@@ -934,8 +971,8 @@ out_debugfs_remove:
mutex_unlock(&q->sysfs_lock);
if (queue_is_mq(q))
blk_mq_sysfs_unregister(disk);
-out_put_queue_kobj:
- kobject_put(&disk->queue_kobj);
+out_del_queue_kobj:
+ kobject_del(&disk->queue_kobj);
return ret;
}
@@ -986,5 +1023,4 @@ void blk_unregister_queue(struct gendisk *disk)
elevator_set_none(q);
blk_debugfs_remove(disk);
- kobject_put(&disk->queue_kobj);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 397b6a410f9e..2c5b64b1a724 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1224,7 +1224,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bio_list_on_stack)))
- submit_bio_noacct_nocheck(bio);
+ submit_bio_noacct_nocheck(bio, false);
blk_finish_plug(&plug);
}
}
@@ -1327,17 +1327,13 @@ static int blk_throtl_init(struct gendisk *disk)
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
- /*
- * Freeze queue before activating policy, to synchronize with IO path,
- * which is protected by 'q_usage_counter'.
- */
memflags = blk_mq_freeze_queue(disk->queue);
blk_mq_quiesce_queue(disk->queue);
q->td = td;
td->queue = q;
- /* activate policy */
+ /* activate policy, blk_throtl_activated() will return true */
ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
if (ret) {
q->td = NULL;
@@ -1846,12 +1842,15 @@ void blk_throtl_exit(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- if (!blk_throtl_activated(q))
+ /*
+ * blkg_destroy_all() already deactivate throtl policy, just check and
+ * free throtl data.
+ */
+ if (!q->td)
return;
timer_delete_sync(&q->td->service_queue.pending_timer);
throtl_shutdown_wq(q);
- blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
kfree(q->td);
}
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 3b27755bfbff..9d7a42c039a1 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -156,7 +156,13 @@ void blk_throtl_cancel_bios(struct gendisk *disk);
static inline bool blk_throtl_activated(struct request_queue *q)
{
- return q->td != NULL;
+ /*
+ * q->td guarantees that the blk-throttle module is already loaded,
+ * and the plid of blk-throttle is assigned.
+ * blkcg_policy_enabled() guarantees that the policy is activated
+ * in the request_queue.
+ */
+ return q->td != NULL && blkcg_policy_enabled(q, &blkcg_policy_throtl);
}
static inline bool blk_should_throtl(struct bio *bio)
@@ -164,11 +170,6 @@ static inline bool blk_should_throtl(struct bio *bio)
struct throtl_grp *tg;
int rw = bio_data_dir(bio);
- /*
- * This is called under bio_queue_enter(), and it's synchronized with
- * the activation of blk-throtl, which is protected by
- * blk_mq_freeze_queue().
- */
if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
return false;
@@ -194,7 +195,10 @@ static inline bool blk_should_throtl(struct bio *bio)
static inline bool blk_throtl_bio(struct bio *bio)
{
-
+ /*
+ * block throttling takes effect if the policy is activated
+ * in the bio's request_queue.
+ */
if (!blk_should_throtl(bio))
return false;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index a50d4cd55f41..eb8037bae0bd 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -85,8 +85,8 @@ struct rq_wb {
u64 sync_issue;
void *sync_cookie;
- unsigned long last_issue; /* last non-throttled issue */
- unsigned long last_comp; /* last non-throttled comp */
+ unsigned long last_issue; /* issue time of last read rq */
+ unsigned long last_comp; /* completion time of last read rq */
unsigned long min_lat_nsec;
struct rq_qos rqos;
struct rq_wait rq_wait[WBT_NUM_RWQ];
@@ -248,13 +248,14 @@ static void wbt_done(struct rq_qos *rqos, struct request *rq)
struct rq_wb *rwb = RQWB(rqos);
if (!wbt_is_tracked(rq)) {
- if (rwb->sync_cookie == rq) {
- rwb->sync_issue = 0;
- rwb->sync_cookie = NULL;
- }
+ if (wbt_is_read(rq)) {
+ if (rwb->sync_cookie == rq) {
+ rwb->sync_issue = 0;
+ rwb->sync_cookie = NULL;
+ }
- if (wbt_is_read(rq))
wb_timestamp(rwb, &rwb->last_comp);
+ }
} else {
WARN_ON_ONCE(rq == rwb->sync_cookie);
__wbt_done(rqos, wbt_flags(rq));
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ef43aaca49f4..5e2a5788dc3b 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1286,14 +1286,14 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
struct block_device *bdev;
unsigned long flags;
struct bio *bio;
+ bool prepared;
/*
* Submit the next plugged BIO. If we do not have any, clear
* the plugged flag.
*/
- spin_lock_irqsave(&zwplug->lock, flags);
-
again:
+ spin_lock_irqsave(&zwplug->lock, flags);
bio = bio_list_pop(&zwplug->bio_list);
if (!bio) {
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
@@ -1304,13 +1304,14 @@ again:
trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
bio->bi_iter.bi_sector, bio_sectors(bio));
- if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
+ prepared = blk_zone_wplug_prepare_bio(zwplug, bio);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ if (!prepared) {
blk_zone_wplug_bio_io_error(zwplug, bio);
goto again;
}
- spin_unlock_irqrestore(&zwplug->lock, flags);
-
bdev = bio->bi_bdev;
/*
diff --git a/block/blk.h b/block/blk.h
index 0a2eccf28ca4..170794632135 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -29,6 +29,7 @@ struct elevator_tags;
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
+extern const struct kobj_type blk_queue_ktype;
extern struct dentry *blk_debugfs_root;
struct blk_flush_queue {
@@ -40,6 +41,7 @@ struct blk_flush_queue {
struct list_head flush_queue[2];
unsigned long flush_data_in_flight;
struct request *flush_rq;
+ struct rcu_head rcu_head;
};
bool is_flush_rq(struct request *req);
@@ -53,7 +55,7 @@ bool blk_queue_start_drain(struct request_queue *q);
bool __blk_freeze_queue_start(struct request_queue *q,
struct task_struct *owner);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
-void submit_bio_noacct_nocheck(struct bio *bio);
+void submit_bio_noacct_nocheck(struct bio *bio, bool split);
void bio_await_chain(struct bio *bio);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
@@ -614,6 +616,7 @@ extern const struct address_space_operations def_blk_aops;
int disk_register_independent_access_ranges(struct gendisk *disk);
void disk_unregister_independent_access_ranges(struct gendisk *disk);
+int should_fail_bio(struct bio *bio);
#ifdef CONFIG_FAIL_MAKE_REQUEST
bool should_fail_request(struct block_device *part, unsigned int bytes);
#else /* CONFIG_FAIL_MAKE_REQUEST */
@@ -679,48 +682,6 @@ static inline ktime_t blk_time_get(void)
return ns_to_ktime(blk_time_get_ns());
}
-/*
- * From most significant bit:
- * 1 bit: reserved for other usage, see below
- * 12 bits: original size of bio
- * 51 bits: issue time of bio
- */
-#define BIO_ISSUE_RES_BITS 1
-#define BIO_ISSUE_SIZE_BITS 12
-#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS)
-#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
-#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
-#define BIO_ISSUE_SIZE_MASK \
- (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
-#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
-
-/* Reserved bit for blk-throtl */
-#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
-
-static inline u64 __bio_issue_time(u64 time)
-{
- return time & BIO_ISSUE_TIME_MASK;
-}
-
-static inline u64 bio_issue_time(struct bio_issue *issue)
-{
- return __bio_issue_time(issue->value);
-}
-
-static inline sector_t bio_issue_size(struct bio_issue *issue)
-{
- return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
-}
-
-static inline void bio_issue_init(struct bio_issue *issue,
- sector_t size)
-{
- size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
- issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
- (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) |
- ((u64)size << BIO_ISSUE_SIZE_SHIFT));
-}
-
void bdev_release(struct file *bdev_file);
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
const struct blk_holder_ops *hops, struct file *bdev_file);
diff --git a/block/elevator.c b/block/elevator.c
index fe96c6f4753c..e2ebfbf107b3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -669,7 +669,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
lockdep_assert_held(&set->update_nr_hwq_lock);
if (strncmp(ctx->name, "none", 4)) {
- ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
+ ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues,
+ blk_mq_default_nr_requests(set));
if (!ctx->et)
return -ENOMEM;
}
diff --git a/block/elevator.h b/block/elevator.h
index adc5c157e17e..c4d20155065e 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -37,7 +37,7 @@ struct elevator_mq_ops {
void (*exit_sched)(struct elevator_queue *);
int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
- void (*depth_updated)(struct blk_mq_hw_ctx *);
+ void (*depth_updated)(struct request_queue *);
bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int);
diff --git a/block/fops.c b/block/fops.c
index 82451ac8ff25..c2c0396ea9ee 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
#include <linux/uio.h>
@@ -38,8 +39,8 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
struct iov_iter *iter)
{
- return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
- !bdev_iter_is_aligned(bdev, iter);
+ return (iocb->ki_pos | iov_iter_count(iter)) &
+ (bdev_logical_block_size(bdev) - 1);
}
#define DIO_INLINE_BIO_VECS 4
@@ -54,7 +55,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct bio bio;
ssize_t ret;
- WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA);
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@@ -78,7 +78,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
if (iocb->ki_flags & IOCB_ATOMIC)
bio.bi_opf |= REQ_ATOMIC;
- ret = bio_iov_iter_get_pages(&bio, iter);
+ ret = bio_iov_iter_get_bdev_pages(&bio, iter, bdev);
if (unlikely(ret))
goto out;
ret = bio.bi_iter.bi_size;
@@ -131,7 +131,7 @@ static void blkdev_bio_end_io(struct bio *bio)
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
- if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA))
+ if (bio_integrity(bio))
bio_integrity_unmap_user(bio);
if (atomic_dec_and_test(&dio->ref)) {
@@ -212,7 +212,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
- ret = bio_iov_iter_get_pages(bio, iter);
+ ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev);
if (unlikely(ret)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
@@ -233,7 +233,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
bio->bi_opf |= REQ_NOWAIT;
}
- if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) {
+ if (iocb->ki_flags & IOCB_HAS_METADATA) {
ret = bio_integrity_map_iter(bio, iocb->private);
if (unlikely(ret))
goto fail;
@@ -301,7 +301,7 @@ static void blkdev_bio_end_io_async(struct bio *bio)
ret = blk_status_to_errno(bio->bi_status);
}
- if (iocb->ki_flags & IOCB_HAS_METADATA)
+ if (bio_integrity(bio))
bio_integrity_unmap_user(bio);
iocb->ki_complete(iocb, ret);
@@ -348,7 +348,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
*/
bio_iov_bvec_set(bio, iter);
} else {
- ret = bio_iov_iter_get_pages(bio, iter);
+ ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev);
if (unlikely(ret))
goto out_bio_put;
}
@@ -422,7 +422,8 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
- if (likely(nr_pages <= BIO_MAX_VECS)) {
+ if (likely(nr_pages <= BIO_MAX_VECS &&
+ !(iocb->ki_flags & IOCB_HAS_METADATA))) {
if (is_sync_kiocb(iocb))
return __blkdev_direct_IO_simple(iocb, iter, bdev,
nr_pages);
@@ -687,6 +688,8 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (bdev_can_atomic_write(bdev))
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+ if (blk_get_integrity(bdev->bd_disk))
+ filp->f_mode |= FMODE_HAS_METADATA;
ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
if (ret)
diff --git a/block/genhd.c b/block/genhd.c
index c26733f6324b..9bbc38d12792 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1303,6 +1303,7 @@ static void disk_release(struct device *dev)
disk_free_zone_resources(disk);
xa_destroy(&disk->part_tbl);
+ kobject_put(&disk->queue_kobj);
disk->queue->disk = NULL;
blk_put_queue(disk->queue);
@@ -1486,6 +1487,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
mutex_init(&disk->rqos_state_mutex);
+ kobject_init(&disk->queue_kobj, &blk_queue_ktype);
return disk;
out_erase_part0:
diff --git a/block/ioctl.c b/block/ioctl.c
index f7b0006ca45d..d7489a56b33c 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -481,7 +481,7 @@ static int blkdev_getgeo(struct block_device *bdev,
*/
memset(&geo, 0, sizeof(geo));
geo.start = get_start_sect(bdev);
- ret = disk->fops->getgeo(bdev, &geo);
+ ret = disk->fops->getgeo(disk, &geo);
if (ret)
return ret;
if (copy_to_user(argp, &geo, sizeof(geo)))
@@ -515,7 +515,7 @@ static int compat_hdio_getgeo(struct block_device *bdev,
* want to override it.
*/
geo.start = get_start_sect(bdev);
- ret = disk->fops->getgeo(bdev, &geo);
+ ret = disk->fops->getgeo(disk, &geo);
if (ret)
return ret;
@@ -776,7 +776,7 @@ static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
if (bic->res == -EAGAIN && bic->nowait)
io_uring_cmd_issue_blocking(cmd);
else
- io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
+ io_uring_cmd_done(cmd, bic->res, issue_flags);
}
static void bio_cmd_bio_end_io(struct bio *bio)
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 70cbc7b2deb4..18efd6ef2a2b 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -399,6 +399,14 @@ err:
return ERR_PTR(ret);
}
+static void kyber_depth_updated(struct request_queue *q)
+{
+ struct kyber_queue_data *kqd = q->elevator->elevator_data;
+
+ kqd->async_depth = q->nr_requests * KYBER_ASYNC_PERCENT / 100U;
+ blk_mq_set_min_shallow_depth(q, kqd->async_depth);
+}
+
static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
{
struct kyber_queue_data *kqd;
@@ -413,6 +421,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
eq->elevator_data = kqd;
q->elevator = eq;
+ kyber_depth_updated(q);
return 0;
}
@@ -440,15 +449,6 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
INIT_LIST_HEAD(&kcq->rq_list[i]);
}
-static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
-{
- struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
- struct blk_mq_tags *tags = hctx->sched_tags;
-
- kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U;
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
-}
-
static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
struct kyber_hctx_data *khd;
@@ -493,7 +493,6 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
khd->batching = 0;
hctx->sched_data = khd;
- kyber_depth_updated(hctx);
return 0;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index b9b7cdf1d3c9..3e741d33142d 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -136,10 +136,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
struct rb_node *node = per_prio->sort_list[data_dir].rb_node;
struct request *rq, *res = NULL;
- if (!node)
- return NULL;
-
- rq = rb_entry_rq(node);
while (node) {
rq = rb_entry_rq(node);
if (blk_rq_pos(rq) >= pos) {
@@ -507,22 +503,12 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
}
/* Called by blk_mq_update_nr_requests(). */
-static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
+static void dd_depth_updated(struct request_queue *q)
{
- struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
- struct blk_mq_tags *tags = hctx->sched_tags;
dd->async_depth = q->nr_requests;
-
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
-}
-
-/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
-static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
-{
- dd_depth_updated(hctx);
- return 0;
+ blk_mq_set_min_shallow_depth(q, 1);
}
static void dd_exit_sched(struct elevator_queue *e)
@@ -587,6 +573,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq)
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
q->elevator = eq;
+ dd_depth_updated(q);
return 0;
}
@@ -1048,7 +1035,6 @@ static struct elevator_type mq_deadline = {
.has_work = dd_has_work,
.init_sched = dd_init_sched,
.exit_sched = dd_exit_sched,
- .init_hctx = dd_init_hctx,
},
#ifdef CONFIG_BLK_DEBUG_FS
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
index 82d9c4c3fb41..631291fbb356 100644
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -358,7 +358,7 @@ int ibm_partition(struct parsed_partitions *state)
goto out_nolab;
/* set start if not filled by getgeo function e.g. virtblk */
geo->start = get_start_sect(bdev);
- if (disk->fops->getgeo(bdev, geo))
+ if (disk->fops->getgeo(disk, geo))
goto out_freeall;
if (!fn || fn(disk, info)) {
kfree(info);