diff options
Diffstat (limited to 'block')
39 files changed, 827 insertions, 668 deletions
diff --git a/block/bdev.c b/block/bdev.c index b77ddd12dc06..810707cca970 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -412,7 +412,7 @@ static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = bdev_evict_inode, }; diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3bf76902f07f..4a8d3d96bfe4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5847,8 +5847,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, goto out; } - bfqq = kmem_cache_alloc_node(bfq_pool, - GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, + bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO, bfqd->queue->node); if (bfqq) { @@ -7110,9 +7109,10 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) * See the comments on bfq_limit_depth for the purpose of * the depths set in the function. Return minimum shallow depth we'll use. */ -static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) +static void bfq_depth_updated(struct request_queue *q) { - unsigned int nr_requests = bfqd->queue->nr_requests; + struct bfq_data *bfqd = q->elevator->elevator_data; + unsigned int nr_requests = q->nr_requests; /* * In-word depths if no bfq_queue is being weight-raised: @@ -7144,21 +7144,8 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U); -} - -static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) -{ - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; - bfq_update_depths(bfqd, &tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); -} - -static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) -{ - bfq_depth_updated(hctx); - return 0; + blk_mq_set_min_shallow_depth(q, 1); } static void bfq_exit_queue(struct elevator_queue *e) @@ -7370,6 +7357,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) goto out_free; bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + bfq_depth_updated(q); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); @@ -7629,7 +7617,6 @@ static struct elevator_type iosched_bfq_mq = { .request_merged = bfq_request_merged, .has_work = bfq_has_work, .depth_updated = bfq_depth_updated, - .init_hctx = bfq_init_hctx, .init_sched = bfq_init_queue, .exit_sched = bfq_exit_queue, }, diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 6b077ca937f6..bed26f1ec869 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -230,7 +230,8 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, } static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, - int nr_vecs, ssize_t bytes, ssize_t offset) + int nr_vecs, ssize_t bytes, ssize_t offset, + bool *is_p2p) { unsigned int nr_bvecs = 0; int i, j; @@ -251,6 +252,9 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, bytes -= next; } + if (is_pci_p2pdma_page(pages[i])) + *is_p2p = true; + bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset); offset = 0; nr_bvecs++; @@ -262,13 +266,13 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + iov_iter_extraction_t extraction_flags = 0; size_t offset, bytes = iter->count; + bool copy, is_p2p = false; unsigned int nr_bvecs; int ret, nr_vecs; - bool copy; if (bio_integrity(bio)) return -EINVAL; @@ -285,16 +289,25 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) pages = NULL; } - copy = !iov_iter_is_aligned(iter, align, align); - ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); + copy = iov_iter_alignment(iter) & + blk_lim_dma_alignment_and_pad(&q->limits); + + if (blk_queue_pci_p2pdma(q)) + extraction_flags |= ITER_ALLOW_P2PDMA; + + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, + extraction_flags, &offset); if (unlikely(ret < 0)) goto free_bvec; - nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset); + nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset, + &is_p2p); if (pages != stack_pages) kvfree(pages); if (nr_bvecs > queue_max_integrity_segments(q)) copy = true; + if (is_p2p) + bio->bi_opf |= REQ_NOMERGE; if (copy) ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes); diff --git a/block/bio.c b/block/bio.c index 3b371a5da159..3a1a848940dd 100644 --- a/block/bio.c +++ b/block/bio.c @@ -261,7 +261,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; - bio->bi_issue.value = 0; + bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST @@ -462,7 +462,10 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache->nr--; put_cpu(); - bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf); + if (nr_vecs) + bio_init_inline(bio, bdev, nr_vecs, opf); + else + bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } @@ -578,7 +581,7 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, bio_init(bio, bdev, bvl, nr_vecs, opf); } else if (nr_vecs) { - bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf); + bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); } else { bio_init(bio, bdev, NULL, 0, opf); } @@ -614,7 +617,8 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; - return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); + return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec), + gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); @@ -981,7 +985,7 @@ void __bio_add_page(struct bio *bio, struct page *page, WARN_ON_ONCE(bio_full(bio, len)); if (is_pci_p2pdma_page(page)) - bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; + bio->bi_opf |= REQ_NOMERGE; bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; @@ -1227,13 +1231,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) extraction_flags |= ITER_ALLOW_P2PDMA; - /* - * Each segment in the iov is required to be a block size multiple. - * However, we may not be able to get the entire segment if it spans - * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the - * result to ensure the bio's total size is correct. The remainder of - * the iov data will be picked up in the next bio iteration. - */ size = iov_iter_extract_pages(iter, &pages, UINT_MAX - bio->bi_iter.bi_size, nr_pages, extraction_flags, &offset); @@ -1241,18 +1238,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return size ? size : -EFAULT; nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - - if (bio->bi_bdev) { - size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); - iov_iter_revert(iter, trim); - size -= trim; - } - - if (unlikely(!size)) { - ret = -EFAULT; - goto out; - } - for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; struct folio *folio = page_folio(page); @@ -1297,10 +1282,44 @@ out: return ret; } +/* + * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that + * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length + * for the next iteration. + */ +static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) +{ + size_t nbytes = bio->bi_iter.bi_size & len_align_mask; + + if (!nbytes) + return 0; + + iov_iter_revert(iter, nbytes); + bio->bi_iter.bi_size -= nbytes; + do { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (nbytes < bv->bv_len) { + bv->bv_len -= nbytes; + break; + } + + bio_release_page(bio, bv->bv_page); + bio->bi_vcnt--; + nbytes -= bv->bv_len; + } while (nbytes); + + if (!bio->bi_vcnt) + return -EFAULT; + return 0; +} + /** - * bio_iov_iter_get_pages - add user or kernel pages to a bio + * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added + * @len_align_mask: the mask to align the total size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and @@ -1317,7 +1336,8 @@ out: * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) { int ret = 0; @@ -1336,9 +1356,11 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - return bio->bi_vcnt ? 0 : ret; + if (bio->bi_vcnt) + return bio_iov_iter_align_down(bio, iter, len_align_mask); + return ret; } -EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); +EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned); static void submit_bio_wait_endio(struct bio *bio) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5936db7f8475..f93de34fe87d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -110,12 +110,6 @@ static struct cgroup_subsys_state *blkcg_css(void) return task_css(current, io_cgrp_id); } -static bool blkcg_policy_enabled(struct request_queue *q, - const struct blkcg_policy *pol) -{ - return pol && test_bit(pol->plid, q->blkcg_pols); -} - static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, @@ -394,7 +388,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); + new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; @@ -883,14 +877,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, disk = ctx->bdev->bd_disk; q = disk->queue; - /* - * blkcg_deactivate_policy() requires queue to be frozen, we can grab - * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). - */ - ret = blk_queue_enter(q, 0); - if (ret) - goto fail; - + /* Prevent concurrent with blkcg_deactivate_policy() */ + mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { @@ -920,16 +908,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); - new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); + new_blkg = blkg_alloc(pos, disk, GFP_NOIO); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto fail_exit_queue; + goto fail_exit; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; - goto fail_exit_queue; + goto fail_exit; } spin_lock_irq(&q->queue_lock); @@ -957,7 +945,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: - blk_queue_exit(q); + mutex_unlock(&q->blkcg_mutex); ctx->blkg = blkg; return 0; @@ -965,9 +953,8 @@ fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); -fail_exit_queue: - blk_queue_exit(q); -fail: +fail_exit: + mutex_unlock(&q->blkcg_mutex); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -1467,7 +1454,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) spin_lock_init(&blkcg->lock); refcount_set(&blkcg->online_pin, 1); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); @@ -1630,7 +1617,7 @@ retry: pd_prealloc = NULL; } else { pd = pol->pd_alloc_fn(disk, blkg->blkcg, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); } if (!pd) { diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 81868ad86330..1cce3294634d 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -370,11 +370,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) -static inline void blkcg_bio_issue_init(struct bio *bio) -{ - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); -} - static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) @@ -459,6 +454,12 @@ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } +static inline bool blkcg_policy_enabled(struct request_queue *q, + const struct blkcg_policy *pol) +{ + return pol && test_bit(pol->plid, q->blkcg_pols); +} + void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ @@ -491,7 +492,6 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } -static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } diff --git a/block/blk-core.c b/block/blk-core.c index fdac48aec5ef..14ae73eebe0d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -539,7 +539,7 @@ static inline void bio_check_ro(struct bio *bio) } } -static noinline int should_fail_bio(struct bio *bio) +int should_fail_bio(struct bio *bio) { if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; @@ -560,6 +560,8 @@ static inline int bio_check_eod(struct bio *bio) if (nr_sectors && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { + if (!maxsector) + return -EIO; pr_info_ratelimited("%s: attempt to access beyond end of device\n" "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", current->comm, bio->bi_bdev, bio->bi_opf, @@ -725,10 +727,9 @@ static void __submit_bio_noacct_mq(struct bio *bio) current->bio_list = NULL; } -void submit_bio_noacct_nocheck(struct bio *bio) +void submit_bio_noacct_nocheck(struct bio *bio, bool split) { blk_cgroup_bio_start(bio); - blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(bio); @@ -745,12 +746,16 @@ void submit_bio_noacct_nocheck(struct bio *bio) * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ - if (current->bio_list) - bio_list_add(¤t->bio_list[0], bio); - else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) + if (current->bio_list) { + if (split) + bio_list_add_head(¤t->bio_list[0], bio); + else + bio_list_add(¤t->bio_list[0], bio); + } else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { __submit_bio_noacct_mq(bio); - else + } else { __submit_bio_noacct(bio); + } } static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, @@ -871,7 +876,7 @@ void submit_bio_noacct(struct bio *bio) if (blk_throtl_bio(bio)) return; - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, false); return; not_supported: diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 005c9157ffb3..86b27f96051a 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -167,8 +167,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio = bio_kmalloc(nr_segs, GFP_NOIO); if (!bio) return NULL; - bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs, - bio_src->bi_opf); + bio_init_inline(bio, bio_src->bi_bdev, nr_segs, bio_src->bi_opf); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; @@ -222,18 +221,14 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) if (++i == BIO_MAX_VECS) break; } - if (num_sectors < bio_sectors(bio)) { - struct bio *split_bio; - split_bio = bio_split(bio, num_sectors, GFP_NOIO, - &crypto_bio_split); - if (IS_ERR(split_bio)) { - bio->bi_status = BLK_STS_RESOURCE; + if (num_sectors < bio_sectors(bio)) { + bio = bio_submit_split_bioset(bio, num_sectors, + &crypto_bio_split); + if (!bio) return false; - } - bio_chain(split_bio, bio); - submit_bio_noacct(bio); - *bio_ptr = split_bio; + + *bio_ptr = bio; } return true; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 056b8948369d..9b27963680dc 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -58,16 +58,14 @@ new_segment: int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp) { - struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk); + struct blk_integrity *bi; struct logical_block_metadata_cap meta_cap = {}; size_t usize = _IOC_SIZE(cmd); - if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) || - _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) || - _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) || - _IOC_SIZE(cmd) < LBMD_SIZE_VER0) + if (!extensible_ioctl_valid(cmd, FS_IOC_GETLBMD_CAP, LBMD_SIZE_VER0)) return -ENOIOCTLCMD; + bi = blk_get_integrity(bdev->bd_disk); if (!bi) goto out; @@ -122,64 +120,6 @@ out: NULL); } -/** - * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist - * @rq: request to map - * @sglist: target scatterlist - * - * Description: Map the integrity vectors in request into a - * scatterlist. The scatterlist must be big enough to hold all - * elements. I.e. sized using blk_rq_count_integrity_sg() or - * rq->nr_integrity_segments. - */ -int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) -{ - struct bio_vec iv, ivprv = { NULL }; - struct request_queue *q = rq->q; - struct scatterlist *sg = NULL; - struct bio *bio = rq->bio; - unsigned int segments = 0; - struct bvec_iter iter; - int prev = 0; - - bio_for_each_integrity_vec(iv, bio, iter) { - if (prev) { - if (!biovec_phys_mergeable(q, &ivprv, &iv)) - goto new_segment; - if (sg->length + iv.bv_len > queue_max_segment_size(q)) - goto new_segment; - - sg->length += iv.bv_len; - } else { -new_segment: - if (!sg) - sg = sglist; - else { - sg_unmark_end(sg); - sg = sg_next(sg); - } - - sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset); - segments++; - } - - prev = 1; - ivprv = iv; - } - - if (sg) - sg_mark_end(sg); - - /* - * Something must have been wrong if the figured number of segment - * is bigger than number of req's physical integrity segments - */ - BUG_ON(segments > rq->nr_integrity_segments); - BUG_ON(segments > queue_max_integrity_segments(q)); - return segments; -} -EXPORT_SYMBOL(blk_rq_map_integrity_sg); - int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9fda3906e5f5..d15918d7fabb 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -286,7 +286,7 @@ out: } EXPORT_SYMBOL_GPL(set_task_ioprio); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2f8fdecdd7a9..45bd18f68541 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -485,19 +485,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) mod_timer(&blkiolat->timer, jiffies + HZ); } -static void iolatency_record_time(struct iolatency_grp *iolat, - struct bio_issue *issue, u64 now, - bool issue_as_root) +static void iolatency_record_time(struct iolatency_grp *iolat, u64 start, + u64 now, bool issue_as_root) { - u64 start = bio_issue_time(issue); u64 req_time; - /* - * Have to do this so we are truncated to the correct time that our - * issue is truncated to. - */ - now = __bio_issue_time(now); - if (now <= start) return; @@ -625,7 +617,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) * submitted, so do not account for it. */ if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { - iolatency_record_time(iolat, &bio->bi_issue, now, + iolatency_record_time(iolat, bio->issue_time_ns, now, issue_as_root); window_start = atomic64_read(&iolat->window_start); if (now > window_start && @@ -750,10 +742,15 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) */ enabled = atomic_read(&blkiolat->enable_cnt); if (enabled != blkiolat->enabled) { + struct request_queue *q = blkiolat->rqos.disk->queue; unsigned int memflags; memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue); blkiolat->enabled = enabled; + if (enabled) + blk_queue_flag_set(QUEUE_FLAG_BIO_ISSUE_TIME, q); + else + blk_queue_flag_clear(QUEUE_FLAG_BIO_ISSUE_TIME, q); blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags); } } diff --git a/block/blk-lib.c b/block/blk-lib.c index 4c9f20a689f7..3030a772d3aa 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -196,6 +196,8 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned int flags) { + struct folio *zero_folio = largest_zero_folio(); + while (nr_sects) { unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects); struct bio *bio; @@ -208,15 +210,14 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev, break; do { - unsigned int len, added; + unsigned int len; - len = min_t(sector_t, - PAGE_SIZE, nr_sects << SECTOR_SHIFT); - added = bio_add_page(bio, ZERO_PAGE(0), len, 0); - if (added < len) + len = min_t(sector_t, folio_size(zero_folio), + nr_sects << SECTOR_SHIFT); + if (!bio_add_folio(bio, zero_folio, len, 0)) break; - nr_sects -= added >> SECTOR_SHIFT; - sector += added >> SECTOR_SHIFT; + nr_sects -= len >> SECTOR_SHIFT; + sector += len >> SECTOR_SHIFT; } while (nr_sects); *biop = bio_chain_and_submit(*biop, bio); diff --git a/block/blk-map.c b/block/blk-map.c index 23e5d5ebe59e..165f2234f00f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -157,7 +157,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); + bio_init_inline(bio, NULL, nr_pages, req_op(rq)); if (map_data) { nr_pages = 1U << map_data->page_order; @@ -253,10 +253,11 @@ static void blk_mq_map_bio_put(struct bio *bio) static struct bio *blk_rq_map_bio_alloc(struct request *rq, unsigned int nr_vecs, gfp_t gfp_mask) { + struct block_device *bdev = rq->q->disk ? rq->q->disk->part0 : NULL; struct bio *bio; if (rq->cmd_flags & REQ_ALLOC_CACHE && (nr_vecs <= BIO_INLINE_VECS)) { - bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask, + bio = bio_alloc_bioset(bdev, nr_vecs, rq->cmd_flags, gfp_mask, &fs_bio_set); if (!bio) return NULL; @@ -264,7 +265,7 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return NULL; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + bio_init_inline(bio, bdev, nr_vecs, req_op(rq)); } return bio; } @@ -326,7 +327,7 @@ static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op); + bio_init_inline(bio, NULL, nr_vecs, op); if (is_vmalloc_addr(data)) { bio->bi_private = data; if (!bio_add_vmalloc(bio, data, len)) { @@ -392,7 +393,7 @@ static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op); + bio_init_inline(bio, NULL, nr_pages, op); while (len) { struct page *page; @@ -443,7 +444,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) int ret; /* check that the data layout matches the hardware restrictions */ - ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes); + ret = bio_split_io_at(bio, lim, &nr_segs, max_bytes, 0); if (ret) { /* if we would have to split the bio, copy instead */ if (ret > 0) diff --git a/block/blk-merge.c b/block/blk-merge.c index 70d704615be5..37864c5d287e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -104,34 +104,58 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } +/* + * bio_submit_split_bioset - Submit a bio, splitting it at a designated sector + * @bio: the original bio to be submitted and split + * @split_sectors: the sector count at which to split + * @bs: the bio set used for allocating the new split bio + * + * The original bio is modified to contain the remaining sectors and submitted. + * The caller is responsible for submitting the returned bio. + * + * If succeed, the newly allocated bio representing the initial part will be + * returned, on failure NULL will be returned and original bio will fail. + */ +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs) +{ + struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return NULL; + } + + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); + + if (should_fail_bio(bio)) + bio_io_error(bio); + else if (!blk_throtl_bio(bio)) + submit_bio_noacct_nocheck(bio, true); + + return split; +} +EXPORT_SYMBOL_GPL(bio_submit_split_bioset); + static struct bio *bio_submit_split(struct bio *bio, int split_sectors) { - if (unlikely(split_sectors < 0)) - goto error; + if (unlikely(split_sectors < 0)) { + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; + } if (split_sectors) { - struct bio *split; - - split = bio_split(bio, split_sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, split_sectors, &bio->bi_bdev->bd_disk->bio_split); - if (IS_ERR(split)) { - split_sectors = PTR_ERR(split); - goto error; - } - split->bi_opf |= REQ_NOMERGE; - blkcg_bio_issue_init(split); - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); - return split; + if (bio) + bio->bi_opf |= REQ_NOMERGE; } return bio; -error: - bio->bi_status = errno_to_blk_status(split_sectors); - bio_endio(bio); - return NULL; } struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, @@ -279,25 +303,30 @@ static unsigned int bio_split_alignment(struct bio *bio, } /** - * bio_split_rw_at - check if and where to split a read/write bio + * bio_split_io_at - check if and where to split a bio * @bio: [in] bio to be split * @lim: [in] queue limits to split based on * @segs: [out] number of segments in the bio with the first half of the sectors * @max_bytes: [in] maximum number of bytes per bio + * @len_align_mask: [in] length alignment mask for each vector * * Find out if @bio needs to be split to fit the queue limits in @lim and a * maximum size of @max_bytes. Returns a negative error number if @bio can't be * split, 0 if the bio doesn't have to be split, or a positive sector offset if * @bio needs to be split. */ -int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, unsigned max_bytes) +int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes, unsigned len_align_mask) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned nsegs = 0, bytes = 0; bio_for_each_bvec(bv, bio, iter) { + if (bv.bv_offset & lim->dma_alignment || + bv.bv_len & len_align_mask) + return -EINVAL; + /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. @@ -339,8 +368,16 @@ split: * Individual bvecs might not be logical block aligned. Round down the * split size so that each bio is properly block size aligned, even if * we do not use the full hardware limits. + * + * It is possible to submit a bio that can't be split into a valid io: + * there may either be too many discontiguous vectors for the max + * segments limit, or contain virtual boundary gaps without having a + * valid block sized split. A zero byte result means one of those + * conditions occured. */ bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim)); + if (!bytes) + return -EINVAL; /* * Bio splitting may cause subtle trouble such as hang when doing sync @@ -350,7 +387,7 @@ split: bio_clear_polled(bio); return bytes >> SECTOR_SHIFT; } -EXPORT_SYMBOL_GPL(bio_split_rw_at); +EXPORT_SYMBOL_GPL(bio_split_io_at); struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7ed3e71f2fc0..4896525b1c05 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -95,6 +95,8 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(SQ_SCHED), QUEUE_FLAG_NAME(DISABLE_WBT_DEF), QUEUE_FLAG_NAME(NO_ELV_SWITCH), + QUEUE_FLAG_NAME(QOS_ENABLED), + QUEUE_FLAG_NAME(BIO_ISSUE_TIME), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index ad283017caef..449950029872 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2025 Christoph Hellwig */ +#include <linux/blk-integrity.h> #include <linux/blk-mq-dma.h> #include "blk.h" @@ -10,29 +11,38 @@ struct phys_vec { u32 len; }; -static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, +static bool __blk_map_iter_next(struct blk_map_iter *iter) +{ + if (iter->iter.bi_size) + return true; + if (!iter->bio || !iter->bio->bi_next) + return false; + + iter->bio = iter->bio->bi_next; + if (iter->is_integrity) { + iter->iter = bio_integrity(iter->bio)->bip_iter; + iter->bvecs = bio_integrity(iter->bio)->bip_vec; + } else { + iter->iter = iter->bio->bi_iter; + iter->bvecs = iter->bio->bi_io_vec; + } + return true; +} + +static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct phys_vec *vec) { unsigned int max_size; struct bio_vec bv; - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - if (!iter->bio) - return false; - vec->paddr = bvec_phys(&req->special_vec); - vec->len = req->special_vec.bv_len; - iter->bio = NULL; - return true; - } - if (!iter->iter.bi_size) return false; - bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter); vec->paddr = bvec_phys(&bv); max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); bv.bv_len = min(bv.bv_len, max_size); - bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); + bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len); /* * If we are entirely done with this bi_io_vec entry, check if the next @@ -42,20 +52,16 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { struct bio_vec next; - if (!iter->iter.bi_size) { - if (!iter->bio->bi_next) - break; - iter->bio = iter->bio->bi_next; - iter->iter = iter->bio->bi_iter; - } + if (!__blk_map_iter_next(iter)) + break; - next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + next = mp_bvec_iter_bvec(iter->bvecs, iter->iter); if (bv.bv_len + next.bv_len > max_size || !biovec_phys_mergeable(req->q, &bv, &next)) break; bv.bv_len += next.bv_len; - bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len); } vec->len = bv.bv_len; @@ -125,6 +131,72 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, return true; } +static inline void blk_rq_map_iter_init(struct request *rq, + struct blk_map_iter *iter) +{ + struct bio *bio = rq->bio; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { + *iter = (struct blk_map_iter) { + .bvecs = &rq->special_vec, + .iter = { + .bi_size = rq->special_vec.bv_len, + } + }; + } else if (bio) { + *iter = (struct blk_map_iter) { + .bio = bio, + .bvecs = bio->bi_io_vec, + .iter = bio->bi_iter, + }; + } else { + /* the internal flush request may not have bio attached */ + *iter = (struct blk_map_iter) {}; + } +} + +static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter, + unsigned int total_len) +{ + struct phys_vec vec; + + memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); + iter->status = BLK_STS_OK; + + /* + * Grab the first segment ASAP because we'll need it to check for P2P + * transfers. + */ + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, + phys_to_page(vec.paddr))) { + case PCI_P2PDMA_MAP_BUS_ADDR: + if (iter->iter.is_integrity) + bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA; + else + req->cmd_flags |= REQ_P2PDMA; + return blk_dma_map_bus(iter, &vec); + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * P2P transfers through the host bridge are treated the + * same as non-P2P transfers below and during unmap. + */ + case PCI_P2PDMA_MAP_NONE: + break; + default: + iter->status = BLK_STS_INVAL; + return false; + } + + if (blk_can_dma_map_iova(req, dma_dev) && + dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) + return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} + /** * blk_rq_dma_map_iter_start - map the first DMA segment for a request * @req: request to map @@ -150,43 +222,9 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter) { - unsigned int total_len = blk_rq_payload_bytes(req); - struct phys_vec vec; - - iter->iter.bio = req->bio; - iter->iter.iter = req->bio->bi_iter; - memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); - iter->status = BLK_STS_OK; - - /* - * Grab the first segment ASAP because we'll need it to check for P2P - * transfers. - */ - if (!blk_map_iter_next(req, &iter->iter, &vec)) - return false; - - if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { - switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, - phys_to_page(vec.paddr))) { - case PCI_P2PDMA_MAP_BUS_ADDR: - return blk_dma_map_bus(iter, &vec); - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * P2P transfers through the host bridge are treated the - * same as non-P2P transfers below and during unmap. - */ - req->cmd_flags &= ~REQ_P2PDMA; - break; - default: - iter->status = BLK_STS_INVAL; - return false; - } - } - - if (blk_can_dma_map_iova(req, dma_dev) && - dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) - return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); - return blk_dma_map_direct(req, dma_dev, iter, &vec); + blk_rq_map_iter_init(req, &iter->iter); + return blk_dma_map_iter_start(req, dma_dev, state, iter, + blk_rq_payload_bytes(req)); } EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); @@ -246,16 +284,11 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { - struct req_iterator iter = { - .bio = rq->bio, - }; + struct blk_map_iter iter; struct phys_vec vec; int nsegs = 0; - /* the internal flush request may not have bio attached */ - if (iter.bio) - iter.iter = iter.bio->bi_iter; - + blk_rq_map_iter_init(rq, &iter); while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, @@ -275,3 +308,124 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, return nsegs; } EXPORT_SYMBOL(__blk_rq_map_sg); + +#ifdef CONFIG_BLK_DEV_INTEGRITY +/** + * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment + * for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are + * provided by the caller and don't need to be initialized. @state needs to be + * stored for use at unmap time, @iter is only needed at map time. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true if it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr + * and the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + * + * The caller can call blk_rq_dma_map_coalesce() to check if further segments + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() + * to try to map the following segments. + */ +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + unsigned len = bio_integrity_bytes(&req->q->limits.integrity, + blk_rq_sectors(req)); + struct bio *bio = req->bio; + + iter->iter = (struct blk_map_iter) { + .bio = bio, + .iter = bio_integrity(bio)->bip_iter, + .bvecs = bio_integrity(bio)->bip_vec, + .is_integrity = true, + }; + return blk_dma_map_iter_start(req, dma_dev, state, iter, len); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start); + +/** + * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for + * a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Iterate to the next integrity mapping after a previous call to + * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description + * of the arguments. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true if it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + */ +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + struct phys_vec vec; + + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + return blk_dma_map_bus(iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next); + +/** + * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist + * @rq: request to map + * @sglist: target scatterlist + * + * Description: Map the integrity vectors in request into a + * scatterlist. The scatterlist must be big enough to hold all + * elements. I.e. sized using blk_rq_count_integrity_sg() or + * rq->nr_integrity_segments. + */ +int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) +{ + struct request_queue *q = rq->q; + struct scatterlist *sg = NULL; + struct bio *bio = rq->bio; + unsigned int segments = 0; + struct phys_vec vec; + + struct blk_map_iter iter = { + .bio = bio, + .iter = bio_integrity(bio)->bip_iter, + .bvecs = bio_integrity(bio)->bip_vec, + .is_integrity = true, + }; + + while (blk_map_iter_next(rq, &iter, &vec)) { + sg = blk_next_sg(&sg, sglist); + sg_set_page(sg, phys_to_page(vec.paddr), vec.len, + offset_in_page(vec.paddr)); + segments++; + } + + if (sg) + sg_mark_end(sg); + + /* + * Something must have been wrong if the figured number of segment + * is bigger than number of req's physical integrity segments + */ + BUG_ON(segments > rq->nr_integrity_segments); + BUG_ON(segments > queue_max_integrity_segments(q)); + return segments; +} +EXPORT_SYMBOL(blk_rq_map_integrity_sg); +#endif diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e2ce4a28e6c9..d06bb137a743 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -454,7 +454,7 @@ void blk_mq_free_sched_tags_batch(struct xarray *et_table, } struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, - unsigned int nr_hw_queues) + unsigned int nr_hw_queues, unsigned int nr_requests) { unsigned int nr_tags; int i; @@ -470,13 +470,8 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, nr_tags * sizeof(struct blk_mq_tags *), gfp); if (!et) return NULL; - /* - * Default to double of smaller one between hw queue_depth and - * 128, since we don't split into sync/async like the old code - * did. Additionally, this is a per-hw queue depth. - */ - et->nr_requests = 2 * min_t(unsigned int, set->queue_depth, - BLKDEV_DEFAULT_RQ); + + et->nr_requests = nr_requests; et->nr_hw_queues = nr_hw_queues; if (blk_mq_is_shared_tags(set->flags)) { @@ -521,7 +516,8 @@ int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, * concurrently. */ if (q->elevator) { - et = blk_mq_alloc_sched_tags(set, nr_hw_queues); + et = blk_mq_alloc_sched_tags(set, nr_hw_queues, + blk_mq_default_nr_requests(set)); if (!et) goto out_unwind; if (xa_insert(et_table, q->id, et, gfp)) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index b554e1d55950..8e21a6b1415d 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -24,7 +24,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, - unsigned int nr_hw_queues); + unsigned int nr_hw_queues, unsigned int nr_requests); int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, struct blk_mq_tag_set *set, unsigned int nr_hw_queues); void blk_mq_free_sched_tags(struct elevator_tags *et, @@ -92,4 +92,15 @@ static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } +static inline void blk_mq_set_min_shallow_depth(struct request_queue *q, + unsigned int depth) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + queue_for_each_hw_ctx(q, hctx, i) + sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, + depth); +} + #endif diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 24656980f443..58ec293373c6 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -34,7 +34,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); - blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); free_cpumask_var(hctx->cpumask); kfree(hctx->ctxs); @@ -150,9 +149,11 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) return; hctx_for_each_ctx(hctx, ctx, i) - kobject_del(&ctx->kobj); + if (ctx->kobj.state_in_sysfs) + kobject_del(&ctx->kobj); - kobject_del(&hctx->kobj); + if (hctx->kobj.state_in_sysfs) + kobject_del(&hctx->kobj); } static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d880c50629d6..c7a4d4b9cc87 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -8,6 +8,9 @@ */ #include <linux/kernel.h> #include <linux/module.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/kmemleak.h> #include <linux/delay.h> #include "blk.h" @@ -253,13 +256,10 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { struct request *rq; - unsigned long flags; - spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) rq = NULL; - spin_unlock_irqrestore(&tags->lock, flags); return rq; } @@ -297,15 +297,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. - * @q: Request queue to examine. + * @q: Request queue @hctx is associated with (@hctx->queue). * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request * associated with @hctx that has been assigned a driver tag. - * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) - * where rq is a pointer to a request. Return true to continue - * iterating tags, false to stop. - * @data: Will be passed as third argument to @fn. + * @fn will be called as follows: @fn(rq, @data) where rq is a + * pointer to a request. Return %true to continue iterating tags; + * %false to stop. + * @data: Will be passed as second argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ @@ -371,9 +371,9 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each started - * request. @fn will be called as follows: @fn(rq, @data, - * @reserved) where rq is a pointer to a request. Return true - * to continue iterating tags, false to stop. + * request. @fn will be called as follows: @fn(rq, @data) where rq + * is a pointer to a request. Return %true to continue iterating + * tags; %false to stop. * @data: Will be passed as second argument to @fn. * @flags: BT_TAG_ITER_* */ @@ -406,10 +406,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, * blk_mq_all_tag_iter - iterate over all requests in a tag map * @tags: Tag map to iterate over. * @fn: Pointer to the function that will be called for each - * request. @fn will be called as follows: @fn(rq, @priv, - * reserved) where rq is a pointer to a request. 'reserved' - * indicates whether or not @rq is a reserved request. Return - * true to continue iterating tags, false to stop. + * request. @fn will be called as follows: @fn(rq, @priv) where rq + * is a pointer to a request. Return %true to continue iterating + * tags; %false to stop. * @priv: Will be passed as second argument to @fn. * * Caller has to pass the tag map from which requests are allocated. @@ -424,10 +423,9 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set * @tagset: Tag set to iterate over. * @fn: Pointer to the function that will be called for each started - * request. @fn will be called as follows: @fn(rq, @priv, - * reserved) where rq is a pointer to a request. 'reserved' - * indicates whether or not @rq is a reserved request. Return - * true to continue iterating tags, false to stop. + * request. @fn will be called as follows: @fn(rq, @priv) where + * rq is a pointer to a request. Return true to continue iterating + * tags, false to stop. * @priv: Will be passed as second argument to @fn. * * We grab one request reference before calling @fn and release it after @@ -437,7 +435,9 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { unsigned int flags = tagset->flags; - int i, nr_tags; + int i, nr_tags, srcu_idx; + + srcu_idx = srcu_read_lock(&tagset->tags_srcu); nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; @@ -446,6 +446,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); } + srcu_read_unlock(&tagset->tags_srcu, srcu_idx); } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); @@ -483,11 +484,10 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag * @q: Request queue to examine. * @fn: Pointer to the function that will be called for each request - * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, - * reserved) where rq is a pointer to a request and hctx points - * to the hardware queue associated with the request. 'reserved' - * indicates whether or not @rq is a reserved request. - * @priv: Will be passed as third argument to @fn. + * on @q. @fn will be called as follows: @fn(rq, @priv) where rq + * is a pointer to a request and hctx points to the hardware queue + * associated with the request. + * @priv: Will be passed as second argument to @fn. * * Note: if @q->tag_set is shared with other request queues then @fn will be * called for all requests on all queues that share that tag set and not only @@ -496,6 +496,8 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { + int srcu_idx; + /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid @@ -504,6 +506,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, if (!percpu_ref_tryget(&q->q_usage_counter)) return; + srcu_idx = srcu_read_lock(&q->tag_set->tags_srcu); if (blk_mq_is_shared_tags(q->tag_set->flags)) { struct blk_mq_tags *tags = q->tag_set->shared_tags; struct sbitmap_queue *bresv = &tags->breserved_tags; @@ -533,6 +536,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, bt_for_each(hctx, q, btags, fn, priv, false); } } + srcu_read_unlock(&q->tag_set->tags_srcu, srcu_idx); blk_queue_exit(q); } @@ -562,6 +566,8 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); + INIT_LIST_HEAD(&tags->page_list); + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) goto out_free_tags; if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node)) @@ -576,63 +582,37 @@ out_free_tags: return NULL; } -void blk_mq_free_tags(struct blk_mq_tags *tags) +static void blk_mq_free_tags_callback(struct rcu_head *head) { - sbitmap_queue_free(&tags->bitmap_tags); - sbitmap_queue_free(&tags->breserved_tags); - kfree(tags); -} - -int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, - struct blk_mq_tags **tagsptr, unsigned int tdepth, - bool can_grow) -{ - struct blk_mq_tags *tags = *tagsptr; - - if (tdepth <= tags->nr_reserved_tags) - return -EINVAL; - - /* - * If we are allowed to grow beyond the original size, allocate - * a new set of tags before freeing the old one. - */ - if (tdepth > tags->nr_tags) { - struct blk_mq_tag_set *set = hctx->queue->tag_set; - struct blk_mq_tags *new; - - if (!can_grow) - return -EINVAL; - - /* - * We need some sort of upper limit, set it high enough that - * no valid use cases should require more. - */ - if (tdepth > MAX_SCHED_RQ) - return -EINVAL; + struct blk_mq_tags *tags = container_of(head, struct blk_mq_tags, + rcu_head); + struct page *page; + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); /* - * Only the sbitmap needs resizing since we allocated the max - * initially. + * Remove kmemleak object previously allocated in + * blk_mq_alloc_rqs(). */ - if (blk_mq_is_shared_tags(set->flags)) - return 0; + kmemleak_free(page_address(page)); + __free_pages(page, page->private); + } + kfree(tags); +} - new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); - if (!new) - return -ENOMEM; +void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) +{ + sbitmap_queue_free(&tags->bitmap_tags); + sbitmap_queue_free(&tags->breserved_tags); - blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); - *tagsptr = new; - } else { - /* - * Don't need (or can't) update reserved tags here, they - * remain static and should never need resizing. - */ - sbitmap_queue_resize(&tags->bitmap_tags, - tdepth - tags->nr_reserved_tags); + /* if tags pages is not allocated yet, free tags directly */ + if (list_empty(&tags->page_list)) { + kfree(tags); + return; } - return 0; + call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback); } void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) diff --git a/block/blk-mq.c b/block/blk-mq.c index b67d6c02eceb..09f579414161 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -396,6 +396,15 @@ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) #endif } +static inline void blk_mq_bio_issue_init(struct request_queue *q, + struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags)) + bio->issue_time_ns = blk_time_get_ns(); +#endif +} + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag) { @@ -3168,6 +3177,7 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) goto queue_exit; + blk_mq_bio_issue_init(q, bio); if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; @@ -3415,7 +3425,6 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct blk_mq_tags *tags) { struct page *page; - unsigned long flags; /* * There is no need to clear mapping if driver tags is not initialized @@ -3439,22 +3448,12 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, } } } - - /* - * Wait until all pending iteration is done. - * - * Request reference is cleared and it is guaranteed to be observed - * after the ->lock is released. - */ - spin_lock_irqsave(&drv_tags->lock, flags); - spin_unlock_irqrestore(&drv_tags->lock, flags); } void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; - struct page *page; if (list_empty(&tags->page_list)) return; @@ -3478,27 +3477,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } blk_mq_clear_rq_mapping(drv_tags, tags); - - while (!list_empty(&tags->page_list)) { - page = list_first_entry(&tags->page_list, struct page, lru); - list_del_init(&page->lru); - /* - * Remove kmemleak object previously allocated in - * blk_mq_alloc_rqs(). - */ - kmemleak_free(page_address(page)); - __free_pages(page, page->private); - } + /* + * Free request pages in SRCU callback, which is called from + * blk_mq_free_tags(). + */ } -void blk_mq_free_rq_map(struct blk_mq_tags *tags) +void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; - blk_mq_free_tags(tags); + blk_mq_free_tags(set, tags); } static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, @@ -3560,7 +3552,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, err_free_rqs: kfree(tags->rqs); err_free_tags: - blk_mq_free_tags(tags); + blk_mq_free_tags(set, tags); return NULL; } @@ -3590,8 +3582,6 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - INIT_LIST_HEAD(&tags->page_list); - /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size @@ -3678,8 +3668,12 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) struct rq_iter_data data = { .hctx = hctx, }; + int srcu_idx; + srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu); blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); + srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx); + return data.has_rq; } @@ -3899,7 +3893,6 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, unsigned int queue_depth, struct request *flush_rq) { int i; - unsigned long flags; /* The hw queue may not be mapped yet */ if (!tags) @@ -3909,15 +3902,14 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); +} - /* - * Wait until all pending iteration is done. - * - * Request reference is cleared and it is guaranteed to be observed - * after the ->lock is released. - */ - spin_lock_irqsave(&tags->lock, flags); - spin_unlock_irqrestore(&tags->lock, flags); +static void blk_free_flush_queue_callback(struct rcu_head *head) +{ + struct blk_flush_queue *fq = + container_of(head, struct blk_flush_queue, rcu_head); + + blk_free_flush_queue(fq); } /* hctx->ctxs will be freed in queue's release handler */ @@ -3939,6 +3931,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); + call_srcu(&set->tags_srcu, &hctx->fq->rcu_head, + blk_free_flush_queue_callback); + hctx->fq = NULL; + xa_erase(&q->hctx_table, hctx_idx); spin_lock(&q->unused_hctx_lock); @@ -3964,13 +3960,19 @@ static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; + + hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); + if (!hctx->fq) + goto fail; + hctx->queue_num = hctx_idx; hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) - goto fail; + goto fail_free_fq; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) @@ -3987,6 +3989,9 @@ static int blk_mq_init_hctx(struct request_queue *q, exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); + fail_free_fq: + blk_free_flush_queue(hctx->fq); + hctx->fq = NULL; fail: return -1; } @@ -4038,16 +4043,10 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); - hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); - if (!hctx->fq) - goto free_bitmap; - blk_mq_hctx_kobj_init(hctx); return hctx; - free_bitmap: - sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); free_cpumask: @@ -4101,7 +4100,7 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); if (ret) { - blk_mq_free_rq_map(tags); + blk_mq_free_rq_map(set, tags); return NULL; } @@ -4129,7 +4128,7 @@ void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, { if (tags) { blk_mq_free_rqs(set, tags, hctx_idx); - blk_mq_free_rq_map(tags); + blk_mq_free_rq_map(set, tags); } } @@ -4828,6 +4827,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_srcu; } + ret = init_srcu_struct(&set->tags_srcu); + if (ret) + goto out_cleanup_srcu; init_rwsem(&set->update_nr_hwq_lock); @@ -4836,7 +4838,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) - goto out_cleanup_srcu; + goto out_cleanup_tags_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, @@ -4865,6 +4867,8 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_tags_srcu: + cleanup_srcu_struct(&set->tags_srcu); out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); @@ -4910,6 +4914,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + + srcu_barrier(&set->tags_srcu); + cleanup_srcu_struct(&set->tags_srcu); if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); @@ -4917,57 +4924,59 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_free_tag_set); -int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, + struct elevator_tags *et, + unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; + struct elevator_tags *old_et = NULL; struct blk_mq_hw_ctx *hctx; - int ret; unsigned long i; - if (WARN_ON_ONCE(!q->mq_freeze_depth)) - return -EINVAL; - - if (!set) - return -EINVAL; - - if (q->nr_requests == nr) - return 0; - blk_mq_quiesce_queue(q); - ret = 0; - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx->tags) - continue; + if (blk_mq_is_shared_tags(set->flags)) { /* - * If we're using an MQ scheduler, just update the scheduler - * queue depth. This is similar to what the old code would do. + * Shared tags, for sched tags, we allocate max initially hence + * tags can't grow, see blk_mq_alloc_sched_tags(). */ - if (hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr, true); - } else { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, - false); + if (q->elevator) + blk_mq_tag_update_sched_shared_tags(q); + else + blk_mq_tag_resize_shared_tags(set, nr); + } else if (!q->elevator) { + /* + * Non-shared hardware tags, nr is already checked from + * queue_requests_store() and tags can't grow. + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->tags) + continue; + sbitmap_queue_resize(&hctx->tags->bitmap_tags, + nr - hctx->tags->nr_reserved_tags); } - if (ret) - break; - if (q->elevator && q->elevator->type->ops.depth_updated) - q->elevator->type->ops.depth_updated(hctx); - } - if (!ret) { - q->nr_requests = nr; - if (blk_mq_is_shared_tags(set->flags)) { - if (q->elevator) - blk_mq_tag_update_sched_shared_tags(q); - else - blk_mq_tag_resize_shared_tags(set, nr); + } else if (nr <= q->elevator->et->nr_requests) { + /* Non-shared sched tags, and tags don't grow. */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->sched_tags) + continue; + sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags, + nr - hctx->sched_tags->nr_reserved_tags); } + } else { + /* Non-shared sched tags, and tags grow */ + queue_for_each_hw_ctx(q, hctx, i) + hctx->sched_tags = et->tags[i]; + old_et = q->elevator->et; + q->elevator->et = et; } - blk_mq_unquiesce_queue(q); + q->nr_requests = nr; + if (q->elevator && q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(q); - return ret; + blk_mq_unquiesce_queue(q); + return old_et; } /* @@ -5033,6 +5042,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, unsigned int memflags; int i; struct xarray elv_tbl, et_tbl; + bool queues_frozen = false; lockdep_assert_held(&set->tag_list_lock); @@ -5056,9 +5066,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_sysfs_unregister_hctxs(q); } - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_freeze_queue_nomemsave(q); - /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done @@ -5068,6 +5075,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (blk_mq_elv_switch_none(q, &elv_tbl)) goto switch_back; + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_freeze_queue_nomemsave(q); + queues_frozen = true; if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto switch_back; @@ -5091,8 +5101,12 @@ fallback: } switch_back: /* The blk_mq_elv_switch_back unfreezes queue for us. */ - list_for_each_entry(q, &set->tag_list, tag_set_list) + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* switch_back expects queue to be frozen */ + if (!queues_frozen) + blk_mq_freeze_queue_nomemsave(q); blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); + } list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); diff --git a/block/blk-mq.h b/block/blk-mq.h index affb2e14b56e..af42dc018808 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -6,6 +6,7 @@ #include "blk-stat.h" struct blk_mq_tag_set; +struct elevator_tags; struct blk_mq_ctxs { struct kobject kobj; @@ -45,7 +46,9 @@ void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); -int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); +struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, + struct elevator_tags *tags, + unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, bool); @@ -59,7 +62,7 @@ void blk_mq_put_rq_ref(struct request *rq); */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); -void blk_mq_free_rq_map(struct blk_mq_tags *tags); +void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, @@ -110,6 +113,17 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, } /* + * Default to double of smaller one between hw queue_depth and + * 128, since we don't split into sync/async like the old code + * did. Additionally, this is a per-hw queue depth. + */ +static inline unsigned int blk_mq_default_nr_requests( + struct blk_mq_tag_set *set) +{ + return 2 * min_t(unsigned int, set->queue_depth, BLKDEV_DEFAULT_RQ); +} + +/* * sysfs helpers */ extern void blk_mq_sysfs_init(struct request_queue *q); @@ -162,7 +176,7 @@ struct blk_mq_alloc_data { struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, unsigned int flags, int node); -void blk_mq_free_tags(struct blk_mq_tags *tags); +void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, @@ -170,8 +184,6 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); -int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, - struct blk_mq_tags **tags, unsigned int depth, bool can_grow); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 848591fb3c57..654478dfbc20 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -2,8 +2,6 @@ #include "blk-rq-qos.h" -__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos); - /* * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, * false if 'v' + 1 would be bigger than 'below'. @@ -319,8 +317,8 @@ void rq_qos_exit(struct request_queue *q) struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; rqos->ops->exit(rqos); - static_branch_dec(&block_rq_qos); } + blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q); mutex_unlock(&q->rq_qos_mutex); } @@ -346,7 +344,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; - static_branch_inc(&block_rq_qos); + blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); @@ -377,6 +375,8 @@ void rq_qos_del(struct rq_qos *rqos) break; } } + if (!q->rq_qos) + blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); mutex_lock(&q->debugfs_mutex); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 39749f4066fb..b538f2c0febc 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -12,7 +12,6 @@ #include "blk-mq-debugfs.h" struct blk_mq_debugfs_attr; -extern struct static_key_false block_rq_qos; enum rq_qos_id { RQ_QOS_WBT, @@ -113,43 +112,58 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) __rq_qos_cleanup(q->rq_qos, bio); } static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos && - !blk_rq_is_passthrough(rq)) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } static inline void rq_qos_issue(struct request_queue *q, struct request *rq) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) __rq_qos_issue(q->rq_qos, rq); } static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) __rq_qos_requeue(q->rq_qos, rq); } static inline void rq_qos_done_bio(struct bio *bio) { - if (static_branch_unlikely(&block_rq_qos) && - bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || - bio_flagged(bio, BIO_QOS_MERGED))) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (q->rq_qos) - __rq_qos_done_bio(q->rq_qos, bio); - } + struct request_queue *q; + + if (!bio->bi_bdev || (!bio_flagged(bio, BIO_QOS_THROTTLED) && + !bio_flagged(bio, BIO_QOS_MERGED))) + return; + + q = bdev_get_queue(bio->bi_bdev); + + /* + * A BIO may carry BIO_QOS_* flags even if the associated request_queue + * does not have rq_qos enabled. This can happen with stacked block + * devices — for example, NVMe multipath, where it's possible that the + * bottom device has QoS enabled but the top device does not. Therefore, + * always verify that q->rq_qos is present and QoS is enabled before + * calling __rq_qos_done_bio(). + */ + if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) + __rq_qos_done_bio(q->rq_qos, bio); } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) { bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } @@ -158,14 +172,16 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) static inline void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) __rq_qos_track(q->rq_qos, rq, bio); } static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) { bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); } @@ -173,7 +189,8 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq, static inline void rq_qos_queue_depth_changed(struct request_queue *q) { - if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) + if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && + q->rq_qos) __rq_qos_queue_depth_changed(q->rq_qos); } diff --git a/block/blk-settings.c b/block/blk-settings.c index 07874e9b609f..54cffaae4df4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -56,6 +56,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_user_wzeroes_unmap_sectors = UINT_MAX; lim->max_hw_zone_append_sectors = UINT_MAX; lim->max_user_discard_sectors = UINT_MAX; + lim->atomic_write_hw_max = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -157,16 +158,14 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) switch (bi->csum_type) { case BLK_INTEGRITY_CSUM_NONE: if (bi->pi_tuple_size) { - pr_warn("pi_tuple_size must be 0 when checksum type \ - is none\n"); + pr_warn("pi_tuple_size must be 0 when checksum type is none\n"); return -EINVAL; } break; case BLK_INTEGRITY_CSUM_CRC: case BLK_INTEGRITY_CSUM_IP: if (bi->pi_tuple_size != sizeof(struct t10_pi_tuple)) { - pr_warn("pi_tuple_size mismatch for T10 PI: expected \ - %zu, got %u\n", + pr_warn("pi_tuple_size mismatch for T10 PI: expected %zu, got %u\n", sizeof(struct t10_pi_tuple), bi->pi_tuple_size); return -EINVAL; @@ -174,8 +173,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) break; case BLK_INTEGRITY_CSUM_CRC64: if (bi->pi_tuple_size != sizeof(struct crc64_pi_tuple)) { - pr_warn("pi_tuple_size mismatch for CRC64 PI: \ - expected %zu, got %u\n", + pr_warn("pi_tuple_size mismatch for CRC64 PI: expected %zu, got %u\n", sizeof(struct crc64_pi_tuple), bi->pi_tuple_size); return -EINVAL; @@ -226,6 +224,27 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim) lim->atomic_write_hw_boundary >> SECTOR_SHIFT; } +/* + * Test whether any boundary is aligned with any chunk size. Stacked + * devices store any stripe size in t->chunk_sectors. + */ +static bool blk_valid_atomic_writes_boundary(unsigned int chunk_sectors, + unsigned int boundary_sectors) +{ + if (!chunk_sectors || !boundary_sectors) + return true; + + if (boundary_sectors > chunk_sectors && + boundary_sectors % chunk_sectors) + return false; + + if (chunk_sectors > boundary_sectors && + chunk_sectors % boundary_sectors) + return false; + + return true; +} + static void blk_validate_atomic_write_limits(struct queue_limits *lim) { unsigned int boundary_sectors; @@ -235,6 +254,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) if (!(lim->features & BLK_FEAT_ATOMIC_WRITES)) goto unsupported; + /* UINT_MAX indicates stacked limits in initial state */ + if (lim->atomic_write_hw_max == UINT_MAX) + goto unsupported; + if (!lim->atomic_write_hw_max) goto unsupported; @@ -262,20 +285,9 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) if (WARN_ON_ONCE(lim->atomic_write_hw_max > lim->atomic_write_hw_boundary)) goto unsupported; - /* - * A feature of boundary support is that it disallows bios to - * be merged which would result in a merged request which - * crosses either a chunk sector or atomic write HW boundary, - * even though chunk sectors may be just set for performance. - * For simplicity, disallow atomic writes for a chunk sector - * which is non-zero and smaller than atomic write HW boundary. - * Furthermore, chunk sectors must be a multiple of atomic - * write HW boundary. Otherwise boundary support becomes - * complicated. - * Devices which do not conform to these rules can be dealt - * with if and when they show up. - */ - if (WARN_ON_ONCE(lim->chunk_sectors % boundary_sectors)) + + if (WARN_ON_ONCE(!blk_valid_atomic_writes_boundary( + lim->chunk_sectors, boundary_sectors))) goto unsupported; /* @@ -642,25 +654,6 @@ static bool blk_stack_atomic_writes_tail(struct queue_limits *t, return true; } -/* Check for valid boundary of first bottom device */ -static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, - struct queue_limits *b) -{ - /* - * Ensure atomic write boundary is aligned with chunk sectors. Stacked - * devices store chunk sectors in t->io_min. - */ - if (b->atomic_write_hw_boundary > t->io_min && - b->atomic_write_hw_boundary % t->io_min) - return false; - if (t->io_min > b->atomic_write_hw_boundary && - t->io_min % b->atomic_write_hw_boundary) - return false; - - t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; - return true; -} - static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t) { unsigned int chunk_bytes; @@ -698,13 +691,14 @@ static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t) static bool blk_stack_atomic_writes_head(struct queue_limits *t, struct queue_limits *b) { - if (b->atomic_write_hw_boundary && - !blk_stack_atomic_writes_boundary_head(t, b)) + if (!blk_valid_atomic_writes_boundary(t->chunk_sectors, + b->atomic_write_hw_boundary >> SECTOR_SHIFT)) return false; t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; t->atomic_write_hw_max = b->atomic_write_hw_max; + t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; return true; } @@ -720,18 +714,14 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t, if (!blk_atomic_write_start_sect_aligned(start, b)) goto unsupported; - /* - * If atomic_write_hw_max is set, we have already stacked 1x bottom - * device, so check for compliance. - */ - if (t->atomic_write_hw_max) { + /* UINT_MAX indicates no stacking of bottom devices yet */ + if (t->atomic_write_hw_max == UINT_MAX) { + if (!blk_stack_atomic_writes_head(t, b)) + goto unsupported; + } else { if (!blk_stack_atomic_writes_tail(t, b)) goto unsupported; - return; } - - if (!blk_stack_atomic_writes_head(t, b)) - goto unsupported; blk_stack_atomic_writes_chunk_sectors(t); return; @@ -766,7 +756,8 @@ unsupported: int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t start) { - unsigned int top, bottom, alignment, ret = 0; + unsigned int top, bottom, alignment; + int ret = 0; t->features |= (b->features & BLK_FEAT_INHERIT_MASK); @@ -972,6 +963,8 @@ bool queue_limits_stack_integrity(struct queue_limits *t, goto incompatible; if (ti->csum_type != bi->csum_type) goto incompatible; + if (ti->pi_tuple_size != bi->pi_tuple_size) + goto incompatible; if ((ti->flags & BLK_INTEGRITY_REF_TAG) != (bi->flags & BLK_INTEGRITY_REF_TAG)) goto incompatible; @@ -980,6 +973,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t, ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | (bi->flags & BLK_INTEGRITY_REF_TAG); ti->csum_type = bi->csum_type; + ti->pi_tuple_size = bi->pi_tuple_size; ti->metadata_size = bi->metadata_size; ti->pi_offset = bi->pi_offset; ti->interval_exp = bi->interval_exp; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 396cded255ea..76c47fe9b8d6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -64,28 +64,66 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page) static ssize_t queue_requests_store(struct gendisk *disk, const char *page, size_t count) { - unsigned long nr; - int ret, err; - unsigned int memflags; struct request_queue *q = disk->queue; - - if (!queue_is_mq(q)) - return -EINVAL; + struct blk_mq_tag_set *set = q->tag_set; + struct elevator_tags *et = NULL; + unsigned int memflags; + unsigned long nr; + int ret; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; - memflags = blk_mq_freeze_queue(q); - mutex_lock(&q->elevator_lock); + /* + * Serialize updating nr_requests with concurrent queue_requests_store() + * and switching elevator. + */ + down_write(&set->update_nr_hwq_lock); + + if (nr == q->nr_requests) + goto unlock; + if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - err = blk_mq_update_nr_requests(disk->queue, nr); - if (err) - ret = err; + /* + * Switching elevator is protected by update_nr_hwq_lock: + * - read lock is held from elevator sysfs attribute; + * - write lock is held from updating nr_hw_queues; + * Hence it's safe to access q->elevator here with write lock held. + */ + if (nr <= set->reserved_tags || + (q->elevator && nr > MAX_SCHED_RQ) || + (!q->elevator && nr > set->queue_depth)) { + ret = -EINVAL; + goto unlock; + } + + if (!blk_mq_is_shared_tags(set->flags) && q->elevator && + nr > q->elevator->et->nr_requests) { + /* + * Tags will grow, allocate memory before freezing queue to + * prevent deadlock. + */ + et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr); + if (!et) { + ret = -ENOMEM; + goto unlock; + } + } + + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); + et = blk_mq_update_nr_requests(q, et, nr); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); + + if (et) + blk_mq_free_sched_tags(et, set); + +unlock: + up_write(&set->update_nr_hwq_lock); return ret; } @@ -620,6 +658,11 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; + /* + * Ensure that the queue is idled, in case the latency update + * ends up either enabling or disabling wbt completely. We can't + * have IO inflight if that happens. + */ memflags = blk_mq_freeze_queue(q); rqos = wbt_rq_qos(q); @@ -638,11 +681,6 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (wbt_get_min_lat(q) == val) goto out; - /* - * Ensure that the queue is idled, in case the latency update - * ends up either enabling or disabling wbt completely. We can't - * have IO inflight if that happens. - */ blk_mq_quiesce_queue(q); mutex_lock(&disk->rqos_state_mutex); @@ -847,7 +885,7 @@ static void blk_queue_release(struct kobject *kobj) /* nothing to do here, all data is associated with the parent gendisk */ } -static const struct kobj_type blk_queue_ktype = { +const struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, .release = blk_queue_release, @@ -875,15 +913,14 @@ int blk_register_queue(struct gendisk *disk) struct request_queue *q = disk->queue; int ret; - kobject_init(&disk->queue_kobj, &blk_queue_ktype); ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) - goto out_put_queue_kobj; + return ret; if (queue_is_mq(q)) { ret = blk_mq_sysfs_register(disk); if (ret) - goto out_put_queue_kobj; + goto out_del_queue_kobj; } mutex_lock(&q->sysfs_lock); @@ -903,9 +940,9 @@ int blk_register_queue(struct gendisk *disk) if (queue_is_mq(q)) elevator_set_default(q); - wbt_enable_default(disk); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); + wbt_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); @@ -934,8 +971,8 @@ out_debugfs_remove: mutex_unlock(&q->sysfs_lock); if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); -out_put_queue_kobj: - kobject_put(&disk->queue_kobj); +out_del_queue_kobj: + kobject_del(&disk->queue_kobj); return ret; } @@ -986,5 +1023,4 @@ void blk_unregister_queue(struct gendisk *disk) elevator_set_none(q); blk_debugfs_remove(disk); - kobject_put(&disk->queue_kobj); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 397b6a410f9e..2c5b64b1a724 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1224,7 +1224,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, false); blk_finish_plug(&plug); } } @@ -1327,17 +1327,13 @@ static int blk_throtl_init(struct gendisk *disk) INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); - /* - * Freeze queue before activating policy, to synchronize with IO path, - * which is protected by 'q_usage_counter'. - */ memflags = blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); q->td = td; td->queue = q; - /* activate policy */ + /* activate policy, blk_throtl_activated() will return true */ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { q->td = NULL; @@ -1846,12 +1842,15 @@ void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; - if (!blk_throtl_activated(q)) + /* + * blkg_destroy_all() already deactivate throtl policy, just check and + * free throtl data. + */ + if (!q->td) return; timer_delete_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); - blkcg_deactivate_policy(disk, &blkcg_policy_throtl); kfree(q->td); } diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 3b27755bfbff..9d7a42c039a1 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -156,7 +156,13 @@ void blk_throtl_cancel_bios(struct gendisk *disk); static inline bool blk_throtl_activated(struct request_queue *q) { - return q->td != NULL; + /* + * q->td guarantees that the blk-throttle module is already loaded, + * and the plid of blk-throttle is assigned. + * blkcg_policy_enabled() guarantees that the policy is activated + * in the request_queue. + */ + return q->td != NULL && blkcg_policy_enabled(q, &blkcg_policy_throtl); } static inline bool blk_should_throtl(struct bio *bio) @@ -164,11 +170,6 @@ static inline bool blk_should_throtl(struct bio *bio) struct throtl_grp *tg; int rw = bio_data_dir(bio); - /* - * This is called under bio_queue_enter(), and it's synchronized with - * the activation of blk-throtl, which is protected by - * blk_mq_freeze_queue(). - */ if (!blk_throtl_activated(bio->bi_bdev->bd_queue)) return false; @@ -194,7 +195,10 @@ static inline bool blk_should_throtl(struct bio *bio) static inline bool blk_throtl_bio(struct bio *bio) { - + /* + * block throttling takes effect if the policy is activated + * in the bio's request_queue. + */ if (!blk_should_throtl(bio)) return false; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index a50d4cd55f41..eb8037bae0bd 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -85,8 +85,8 @@ struct rq_wb { u64 sync_issue; void *sync_cookie; - unsigned long last_issue; /* last non-throttled issue */ - unsigned long last_comp; /* last non-throttled comp */ + unsigned long last_issue; /* issue time of last read rq */ + unsigned long last_comp; /* completion time of last read rq */ unsigned long min_lat_nsec; struct rq_qos rqos; struct rq_wait rq_wait[WBT_NUM_RWQ]; @@ -248,13 +248,14 @@ static void wbt_done(struct rq_qos *rqos, struct request *rq) struct rq_wb *rwb = RQWB(rqos); if (!wbt_is_tracked(rq)) { - if (rwb->sync_cookie == rq) { - rwb->sync_issue = 0; - rwb->sync_cookie = NULL; - } + if (wbt_is_read(rq)) { + if (rwb->sync_cookie == rq) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } - if (wbt_is_read(rq)) wb_timestamp(rwb, &rwb->last_comp); + } } else { WARN_ON_ONCE(rq == rwb->sync_cookie); __wbt_done(rqos, wbt_flags(rq)); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ef43aaca49f4..5e2a5788dc3b 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1286,14 +1286,14 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) struct block_device *bdev; unsigned long flags; struct bio *bio; + bool prepared; /* * Submit the next plugged BIO. If we do not have any, clear * the plugged flag. */ - spin_lock_irqsave(&zwplug->lock, flags); - again: + spin_lock_irqsave(&zwplug->lock, flags); bio = bio_list_pop(&zwplug->bio_list); if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; @@ -1304,13 +1304,14 @@ again: trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); - if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { + prepared = blk_zone_wplug_prepare_bio(zwplug, bio); + spin_unlock_irqrestore(&zwplug->lock, flags); + + if (!prepared) { blk_zone_wplug_bio_io_error(zwplug, bio); goto again; } - spin_unlock_irqrestore(&zwplug->lock, flags); - bdev = bio->bi_bdev; /* diff --git a/block/blk.h b/block/blk.h index 0a2eccf28ca4..170794632135 100644 --- a/block/blk.h +++ b/block/blk.h @@ -29,6 +29,7 @@ struct elevator_tags; /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) +extern const struct kobj_type blk_queue_ktype; extern struct dentry *blk_debugfs_root; struct blk_flush_queue { @@ -40,6 +41,7 @@ struct blk_flush_queue { struct list_head flush_queue[2]; unsigned long flush_data_in_flight; struct request *flush_rq; + struct rcu_head rcu_head; }; bool is_flush_rq(struct request *req); @@ -53,7 +55,7 @@ bool blk_queue_start_drain(struct request_queue *q); bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); -void submit_bio_noacct_nocheck(struct bio *bio); +void submit_bio_noacct_nocheck(struct bio *bio, bool split); void bio_await_chain(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) @@ -614,6 +616,7 @@ extern const struct address_space_operations def_blk_aops; int disk_register_independent_access_ranges(struct gendisk *disk); void disk_unregister_independent_access_ranges(struct gendisk *disk); +int should_fail_bio(struct bio *bio); #ifdef CONFIG_FAIL_MAKE_REQUEST bool should_fail_request(struct block_device *part, unsigned int bytes); #else /* CONFIG_FAIL_MAKE_REQUEST */ @@ -679,48 +682,6 @@ static inline ktime_t blk_time_get(void) return ns_to_ktime(blk_time_get_ns()); } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); diff --git a/block/elevator.c b/block/elevator.c index fe96c6f4753c..e2ebfbf107b3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -669,7 +669,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) lockdep_assert_held(&set->update_nr_hwq_lock); if (strncmp(ctx->name, "none", 4)) { - ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); + ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues, + blk_mq_default_nr_requests(set)); if (!ctx->et) return -ENOMEM; } diff --git a/block/elevator.h b/block/elevator.h index adc5c157e17e..c4d20155065e 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -37,7 +37,7 @@ struct elevator_mq_ops { void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); - void (*depth_updated)(struct blk_mq_hw_ctx *); + void (*depth_updated)(struct request_queue *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); diff --git a/block/fops.c b/block/fops.c index 82451ac8ff25..c2c0396ea9ee 100644 --- a/block/fops.c +++ b/block/fops.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/uio.h> @@ -38,8 +39,8 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, struct iov_iter *iter) { - return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || - !bdev_iter_is_aligned(bdev, iter); + return (iocb->ki_pos | iov_iter_count(iter)) & + (bdev_logical_block_size(bdev) - 1); } #define DIO_INLINE_BIO_VECS 4 @@ -54,7 +55,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; - WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -78,7 +78,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; - ret = bio_iov_iter_get_pages(&bio, iter); + ret = bio_iov_iter_get_bdev_pages(&bio, iter, bdev); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; @@ -131,7 +131,7 @@ static void blkdev_bio_end_io(struct bio *bio) if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; - if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) + if (bio_integrity(bio)) bio_integrity_unmap_user(bio); if (atomic_dec_and_test(&dio->ref)) { @@ -212,7 +212,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; - ret = bio_iov_iter_get_pages(bio, iter); + ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); @@ -233,7 +233,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } bio->bi_opf |= REQ_NOWAIT; } - if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { + if (iocb->ki_flags & IOCB_HAS_METADATA) { ret = bio_integrity_map_iter(bio, iocb->private); if (unlikely(ret)) goto fail; @@ -301,7 +301,7 @@ static void blkdev_bio_end_io_async(struct bio *bio) ret = blk_status_to_errno(bio->bi_status); } - if (iocb->ki_flags & IOCB_HAS_METADATA) + if (bio_integrity(bio)) bio_integrity_unmap_user(bio); iocb->ki_complete(iocb, ret); @@ -348,7 +348,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, */ bio_iov_bvec_set(bio, iter); } else { - ret = bio_iov_iter_get_pages(bio, iter); + ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); if (unlikely(ret)) goto out_bio_put; } @@ -422,7 +422,8 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (likely(nr_pages <= BIO_MAX_VECS)) { + if (likely(nr_pages <= BIO_MAX_VECS && + !(iocb->ki_flags & IOCB_HAS_METADATA))) { if (is_sync_kiocb(iocb)) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); @@ -687,6 +688,8 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + if (blk_get_integrity(bdev->bd_disk)) + filp->f_mode |= FMODE_HAS_METADATA; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) diff --git a/block/genhd.c b/block/genhd.c index c26733f6324b..9bbc38d12792 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1303,6 +1303,7 @@ static void disk_release(struct device *dev) disk_free_zone_resources(disk); xa_destroy(&disk->part_tbl); + kobject_put(&disk->queue_kobj); disk->queue->disk = NULL; blk_put_queue(disk->queue); @@ -1486,6 +1487,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, INIT_LIST_HEAD(&disk->slave_bdevs); #endif mutex_init(&disk->rqos_state_mutex); + kobject_init(&disk->queue_kobj, &blk_queue_ktype); return disk; out_erase_part0: diff --git a/block/ioctl.c b/block/ioctl.c index f7b0006ca45d..d7489a56b33c 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -481,7 +481,7 @@ static int blkdev_getgeo(struct block_device *bdev, */ memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); + ret = disk->fops->getgeo(disk, &geo); if (ret) return ret; if (copy_to_user(argp, &geo, sizeof(geo))) @@ -515,7 +515,7 @@ static int compat_hdio_getgeo(struct block_device *bdev, * want to override it. */ geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); + ret = disk->fops->getgeo(disk, &geo); if (ret) return ret; @@ -776,7 +776,7 @@ static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) if (bic->res == -EAGAIN && bic->nowait) io_uring_cmd_issue_blocking(cmd); else - io_uring_cmd_done(cmd, bic->res, 0, issue_flags); + io_uring_cmd_done(cmd, bic->res, issue_flags); } static void bio_cmd_bio_end_io(struct bio *bio) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 70cbc7b2deb4..18efd6ef2a2b 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -399,6 +399,14 @@ err: return ERR_PTR(ret); } +static void kyber_depth_updated(struct request_queue *q) +{ + struct kyber_queue_data *kqd = q->elevator->elevator_data; + + kqd->async_depth = q->nr_requests * KYBER_ASYNC_PERCENT / 100U; + blk_mq_set_min_shallow_depth(q, kqd->async_depth); +} + static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct kyber_queue_data *kqd; @@ -413,6 +421,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) eq->elevator_data = kqd; q->elevator = eq; + kyber_depth_updated(q); return 0; } @@ -440,15 +449,6 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) INIT_LIST_HEAD(&kcq->rq_list[i]); } -static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) -{ - struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; - - kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U; - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); -} - static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct kyber_hctx_data *khd; @@ -493,7 +493,6 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) khd->batching = 0; hctx->sched_data = khd; - kyber_depth_updated(hctx); return 0; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b9b7cdf1d3c9..3e741d33142d 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -136,10 +136,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, struct rb_node *node = per_prio->sort_list[data_dir].rb_node; struct request *rq, *res = NULL; - if (!node) - return NULL; - - rq = rb_entry_rq(node); while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { @@ -507,22 +503,12 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) } /* Called by blk_mq_update_nr_requests(). */ -static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) +static void dd_depth_updated(struct request_queue *q) { - struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; dd->async_depth = q->nr_requests; - - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); -} - -/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ -static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - dd_depth_updated(hctx); - return 0; + blk_mq_set_min_shallow_depth(q, 1); } static void dd_exit_sched(struct elevator_queue *e) @@ -587,6 +573,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; + dd_depth_updated(q); return 0; } @@ -1048,7 +1035,6 @@ static struct elevator_type mq_deadline = { .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, - .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 82d9c4c3fb41..631291fbb356 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -358,7 +358,7 @@ int ibm_partition(struct parsed_partitions *state) goto out_nolab; /* set start if not filled by getgeo function e.g. virtblk */ geo->start = get_start_sect(bdev); - if (disk->fops->getgeo(bdev, geo)) + if (disk->fops->getgeo(disk, geo)) goto out_freeall; if (!fn || fn(disk, info)) { kfree(info); |