diff options
Diffstat (limited to 'block')
43 files changed, 2122 insertions, 1545 deletions
diff --git a/block/Kconfig b/block/Kconfig index df8973bc0539..15027963472d 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -211,14 +211,6 @@ config BLK_INLINE_ENCRYPTION_FALLBACK source "block/partitions/Kconfig" -config BLK_MQ_PCI - def_bool PCI - -config BLK_MQ_VIRTIO - bool - depends on VIRTIO - default y - config BLK_PM def_bool PM diff --git a/block/Makefile b/block/Makefile index 3a941dc0d27f..c65f4da93702 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,13 +5,12 @@ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-merge.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ + blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \ + blk-mq-tag.o blk-mq-dma.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ disk-events.o blk-ia-ranges.o early-lookup.o -obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o diff --git a/block/bdev.c b/block/bdev.c index 889ec6e002d7..b77ddd12dc06 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1335,7 +1335,8 @@ void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask) generic_fill_statx_atomic_writes(stat, queue_atomic_write_unit_min_bytes(bd_queue), - queue_atomic_write_unit_max_bytes(bd_queue)); + queue_atomic_write_unit_max_bytes(bd_queue), + 0); } stat->blksize = bdev_io_min(bdev); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index abd80dc13562..50e51047e1fe 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -454,17 +454,10 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) */ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) { - struct bfq_io_cq *icq; - unsigned long flags; - if (!current->io_context) return NULL; - spin_lock_irqsave(&q->queue_lock, flags); - icq = icq_to_bic(ioc_lookup_icq(q)); - spin_unlock_irqrestore(&q->queue_lock, flags); - - return icq; + return icq_to_bic(ioc_lookup_icq(q)); } /* @@ -701,17 +694,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; struct bfq_io_cq *bic = bfq_bic_lookup(data->q); - int depth; - unsigned limit = data->q->nr_requests; - unsigned int act_idx; + unsigned int limit, act_idx; /* Sync reads have full depth available */ - if (op_is_sync(opf) && !op_is_write(opf)) { - depth = 0; - } else { - depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; - limit = (limit * depth) >> bfqd->full_depth_shift; - } + if (op_is_sync(opf) && !op_is_write(opf)) + limit = data->q->nr_requests; + else + limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { /* Fast path to check if bfqq is already allocated. */ @@ -725,14 +714,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * available requests and thus starve other entities. */ if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) { - depth = 1; + limit = 1; break; } } + bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", - __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); - if (depth) - data->shallow_depth = depth; + __func__, bfqd->wr_busy_queues, op_is_sync(opf), limit); + + if (limit < data->q->nr_requests) + data->shallow_depth = limit; } static struct bfq_queue * @@ -2457,15 +2448,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *free = NULL; - /* - * bfq_bic_lookup grabs the queue_lock: invoke it now and - * store its return value for later use, to avoid nesting - * queue_lock inside the bfqd->lock. We assume that the bic - * returned by bfq_bic_lookup does not go away before - * bfqd->lock is taken. - */ struct bfq_io_cq *bic = bfq_bic_lookup(q); + struct request *free = NULL; bool ret; spin_lock_irq(&bfqd->lock); @@ -5863,8 +5847,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, goto out; } - bfqq = kmem_cache_alloc_node(bfq_pool, - GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, + bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO, bfqd->queue->node); if (bfqq) { @@ -7128,9 +7111,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) */ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { - unsigned int depth = 1U << bt->sb.shift; + unsigned int nr_requests = bfqd->queue->nr_requests; - bfqd->full_depth_shift = bt->sb.shift; /* * In-word depths if no bfq_queue is being weight-raised: * leaving 25% of tags only for sync reads. @@ -7142,13 +7124,13 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) * limit 'something'. */ /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max(depth >> 1, 1U); + bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U); /* * no more than 75% of tags for sync writes (25% extra tags * w.r.t. async I/O, to prevent async I/O from starving sync * writes) */ - bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U); + bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U); /* * In-word depths in case some bfq_queue is being weight- @@ -7158,9 +7140,9 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) * shortage. */ /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); + bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); + bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U); } static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) @@ -7210,8 +7192,8 @@ static void bfq_exit_queue(struct elevator_queue *e) #endif blk_stat_disable_accounting(bfqd->queue); - clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags); - wbt_enable_default(bfqd->queue->disk); + blk_queue_flag_clear(QUEUE_FLAG_DISABLE_WBT_DEF, bfqd->queue); + set_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &e->flags); kfree(bfqd); } @@ -7232,22 +7214,16 @@ static void bfq_init_root_group(struct bfq_group *root_group, root_group->sched_data.bfq_class_idle_last_service = jiffies; } -static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) +static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) { struct bfq_data *bfqd; - struct elevator_queue *eq; unsigned int i; struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); - if (!bfqd) { - kobject_put(&eq->kobj); + if (!bfqd) return -ENOMEM; - } + eq->elevator_data = bfqd; spin_lock_irq(&q->queue_lock); @@ -7397,7 +7373,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); - set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); + blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q); wbt_disable_default(q->disk); blk_stat_enable_accounting(q); @@ -7405,7 +7381,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) out_free: kfree(bfqd); - kobject_put(&eq->kobj); return -ENOMEM; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 687a3a7ba784..34a498e6b2a5 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -427,9 +427,6 @@ struct bfq_iocq_bfqq_data { */ bool saved_IO_bound; - u64 saved_io_start_time; - u64 saved_tot_idle_time; - /* * Same purpose as the previous fields for the values of the * field keeping the queue's belonging to a large burst @@ -450,6 +447,9 @@ struct bfq_iocq_bfqq_data { */ unsigned int saved_weight; + u64 saved_io_start_time; + u64 saved_tot_idle_time; + /* * Similar to previous fields: save wr information. */ @@ -457,13 +457,13 @@ struct bfq_iocq_bfqq_data { unsigned long saved_last_wr_start_finish; unsigned long saved_service_from_wr; unsigned long saved_wr_start_at_switch_to_srt; - unsigned int saved_wr_cur_max_time; struct bfq_ttime saved_ttime; + unsigned int saved_wr_cur_max_time; /* Save also injection state */ - u64 saved_last_serv_time_ns; unsigned int saved_inject_limit; unsigned long saved_decrease_time_jif; + u64 saved_last_serv_time_ns; /* candidate queue for a stable merge (due to close creation time) */ struct bfq_queue *stable_merge_bfqq; @@ -813,8 +813,7 @@ struct bfq_data { * Depth limits used in bfq_limit_depth (see comments on the * function) */ - unsigned int word_depths[2][2]; - unsigned int full_depth_shift; + unsigned int async_depths[2][2]; /* * Number of independent actuators. This is equal to 1 in diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 9c6657664792..687952f63bbb 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -54,10 +54,10 @@ static bool bi_offload_capable(struct blk_integrity *bi) { switch (bi->csum_type) { case BLK_INTEGRITY_CSUM_CRC64: - return bi->tuple_size == sizeof(struct crc64_pi_tuple); + return bi->metadata_size == sizeof(struct crc64_pi_tuple); case BLK_INTEGRITY_CSUM_CRC: case BLK_INTEGRITY_CSUM_IP: - return bi->tuple_size == sizeof(struct t10_pi_tuple); + return bi->metadata_size == sizeof(struct t10_pi_tuple); default: pr_warn_once("%s: unknown integrity checksum type:%d\n", __func__, bi->csum_type); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 43ef6bd06c85..6b077ca937f6 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -127,10 +127,11 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, if (bip->bip_vcnt > 0) { struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; - bool same_page = false; - if (bvec_try_merge_hw_page(q, bv, page, len, offset, - &same_page)) { + if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return 0; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset)) { bip->bip_iter.bi_size += len; return len; } @@ -156,10 +157,9 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, EXPORT_SYMBOL(bio_integrity_add_page); static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, - int nr_vecs, unsigned int len, - unsigned int direction) + int nr_vecs, unsigned int len) { - bool write = direction == ITER_SOURCE; + bool write = op_is_write(bio_op(bio)); struct bio_integrity_payload *bip; struct iov_iter iter; void *buf; @@ -170,7 +170,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, return -ENOMEM; if (write) { - iov_iter_bvec(&iter, direction, bvec, nr_vecs, len); + iov_iter_bvec(&iter, ITER_SOURCE, bvec, nr_vecs, len); if (!copy_from_iter_full(buf, len, &iter)) { ret = -EFAULT; goto free_buf; @@ -266,7 +266,7 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; size_t offset, bytes = iter->count; - unsigned int direction, nr_bvecs; + unsigned int nr_bvecs; int ret, nr_vecs; bool copy; @@ -275,11 +275,6 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q)) return -E2BIG; - if (bio_data_dir(bio) == READ) - direction = ITER_DEST; - else - direction = ITER_SOURCE; - nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1); if (nr_vecs > BIO_MAX_VECS) return -E2BIG; @@ -302,8 +297,7 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) copy = true; if (copy) - ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, - direction); + ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes); else ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes); if (ret) diff --git a/block/bio.c b/block/bio.c index 4be592d37fb6..3b371a5da159 100644 --- a/block/bio.c +++ b/block/bio.c @@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_flags = 0; bio->bi_ioprio = 0; bio->bi_write_hint = 0; + bio->bi_write_stream = 0; bio->bi_status = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; @@ -652,13 +653,13 @@ static void bio_truncate(struct bio *bio, unsigned new_size) bio_for_each_segment(bv, bio, iter) { if (done + bv.bv_len > new_size) { - unsigned offset; + size_t offset; if (!truncated) offset = new_size - done; else offset = 0; - zero_user(bv.bv_page, bv.bv_offset + offset, + memzero_page(bv.bv_page, bv.bv_offset + offset, bv.bv_len - offset); truncated = true; } @@ -827,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { @@ -918,7 +920,7 @@ static inline bool bio_full(struct bio *bio, unsigned len) } static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, - unsigned int len, unsigned int off, bool *same_page) + unsigned int len, unsigned int off) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; @@ -928,12 +930,8 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; - if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) - return false; - *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) & - PAGE_MASK)); - if (!*same_page) { + if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) { if (IS_ENABLED(CONFIG_KMSAN)) return false; if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) @@ -953,8 +951,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, * helpers to split. Hopefully this will go away soon. */ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, - struct page *page, unsigned len, unsigned offset, - bool *same_page) + struct page *page, unsigned len, unsigned offset) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = bvec_phys(bv); @@ -964,7 +961,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, return false; if (len > queue_max_segment_size(q) - bv->bv_len) return false; - return bvec_try_merge_page(bv, page, len, offset, same_page); + return bvec_try_merge_page(bv, page, len, offset); } /** @@ -983,6 +980,9 @@ void __bio_add_page(struct bio *bio, struct page *page, WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); + if (is_pci_p2pdma_page(page)) + bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; + bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; @@ -990,6 +990,22 @@ void __bio_add_page(struct bio *bio, struct page *page, EXPORT_SYMBOL_GPL(__bio_add_page); /** + * bio_add_virt_nofail - add data in the direct kernel mapping to a bio + * @bio: destination bio + * @vaddr: data to add + * @len: length of the data to add, may cross pages + * + * Add the data at @vaddr to @bio. The caller must have ensure a segment + * is available for the added data. No merging into an existing segment + * will be performed. + */ +void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) +{ + __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr)); +} +EXPORT_SYMBOL_GPL(bio_add_virt_nofail); + +/** * bio_add_page - attempt to add page(s) to bio * @bio: destination bio * @page: start page to add @@ -1002,18 +1018,21 @@ EXPORT_SYMBOL_GPL(__bio_add_page); int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - bool same_page = false; - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; if (bio->bi_iter.bi_size > UINT_MAX - len) return 0; - if (bio->bi_vcnt > 0 && - bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - page, len, offset, &same_page)) { - bio->bi_iter.bi_size += len; - return len; + if (bio->bi_vcnt > 0) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return 0; + + if (bvec_try_merge_page(bv, page, len, offset)) { + bio->bi_iter.bi_size += len; + return len; + } } if (bio->bi_vcnt >= bio->bi_max_vecs) @@ -1058,6 +1077,61 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, } EXPORT_SYMBOL(bio_add_folio); +/** + * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio + * @bio: destination bio + * @vaddr: vmalloc address to add + * @len: total length in bytes of the data to add + * + * Add data starting at @vaddr to @bio and return how many bytes were added. + * This may be less than the amount originally asked. Returns 0 if no data + * could be added to @bio. + * + * This helper calls flush_kernel_vmap_range() for the range added. For reads + * the caller still needs to manually call invalidate_kernel_vmap_range() in + * the completion handler. + */ +unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len) +{ + unsigned int offset = offset_in_page(vaddr); + + len = min(len, PAGE_SIZE - offset); + if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len) + return 0; + if (op_is_write(bio_op(bio))) + flush_kernel_vmap_range(vaddr, len); + return len; +} +EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk); + +/** + * bio_add_vmalloc - add a vmalloc region to a bio + * @bio: destination bio + * @vaddr: vmalloc address to add + * @len: total length in bytes of the data to add + * + * Add data starting at @vaddr to @bio. Return %true on success or %false if + * @bio does not have enough space for the payload. + * + * This helper calls flush_kernel_vmap_range() for the range added. For reads + * the caller still needs to manually call invalidate_kernel_vmap_range() in + * the completion handler. + */ +bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len) +{ + do { + unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len); + + if (!added) + return false; + vaddr += added; + len -= added; + } while (len); + + return true; +} +EXPORT_SYMBOL_GPL(bio_add_vmalloc); + void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct folio_iter fi; @@ -1088,27 +1162,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len, - size_t offset) -{ - bool same_page = false; - - if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) - return -EIO; - - if (bio->bi_vcnt > 0 && - bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - folio_page(folio, 0), len, offset, - &same_page)) { - bio->bi_iter.bi_size += len; - if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) - unpin_user_folio(folio, 1); - return 0; - } - bio_add_folio_nofail(bio, folio, len, offset); - return 0; -} - static unsigned int get_contig_folio_len(unsigned int *num_pages, struct page **pages, unsigned int i, struct folio *folio, size_t left, @@ -1203,6 +1256,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; struct folio *folio = page_folio(page); + unsigned int old_vcnt = bio->bi_vcnt; folio_offset = ((size_t)folio_page_idx(folio, page) << PAGE_SHIFT) + offset; @@ -1215,7 +1269,23 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) len = get_contig_folio_len(&num_pages, pages, i, folio, left, offset); - bio_iov_add_folio(bio, folio, len, folio_offset); + if (!bio_add_folio(bio, folio, len, folio_offset)) { + WARN_ON_ONCE(1); + ret = -EINVAL; + goto out; + } + + if (bio_flagged(bio, BIO_PAGE_PINNED)) { + /* + * We're adding another fragment of a page that already + * was part of the last segment. Undo our pin as the + * page was pinned when an earlier fragment of it was + * added to the bio and __bio_release_pages expects a + * single pin per page. + */ + if (offset && bio->bi_vcnt == old_vcnt) + unpin_user_folio(folio, 1); + } offset = 0; } @@ -1301,6 +1371,36 @@ int submit_bio_wait(struct bio *bio) } EXPORT_SYMBOL(submit_bio_wait); +/** + * bdev_rw_virt - synchronously read into / write from kernel mapping + * @bdev: block device to access + * @sector: sector to access + * @data: data to read/write + * @len: length in byte to read/write + * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE) + * + * Performs synchronous I/O to @bdev for @data/@len. @data must be in + * the kernel direct mapping and not a vmalloc address. + */ +int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, + size_t len, enum req_op op) +{ + struct bio_vec bv; + struct bio bio; + int error; + + if (WARN_ON_ONCE(is_vmalloc_addr(data))) + return -EIO; + + bio_init(&bio, bdev, &bv, 1, op); + bio.bi_iter.bi_sector = sector; + bio_add_virt_nofail(&bio, data, len); + error = submit_bio_wait(&bio); + bio_uninit(&bio); + return error; +} +EXPORT_SYMBOL_GPL(bdev_rw_virt); + static void bio_wait_end_io(struct bio *bio) { complete(bio->bi_private); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ce93706555c5..fe9ebd6a2e14 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -394,7 +394,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); + new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; @@ -1074,8 +1074,8 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) /* * For covering concurrent parent blkg update from blkg_release(). * - * When flushing from cgroup, cgroup_rstat_lock is always held, so - * this lock won't cause contention most of time. + * When flushing from cgroup, the subsystem rstat lock is always held, + * so this lock won't cause contention most of time. */ raw_spin_lock_irqsave(&blkg_stat_lock, flags); @@ -1144,7 +1144,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no - * cgroups are defined. For that reason, cgroup_rstat_flush in + * cgroups are defined. For that reason, css_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * @@ -1253,7 +1253,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else - cgroup_rstat_flush(blkcg->css.cgroup); + css_rstat_flush(&blkcg->css); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -1467,7 +1467,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) spin_lock_init(&blkcg->lock); refcount_set(&blkcg->online_pin, 1); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); @@ -1630,7 +1630,7 @@ retry: pd_prealloc = NULL; } else { pd = pol->pd_alloc_fn(disk, blkg->blkcg, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); } if (!pd) { @@ -2243,7 +2243,7 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - cgroup_rstat_updated(blkcg->css.cgroup, cpu); + css_rstat_updated(&blkcg->css, cpu); put_cpu(); } diff --git a/block/blk-core.c b/block/blk-core.c index e8cc270a453f..fdac48aec5ef 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -381,7 +381,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref) static void blk_rq_timed_out_timer(struct timer_list *t) { - struct request_queue *q = from_timer(q, t, timeout); + struct request_queue *q = timer_container_of(q, t, timeout); kblockd_schedule_work(&q->timeout_work); } @@ -1018,7 +1018,7 @@ again: stamp = READ_ONCE(part->bd_stamp); if (unlikely(time_after(now, stamp)) && likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && - (end || part_in_flight(part))) + (end || bdev_count_inflight(part))) __part_stat_add(part, io_ticks, now - stamp); if (bdev_is_partition(part)) { diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index f154be0b575a..005c9157ffb3 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 94a155912bf1..81918f6e0cae 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -501,6 +501,7 @@ int blk_crypto_derive_sw_secret(struct block_device *bdev, blk_crypto_hw_exit(profile); return err; } +EXPORT_SYMBOL_GPL(blk_crypto_derive_sw_secret); int blk_crypto_import_key(struct blk_crypto_profile *profile, const u8 *raw_key, size_t raw_key_size, @@ -520,6 +521,7 @@ int blk_crypto_import_key(struct blk_crypto_profile *profile, blk_crypto_hw_exit(profile); return ret; } +EXPORT_SYMBOL_GPL(blk_crypto_import_key); int blk_crypto_generate_key(struct blk_crypto_profile *profile, u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) @@ -537,6 +539,7 @@ int blk_crypto_generate_key(struct blk_crypto_profile *profile, blk_crypto_hw_exit(profile); return ret; } +EXPORT_SYMBOL_GPL(blk_crypto_generate_key); int blk_crypto_prepare_key(struct blk_crypto_profile *profile, const u8 *lt_key, size_t lt_key_size, @@ -556,6 +559,7 @@ int blk_crypto_prepare_key(struct blk_crypto_profile *profile, blk_crypto_hw_exit(profile); return ret; } +EXPORT_SYMBOL_GPL(blk_crypto_prepare_key); /** * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities diff --git a/block/blk-integrity.c b/block/blk-integrity.c index a1678f0a9f81..056b8948369d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -13,6 +13,7 @@ #include <linux/scatterlist.h> #include <linux/export.h> #include <linux/slab.h> +#include <linux/t10-pi.h> #include "blk.h" @@ -54,6 +55,73 @@ new_segment: return segments; } +int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, + struct logical_block_metadata_cap __user *argp) +{ + struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk); + struct logical_block_metadata_cap meta_cap = {}; + size_t usize = _IOC_SIZE(cmd); + + if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) || + _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) || + _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) || + _IOC_SIZE(cmd) < LBMD_SIZE_VER0) + return -ENOIOCTLCMD; + + if (!bi) + goto out; + + if (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) + meta_cap.lbmd_flags |= LBMD_PI_CAP_INTEGRITY; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + meta_cap.lbmd_flags |= LBMD_PI_CAP_REFTAG; + meta_cap.lbmd_interval = 1 << bi->interval_exp; + meta_cap.lbmd_size = bi->metadata_size; + meta_cap.lbmd_pi_size = bi->pi_tuple_size; + meta_cap.lbmd_pi_offset = bi->pi_offset; + meta_cap.lbmd_opaque_size = bi->metadata_size - bi->pi_tuple_size; + if (meta_cap.lbmd_opaque_size && !bi->pi_offset) + meta_cap.lbmd_opaque_offset = bi->pi_tuple_size; + + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_NONE: + meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_NONE; + break; + case BLK_INTEGRITY_CSUM_IP: + meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_IP; + break; + case BLK_INTEGRITY_CSUM_CRC: + meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_CRC16_T10DIF; + break; + case BLK_INTEGRITY_CSUM_CRC64: + meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_CRC64_NVME; + break; + } + + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) + meta_cap.lbmd_app_tag_size = 2; + + if (bi->flags & BLK_INTEGRITY_REF_TAG) { + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + meta_cap.lbmd_ref_tag_size = + sizeof_field(struct crc64_pi_tuple, ref_tag); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + meta_cap.lbmd_ref_tag_size = + sizeof_field(struct t10_pi_tuple, ref_tag); + break; + default: + break; + } + } + +out: + return copy_struct_to_user(argp, usize, &meta_cap, sizeof(meta_cap), + NULL); +} + /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist * @rq: request to map @@ -117,13 +185,8 @@ int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, { int ret; struct iov_iter iter; - unsigned int direction; - if (op_is_write(req_op(rq))) - direction = ITER_DEST; - else - direction = ITER_SOURCE; - iov_iter_ubuf(&iter, direction, ubuf, bytes); + iov_iter_ubuf(&iter, rq_data_dir(rq), ubuf, bytes); ret = bio_integrity_map_user(rq->bio, &iter); if (ret) return ret; @@ -244,7 +307,7 @@ static ssize_t format_show(struct device *dev, struct device_attribute *attr, { struct blk_integrity *bi = dev_to_bi(dev); - if (!bi->tuple_size) + if (!bi->metadata_size) return sysfs_emit(page, "none\n"); return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi)); } diff --git a/block/blk-ioc.c b/block/blk-ioc.c index ce82770c72ab..9fda3906e5f5 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -308,24 +308,23 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_BLK_ICQ /** - * ioc_lookup_icq - lookup io_cq from ioc + * ioc_lookup_icq - lookup io_cq from ioc in io issue path * @q: the associated request_queue * * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called - * with @q->queue_lock held. + * from io issue path, either return NULL if current issue io to @q for the + * first time, or return a valid icq. */ struct io_cq *ioc_lookup_icq(struct request_queue *q) { struct io_context *ioc = current->io_context; struct io_cq *icq; - lockdep_assert_held(&q->queue_lock); - /* * icq's are indexed from @ioc using radix tree and hint pointer, - * both of which are protected with RCU. All removals are done - * holding both q and ioc locks, and we're holding q lock - if we - * find a icq which points to us, it's guaranteed to be valid. + * both of which are protected with RCU, io issue path ensures that + * both request_queue and current task are valid, the found icq + * is guaranteed to be valid until the io is done. */ rcu_read_lock(); icq = rcu_dereference(ioc->icq_hint); @@ -419,10 +418,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q) task_unlock(current); } else { get_io_context(ioc); - - spin_lock_irq(&q->queue_lock); icq = ioc_lookup_icq(q); - spin_unlock_irq(&q->queue_lock); } if (!icq) { diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 42c1e0b9a68f..2f8fdecdd7a9 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -658,7 +658,8 @@ static const struct rq_qos_ops blkcg_iolatency_ops = { static void blkiolatency_timer_fn(struct timer_list *t) { - struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); + struct blk_iolatency *blkiolat = timer_container_of(blkiolat, t, + timer); struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; u64 now = blk_time_get_ns(); diff --git a/block/blk-map.c b/block/blk-map.c index d2f22744b3d1..23e5d5ebe59e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -317,64 +317,26 @@ static void bio_map_kern_endio(struct bio *bio) kfree(bio); } -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -static struct bio *bio_map_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) +static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op, + gfp_t gfp_mask) { - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - bool is_vmalloc = is_vmalloc_addr(data); - struct page *page; - int offset, i; + unsigned int nr_vecs = bio_add_max_vecs(data, len); struct bio *bio; - bio = bio_kmalloc(nr_pages, gfp_mask); + bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); - - if (is_vmalloc) { - flush_kernel_vmap_range(data, len); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op); + if (is_vmalloc_addr(data)) { bio->bi_private = data; - } - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (!is_vmalloc) - page = virt_to_page(data); - else - page = vmalloc_to_page(data); - if (bio_add_page(bio, page, bytes, offset) < bytes) { - /* we don't support partial mappings */ + if (!bio_add_vmalloc(bio, data, len)) { bio_uninit(bio); kfree(bio); return ERR_PTR(-EINVAL); } - - data += bytes; - len -= bytes; - offset = 0; + } else { + bio_add_virt_nofail(bio, data, len); } - bio->bi_end_io = bio_map_kern_endio; return bio; } @@ -402,17 +364,16 @@ static void bio_copy_kern_endio_read(struct bio *bio) /** * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio * @data: pointer to buffer to copy * @len: length in bytes + * @op: bio/request operation * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ * * copy the kernel address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ -static struct bio *bio_copy_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask, int reading) +static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op, + gfp_t gfp_mask) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -431,7 +392,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op); while (len) { struct page *page; @@ -444,7 +405,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (!page) goto cleanup; - if (!reading) + if (op_is_write(op)) memcpy(page_address(page), p, bytes); if (bio_add_page(bio, page, bytes, 0) < bytes) @@ -454,11 +415,11 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, p += bytes; } - if (reading) { + if (op_is_write(op)) { + bio->bi_end_io = bio_copy_kern_endio; + } else { bio->bi_end_io = bio_copy_kern_endio_read; bio->bi_private = data; - } else { - bio->bi_end_io = bio_copy_kern_endio; } return bio; @@ -556,8 +517,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (map_data) copy = true; - else if (blk_queue_may_bounce(q)) - copy = true; else if (iov_iter_alignment(iter) & align) copy = true; else if (iov_iter_is_bvec(iter)) @@ -689,7 +648,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user); /** * blk_rq_map_kern - map kernel data to a request, for passthrough requests - * @q: request queue where request should be inserted * @rq: request to fill * @kbuf: the kernel buffer * @len: length of user data @@ -700,31 +658,26 @@ EXPORT_SYMBOL(blk_rq_unmap_user); * buffer is used. Can be called multiple times to append multiple * buffers. */ -int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, - unsigned int len, gfp_t gfp_mask) +int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len, + gfp_t gfp_mask) { - int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; struct bio *bio; int ret; - if (len > (queue_max_hw_sectors(q) << 9)) + if (len > (queue_max_hw_sectors(rq->q) << SECTOR_SHIFT)) return -EINVAL; if (!len || !kbuf) return -EINVAL; - if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) || - blk_queue_may_bounce(q)) - bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); + if (!blk_rq_aligned(rq->q, addr, len) || object_is_on_stack(kbuf)) + bio = bio_copy_kern(kbuf, len, req_op(rq), gfp_mask); else - bio = bio_map_kern(q, kbuf, len, gfp_mask); + bio = bio_map_kern(kbuf, len, req_op(rq), gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); - bio->bi_opf &= ~REQ_OP_MASK; - bio->bi_opf |= req_op(rq); - ret = blk_rq_append_bio(rq, bio); if (unlikely(ret)) { bio_uninit(bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index fdd4efb54c6c..70d704615be5 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -7,7 +7,6 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> -#include <linux/scatterlist.h> #include <linux/part_stat.h> #include <linux/blk-cgroup.h> @@ -226,27 +225,6 @@ static inline unsigned get_max_io_size(struct bio *bio, } /** - * get_max_segment_size() - maximum number of bytes to add as a single segment - * @lim: Request queue limits. - * @paddr: address of the range to add - * @len: maximum length available to add at @paddr - * - * Returns the maximum number of bytes of the range starting at @paddr that can - * be added to a single segment. - */ -static inline unsigned get_max_segment_size(const struct queue_limits *lim, - phys_addr_t paddr, unsigned int len) -{ - /* - * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 - * after having calculated the minimum. - */ - return min_t(unsigned long, len, - min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr), - (unsigned long)lim->max_segment_size - 1) + 1); -} - -/** * bvec_split_segs - verify whether or not a bvec should be split in the middle * @lim: [in] queue limits to split based on * @bv: [in] bvec to examine @@ -473,117 +451,6 @@ unsigned int blk_recalc_rq_segments(struct request *rq) return nr_phys_segs; } -struct phys_vec { - phys_addr_t paddr; - u32 len; -}; - -static bool blk_map_iter_next(struct request *req, - struct req_iterator *iter, struct phys_vec *vec) -{ - unsigned int max_size; - struct bio_vec bv; - - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - if (!iter->bio) - return false; - vec->paddr = bvec_phys(&req->special_vec); - vec->len = req->special_vec.bv_len; - iter->bio = NULL; - return true; - } - - if (!iter->iter.bi_size) - return false; - - bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); - vec->paddr = bvec_phys(&bv); - max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); - bv.bv_len = min(bv.bv_len, max_size); - bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); - - /* - * If we are entirely done with this bi_io_vec entry, check if the next - * one could be merged into it. This typically happens when moving to - * the next bio, but some callers also don't pack bvecs tight. - */ - while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { - struct bio_vec next; - - if (!iter->iter.bi_size) { - if (!iter->bio->bi_next) - break; - iter->bio = iter->bio->bi_next; - iter->iter = iter->bio->bi_iter; - } - - next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); - if (bv.bv_len + next.bv_len > max_size || - !biovec_phys_mergeable(req->q, &bv, &next)) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); - } - - vec->len = bv.bv_len; - return true; -} - -static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, - struct scatterlist *sglist) -{ - if (!*sg) - return sglist; - - /* - * If the driver previously mapped a shorter list, we could see a - * termination bit prematurely unless it fully inits the sg table - * on each mapping. We KNOW that there must be more entries here - * or the driver would be buggy, so force clear the termination bit - * to avoid doing a full sg_init_table() in drivers for each command. - */ - sg_unmark_end(*sg); - return sg_next(*sg); -} - -/* - * Map a request to scatterlist, return number of sg entries setup. Caller - * must make sure sg can hold rq->nr_phys_segments entries. - */ -int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, - struct scatterlist **last_sg) -{ - struct req_iterator iter = { - .bio = rq->bio, - }; - struct phys_vec vec; - int nsegs = 0; - - /* the internal flush request may not have bio attached */ - if (iter.bio) - iter.iter = iter.bio->bi_iter; - - while (blk_map_iter_next(rq, &iter, &vec)) { - *last_sg = blk_next_sg(last_sg, sglist); - sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, - offset_in_page(vec.paddr)); - nsegs++; - } - - if (*last_sg) - sg_mark_end(*last_sg); - - /* - * Something must have been wrong if the figured number of - * segment is bigger than number of req's physical segments - */ - WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); - - return nsegs; -} -EXPORT_SYMBOL(__blk_rq_map_sg); - static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { @@ -832,6 +699,8 @@ static struct request *attempt_merge(struct request_queue *q, if (req->bio->bi_write_hint != next->bio->bi_write_hint) return NULL; + if (req->bio->bi_write_stream != next->bio->bi_write_stream) + return NULL; if (req->bio->bi_ioprio != next->bio->bi_ioprio) return NULL; if (!blk_atomic_write_mergeable_rqs(req, next)) @@ -953,6 +822,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; if (rq->bio->bi_write_hint != bio->bi_write_hint) return false; + if (rq->bio->bi_write_stream != bio->bi_write_stream) + return false; if (rq->bio->bi_ioprio != bio->bi_ioprio) return false; if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) @@ -1127,20 +998,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, if (!plug || rq_list_empty(&plug->mq_list)) return false; - rq_list_for_each(&plug->mq_list, rq) { - if (rq->q == q) { - if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == - BIO_MERGE_OK) - return true; - break; - } + rq = plug->mq_list.tail; + if (rq->q == q) + return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK; + else if (!plug->multiple_queues) + return false; - /* - * Only keep iterating plug list for merges if we have multiple - * queues - */ - if (!plug->multiple_queues) - break; + rq_list_for_each(&plug->mq_list, rq) { + if (rq->q != q) + continue; + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) + return true; + break; } return false; } diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 444798c5374f..705da074ad6c 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -12,16 +12,56 @@ #include <linux/cpu.h> #include <linux/group_cpus.h> #include <linux/device/bus.h> +#include <linux/sched/isolation.h> #include "blk.h" #include "blk-mq.h" +static unsigned int blk_mq_num_queues(const struct cpumask *mask, + unsigned int max_queues) +{ + unsigned int num; + + num = cpumask_weight(mask); + return min_not_zero(num, max_queues); +} + +/** + * blk_mq_num_possible_queues - Calc nr of queues for multiqueue devices + * @max_queues: The maximum number of queues the hardware/driver + * supports. If max_queues is 0, the argument is + * ignored. + * + * Calculates the number of queues to be used for a multiqueue + * device based on the number of possible CPUs. + */ +unsigned int blk_mq_num_possible_queues(unsigned int max_queues) +{ + return blk_mq_num_queues(cpu_possible_mask, max_queues); +} +EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues); + +/** + * blk_mq_num_online_queues - Calc nr of queues for multiqueue devices + * @max_queues: The maximum number of queues the hardware/driver + * supports. If max_queues is 0, the argument is + * ignored. + * + * Calculates the number of queues to be used for a multiqueue + * device based on the number of online CPUs. + */ +unsigned int blk_mq_num_online_queues(unsigned int max_queues) +{ + return blk_mq_num_queues(cpu_online_mask, max_queues); +} +EXPORT_SYMBOL_GPL(blk_mq_num_online_queues); + void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { const struct cpumask *masks; - unsigned int queue, cpu; + unsigned int queue, cpu, nr_masks; - masks = group_cpus_evenly(qmap->nr_queues); + masks = group_cpus_evenly(qmap->nr_queues, &nr_masks); if (!masks) { for_each_possible_cpu(cpu) qmap->mq_map[cpu] = qmap->queue_offset; @@ -29,7 +69,7 @@ void blk_mq_map_queues(struct blk_mq_queue_map *qmap) } for (queue = 0; queue < qmap->nr_queues; queue++) { - for_each_cpu(cpu, &masks[queue]) + for_each_cpu(cpu, &masks[queue % nr_masks]) qmap->mq_map[cpu] = qmap->queue_offset + queue; } kfree(masks); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3421b5521fe2..7ed3e71f2fc0 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -93,6 +93,8 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(SQ_SCHED), + QUEUE_FLAG_NAME(DISABLE_WBT_DEF), + QUEUE_FLAG_NAME(NO_ELV_SWITCH), }; #undef QUEUE_FLAG_NAME @@ -519,7 +521,7 @@ CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); static int blk_mq_debugfs_show(struct seq_file *m, void *v) { const struct blk_mq_debugfs_attr *attr = m->private; - void *data = d_inode(m->file->f_path.dentry->d_parent)->i_private; + void *data = debugfs_get_aux(m->file); return attr->show(data, m); } @@ -529,7 +531,7 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, { struct seq_file *m = file->private_data; const struct blk_mq_debugfs_attr *attr = m->private; - void *data = d_inode(file->f_path.dentry->d_parent)->i_private; + void *data = debugfs_get_aux(file); /* * Attributes that only implement .seq_ops are read-only and 'attr' is @@ -544,7 +546,7 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, static int blk_mq_debugfs_open(struct inode *inode, struct file *file) { const struct blk_mq_debugfs_attr *attr = inode->i_private; - void *data = d_inode(file->f_path.dentry->d_parent)->i_private; + void *data = debugfs_get_aux(file); struct seq_file *m; int ret; @@ -610,11 +612,9 @@ static void debugfs_create_files(struct dentry *parent, void *data, if (IS_ERR_OR_NULL(parent)) return; - d_inode(parent)->i_private = data; - for (; attr->name; attr++) - debugfs_create_file(attr->name, attr->mode, parent, - (void *)attr, &blk_mq_debugfs_fops); + debugfs_create_file_aux(attr->name, attr->mode, parent, + (void *)attr, data, &blk_mq_debugfs_fops); } void blk_mq_debugfs_register(struct request_queue *q) @@ -624,20 +624,9 @@ void blk_mq_debugfs_register(struct request_queue *q) debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); - /* - * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir - * didn't exist yet (because we don't know what to name the directory - * until the queue is registered to a gendisk). - */ - if (q->elevator && !q->sched_debugfs_dir) - blk_mq_debugfs_register_sched(q); - - /* Similarly, blk_mq_init_hctx() couldn't do this previously. */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir) blk_mq_debugfs_register_hctx(q, hctx); - if (q->elevator && !hctx->sched_debugfs_dir) - blk_mq_debugfs_register_sched_hctx(q, hctx); } if (q->rq_qos) { diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c new file mode 100644 index 000000000000..ad283017caef --- /dev/null +++ b/block/blk-mq-dma.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Christoph Hellwig + */ +#include <linux/blk-mq-dma.h> +#include "blk.h" + +struct phys_vec { + phys_addr_t paddr; + u32 len; +}; + +static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, + struct phys_vec *vec) +{ + unsigned int max_size; + struct bio_vec bv; + + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + if (!iter->bio) + return false; + vec->paddr = bvec_phys(&req->special_vec); + vec->len = req->special_vec.bv_len; + iter->bio = NULL; + return true; + } + + if (!iter->iter.bi_size) + return false; + + bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + vec->paddr = bvec_phys(&bv); + max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); + bv.bv_len = min(bv.bv_len, max_size); + bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); + + /* + * If we are entirely done with this bi_io_vec entry, check if the next + * one could be merged into it. This typically happens when moving to + * the next bio, but some callers also don't pack bvecs tight. + */ + while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { + struct bio_vec next; + + if (!iter->iter.bi_size) { + if (!iter->bio->bi_next) + break; + iter->bio = iter->bio->bi_next; + iter->iter = iter->bio->bi_iter; + } + + next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + if (bv.bv_len + next.bv_len > max_size || + !biovec_phys_mergeable(req->q, &bv, &next)) + break; + + bv.bv_len += next.bv_len; + bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + } + + vec->len = bv.bv_len; + return true; +} + +/* + * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page + * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so + * we need to ensure our segments are aligned to this as well. + * + * Note that there is no point in using the slightly more complicated IOVA based + * path for single segment mappings. + */ +static inline bool blk_can_dma_map_iova(struct request *req, + struct device *dma_dev) +{ + return !((queue_virt_boundary(req->q) + 1) & + dma_get_merge_boundary(dma_dev)); +} + +static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) +{ + iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr); + iter->len = vec->len; + return true; +} + +static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, + struct blk_dma_iter *iter, struct phys_vec *vec) +{ + iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr), + offset_in_page(vec->paddr), vec->len, rq_dma_dir(req)); + if (dma_mapping_error(dma_dev, iter->addr)) { + iter->status = BLK_STS_RESOURCE; + return false; + } + iter->len = vec->len; + return true; +} + +static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter, + struct phys_vec *vec) +{ + enum dma_data_direction dir = rq_dma_dir(req); + unsigned int mapped = 0; + int error; + + iter->addr = state->addr; + iter->len = dma_iova_size(state); + + do { + error = dma_iova_link(dma_dev, state, vec->paddr, mapped, + vec->len, dir, 0); + if (error) + break; + mapped += vec->len; + } while (blk_map_iter_next(req, &iter->iter, vec)); + + error = dma_iova_sync(dma_dev, state, 0, mapped); + if (error) { + iter->status = errno_to_blk_status(error); + return false; + } + + return true; +} + +/** + * blk_rq_dma_map_iter_start - map the first DMA segment for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the + * caller and don't need to be initialized. @state needs to be stored for use + * at unmap time, @iter is only needed at map time. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true ft it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + * + * The caller can call blk_rq_dma_map_coalesce() to check if further segments + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() + * to try to map the following segments. + */ +bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter) +{ + unsigned int total_len = blk_rq_payload_bytes(req); + struct phys_vec vec; + + iter->iter.bio = req->bio; + iter->iter.iter = req->bio->bi_iter; + memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); + iter->status = BLK_STS_OK; + + /* + * Grab the first segment ASAP because we'll need it to check for P2P + * transfers. + */ + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { + switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, + phys_to_page(vec.paddr))) { + case PCI_P2PDMA_MAP_BUS_ADDR: + return blk_dma_map_bus(iter, &vec); + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * P2P transfers through the host bridge are treated the + * same as non-P2P transfers below and during unmap. + */ + req->cmd_flags &= ~REQ_P2PDMA; + break; + default: + iter->status = BLK_STS_INVAL; + return false; + } + } + + if (blk_can_dma_map_iova(req, dma_dev) && + dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) + return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); + +/** + * blk_rq_dma_map_iter_next - map the next DMA segment for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Iterate to the next mapping after a previous call to + * blk_rq_dma_map_iter_start(). See there for a detailed description of the + * arguments. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true ft it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + */ +bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter) +{ + struct phys_vec vec; + + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + return blk_dma_map_bus(iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next); + +static inline struct scatterlist * +blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) +{ + if (!*sg) + return sglist; + + /* + * If the driver previously mapped a shorter list, we could see a + * termination bit prematurely unless it fully inits the sg table + * on each mapping. We KNOW that there must be more entries here + * or the driver would be buggy, so force clear the termination bit + * to avoid doing a full sg_init_table() in drivers for each command. + */ + sg_unmark_end(*sg); + return sg_next(*sg); +} + +/* + * Map a request to scatterlist, return number of sg entries setup. Caller + * must make sure sg can hold rq->nr_phys_segments entries. + */ +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg) +{ + struct req_iterator iter = { + .bio = rq->bio, + }; + struct phys_vec vec; + int nsegs = 0; + + /* the internal flush request may not have bio attached */ + if (iter.bio) + iter.iter = iter.bio->bi_iter; + + while (blk_map_iter_next(rq, &iter, &vec)) { + *last_sg = blk_next_sg(last_sg, sglist); + sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, + offset_in_page(vec.paddr)); + nsegs++; + } + + if (*last_sg) + sg_mark_end(*last_sg); + + /* + * Something must have been wrong if the figured number of + * segment is bigger than number of req's physical segments + */ + WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); + + return nsegs; +} +EXPORT_SYMBOL(__blk_rq_map_sg); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 109611445d40..e2ce4a28e6c9 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -59,19 +59,17 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) list_first_entry(rq_list, struct request, queuelist)->mq_hctx; struct request *rq; LIST_HEAD(hctx_list); - unsigned int count = 0; list_for_each_entry(rq, rq_list, queuelist) { if (rq->mq_hctx != hctx) { list_cut_before(&hctx_list, rq_list, &rq->queuelist); goto dispatch; } - count++; } list_splice_tail_init(rq_list, &hctx_list); dispatch: - return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); + return blk_mq_dispatch_rq_list(hctx, &hctx_list, false); } #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ @@ -167,7 +165,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) dispatched |= blk_mq_dispatch_hctx_list(&rq_list); } while (!list_empty(&rq_list)); } else { - dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false); } if (busy) @@ -261,7 +259,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; @@ -298,7 +296,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) + if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true)) return 0; need_dispatch = true; } else { @@ -312,7 +310,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (need_dispatch) return blk_mq_do_dispatch_ctx(hctx); blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(hctx, &rq_list, 0); + blk_mq_dispatch_rq_list(hctx, &rq_list, true); return 0; } @@ -376,68 +374,177 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) +/* called in queue's release handler, tagset has gone away */ +static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { - if (blk_mq_is_shared_tags(q->tag_set->flags)) { - hctx->sched_tags = q->sched_shared_tags; - return 0; - } + struct blk_mq_hw_ctx *hctx; + unsigned long i; - hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, - q->nr_requests); + queue_for_each_hw_ctx(q, hctx, i) + hctx->sched_tags = NULL; - if (!hctx->sched_tags) - return -ENOMEM; - return 0; + if (blk_mq_is_shared_tags(flags)) + q->sched_shared_tags = NULL; } -static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) +void blk_mq_sched_reg_debugfs(struct request_queue *q) { - blk_mq_free_rq_map(queue->sched_shared_tags); - queue->sched_shared_tags = NULL; + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_sched(q); + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_register_sched_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); } -/* called in queue's release handler, tagset has gone away */ -static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) +void blk_mq_sched_unreg_debugfs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) { - if (!blk_mq_is_shared_tags(flags)) - blk_mq_free_rq_map(hctx->sched_tags); - hctx->sched_tags = NULL; - } + mutex_lock(&q->debugfs_mutex); + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_unregister_sched_hctx(hctx); + blk_mq_debugfs_unregister_sched(q); + mutex_unlock(&q->debugfs_mutex); +} + +void blk_mq_free_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set) +{ + unsigned long i; + + /* Shared tags are stored at index 0 in @tags. */ + if (blk_mq_is_shared_tags(set->flags)) + blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX); + else { + for (i = 0; i < et->nr_hw_queues; i++) + blk_mq_free_map_and_rqs(set, et->tags[i], i); } - if (blk_mq_is_shared_tags(flags)) - blk_mq_exit_sched_shared_tags(q); + kfree(et); } -static int blk_mq_init_sched_shared_tags(struct request_queue *queue) +void blk_mq_free_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set) { - struct blk_mq_tag_set *set = queue->tag_set; + struct request_queue *q; + struct elevator_tags *et; + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + et = xa_load(et_table, q->id); + if (unlikely(!et)) + WARN_ON_ONCE(1); + else + blk_mq_free_sched_tags(et, set); + } + } +} + +struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues) +{ + unsigned int nr_tags; + int i; + struct elevator_tags *et; + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + + if (blk_mq_is_shared_tags(set->flags)) + nr_tags = 1; + else + nr_tags = nr_hw_queues; + + et = kmalloc(sizeof(struct elevator_tags) + + nr_tags * sizeof(struct blk_mq_tags *), gfp); + if (!et) + return NULL; /* - * Set initial depth at max so that we don't need to reallocate for - * updating nr_requests. + * Default to double of smaller one between hw queue_depth and + * 128, since we don't split into sync/async like the old code + * did. Additionally, this is a per-hw queue depth. */ - queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, - BLK_MQ_NO_HCTX_IDX, - MAX_SCHED_RQ); - if (!queue->sched_shared_tags) - return -ENOMEM; + et->nr_requests = 2 * min_t(unsigned int, set->queue_depth, + BLKDEV_DEFAULT_RQ); + et->nr_hw_queues = nr_hw_queues; + + if (blk_mq_is_shared_tags(set->flags)) { + /* Shared tags are stored at index 0 in @tags. */ + et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!et->tags[0]) + goto out; + } else { + for (i = 0; i < et->nr_hw_queues; i++) { + et->tags[i] = blk_mq_alloc_map_and_rqs(set, i, + et->nr_requests); + if (!et->tags[i]) + goto out_unwind; + } + } - blk_mq_tag_update_sched_shared_tags(queue); + return et; +out_unwind: + while (--i >= 0) + blk_mq_free_map_and_rqs(set, et->tags[i], i); +out: + kfree(et); + return NULL; +} + +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues) +{ + struct request_queue *q; + struct elevator_tags *et; + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + et = blk_mq_alloc_sched_tags(set, nr_hw_queues); + if (!et) + goto out_unwind; + if (xa_insert(et_table, q->id, et, gfp)) + goto out_free_tags; + } + } return 0; +out_free_tags: + blk_mq_free_sched_tags(et, set); +out_unwind: + list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { + if (q->elevator) { + et = xa_load(et_table, q->id); + if (et) + blk_mq_free_sched_tags(et, set); + } + } + return -ENOMEM; } /* caller must have a reference to @e, will grab another one if successful */ -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *et) { unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; @@ -445,56 +552,44 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) unsigned long i; int ret; - /* - * Default to double of smaller one between hw queue_depth and 128, - * since we don't split into sync/async like the old code did. - * Additionally, this is a per-hw queue depth. - */ - q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_DEFAULT_RQ); + eq = elevator_alloc(q, e, et); + if (!eq) + return -ENOMEM; + + q->nr_requests = et->nr_requests; if (blk_mq_is_shared_tags(flags)) { - ret = blk_mq_init_sched_shared_tags(q); - if (ret) - return ret; + /* Shared tags are stored at index 0 in @et->tags. */ + q->sched_shared_tags = et->tags[0]; + blk_mq_tag_update_sched_shared_tags(q); } queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); - if (ret) - goto err_free_map_and_rqs; + if (blk_mq_is_shared_tags(flags)) + hctx->sched_tags = q->sched_shared_tags; + else + hctx->sched_tags = et->tags[i]; } - ret = e->ops.init_sched(q, e); + ret = e->ops.init_sched(q, eq); if (ret) - goto err_free_map_and_rqs; - - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_sched(q); - mutex_unlock(&q->debugfs_mutex); + goto out; queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { - eq = q->elevator; - blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; } } - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_sched_hctx(q, hctx); - mutex_unlock(&q->debugfs_mutex); } - return 0; -err_free_map_and_rqs: - blk_mq_sched_free_rqs(q); +out: blk_mq_sched_tags_teardown(q, flags); - + kobject_put(&eq->kobj); q->elevator = NULL; return ret; } @@ -527,10 +622,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_sched_hctx(hctx); - mutex_unlock(&q->debugfs_mutex); - if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; @@ -538,12 +629,9 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) flags = hctx->flags; } - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_sched(q); - mutex_unlock(&q->debugfs_mutex); - if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q, flags); + set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 1326526bb733..b554e1d55950 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -18,10 +18,20 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *et); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); +struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues); +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues); +void blk_mq_free_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set); +void blk_mq_free_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set); + static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) diff --git a/block/blk-mq.c b/block/blk-mq.c index c2697db59109..b67d6c02eceb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -89,7 +89,7 @@ struct mq_inflight { unsigned int inflight[2]; }; -static bool blk_mq_check_inflight(struct request *rq, void *priv) +static bool blk_mq_check_in_driver(struct request *rq, void *priv) { struct mq_inflight *mi = priv; @@ -101,24 +101,14 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv) return true; } -unsigned int blk_mq_in_flight(struct request_queue *q, - struct block_device *part) +void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - - return mi.inflight[0] + mi.inflight[1]; -} - -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, - unsigned int inflight[2]) -{ - struct mq_inflight mi = { .part = part }; - - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - inflight[0] = mi.inflight[0]; - inflight[1] = mi.inflight[1]; + blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver, + &mi); + inflight[READ] = mi.inflight[READ]; + inflight[WRITE] = mi.inflight[WRITE]; } #ifdef CONFIG_LOCKDEP @@ -584,9 +574,13 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = plug->nr_ios, .cached_rqs = &plug->cached_rqs, + .ctx = NULL, + .hctx = NULL }; struct request *rq; @@ -646,8 +640,13 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; int ret; @@ -675,8 +674,13 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; u64 alloc_time_ns = 0; struct request *rq; @@ -879,7 +883,8 @@ static void blk_complete_request(struct request *req) /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); - blk_zone_update_request_bio(req, bio); + if (blk_req_bio_is_zone_append(req, bio)) + blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); @@ -978,7 +983,8 @@ bool blk_update_request(struct request *req, blk_status_t error, /* Don't actually finish bio if it's part of flush sequence */ if (!bio->bi_iter.bi_size) { - blk_zone_update_request_bio(req, bio); + if (blk_req_bio_is_zone_append(req, bio)) + blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); } @@ -2080,7 +2086,7 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, - unsigned int nr_budgets) + bool get_budget) { enum prep_dispatch prep; struct request_queue *q = hctx->queue; @@ -2102,7 +2108,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); - prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); + prep = blk_mq_prep_dispatch_rq(rq, get_budget); if (prep != PREP_DISPATCH_OK) break; @@ -2111,12 +2117,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bd.rq = rq; bd.last = list_empty(list); - /* - * once the request is queued to lld, no need to cover the - * budget any more - */ - if (nr_budgets) - nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: @@ -2150,7 +2150,11 @@ out: ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || blk_mq_is_shared_tags(hctx->flags)); - if (nr_budgets) + /* + * If the caller allocated budgets, free the budgets of the + * requests that have not yet been passed to the block driver. + */ + if (!get_budget) blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); @@ -2778,15 +2782,15 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) return __blk_mq_issue_directly(hctx, rq, last); } -static void blk_mq_plug_issue_direct(struct blk_plug *plug) +static void blk_mq_issue_direct(struct rq_list *rqs) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; blk_status_t ret = BLK_STS_OK; - while ((rq = rq_list_pop(&plug->mq_list))) { - bool last = rq_list_empty(&plug->mq_list); + while ((rq = rq_list_pop(rqs))) { + bool last = rq_list_empty(rqs); if (hctx != rq->mq_hctx) { if (hctx) { @@ -2817,15 +2821,64 @@ out: blk_mq_commit_rqs(hctx, queued, false); } -static void __blk_mq_flush_plug_list(struct request_queue *q, - struct blk_plug *plug) +static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs) { if (blk_queue_quiesced(q)) return; - q->mq_ops->queue_rqs(&plug->mq_list); + q->mq_ops->queue_rqs(rqs); +} + +static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs, + struct rq_list *queue_rqs) +{ + struct request *rq = rq_list_pop(rqs); + struct request_queue *this_q = rq->q; + struct request **prev = &rqs->head; + struct rq_list matched_rqs = {}; + struct request *last = NULL; + unsigned depth = 1; + + rq_list_add_tail(&matched_rqs, rq); + while ((rq = *prev)) { + if (rq->q == this_q) { + /* move rq from rqs to matched_rqs */ + *prev = rq->rq_next; + rq_list_add_tail(&matched_rqs, rq); + depth++; + } else { + /* leave rq in rqs */ + prev = &rq->rq_next; + last = rq; + } + } + + rqs->tail = last; + *queue_rqs = matched_rqs; + return depth; +} + +static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth) +{ + struct request_queue *q = rq_list_peek(rqs)->q; + + trace_block_unplug(q, depth, true); + + /* + * Peek first request and see if we have a ->queue_rqs() hook. + * If we do, we can dispatch the whole list in one go. + * We already know at this point that all requests belong to the + * same queue, caller must ensure that's the case. + */ + if (q->mq_ops->queue_rqs) { + blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs)); + if (rq_list_empty(rqs)) + return; + } + + blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs)); } -static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) +static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; @@ -2835,7 +2888,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) LIST_HEAD(list); do { - struct request *rq = rq_list_pop(&plug->mq_list); + struct request *rq = rq_list_pop(rqs); if (!this_hctx) { this_hctx = rq->mq_hctx; @@ -2848,9 +2901,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) } list_add_tail(&rq->queuelist, &list); depth++; - } while (!rq_list_empty(&plug->mq_list)); + } while (!rq_list_empty(rqs)); - plug->mq_list = requeue_list; + *rqs = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); @@ -2870,9 +2923,21 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) percpu_ref_put(&this_hctx->queue->q_usage_counter); } +static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs) +{ + do { + struct rq_list queue_rqs; + unsigned depth; + + depth = blk_mq_extract_queue_requests(rqs, &queue_rqs); + blk_mq_dispatch_queue_requests(&queue_rqs, depth); + while (!rq_list_empty(&queue_rqs)) + blk_mq_dispatch_list(&queue_rqs, false); + } while (!rq_list_empty(rqs)); +} + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct request *rq; unsigned int depth; /* @@ -2887,34 +2952,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) depth = plug->rq_count; plug->rq_count = 0; - if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { - struct request_queue *q; - - rq = rq_list_peek(&plug->mq_list); - q = rq->q; - trace_block_unplug(q, depth, true); - - /* - * Peek first request and see if we have a ->queue_rqs() hook. - * If we do, we can dispatch the whole plug list in one go. We - * already know at this point that all requests belong to the - * same queue, caller must ensure that's the case. - */ - if (q->mq_ops->queue_rqs) { - blk_mq_run_dispatch_ops(q, - __blk_mq_flush_plug_list(q, plug)); - if (rq_list_empty(&plug->mq_list)) - return; + if (!plug->has_elevator && !from_schedule) { + if (plug->multiple_queues) { + blk_mq_dispatch_multiple_queue_requests(&plug->mq_list); + return; } - blk_mq_run_dispatch_ops(q, - blk_mq_plug_issue_direct(plug)); + blk_mq_dispatch_queue_requests(&plug->mq_list, depth); if (rq_list_empty(&plug->mq_list)) return; } do { - blk_mq_dispatch_plug_list(plug, from_schedule); + blk_mq_dispatch_list(&plug->mq_list, from_schedule); } while (!rq_list_empty(&plug->mq_list)); } @@ -2969,8 +3019,14 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, { struct blk_mq_alloc_data data = { .q = q, - .nr_tags = 1, + .flags = 0, + .shallow_depth = 0, .cmd_flags = bio->bi_opf, + .rq_flags = 0, + .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; struct request *rq; @@ -3080,8 +3136,6 @@ void blk_mq_submit_bio(struct bio *bio) goto new_request; } - bio = blk_queue_bounce(bio, q); - /* * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it. @@ -3117,8 +3171,10 @@ void blk_mq_submit_bio(struct bio *bio) if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; - if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) - goto queue_exit; + if (bio_needs_zone_write_plugging(bio)) { + if (blk_zone_plug_bio(bio, nr_segs)) + goto queue_exit; + } new_request: if (rq) { @@ -4094,8 +4150,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; - mutex_lock(&q->elevator_lock); - queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -4200,8 +4254,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } - - mutex_unlock(&q->elevator_lock); } /* @@ -4505,16 +4557,9 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, - struct request_queue *q, bool lock) + struct request_queue *q) { - if (lock) { - /* protect against switching io scheduler */ - mutex_lock(&q->elevator_lock); - __blk_mq_realloc_hw_ctxs(set, q); - mutex_unlock(&q->elevator_lock); - } else { - __blk_mq_realloc_hw_ctxs(set, q); - } + __blk_mq_realloc_hw_ctxs(set, q); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4546,7 +4591,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, xa_init(&q->hctx_table); - blk_mq_realloc_hw_ctxs(set, q, false); + blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -4563,8 +4608,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_requests = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); - blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); + blk_mq_add_queue_tag_set(set, q); return 0; err_hctxs: @@ -4784,6 +4829,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) goto out_free_srcu; } + init_rwsem(&set->update_nr_hwq_lock); + ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, @@ -4924,90 +4971,68 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } /* - * request_queue and elevator_type pair. - * It is just used by __blk_mq_update_nr_hw_queues to cache - * the elevator_type associated with a request_queue. - */ -struct blk_mq_qe_pair { - struct list_head node; - struct request_queue *q; - struct elevator_type *type; -}; - -/* - * Cache the elevator_type in qe pair list and switch the - * io scheduler to 'none' + * Switch back to the elevator type stored in the xarray. */ -static bool blk_mq_elv_switch_none(struct list_head *head, - struct request_queue *q) +static void blk_mq_elv_switch_back(struct request_queue *q, + struct xarray *elv_tbl, struct xarray *et_tbl) { - struct blk_mq_qe_pair *qe; - - qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); - if (!qe) - return false; - - /* Accessing q->elevator needs protection from ->elevator_lock. */ - mutex_lock(&q->elevator_lock); - - if (!q->elevator) { - kfree(qe); - goto unlock; - } + struct elevator_type *e = xa_load(elv_tbl, q->id); + struct elevator_tags *t = xa_load(et_tbl, q->id); - INIT_LIST_HEAD(&qe->node); - qe->q = q; - qe->type = q->elevator->type; - /* keep a reference to the elevator module as we'll switch back */ - __elevator_get(qe->type); - list_add(&qe->node, head); - elevator_disable(q); -unlock: - mutex_unlock(&q->elevator_lock); + /* The elv_update_nr_hw_queues unfreezes the queue. */ + elv_update_nr_hw_queues(q, e, t); - return true; + /* Drop the reference acquired in blk_mq_elv_switch_none. */ + if (e) + elevator_put(e); } -static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head, - struct request_queue *q) +/* + * Stores elevator type in xarray and set current elevator to none. It uses + * q->id as an index to store the elevator type into the xarray. + */ +static int blk_mq_elv_switch_none(struct request_queue *q, + struct xarray *elv_tbl) { - struct blk_mq_qe_pair *qe; + int ret = 0; - list_for_each_entry(qe, head, node) - if (qe->q == q) - return qe; + lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); - return NULL; -} + /* + * Accessing q->elevator without holding q->elevator_lock is safe here + * because we're called from nr_hw_queue update which is protected by + * set->update_nr_hwq_lock in the writer context. So, scheduler update/ + * switch code (which acquires the same lock in the reader context) + * can't run concurrently. + */ + if (q->elevator) { -static void blk_mq_elv_switch_back(struct list_head *head, - struct request_queue *q) -{ - struct blk_mq_qe_pair *qe; - struct elevator_type *t; + ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); + if (WARN_ON_ONCE(ret)) + return ret; - qe = blk_lookup_qe_pair(head, q); - if (!qe) - return; - t = qe->type; - list_del(&qe->node); - kfree(qe); + /* + * Before we switch elevator to 'none', take a reference to + * the elevator module so that while nr_hw_queue update is + * running, no one can remove elevator module. We'd put the + * reference to elevator module later when we switch back + * elevator. + */ + __elevator_get(q->elevator->type); - mutex_lock(&q->elevator_lock); - elevator_switch(q, t); - /* drop the reference acquired in blk_mq_elv_switch_none */ - elevator_put(t); - mutex_unlock(&q->elevator_lock); + elevator_set_none(q); + } + return ret; } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; - LIST_HEAD(head); int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; + struct xarray elv_tbl, et_tbl; lockdep_assert_held(&set->tag_list_lock); @@ -5019,6 +5044,18 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, return; memflags = memalloc_noio_save(); + + xa_init(&et_tbl); + if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) + goto out_memalloc_restore; + + xa_init(&elv_tbl); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_debugfs_unregister_hctxs(q); + blk_mq_sysfs_unregister_hctxs(q); + } + list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue_nomemsave(q); @@ -5028,21 +5065,16 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, * updating the new sw to hw queue mappings. */ list_for_each_entry(q, &set->tag_list, tag_set_list) - if (!blk_mq_elv_switch_none(&head, q)) + if (blk_mq_elv_switch_none(q, &elv_tbl)) goto switch_back; - list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_debugfs_unregister_hctxs(q); - blk_mq_sysfs_unregister_hctxs(q); - } - if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) - goto reregister; + goto switch_back; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_realloc_hw_ctxs(set, q, true); + __blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; @@ -5057,19 +5089,22 @@ fallback: } blk_mq_map_swqueue(q); } +switch_back: + /* The blk_mq_elv_switch_back unfreezes queue for us. */ + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); -reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); - } -switch_back: - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_elv_switch_back(&head, q); + blk_mq_remove_hw_queues_cpuhp(q); + blk_mq_add_hw_queues_cpuhp(q); + } - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_unfreeze_queue_nomemrestore(q); + xa_destroy(&elv_tbl); + xa_destroy(&et_tbl); +out_memalloc_restore: memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ @@ -5079,9 +5114,11 @@ switch_back: void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { + down_write(&set->update_nr_hwq_lock); mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); + up_write(&set->update_nr_hwq_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); diff --git a/block/blk-mq.h b/block/blk-mq.h index 3011a78cf16a..affb2e14b56e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -48,7 +48,7 @@ void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, - unsigned int); + bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); @@ -246,10 +246,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } -unsigned int blk_mq_in_flight(struct request_queue *q, - struct block_device *part); -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, - unsigned int inflight[2]); +void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 95982bc46ba1..848591fb3c57 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -2,6 +2,8 @@ #include "blk-rq-qos.h" +__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos); + /* * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, * false if 'v' + 1 would be bigger than 'below'. @@ -317,6 +319,7 @@ void rq_qos_exit(struct request_queue *q) struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; rqos->ops->exit(rqos); + static_branch_dec(&block_rq_qos); } mutex_unlock(&q->rq_qos_mutex); } @@ -343,6 +346,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; + static_branch_inc(&block_rq_qos); blk_mq_unfreeze_queue(q, memflags); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 37245c97ee61..39749f4066fb 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -12,6 +12,7 @@ #include "blk-mq-debugfs.h" struct blk_mq_debugfs_attr; +extern struct static_key_false block_rq_qos; enum rq_qos_id { RQ_QOS_WBT, @@ -112,31 +113,33 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_cleanup(q->rq_qos, bio); } static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (q->rq_qos && !blk_rq_is_passthrough(rq)) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos && + !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } static inline void rq_qos_issue(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_issue(q->rq_qos, rq); } static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_requeue(q->rq_qos, rq); } static inline void rq_qos_done_bio(struct bio *bio) { - if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || + if (static_branch_unlikely(&block_rq_qos) && + bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || bio_flagged(bio, BIO_QOS_MERGED))) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (q->rq_qos) @@ -146,7 +149,7 @@ static inline void rq_qos_done_bio(struct bio *bio) static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - if (q->rq_qos) { + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } @@ -155,14 +158,14 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) static inline void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_track(q->rq_qos, rq, bio); } static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) { + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); } @@ -170,7 +173,7 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq, static inline void rq_qos_queue_depth_changed(struct request_queue *q) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_queue_depth_changed(q->rq_qos); } diff --git a/block/blk-settings.c b/block/blk-settings.c index 4817e7ca03f8..07874e9b609f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -14,6 +14,8 @@ #include <linux/jiffies.h> #include <linux/gfp.h> #include <linux/dma-mapping.h> +#include <linux/t10-pi.h> +#include <linux/crc64.h> #include "blk.h" #include "blk-rq-qos.h" @@ -50,6 +52,8 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; + lim->max_hw_wzeroes_unmap_sectors = UINT_MAX; + lim->max_user_wzeroes_unmap_sectors = UINT_MAX; lim->max_hw_zone_append_sectors = UINT_MAX; lim->max_user_discard_sectors = UINT_MAX; } @@ -58,16 +62,24 @@ EXPORT_SYMBOL(blk_set_stacking_limits); void blk_apply_bdi_limits(struct backing_dev_info *bdi, struct queue_limits *lim) { + u64 io_opt = lim->io_opt; + /* * For read-ahead of large files to be effective, we need to read ahead - * at least twice the optimal I/O size. + * at least twice the optimal I/O size. For rotational devices that do + * not report an optimal I/O size (e.g. ATA HDDs), use the maximum I/O + * size to avoid falling back to the (rather inefficient) small default + * read-ahead size. * * There is no hardware limitation for the read-ahead size and the user * might have increased the read-ahead size through sysfs, so don't ever * decrease it. */ + if (!io_opt && (lim->features & BLK_FEAT_ROTATIONAL)) + io_opt = (u64)lim->max_sectors << SECTOR_SHIFT; + bdi->ra_pages = max3(bdi->ra_pages, - lim->io_opt * 2 / PAGE_SIZE, + io_opt * 2 >> PAGE_SHIFT, VM_READAHEAD_PAGES); bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; } @@ -114,7 +126,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) { struct blk_integrity *bi = &lim->integrity; - if (!bi->tuple_size) { + if (!bi->metadata_size) { if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE || bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) { pr_warn("invalid PI settings.\n"); @@ -124,11 +136,6 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) return 0; } - if (lim->features & BLK_FEAT_BOUNCE_HIGH) { - pr_warn("no bounce buffer support for integrity metadata\n"); - return -EINVAL; - } - if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { pr_warn("integrity support disabled.\n"); return -EINVAL; @@ -140,6 +147,42 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) return -EINVAL; } + if (bi->pi_tuple_size > bi->metadata_size) { + pr_warn("pi_tuple_size (%u) exceeds metadata_size (%u)\n", + bi->pi_tuple_size, + bi->metadata_size); + return -EINVAL; + } + + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_NONE: + if (bi->pi_tuple_size) { + pr_warn("pi_tuple_size must be 0 when checksum type \ + is none\n"); + return -EINVAL; + } + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + if (bi->pi_tuple_size != sizeof(struct t10_pi_tuple)) { + pr_warn("pi_tuple_size mismatch for T10 PI: expected \ + %zu, got %u\n", + sizeof(struct t10_pi_tuple), + bi->pi_tuple_size); + return -EINVAL; + } + break; + case BLK_INTEGRITY_CSUM_CRC64: + if (bi->pi_tuple_size != sizeof(struct crc64_pi_tuple)) { + pr_warn("pi_tuple_size mismatch for CRC64 PI: \ + expected %zu, got %u\n", + sizeof(struct crc64_pi_tuple), + bi->pi_tuple_size); + return -EINVAL; + } + break; + } + if (!bi->interval_exp) bi->interval_exp = ilog2(lim->logical_block_size); @@ -186,6 +229,8 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim) static void blk_validate_atomic_write_limits(struct queue_limits *lim) { unsigned int boundary_sectors; + unsigned int atomic_write_hw_max_sectors = + lim->atomic_write_hw_max >> SECTOR_SHIFT; if (!(lim->features & BLK_FEAT_ATOMIC_WRITES)) goto unsupported; @@ -207,6 +252,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) lim->atomic_write_hw_max)) goto unsupported; + if (WARN_ON_ONCE(lim->chunk_sectors && + atomic_write_hw_max_sectors > lim->chunk_sectors)) + goto unsupported; + boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; if (boundary_sectors) { @@ -271,8 +320,12 @@ int blk_validate_limits(struct queue_limits *lim) pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size); return -EINVAL; } - if (lim->physical_block_size < lim->logical_block_size) + if (lim->physical_block_size < lim->logical_block_size) { lim->physical_block_size = lim->logical_block_size; + } else if (!is_power_of_2(lim->physical_block_size)) { + pr_warn("Invalid physical block size (%d)\n", lim->physical_block_size); + return -EINVAL; + } /* * The minimum I/O size defaults to the physical block size unless @@ -338,15 +391,28 @@ int blk_validate_limits(struct queue_limits *lim) if (!lim->max_segments) lim->max_segments = BLK_MAX_SEGMENTS; + if (lim->max_hw_wzeroes_unmap_sectors && + lim->max_hw_wzeroes_unmap_sectors != lim->max_write_zeroes_sectors) + return -EINVAL; + lim->max_wzeroes_unmap_sectors = min(lim->max_hw_wzeroes_unmap_sectors, + lim->max_user_wzeroes_unmap_sectors); + lim->max_discard_sectors = min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); + /* + * When discard is not supported, discard_granularity should be reported + * as 0 to userspace. + */ + if (lim->max_discard_sectors) + lim->discard_granularity = + max(lim->discard_granularity, lim->physical_block_size); + else + lim->discard_granularity = 0; + if (!lim->max_discard_segments) lim->max_discard_segments = 1; - if (lim->discard_granularity < lim->physical_block_size) - lim->discard_granularity = lim->physical_block_size; - /* * By default there is no limit on the segment boundary alignment, * but if there is one it can't be smaller than the page size as @@ -423,10 +489,11 @@ int blk_set_default_limits(struct queue_limits *lim) { /* * Most defaults are set by capping the bounds in blk_validate_limits, - * but max_user_discard_sectors is special and needs an explicit - * initialization to the max value here. + * but these limits are special and need an explicit initialization to + * the max value here. */ lim->max_user_discard_sectors = UINT_MAX; + lim->max_user_wzeroes_unmap_sectors = UINT_MAX; return blk_validate_limits(lim); } @@ -594,41 +661,50 @@ static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, return true; } - -/* Check stacking of first bottom device */ -static bool blk_stack_atomic_writes_head(struct queue_limits *t, - struct queue_limits *b) +static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t) { - if (b->atomic_write_hw_boundary && - !blk_stack_atomic_writes_boundary_head(t, b)) - return false; + unsigned int chunk_bytes; - if (t->io_min <= SECTOR_SIZE) { - /* No chunk sectors, so use bottom device values directly */ - t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; - t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; - t->atomic_write_hw_max = b->atomic_write_hw_max; - return true; - } + if (!t->chunk_sectors) + return; + + /* + * If chunk sectors is so large that its value in bytes overflows + * UINT_MAX, then just shift it down so it definitely will fit. + * We don't support atomic writes of such a large size anyway. + */ + if (check_shl_overflow(t->chunk_sectors, SECTOR_SHIFT, &chunk_bytes)) + chunk_bytes = t->chunk_sectors; /* * Find values for limits which work for chunk size. * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk - * size (t->io_min), as chunk size is not restricted to a power-of-2. + * size, as the chunk size is not restricted to a power-of-2. * So we need to find highest power-of-2 which works for the chunk * size. - * As an example scenario, we could have b->unit_max = 16K and - * t->io_min = 24K. For this case, reduce t->unit_max to a value - * aligned with both limits, i.e. 8K in this example. + * As an example scenario, we could have t->unit_max = 16K and + * t->chunk_sectors = 24KB. For this case, reduce t->unit_max to a + * value aligned with both limits, i.e. 8K in this example. */ - t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; - while (t->io_min % t->atomic_write_hw_unit_max) - t->atomic_write_hw_unit_max /= 2; + t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max, + max_pow_of_two_factor(chunk_bytes)); - t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min, + t->atomic_write_hw_unit_min = min(t->atomic_write_hw_unit_min, t->atomic_write_hw_unit_max); - t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min); + t->atomic_write_hw_max = min(t->atomic_write_hw_max, chunk_bytes); +} +/* Check stacking of first bottom device */ +static bool blk_stack_atomic_writes_head(struct queue_limits *t, + struct queue_limits *b) +{ + if (b->atomic_write_hw_boundary && + !blk_stack_atomic_writes_boundary_head(t, b)) + return false; + + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; + t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; + t->atomic_write_hw_max = b->atomic_write_hw_max; return true; } @@ -656,6 +732,7 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t, if (!blk_stack_atomic_writes_head(t, b)) goto unsupported; + blk_stack_atomic_writes_chunk_sectors(t); return; unsupported: @@ -713,6 +790,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); + t->max_user_wzeroes_unmap_sectors = + min(t->max_user_wzeroes_unmap_sectors, + b->max_user_wzeroes_unmap_sectors); + t->max_hw_wzeroes_unmap_sectors = + min(t->max_hw_wzeroes_unmap_sectors, + b->max_hw_wzeroes_unmap_sectors); + t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors, b->max_hw_zone_append_sectors); @@ -784,7 +868,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, } /* chunk_sectors a multiple of the physical block size? */ - if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { + if (t->chunk_sectors % (t->physical_block_size >> SECTOR_SHIFT)) { t->chunk_sectors = 0; t->flags |= BLK_FLAG_MISALIGNED; ret = -1; @@ -880,7 +964,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t, return true; if (ti->flags & BLK_INTEGRITY_STACKED) { - if (ti->tuple_size != bi->tuple_size) + if (ti->metadata_size != bi->metadata_size) goto incompatible; if (ti->interval_exp != bi->interval_exp) goto incompatible; @@ -896,7 +980,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t, ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | (bi->flags & BLK_INTEGRITY_REF_TAG); ti->csum_type = bi->csum_type; - ti->tuple_size = bi->tuple_size; + ti->metadata_size = bi->metadata_size; ti->pi_offset = bi->pi_offset; ti->interval_exp = bi->interval_exp; ti->tag_size = bi->tag_size; diff --git a/block/blk-stat.c b/block/blk-stat.c index 46449da856f8..682a8ddb1173 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -76,7 +76,7 @@ void blk_stat_add(struct request *rq, u64 now) static void blk_stat_timer_fn(struct timer_list *t) { - struct blk_stat_callback *cb = from_timer(cb, t, timer); + struct blk_stat_callback *cb = timer_container_of(cb, t, timer); unsigned int bucket; int cpu; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1f9b45b0b9ee..4a7f1a349998 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -134,6 +134,8 @@ QUEUE_SYSFS_LIMIT_SHOW(max_segments) QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) +QUEUE_SYSFS_LIMIT_SHOW(max_write_streams) +QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity) QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) @@ -159,6 +161,8 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_wzeroes_unmap_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_wzeroes_unmap_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) @@ -203,6 +207,24 @@ static int queue_max_discard_sectors_store(struct gendisk *disk, return 0; } +static int queue_max_wzeroes_unmap_sectors_store(struct gendisk *disk, + const char *page, size_t count, struct queue_limits *lim) +{ + unsigned long max_zeroes_bytes, max_hw_zeroes_bytes; + ssize_t ret; + + ret = queue_var_store(&max_zeroes_bytes, page, count); + if (ret < 0) + return ret; + + max_hw_zeroes_bytes = lim->max_hw_wzeroes_unmap_sectors << SECTOR_SHIFT; + if (max_zeroes_bytes != 0 && max_zeroes_bytes != max_hw_zeroes_bytes) + return -EINVAL; + + lim->max_user_wzeroes_unmap_sectors = max_zeroes_bytes >> SECTOR_SHIFT; + return 0; +} + static int queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) @@ -488,6 +510,8 @@ QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams"); +QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity"); QUEUE_RW_ENTRY(elv_iosched, "scheduler"); QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); @@ -510,6 +534,10 @@ QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_max_hw_wzeroes_unmap_sectors, + "write_zeroes_unmap_max_hw_bytes"); +QUEUE_LIM_RW_ENTRY(queue_max_wzeroes_unmap_sectors, + "write_zeroes_unmap_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); @@ -560,7 +588,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ssize_t ret; struct request_queue *q = disk->queue; - mutex_lock(&q->elevator_lock); + mutex_lock(&disk->rqos_state_mutex); if (!wbt_rq_qos(q)) { ret = -EINVAL; goto out; @@ -573,7 +601,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); out: - mutex_unlock(&q->elevator_lock); + mutex_unlock(&disk->rqos_state_mutex); return ret; } @@ -593,7 +621,6 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, return -EINVAL; memflags = blk_mq_freeze_queue(q); - mutex_lock(&q->elevator_lock); rqos = wbt_rq_qos(q); if (!rqos) { @@ -618,11 +645,12 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, */ blk_mq_quiesce_queue(q); + mutex_lock(&disk->rqos_state_mutex); wbt_set_min_lat(q, val); + mutex_unlock(&disk->rqos_state_mutex); blk_mq_unquiesce_queue(q); out: - mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); return ret; @@ -642,6 +670,8 @@ static struct attribute *queue_attrs[] = { &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, + &queue_max_write_streams_entry.attr, + &queue_write_stream_granularity_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, @@ -656,6 +686,8 @@ static struct attribute *queue_attrs[] = { &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, + &queue_max_hw_wzeroes_unmap_sectors_entry.attr, + &queue_max_wzeroes_unmap_sectors_entry.attr, &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, @@ -815,7 +847,7 @@ static void blk_queue_release(struct kobject *kobj) /* nothing to do here, all data is associated with the parent gendisk */ } -static const struct kobj_type blk_queue_ktype = { +const struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, .release = blk_queue_release, @@ -843,15 +875,14 @@ int blk_register_queue(struct gendisk *disk) struct request_queue *q = disk->queue; int ret; - kobject_init(&disk->queue_kobj, &blk_queue_ktype); ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) - goto out_put_queue_kobj; + return ret; if (queue_is_mq(q)) { ret = blk_mq_sysfs_register(disk); if (ret) - goto out_put_queue_kobj; + goto out_del_queue_kobj; } mutex_lock(&q->sysfs_lock); @@ -869,18 +900,11 @@ int blk_register_queue(struct gendisk *disk) if (ret) goto out_unregister_ia_ranges; - mutex_lock(&q->elevator_lock); - if (q->elevator) { - ret = elv_register_queue(q, false); - if (ret) { - mutex_unlock(&q->elevator_lock); - goto out_crypto_sysfs_unregister; - } - } - wbt_enable_default(disk); - mutex_unlock(&q->elevator_lock); + if (queue_is_mq(q)) + elevator_set_default(q); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); + wbt_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); @@ -902,8 +926,6 @@ int blk_register_queue(struct gendisk *disk) return ret; -out_crypto_sysfs_unregister: - blk_crypto_sysfs_unregister(disk); out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: @@ -911,8 +933,8 @@ out_debugfs_remove: mutex_unlock(&q->sysfs_lock); if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); -out_put_queue_kobj: - kobject_put(&disk->queue_kobj); +out_del_queue_kobj: + kobject_del(&disk->queue_kobj); return ret; } @@ -951,10 +973,6 @@ void blk_unregister_queue(struct gendisk *disk) blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(disk); - mutex_lock(&q->elevator_lock); - elv_unregister_queue(q); - mutex_unlock(&q->elevator_lock); - mutex_lock(&q->sysfs_lock); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); @@ -963,5 +981,8 @@ void blk_unregister_queue(struct gendisk *disk) kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); kobject_del(&disk->queue_kobj); + if (queue_is_mq(q)) + elevator_set_none(q); + blk_debugfs_remove(disk); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d6dd2e047874..397b6a410f9e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -143,7 +143,8 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio) static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); - bio_list_init(&qn->bios); + bio_list_init(&qn->bios_bps); + bio_list_init(&qn->bios_iops); qn->tg = tg; } @@ -151,18 +152,32 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it * @bio: bio being added * @qn: qnode to add bio to - * @queued: the service_queue->queued[] list @qn belongs to + * @sq: the service_queue @qn belongs to * - * Add @bio to @qn and put @qn on @queued if it's not already on. + * Add @bio to @qn and put @qn on @sq->queued if it's not already on. * @qn->tg's reference count is bumped when @qn is activated. See the * comment on top of throtl_qnode definition for details. */ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, - struct list_head *queued) + struct throtl_service_queue *sq) { - bio_list_add(&qn->bios, bio); + bool rw = bio_data_dir(bio); + + /* + * Split bios have already been throttled by bps, so they are + * directly queued into the iops path. + */ + if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) || + bio_flagged(bio, BIO_BPS_THROTTLED)) { + bio_list_add(&qn->bios_iops, bio); + sq->nr_queued_iops[rw]++; + } else { + bio_list_add(&qn->bios_bps, bio); + sq->nr_queued_bps[rw]++; + } + if (list_empty(&qn->node)) { - list_add_tail(&qn->node, queued); + list_add_tail(&qn->node, &sq->queued[rw]); blkg_get(tg_to_blkg(qn->tg)); } } @@ -170,6 +185,10 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, /** * throtl_peek_queued - peek the first bio on a qnode list * @queued: the qnode list to peek + * + * Always take a bio from the head of the iops queue first. If the queue is + * empty, we then take it from the bps queue to maintain the overall idea of + * fetching bios from the head. */ static struct bio *throtl_peek_queued(struct list_head *queued) { @@ -180,28 +199,33 @@ static struct bio *throtl_peek_queued(struct list_head *queued) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); - bio = bio_list_peek(&qn->bios); + bio = bio_list_peek(&qn->bios_iops); + if (!bio) + bio = bio_list_peek(&qn->bios_bps); WARN_ON_ONCE(!bio); return bio; } /** * throtl_pop_queued - pop the first bio form a qnode list - * @queued: the qnode list to pop a bio from + * @sq: the service_queue to pop a bio from * @tg_to_put: optional out argument for throtl_grp to put + * @rw: read/write * - * Pop the first bio from the qnode list @queued. After popping, the first - * qnode is removed from @queued if empty or moved to the end of @queued so - * that the popping order is round-robin. + * Pop the first bio from the qnode list @sq->queued. Note that we firstly + * focus on the iops list because bios are ultimately dispatched from it. + * After popping, the first qnode is removed from @sq->queued if empty or moved + * to the end of @sq->queued so that the popping order is round-robin. * * When the first qnode is removed, its associated throtl_grp should be put * too. If @tg_to_put is NULL, this function automatically puts it; * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is * responsible for putting it. */ -static struct bio *throtl_pop_queued(struct list_head *queued, - struct throtl_grp **tg_to_put) +static struct bio *throtl_pop_queued(struct throtl_service_queue *sq, + struct throtl_grp **tg_to_put, bool rw) { + struct list_head *queued = &sq->queued[rw]; struct throtl_qnode *qn; struct bio *bio; @@ -209,10 +233,17 @@ static struct bio *throtl_pop_queued(struct list_head *queued, return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); - bio = bio_list_pop(&qn->bios); + bio = bio_list_pop(&qn->bios_iops); + if (bio) { + sq->nr_queued_iops[rw]--; + } else { + bio = bio_list_pop(&qn->bios_bps); + if (bio) + sq->nr_queued_bps[rw]--; + } WARN_ON_ONCE(!bio); - if (bio_list_empty(&qn->bios)) { + if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) { list_del_init(&qn->node); if (tg_to_put) *tg_to_put = qn->tg; @@ -520,6 +551,9 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { + if (!time_before(tg->slice_end[rw], jiffy_end)) + return; + throtl_set_slice_end(tg, rw, jiffy_end); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", @@ -536,6 +570,11 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) return true; } +static unsigned int sq_queued(struct throtl_service_queue *sq, int type) +{ + return sq->nr_queued_bps[type] + sq->nr_queued_iops[type]; +} + static unsigned int calculate_io_allowed(u32 iops_limit, unsigned long jiffy_elapsed) { @@ -571,6 +610,48 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); } +static long long throtl_trim_bps(struct throtl_grp *tg, bool rw, + unsigned long time_elapsed) +{ + u64 bps_limit = tg_bps_limit(tg, rw); + long long bytes_trim; + + if (bps_limit == U64_MAX) + return 0; + + /* Need to consider the case of bytes_allowed overflow. */ + bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed); + if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) { + bytes_trim = tg->bytes_disp[rw]; + tg->bytes_disp[rw] = 0; + } else { + tg->bytes_disp[rw] -= bytes_trim; + } + + return bytes_trim; +} + +static int throtl_trim_iops(struct throtl_grp *tg, bool rw, + unsigned long time_elapsed) +{ + u32 iops_limit = tg_iops_limit(tg, rw); + int io_trim; + + if (iops_limit == UINT_MAX) + return 0; + + /* Need to consider the case of io_allowed overflow. */ + io_trim = calculate_io_allowed(iops_limit, time_elapsed); + if (io_trim <= 0 || tg->io_disp[rw] < io_trim) { + io_trim = tg->io_disp[rw]; + tg->io_disp[rw] = 0; + } else { + tg->io_disp[rw] -= io_trim; + } + + return io_trim; +} + /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { @@ -612,22 +693,11 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * one extra slice is preserved for deviation. */ time_elapsed -= tg->td->throtl_slice; - bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), - time_elapsed); - io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed); - if (bytes_trim <= 0 && io_trim <= 0) + bytes_trim = throtl_trim_bps(tg, rw, time_elapsed); + io_trim = throtl_trim_iops(tg, rw, time_elapsed); + if (!bytes_trim && !io_trim) return; - if ((long long)tg->bytes_disp[rw] >= bytes_trim) - tg->bytes_disp[rw] -= bytes_trim; - else - tg->bytes_disp[rw] = 0; - - if ((int)tg->io_disp[rw] >= io_trim) - tg->io_disp[rw] -= io_trim; - else - tg->io_disp[rw] = 0; - tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, @@ -643,21 +713,41 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw, unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); + long long bytes_allowed; + int io_allowed; + + /* + * If the queue is empty, carryover handling is not needed. In such cases, + * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch + * of subsequent bios. The same handling applies when the previous BPS/IOPS + * limit was set to max. + */ + if (sq_queued(&tg->service_queue, rw) == 0) { + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + return; + } /* * If config is updated while bios are still throttled, calculate and - * accumulate how many bytes/ios are waited across changes. And - * carryover_bytes/ios will be used to calculate new wait time under new - * configuration. + * accumulate how many bytes/ios are waited across changes. And use the + * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which + * will be used to calculate new wait time under new configuration. + * And we need to consider the case of bytes/io_allowed overflow. */ - if (bps_limit != U64_MAX) - *bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) - - tg->bytes_disp[rw]; - if (iops_limit != UINT_MAX) - *ios = calculate_io_allowed(iops_limit, jiffy_elapsed) - - tg->io_disp[rw]; - tg->bytes_disp[rw] -= *bytes; - tg->io_disp[rw] -= *ios; + if (bps_limit != U64_MAX) { + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed); + if (bytes_allowed > 0) + *bytes = bytes_allowed - tg->bytes_disp[rw]; + } + if (iops_limit != UINT_MAX) { + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed); + if (io_allowed > 0) + *ios = io_allowed - tg->io_disp[rw]; + } + + tg->bytes_disp[rw] = -*bytes; + tg->io_disp[rw] = -*ios; } static void tg_update_carryover(struct throtl_grp *tg) @@ -665,12 +755,10 @@ static void tg_update_carryover(struct throtl_grp *tg) long long bytes[2] = {0}; int ios[2] = {0}; - if (tg->service_queue.nr_queued[READ]) - __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); - if (tg->service_queue.nr_queued[WRITE]) - __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); + __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); + __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); - /* see comments in struct throtl_grp for meaning of these fields. */ + /* see comments in struct throtl_grp for meaning of carryover. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]); } @@ -682,10 +770,6 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; - if (iops_limit == UINT_MAX) { - return 0; - } - jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ @@ -711,11 +795,6 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); - /* no need to throttle if this bio's bytes have been accounted */ - if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { - return 0; - } - jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; /* Slice has just started. Consider one slice interval */ @@ -724,7 +803,9 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); - if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) + /* Need to consider the case of bytes_allowed overflow. */ + if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) + || bytes_allowed < 0) return 0; /* Calc approx time to dispatch */ @@ -742,17 +823,82 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, return jiffy_wait; } +static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio) +{ + unsigned int bio_size = throtl_bio_data_size(bio); + + /* Charge the bio to the group */ + if (!bio_flagged(bio, BIO_BPS_THROTTLED) && + !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) { + bio_set_flag(bio, BIO_TG_BPS_THROTTLED); + tg->bytes_disp[bio_data_dir(bio)] += bio_size; + } +} + +static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio) +{ + bio_clear_flag(bio, BIO_TG_BPS_THROTTLED); + tg->io_disp[bio_data_dir(bio)]++; +} + /* - * Returns whether one can dispatch a bio or not. Also returns approx number - * of jiffies to wait before this bio is with-in IO rate and can be dispatched + * If previous slice expired, start a new one otherwise renew/extend existing + * slice to make sure it is at least throtl_slice interval long since now. New + * slice is started only for empty throttle group. If there is queued bio, that + * means there should be an active slice and it should be extended instead. */ -static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, - unsigned long *wait) +static void tg_update_slice(struct throtl_grp *tg, bool rw) +{ + if (throtl_slice_used(tg, rw) && + sq_queued(&tg->service_queue, rw) == 0) + throtl_start_new_slice(tg, rw, true); + else + throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); +} + +static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); - unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; u64 bps_limit = tg_bps_limit(tg, rw); + unsigned long bps_wait; + + /* no need to throttle if this bio's bytes have been accounted */ + if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING || + bio_flagged(bio, BIO_BPS_THROTTLED) || + bio_flagged(bio, BIO_TG_BPS_THROTTLED)) + return 0; + + tg_update_slice(tg, rw); + bps_wait = tg_within_bps_limit(tg, bio, bps_limit); + throtl_extend_slice(tg, rw, jiffies + bps_wait); + + return bps_wait; +} + +static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); u32 iops_limit = tg_iops_limit(tg, rw); + unsigned long iops_wait; + + if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING) + return 0; + + tg_update_slice(tg, rw); + iops_wait = tg_within_iops_limit(tg, bio, iops_limit); + throtl_extend_slice(tg, rw, jiffies + iops_wait); + + return iops_wait; +} + +/* + * Returns approx number of jiffies to wait before this bio is with-in IO rate + * and can be moved to other queue or dispatched. + */ +static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); + unsigned long wait; /* * Currently whole state machine of group depends on first bio @@ -760,62 +906,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, * this function with a different bio if there are other bios * queued. */ - BUG_ON(tg->service_queue.nr_queued[rw] && + BUG_ON(sq_queued(&tg->service_queue, rw) && bio != throtl_peek_queued(&tg->service_queue.queued[rw])); - /* If tg->bps = -1, then BW is unlimited */ - if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || - tg->flags & THROTL_TG_CANCELING) { - if (wait) - *wait = 0; - return true; - } + wait = tg_dispatch_bps_time(tg, bio); + if (wait != 0) + return wait; /* - * If previous slice expired, start a new one otherwise renew/extend - * existing slice to make sure it is at least throtl_slice interval - * long since now. New slice is started only for empty throttle group. - * If there is queued bio, that means there should be an active - * slice and it should be extended instead. + * Charge bps here because @bio will be directly placed into the + * iops queue afterward. */ - if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) - throtl_start_new_slice(tg, rw, true); - else { - if (time_before(tg->slice_end[rw], - jiffies + tg->td->throtl_slice)) - throtl_extend_slice(tg, rw, - jiffies + tg->td->throtl_slice); - } - - bps_wait = tg_within_bps_limit(tg, bio, bps_limit); - iops_wait = tg_within_iops_limit(tg, bio, iops_limit); - if (bps_wait + iops_wait == 0) { - if (wait) - *wait = 0; - return true; - } - - max_wait = max(bps_wait, iops_wait); - - if (wait) - *wait = max_wait; - - if (time_before(tg->slice_end[rw], jiffies + max_wait)) - throtl_extend_slice(tg, rw, jiffies + max_wait); - - return false; -} - -static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) -{ - bool rw = bio_data_dir(bio); - unsigned int bio_size = throtl_bio_data_size(bio); + throtl_charge_bps_bio(tg, bio); - /* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) - tg->bytes_disp[rw] += bio_size; - - tg->io_disp[rw]++; + return tg_dispatch_iops_time(tg, bio); } /** @@ -842,28 +946,36 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, * dispatched. Mark that @tg was empty. This is automatically * cleared on the next tg_update_disptime(). */ - if (!sq->nr_queued[rw]) + if (sq_queued(sq, rw) == 0) tg->flags |= THROTL_TG_WAS_EMPTY; - throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); + throtl_qnode_add_bio(bio, qn, sq); + + /* + * Since we have split the queues, when the iops queue is + * previously empty and a new @bio is added into the first @qn, + * we also need to update the @tg->disptime. + */ + if (bio_flagged(bio, BIO_BPS_THROTTLED) && + bio == throtl_peek_queued(&sq->queued[rw])) + tg->flags |= THROTL_TG_IOPS_WAS_EMPTY; - sq->nr_queued[rw]++; throtl_enqueue_tg(tg); } static void tg_update_disptime(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; - unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; + unsigned long read_wait = -1, write_wait = -1, min_wait, disptime; struct bio *bio; bio = throtl_peek_queued(&sq->queued[READ]); if (bio) - tg_may_dispatch(tg, bio, &read_wait); + read_wait = tg_dispatch_time(tg, bio); bio = throtl_peek_queued(&sq->queued[WRITE]); if (bio) - tg_may_dispatch(tg, bio, &write_wait); + write_wait = tg_dispatch_time(tg, bio); min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; @@ -875,6 +987,7 @@ static void tg_update_disptime(struct throtl_grp *tg) /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; + tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY; } static void start_parent_slice_with_credit(struct throtl_grp *child_tg, @@ -901,10 +1014,9 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) * getting released prematurely. Remember the tg to put and put it * after @bio is transferred to @parent_sq. */ - bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); - sq->nr_queued[rw]--; + bio = throtl_pop_queued(sq, &tg_to_put, rw); - throtl_charge_bio(tg, bio); + throtl_charge_iops_bio(tg, bio); /* * If our parent is another tg, we just need to transfer @bio to @@ -919,7 +1031,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) } else { bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], - &parent_sq->queued[rw]); + parent_sq); BUG_ON(tg->td->nr_queued[rw] <= 0); tg->td->nr_queued[rw]--; } @@ -941,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) /* Try to dispatch 75% READS and 25% WRITES */ while ((bio = throtl_peek_queued(&sq->queued[READ])) && - tg_may_dispatch(tg, bio, NULL)) { + tg_dispatch_time(tg, bio) == 0) { tg_dispatch_one_bio(tg, READ); nr_reads++; @@ -951,7 +1063,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) } while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && - tg_may_dispatch(tg, bio, NULL)) { + tg_dispatch_time(tg, bio) == 0) { tg_dispatch_one_bio(tg, WRITE); nr_writes++; @@ -984,7 +1096,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; - if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) + if (sq_queued(sq, READ) || sq_queued(sq, WRITE)) tg_update_disptime(tg); else throtl_dequeue_tg(tg); @@ -1013,7 +1125,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) */ static void throtl_pending_timer_fn(struct timer_list *t) { - struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); + struct throtl_service_queue *sq = timer_container_of(sq, t, + pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); struct throtl_service_queue *parent_sq; @@ -1037,9 +1150,11 @@ again: dispatched = false; while (true) { + unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ); + unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE); + throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", - sq->nr_queued[READ] + sq->nr_queued[WRITE], - sq->nr_queued[READ], sq->nr_queued[WRITE]); + bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w); ret = throtl_select_dispatch(sq); if (ret) { @@ -1061,7 +1176,8 @@ again: if (parent_sq) { /* @parent_sq is another throl_grp, propagate dispatch */ - if (tg->flags & THROTL_TG_WAS_EMPTY) { + if (tg->flags & THROTL_TG_WAS_EMPTY || + tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { tg_update_disptime(tg); if (!throtl_schedule_next_dispatch(parent_sq, false)) { /* window is already open, repeat dispatching */ @@ -1101,7 +1217,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) - while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) + while ((bio = throtl_pop_queued(td_sq, NULL, rw))) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(&q->queue_lock); @@ -1606,11 +1722,30 @@ void blk_throtl_cancel_bios(struct gendisk *disk) static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) { - /* throtl is FIFO - if bios are already queued, should queue */ - if (tg->service_queue.nr_queued[rw]) + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * For a split bio, we need to specifically distinguish whether the + * iops queue is empty. + */ + if (bio_flagged(bio, BIO_BPS_THROTTLED)) + return sq->nr_queued_iops[rw] == 0 && + tg_dispatch_iops_time(tg, bio) == 0; + + /* + * Throtl is FIFO - if bios are already queued, should queue. + * If the bps queue is empty and @bio is within the bps limit, charge + * bps here for direct placement into the iops queue. + */ + if (sq_queued(&tg->service_queue, rw)) { + if (sq->nr_queued_bps[rw] == 0 && + tg_dispatch_bps_time(tg, bio) == 0) + throtl_charge_bps_bio(tg, bio); + return false; + } - return tg_may_dispatch(tg, bio, NULL); + return tg_dispatch_time(tg, bio) == 0; } bool __blk_throtl_bio(struct bio *bio) @@ -1631,7 +1766,7 @@ bool __blk_throtl_bio(struct bio *bio) while (true) { if (tg_within_limit(tg, bio, rw)) { /* within limits, let's charge and dispatch directly */ - throtl_charge_bio(tg, bio); + throtl_charge_iops_bio(tg, bio); /* * We need to trim slice even when bios are not being @@ -1654,7 +1789,8 @@ bool __blk_throtl_bio(struct bio *bio) * control algorithm is adaptive, and extra IO bytes * will be throttled for paying the debt */ - throtl_charge_bio(tg, bio); + throtl_charge_bps_bio(tg, bio); + throtl_charge_iops_bio(tg, bio); } else { /* if above limits, break to queue */ break; @@ -1680,7 +1816,7 @@ bool __blk_throtl_bio(struct bio *bio) tg->bytes_disp[rw], bio->bi_iter.bi_size, tg_bps_limit(tg, rw), tg->io_disp[rw], tg_iops_limit(tg, rw), - sq->nr_queued[READ], sq->nr_queued[WRITE]); + sq_queued(sq, READ), sq_queued(sq, WRITE)); td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); @@ -1688,11 +1824,13 @@ bool __blk_throtl_bio(struct bio *bio) /* * Update @tg's dispatch time and force schedule dispatch if @tg - * was empty before @bio. The forced scheduling isn't likely to - * cause undue delay as @bio is likely to be dispatched directly if - * its @tg's disptime is not in the future. + * was empty before @bio, or the iops queue is empty and @bio will + * add to. The forced scheduling isn't likely to cause undue + * delay as @bio is likely to be dispatched directly if its @tg's + * disptime is not in the future. */ - if (tg->flags & THROTL_TG_WAS_EMPTY) { + if (tg->flags & THROTL_TG_WAS_EMPTY || + tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { tg_update_disptime(tg); throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); } diff --git a/block/blk-throttle.h b/block/blk-throttle.h index f9f8666891ab..3b27755bfbff 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -29,7 +29,8 @@ */ struct throtl_qnode { struct list_head node; /* service_queue->queued[] */ - struct bio_list bios; /* queued bios */ + struct bio_list bios_bps; /* queued bios for bps limit */ + struct bio_list bios_iops; /* queued bios for iops limit */ struct throtl_grp *tg; /* tg this qnode belongs to */ }; @@ -41,7 +42,8 @@ struct throtl_service_queue { * children throtl_grp's. */ struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ - unsigned int nr_queued[2]; /* number of queued bios */ + unsigned int nr_queued_bps[2]; /* number of queued bps bios */ + unsigned int nr_queued_iops[2]; /* number of queued iops bios */ /* * RB tree of active children throtl_grp's, which are sorted by @@ -54,9 +56,14 @@ struct throtl_service_queue { }; enum tg_state_flags { - THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ - THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ - THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ + THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ + THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ + /* + * The sq's iops queue is empty, and a bio is about to be enqueued + * to the first qnode's bios_iops list. + */ + THROTL_TG_IOPS_WAS_EMPTY = 1 << 2, + THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ }; struct throtl_grp { @@ -102,19 +109,16 @@ struct throtl_grp { /* IOPS limits */ unsigned int iops[2]; - /* Number of bytes dispatched in current slice */ - int64_t bytes_disp[2]; - /* Number of bio's dispatched in current slice */ - int io_disp[2]; - /* - * The following two fields are updated when new configuration is - * submitted while some bios are still throttled, they record how many - * bytes/ios are waited already in previous configuration, and they will - * be used to calculate wait time under new configuration. + * Number of bytes/bio's dispatched in current slice. + * When new configuration is submitted while some bios are still throttled, + * first calculate the carryover: the amount of bytes/IOs already waited + * under the previous configuration. Then, [bytes/io]_disp are represented + * as the negative of the carryover, and they will be used to calculate the + * wait time under the new configuration. */ - long long carryover_bytes[2]; - int carryover_ios[2]; + int64_t bytes_disp[2]; + int io_disp[2]; unsigned long last_check_time; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index f1754d07f7e0..eb8037bae0bd 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -37,7 +37,7 @@ enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ WBT_READ = 2, /* read */ - WBT_SWAP = 4, /* write, from swap_writepage() */ + WBT_SWAP = 4, /* write, from swap_writeout() */ WBT_DISCARD = 8, /* discard */ WBT_NR_BITS = 4, /* number of bits */ @@ -85,8 +85,8 @@ struct rq_wb { u64 sync_issue; void *sync_cookie; - unsigned long last_issue; /* last non-throttled issue */ - unsigned long last_comp; /* last non-throttled comp */ + unsigned long last_issue; /* issue time of last read rq */ + unsigned long last_comp; /* completion time of last read rq */ unsigned long min_lat_nsec; struct rq_qos rqos; struct rq_wait rq_wait[WBT_NUM_RWQ]; @@ -248,13 +248,14 @@ static void wbt_done(struct rq_qos *rqos, struct request *rq) struct rq_wb *rwb = RQWB(rqos); if (!wbt_is_tracked(rq)) { - if (rwb->sync_cookie == rq) { - rwb->sync_issue = 0; - rwb->sync_cookie = NULL; - } + if (wbt_is_read(rq)) { + if (rwb->sync_cookie == rq) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } - if (wbt_is_read(rq)) wb_timestamp(rwb, &rwb->last_comp); + } } else { WARN_ON_ONCE(rq == rwb->sync_cookie); __wbt_done(rqos, wbt_flags(rq)); @@ -704,8 +705,9 @@ void wbt_enable_default(struct gendisk *disk) struct rq_qos *rqos; bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); - if (q->elevator && - test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags)) + mutex_lock(&disk->rqos_state_mutex); + + if (blk_queue_disable_wbt(q)) enable = false; /* Throttling already enabled? */ @@ -713,8 +715,10 @@ void wbt_enable_default(struct gendisk *disk) if (rqos) { if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; + mutex_unlock(&disk->rqos_state_mutex); return; } + mutex_unlock(&disk->rqos_state_mutex); /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) @@ -774,11 +778,13 @@ void wbt_disable_default(struct gendisk *disk) struct rq_wb *rwb; if (!rqos) return; + mutex_lock(&disk->rqos_state_mutex); rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); rwb->enable_state = WBT_STATE_OFF_DEFAULT; } + mutex_unlock(&disk->rqos_state_mutex); } EXPORT_SYMBOL_GPL(wbt_disable_default); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 8f15d1aa6eb8..ef43aaca49f4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -17,6 +17,8 @@ #include <linux/refcount.h> #include <linux/mempool.h> +#include <trace/events/block.h> + #include "blk.h" #include "blk-mq-sched.h" #include "blk-mq-debugfs.h" @@ -177,6 +179,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev) struct bio bio; bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); + trace_blkdev_zone_mgmt(&bio, 0); return submit_bio_wait(&bio); } @@ -240,6 +243,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, cond_resched(); } + trace_blkdev_zone_mgmt(bio, nr_sectors); ret = submit_bio_wait(bio); bio_put(bio); @@ -818,6 +822,8 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, * at the tail of the list to preserve the sequential write order. */ bio_list_add(&zwplug->bio_list, bio); + trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, + bio->bi_iter.bi_sector, bio_sectors(bio)); zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; @@ -1116,25 +1122,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) { struct block_device *bdev = bio->bi_bdev; - if (!bdev->bd_disk->zone_wplugs_hash) - return false; - - /* - * If the BIO already has the plugging flag set, then it was already - * handled through this path and this is a submission from the zone - * plug bio submit work. - */ - if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) - return false; - - /* - * We do not need to do anything special for empty flush BIOs, e.g - * BIOs such as issued by blkdev_issue_flush(). The is because it is - * the responsibility of the user to first wait for the completion of - * write operations for flush to have any effect on the persistence of - * the written data. - */ - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash)) return false; /* @@ -1205,6 +1193,20 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_unlock_irqrestore(&zwplug->lock, flags); } +void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio) +{ + /* + * For zone append requests, the request sector indicates the location + * at which the BIO data was written. Return this value to the BIO + * issuer through the BIO iter sector. + * For plugged zone writes, which include emulated zone append, we need + * the original BIO sector so that blk_zone_write_plug_bio_endio() can + * lookup the zone write plug. + */ + bio->bi_iter.bi_sector = rq->__sector; + trace_blk_zone_append_update_request_bio(rq); +} + void blk_zone_write_plug_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; @@ -1225,6 +1227,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= REQ_OP_ZONE_APPEND; + bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND); } /* @@ -1298,6 +1301,9 @@ again: goto put_zwplug; } + trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, + bio->bi_iter.bi_sector, bio_sectors(bio)); + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { blk_zone_wplug_bio_io_error(zwplug, bio); goto again; @@ -1306,7 +1312,6 @@ again: spin_unlock_irqrestore(&zwplug->lock, flags); bdev = bio->bi_bdev; - submit_bio_noacct_nocheck(bio); /* * blk-mq devices will reuse the extra reference on the request queue @@ -1314,8 +1319,12 @@ again: * path for BIO-based devices will not do that. So drop this extra * reference here. */ - if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) + if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { + bdev->bd_disk->fops->submit_bio(bio); blk_queue_exit(bdev->bd_disk->queue); + } else { + blk_mq_submit_bio(bio); + } put_zwplug: /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ diff --git a/block/blk.h b/block/blk.h index 594eeba7b949..46f566f9b126 100644 --- a/block/blk.h +++ b/block/blk.h @@ -12,6 +12,16 @@ #include "blk-crypto-internal.h" struct elevator_type; +struct elevator_tags; + +/* + * Default upper limit for the software max_sectors limit used for regular I/Os. + * This can be increased through sysfs. + * + * This should not be confused with the max_hw_sector limit that is entirely + * controlled by the block device driver, usually based on hardware limits. + */ +#define BLK_DEF_MAX_SECTORS_CAP (SZ_4M >> SECTOR_SHIFT) #define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) #define BLK_MIN_SEGMENT_SIZE 4096 @@ -19,6 +29,7 @@ struct elevator_type; /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) +extern const struct kobj_type blk_queue_ktype; extern struct dentry *blk_debugfs_root; struct blk_flush_queue { @@ -103,8 +114,7 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, - struct page *page, unsigned len, unsigned offset, - bool *same_page); + struct page *page, unsigned len, unsigned offset); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) @@ -322,11 +332,10 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -int elevator_switch(struct request_queue *q, struct elevator_type *new_e); -void elevator_disable(struct request_queue *q); -void elevator_exit(struct request_queue *q); -int elv_register_queue(struct request_queue *q, bool uevent); -void elv_unregister_queue(struct request_queue *q); +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *t); +void elevator_set_default(struct request_queue *q); +void elevator_set_none(struct request_queue *q); ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); @@ -407,6 +416,27 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio, } } +/** + * get_max_segment_size() - maximum number of bytes to add as a single segment + * @lim: Request queue limits. + * @paddr: address of the range to add + * @len: maximum length available to add at @paddr + * + * Returns the maximum number of bytes of the range starting at @paddr that can + * be added to a single segment. + */ +static inline unsigned get_max_segment_size(const struct queue_limits *lim, + phys_addr_t paddr, unsigned int len) +{ + /* + * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 + * after having calculated the minimum. + */ + return min_t(unsigned long, len, + min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr), + (unsigned long)lim->max_segment_size - 1) + 1); +} + int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, @@ -421,7 +451,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi, int blk_dev_init(void); void update_io_ticks(struct block_device *part, unsigned long now, bool end); -unsigned int part_in_flight(struct block_device *part); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { @@ -443,23 +472,6 @@ static inline void ioc_clear_queue(struct request_queue *q) } #endif /* CONFIG_BLK_ICQ */ -struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); - -static inline bool blk_queue_may_bounce(struct request_queue *q) -{ - return IS_ENABLED(CONFIG_BOUNCE) && - (q->limits.features & BLK_FEAT_BOUNCE_HIGH) && - max_low_pfn >= max_pfn; -} - -static inline struct bio *blk_queue_bounce(struct bio *bio, - struct request_queue *q) -{ - if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio))) - return __blk_queue_bounce(bio, q); - return bio; -} - #ifdef CONFIG_BLK_DEV_ZONED void disk_init_zone_resources(struct gendisk *disk); void disk_free_zone_resources(struct gendisk *disk); @@ -467,23 +479,15 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); } -void blk_zone_write_plug_bio_merged(struct bio *bio); -void blk_zone_write_plug_init_request(struct request *rq); -static inline void blk_zone_update_request_bio(struct request *rq, - struct bio *bio) +static inline bool blk_req_bio_is_zone_append(struct request *rq, + struct bio *bio) { - /* - * For zone append requests, the request sector indicates the location - * at which the BIO data was written. Return this value to the BIO - * issuer through the BIO iter sector. - * For plugged zone writes, which include emulated zone append, we need - * the original BIO sector so that blk_zone_write_plug_bio_endio() can - * lookup the zone write plug. - */ - if (req_op(rq) == REQ_OP_ZONE_APPEND || - bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) - bio->bi_iter.bi_sector = rq->__sector; + return req_op(rq) == REQ_OP_ZONE_APPEND || + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); } +void blk_zone_write_plug_bio_merged(struct bio *bio); +void blk_zone_write_plug_init_request(struct request *rq); +void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio); void blk_zone_write_plug_bio_endio(struct bio *bio); static inline void blk_zone_bio_endio(struct bio *bio) { @@ -516,14 +520,19 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return false; } +static inline bool blk_req_bio_is_zone_append(struct request *req, + struct bio *bio) +{ + return false; +} static inline void blk_zone_write_plug_bio_merged(struct bio *bio) { } static inline void blk_zone_write_plug_init_request(struct request *rq) { } -static inline void blk_zone_update_request_bio(struct request *rq, - struct bio *bio) +static inline void blk_zone_append_update_request_bio(struct request *rq, + struct bio *bio) { } static inline void blk_zone_bio_endio(struct bio *bio) diff --git a/block/bounce.c b/block/bounce.c deleted file mode 100644 index 09a9616cf209..000000000000 --- a/block/bounce.c +++ /dev/null @@ -1,267 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* bounce buffer handling for block devices - * - * - Split from highmem.c - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/mm.h> -#include <linux/export.h> -#include <linux/swap.h> -#include <linux/gfp.h> -#include <linux/bio-integrity.h> -#include <linux/pagemap.h> -#include <linux/mempool.h> -#include <linux/blkdev.h> -#include <linux/backing-dev.h> -#include <linux/init.h> -#include <linux/hash.h> -#include <linux/highmem.h> -#include <linux/printk.h> -#include <asm/tlbflush.h> - -#include <trace/events/block.h> -#include "blk.h" -#include "blk-cgroup.h" - -#define POOL_SIZE 64 -#define ISA_POOL_SIZE 16 - -static struct bio_set bounce_bio_set, bounce_bio_split; -static mempool_t page_pool; - -static void init_bounce_bioset(void) -{ - static bool bounce_bs_setup; - int ret; - - if (bounce_bs_setup) - return; - - ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - BUG_ON(ret); - - ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); - BUG_ON(ret); - bounce_bs_setup = true; -} - -static __init int init_emergency_pool(void) -{ - int ret; - -#ifndef CONFIG_MEMORY_HOTPLUG - if (max_pfn <= max_low_pfn) - return 0; -#endif - - ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0); - BUG_ON(ret); - pr_info("pool size: %d pages\n", POOL_SIZE); - - init_bounce_bioset(); - return 0; -} - -__initcall(init_emergency_pool); - -/* - * Simple bounce buffer support for highmem pages. Depending on the - * queue gfp mask set, *to may or may not be a highmem page. kmap it - * always, it will do the Right Thing - */ -static void copy_to_high_bio_irq(struct bio *to, struct bio *from) -{ - struct bio_vec tovec, fromvec; - struct bvec_iter iter; - /* - * The bio of @from is created by bounce, so we can iterate - * its bvec from start to end, but the @from->bi_iter can't be - * trusted because it might be changed by splitting. - */ - struct bvec_iter from_iter = BVEC_ITER_ALL_INIT; - - bio_for_each_segment(tovec, to, iter) { - fromvec = bio_iter_iovec(from, from_iter); - if (tovec.bv_page != fromvec.bv_page) { - /* - * fromvec->bv_offset and fromvec->bv_len might have - * been modified by the block layer, so use the original - * copy, bounce_copy_vec already uses tovec->bv_len - */ - memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) + - tovec.bv_offset); - } - bio_advance_iter(from, &from_iter, tovec.bv_len); - } -} - -static void bounce_end_io(struct bio *bio) -{ - struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, orig_vec; - struct bvec_iter orig_iter = bio_orig->bi_iter; - struct bvec_iter_all iter_all; - - /* - * free up bounce indirect pages used - */ - bio_for_each_segment_all(bvec, bio, iter_all) { - orig_vec = bio_iter_iovec(bio_orig, orig_iter); - if (bvec->bv_page != orig_vec.bv_page) { - dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, &page_pool); - } - bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); - } - - bio_orig->bi_status = bio->bi_status; - bio_endio(bio_orig); - bio_put(bio); -} - -static void bounce_end_io_write(struct bio *bio) -{ - bounce_end_io(bio); -} - -static void bounce_end_io_read(struct bio *bio) -{ - struct bio *bio_orig = bio->bi_private; - - if (!bio->bi_status) - copy_to_high_bio_irq(bio_orig, bio); - - bounce_end_io(bio); -} - -static struct bio *bounce_clone_bio(struct bio *bio_src) -{ - struct bvec_iter iter; - struct bio_vec bv; - struct bio *bio; - - /* - * Pre immutable biovecs, __bio_clone() used to just do a memcpy from - * bio_src->bi_io_vec to bio->bi_io_vec. - * - * We can't do that anymore, because: - * - * - The point of cloning the biovec is to produce a bio with a biovec - * the caller can modify: bi_idx and bi_bvec_done should be 0. - * - * - The original bio could've had more than BIO_MAX_VECS biovecs; if - * we tried to clone the whole thing bio_alloc_bioset() would fail. - * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_VECS. - * - * - Lastly, bi_vcnt should not be looked at or relied upon by code - * that does not own the bio - reason being drivers don't use it for - * iterating over the biovec anymore, so expecting it to be kept up - * to date (i.e. for clones that share the parent biovec) is just - * asking for trouble and would force extra work. - */ - bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src), - bio_src->bi_opf, GFP_NOIO, &bounce_bio_set); - if (bio_flagged(bio_src, BIO_REMAPPED)) - bio_set_flag(bio, BIO_REMAPPED); - bio->bi_ioprio = bio_src->bi_ioprio; - bio->bi_write_hint = bio_src->bi_write_hint; - bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - break; - default: - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - break; - } - - if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0) - goto err_put; - - if (bio_integrity(bio_src) && - bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0) - goto err_put; - - bio_clone_blkg_association(bio, bio_src); - - return bio; - -err_put: - bio_put(bio); - return NULL; -} - -struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q) -{ - struct bio *bio; - int rw = bio_data_dir(bio_orig); - struct bio_vec *to, from; - struct bvec_iter iter; - unsigned i = 0, bytes = 0; - bool bounce = false; - int sectors; - - bio_for_each_segment(from, bio_orig, iter) { - if (i++ < BIO_MAX_VECS) - bytes += from.bv_len; - if (PageHighMem(from.bv_page)) - bounce = true; - } - if (!bounce) - return bio_orig; - - /* - * Individual bvecs might not be logical block aligned. Round down - * the split size so that each bio is properly block size aligned, - * even if we do not use the full hardware limits. - */ - sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> - SECTOR_SHIFT; - if (sectors < bio_sectors(bio_orig)) { - bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split); - bio_chain(bio, bio_orig); - submit_bio_noacct(bio_orig); - bio_orig = bio; - } - bio = bounce_clone_bio(bio_orig); - - /* - * Bvec table can't be updated by bio_for_each_segment_all(), - * so retrieve bvec from the table directly. This way is safe - * because the 'bio' is single-page bvec. - */ - for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { - struct page *bounce_page; - - if (!PageHighMem(to->bv_page)) - continue; - - bounce_page = mempool_alloc(&page_pool, GFP_NOIO); - inc_zone_page_state(bounce_page, NR_BOUNCE); - - if (rw == WRITE) { - flush_dcache_page(to->bv_page); - memcpy_from_bvec(page_address(bounce_page), to); - } - to->bv_page = bounce_page; - } - - trace_block_bio_bounce(bio_orig); - - bio->bi_flags |= (1 << BIO_BOUNCED); - - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - else - bio->bi_end_io = bounce_end_io_write; - - bio->bi_private = bio_orig; - return bio; -} diff --git a/block/elevator.c b/block/elevator.c index b4d08026b02c..fe96c6f4753c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -45,6 +45,19 @@ #include "blk-wbt.h" #include "blk-cgroup.h" +/* Holding context data for changing elevator */ +struct elv_change_ctx { + const char *name; + bool no_uevent; + + /* for unregistering old elevator */ + struct elevator_queue *old; + /* for registering new elevator */ + struct elevator_queue *new; + /* holds sched tags data */ + struct elevator_tags *et; +}; + static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -121,7 +134,7 @@ static struct elevator_type *elevator_find_get(const char *name) static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, - struct elevator_type *e) + struct elevator_type *e, struct elevator_tags *et) { struct elevator_queue *eq; @@ -134,10 +147,10 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); + eq->et = et; return eq; } -EXPORT_SYMBOL(elevator_alloc); static void elevator_release(struct kobject *kobj) { @@ -148,18 +161,17 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -void elevator_exit(struct request_queue *q) +static void elevator_exit(struct request_queue *q) { struct elevator_queue *e = q->elevator; + lockdep_assert_held(&q->elevator_lock); + ioc_clear_queue(q); - blk_mq_sched_free_rqs(q); mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); - - kobject_put(&e->kobj); } static inline void __elv_rqhash_del(struct request *rq) @@ -412,14 +424,15 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; - ssize_t error; + ssize_t error = -ENODEV; if (!entry->show) return -EIO; e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); - error = e->type ? entry->show(e, page) : -ENOENT; + if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags)) + error = entry->show(e, page); mutex_unlock(&e->sysfs_lock); return error; } @@ -430,14 +443,15 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr, { const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; - ssize_t error; + ssize_t error = -ENODEV; if (!entry->store) return -EIO; e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); - error = e->type ? entry->store(e, page, length) : -ENOENT; + if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags)) + error = entry->store(e, page, length); mutex_unlock(&e->sysfs_lock); return error; } @@ -452,13 +466,12 @@ static const struct kobj_type elv_ktype = { .release = elevator_release, }; -int elv_register_queue(struct request_queue *q, bool uevent) +static int elv_register_queue(struct request_queue *q, + struct elevator_queue *e, + bool uevent) { - struct elevator_queue *e = q->elevator; int error; - lockdep_assert_held(&q->elevator_lock); - error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { const struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -472,20 +485,25 @@ int elv_register_queue(struct request_queue *q, bool uevent) if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); + /* + * Sched is initialized, it is ready to export it via + * debugfs + */ + blk_mq_sched_reg_debugfs(q); set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags); } return error; } -void elv_unregister_queue(struct request_queue *q) +static void elv_unregister_queue(struct request_queue *q, + struct elevator_queue *e) { - struct elevator_queue *e = q->elevator; - - lockdep_assert_held(&q->elevator_lock); - if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + + /* unexport via debugfs before exiting sched */ + blk_mq_sched_unreg_debugfs(q); } } @@ -548,159 +566,224 @@ void elv_unregister(struct elevator_type *e) EXPORT_SYMBOL_GPL(elv_unregister); /* - * For single queue devices, default to using mq-deadline. If we have multiple - * queues or mq-deadline is not available, default to "none". - */ -static struct elevator_type *elevator_get_default(struct request_queue *q) -{ - if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) - return NULL; - - if (q->nr_hw_queues != 1 && - !blk_mq_is_shared_tags(q->tag_set->flags)) - return NULL; - - return elevator_find_get("mq-deadline"); -} - -/* - * Use the default elevator settings. If the chosen elevator initialization - * fails, fall back to the "none" elevator (no elevator). - */ -void elevator_init_mq(struct request_queue *q) -{ - struct elevator_type *e; - unsigned int memflags; - int err; - - WARN_ON_ONCE(blk_queue_registered(q)); - - if (unlikely(q->elevator)) - return; - - e = elevator_get_default(q); - if (!e) - return; - - /* - * We are called before adding disk, when there isn't any FS I/O, - * so freezing queue plus canceling dispatch work is enough to - * drain any dispatch activities originated from passthrough - * requests, then no need to quiesce queue which may add long boot - * latency, especially when lots of disks are involved. - * - * Disk isn't added yet, so verifying queue lock only manually. - */ - memflags = blk_mq_freeze_queue(q); - - blk_mq_cancel_work_sync(q); - - err = blk_mq_init_sched(q, e); - - blk_mq_unfreeze_queue(q, memflags); - - if (err) { - pr_warn("\"%s\" elevator initialization failed, " - "falling back to \"none\"\n", e->elevator_name); - } - - elevator_put(e); -} - -/* * Switch to new_e io scheduler. * * If switching fails, we are most likely running out of memory and not able * to restore the old io scheduler, so leaving the io scheduler being none. */ -int elevator_switch(struct request_queue *q, struct elevator_type *new_e) +static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) { - unsigned int memflags; - int ret; + struct elevator_type *new_e = NULL; + int ret = 0; + WARN_ON_ONCE(q->mq_freeze_depth == 0); lockdep_assert_held(&q->elevator_lock); - memflags = blk_mq_freeze_queue(q); + if (strncmp(ctx->name, "none", 4)) { + new_e = elevator_find_get(ctx->name); + if (!new_e) + return -EINVAL; + } + blk_mq_quiesce_queue(q); if (q->elevator) { - elv_unregister_queue(q); + ctx->old = q->elevator; elevator_exit(q); } - ret = blk_mq_init_sched(q, new_e); - if (ret) - goto out_unfreeze; - - ret = elv_register_queue(q, true); - if (ret) { - elevator_exit(q); - goto out_unfreeze; + if (new_e) { + ret = blk_mq_init_sched(q, new_e, ctx->et); + if (ret) + goto out_unfreeze; + ctx->new = q->elevator; + } else { + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); + q->elevator = NULL; + q->nr_requests = q->tag_set->queue_depth; } - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + blk_add_trace_msg(q, "elv switch: %s", ctx->name); out_unfreeze: blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q, memflags); if (ret) { pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", new_e->elevator_name); } + if (new_e) + elevator_put(new_e); return ret; } -void elevator_disable(struct request_queue *q) +static void elv_exit_and_release(struct request_queue *q) { - unsigned int memflags; - - lockdep_assert_held(&q->elevator_lock); + struct elevator_queue *e; + unsigned memflags; memflags = blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - - elv_unregister_queue(q); + mutex_lock(&q->elevator_lock); + e = q->elevator; elevator_exit(q); - blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); - q->elevator = NULL; - q->nr_requests = q->tag_set->queue_depth; - blk_add_trace_msg(q, "elv switch: none"); - - blk_mq_unquiesce_queue(q); + mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); + if (e) { + blk_mq_free_sched_tags(e->et, q->tag_set); + kobject_put(&e->kobj); + } +} + +static int elevator_change_done(struct request_queue *q, + struct elv_change_ctx *ctx) +{ + int ret = 0; + + if (ctx->old) { + bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, + &ctx->old->flags); + + elv_unregister_queue(q, ctx->old); + blk_mq_free_sched_tags(ctx->old->et, q->tag_set); + kobject_put(&ctx->old->kobj); + if (enable_wbt) + wbt_enable_default(q->disk); + } + if (ctx->new) { + ret = elv_register_queue(q, ctx->new, !ctx->no_uevent); + if (ret) + elv_exit_and_release(q); + } + return ret; } /* * Switch this queue to the given IO scheduler. */ -static int elevator_change(struct request_queue *q, const char *elevator_name) +static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) { - struct elevator_type *e; - int ret; + unsigned int memflags; + struct blk_mq_tag_set *set = q->tag_set; + int ret = 0; - /* Make sure queue is not in the middle of being removed */ - if (!blk_queue_registered(q)) - return -ENOENT; + lockdep_assert_held(&set->update_nr_hwq_lock); - if (!strncmp(elevator_name, "none", 4)) { - if (q->elevator) - elevator_disable(q); - return 0; + if (strncmp(ctx->name, "none", 4)) { + ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); + if (!ctx->et) + return -ENOMEM; } - if (q->elevator && elevator_match(q->elevator->type, elevator_name)) - return 0; + memflags = blk_mq_freeze_queue(q); + /* + * May be called before adding disk, when there isn't any FS I/O, + * so freezing queue plus canceling dispatch work is enough to + * drain any dispatch activities originated from passthrough + * requests, then no need to quiesce queue which may add long boot + * latency, especially when lots of disks are involved. + * + * Disk isn't added yet, so verifying queue lock only manually. + */ + blk_mq_cancel_work_sync(q); + mutex_lock(&q->elevator_lock); + if (!(q->elevator && elevator_match(q->elevator->type, ctx->name))) + ret = elevator_switch(q, ctx); + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); + if (!ret) + ret = elevator_change_done(q, ctx); + /* + * Free sched tags if it's allocated but we couldn't switch elevator. + */ + if (ctx->et && !ctx->new) + blk_mq_free_sched_tags(ctx->et, set); + + return ret; +} + +/* + * The I/O scheduler depends on the number of hardware queues, this forces a + * reattachment when nr_hw_queues changes. + */ +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *t) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct elv_change_ctx ctx = {}; + int ret = -ENODEV; + + WARN_ON_ONCE(q->mq_freeze_depth == 0); + + if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { + ctx.name = e->elevator_name; + ctx.et = t; - e = elevator_find_get(elevator_name); + mutex_lock(&q->elevator_lock); + /* force to reattach elevator after nr_hw_queue is updated */ + ret = elevator_switch(q, &ctx); + mutex_unlock(&q->elevator_lock); + } + blk_mq_unfreeze_queue_nomemrestore(q); + if (!ret) + WARN_ON_ONCE(elevator_change_done(q, &ctx)); + /* + * Free sched tags if it's allocated but we couldn't switch elevator. + */ + if (t && !ctx.new) + blk_mq_free_sched_tags(t, set); +} + +/* + * Use the default elevator settings. If the chosen elevator initialization + * fails, fall back to the "none" elevator (no elevator). + */ +void elevator_set_default(struct request_queue *q) +{ + struct elv_change_ctx ctx = { + .name = "mq-deadline", + .no_uevent = true, + }; + int err; + struct elevator_type *e; + + /* now we allow to switch elevator */ + blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q); + + if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return; + + /* + * For single queue devices, default to using mq-deadline. If we + * have multiple queues or mq-deadline is not available, default + * to "none". + */ + e = elevator_find_get(ctx.name); if (!e) - return -EINVAL; - ret = elevator_switch(q, e); + return; + + if ((q->nr_hw_queues == 1 || + blk_mq_is_shared_tags(q->tag_set->flags))) { + err = elevator_change(q, &ctx); + if (err < 0) + pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n", + ctx.name, err); + } elevator_put(e); - return ret; } -static void elv_iosched_load_module(char *elevator_name) +void elevator_set_none(struct request_queue *q) +{ + struct elv_change_ctx ctx = { + .name = "none", + }; + int err; + + err = elevator_change(q, &ctx); + if (err < 0) + pr_warn("%s: set none elevator failed %d\n", __func__, err); +} + +static void elv_iosched_load_module(const char *elevator_name) { struct elevator_type *found; @@ -716,10 +799,14 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; - char *name; + struct elv_change_ctx ctx = {}; int ret; - unsigned int memflags; struct request_queue *q = disk->queue; + struct blk_mq_tag_set *set = q->tag_set; + + /* Make sure queue is not in the middle of being removed */ + if (!blk_queue_registered(q)) + return -ENOENT; /* * If the attribute needs to load a module, do it before freezing the @@ -727,24 +814,25 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, * queue is the one for the device storing the module file. */ strscpy(elevator_name, buf, sizeof(elevator_name)); - name = strstrip(elevator_name); + ctx.name = strstrip(elevator_name); - elv_iosched_load_module(name); + elv_iosched_load_module(ctx.name); - memflags = blk_mq_freeze_queue(q); - mutex_lock(&q->elevator_lock); - ret = elevator_change(q, name); - if (!ret) - ret = count; - mutex_unlock(&q->elevator_lock); - blk_mq_unfreeze_queue(q, memflags); + down_read(&set->update_nr_hwq_lock); + if (!blk_queue_no_elv_switch(q)) { + ret = elevator_change(q, &ctx); + if (!ret) + ret = count; + } else { + ret = -ENOENT; + } + up_read(&set->update_nr_hwq_lock); return ret; } ssize_t elv_iosched_show(struct gendisk *disk, char *name) { struct request_queue *q = disk->queue; - struct elevator_queue *eq = q->elevator; struct elevator_type *cur = NULL, *e; int len = 0; @@ -753,7 +841,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) len += sprintf(name+len, "[none] "); } else { len += sprintf(name+len, "none "); - cur = eq->type; + cur = q->elevator->type; } spin_lock(&elv_list_lock); diff --git a/block/elevator.h b/block/elevator.h index e4e44dfac503..adc5c157e17e 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -23,8 +23,17 @@ enum elv_merge { struct blk_mq_alloc_data; struct blk_mq_hw_ctx; +struct elevator_tags { + /* num. of hardware queues for which tags are allocated */ + unsigned int nr_hw_queues; + /* depth used while allocating tags */ + unsigned int nr_requests; + /* shared tag is stored at index 0 */ + struct blk_mq_tags *tags[]; +}; + struct elevator_mq_ops { - int (*init_sched)(struct request_queue *, struct elevator_type *); + int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); @@ -113,6 +122,7 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); struct elevator_queue { struct elevator_type *type; + struct elevator_tags *et; void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; @@ -121,7 +131,8 @@ struct elevator_queue }; #define ELEVATOR_FLAG_REGISTERED 0 -#define ELEVATOR_FLAG_DISABLE_WBT 1 +#define ELEVATOR_FLAG_DYING 1 +#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 /* * block elevator interface @@ -151,8 +162,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *page); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); -extern struct elevator_queue *elevator_alloc(struct request_queue *, - struct elevator_type *); +struct elevator_queue *elevator_alloc(struct request_queue *, + struct elevator_type *, struct elevator_tags *); /* * Helper functions. @@ -182,4 +193,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) +void blk_mq_sched_reg_debugfs(struct request_queue *q); +void blk_mq_sched_unreg_debugfs(struct request_queue *q); + #endif /* _ELEVATOR_H */ diff --git a/block/fops.c b/block/fops.c index 82b672d15ea4..82451ac8ff25 100644 --- a/block/fops.c +++ b/block/fops.c @@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio.bi_write_stream = iocb->ki_write_stream; bio.bi_ioprio = iocb->ki_ioprio; if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; @@ -206,6 +207,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio->bi_write_stream = iocb->ki_write_stream; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; @@ -333,6 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio->bi_write_stream = iocb->ki_write_stream; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; @@ -398,6 +401,26 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (blkdev_dio_invalid(bdev, iocb, iter)) return -EINVAL; + if (iov_iter_rw(iter) == WRITE) { + u16 max_write_streams = bdev_max_write_streams(bdev); + + if (iocb->ki_write_stream) { + if (iocb->ki_write_stream > max_write_streams) + return -EINVAL; + } else if (max_write_streams) { + enum rw_hint write_hint = + file_inode(iocb->ki_filp)->i_write_hint; + + /* + * Just use the write hint as write stream for block + * device writes. This assumes no file system is + * mounted that would use the streams differently. + */ + if (write_hint <= max_write_streams) + iocb->ki_write_stream = write_hint; + } + } + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS)) { if (is_sync_kiocb(iocb)) @@ -451,12 +474,13 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct folio *folio = NULL; struct blk_plug plug; int err; blk_start_plug(&plug); - err = write_cache_pages(mapping, wbc, block_write_full_folio, - blkdev_get_block); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) + err = block_write_full_folio(folio, wbc, blkdev_get_block); blk_finish_plug(&plug); return err; @@ -472,18 +496,21 @@ static void blkdev_readahead(struct readahead_control *rac) mpage_readahead(rac, blkdev_get_block); } -static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +static int blkdev_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, struct folio **foliop, + void **fsdata) { return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); } -static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct folio *folio, - void *fsdata) +static int blkdev_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = block_write_end(pos, len, copied, folio); folio_unlock(folio); folio_put(folio); @@ -513,30 +540,42 @@ static void blkdev_readahead(struct readahead_control *rac) iomap_readahead(rac, &blkdev_iomap_ops); } -static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset, unsigned int len) +static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc, + struct folio *folio, u64 offset, unsigned int len, u64 end_pos) { - loff_t isize = i_size_read(inode); + loff_t isize = i_size_read(wpc->inode); if (WARN_ON_ONCE(offset >= isize)) return -EIO; - if (offset >= wpc->iomap.offset && - offset < wpc->iomap.offset + wpc->iomap.length) - return 0; - return blkdev_iomap_begin(inode, offset, isize - offset, - IOMAP_WRITE, &wpc->iomap, NULL); + + if (offset < wpc->iomap.offset || + offset >= wpc->iomap.offset + wpc->iomap.length) { + int error; + + error = blkdev_iomap_begin(wpc->inode, offset, isize - offset, + IOMAP_WRITE, &wpc->iomap, NULL); + if (error) + return error; + } + + return iomap_add_to_ioend(wpc, folio, offset, end_pos, len); } static const struct iomap_writeback_ops blkdev_writeback_ops = { - .map_blocks = blkdev_map_blocks, + .writeback_range = blkdev_writeback_range, + .writeback_submit = iomap_ioend_writeback_submit, }; static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct iomap_writepage_ctx wpc = { }; + struct iomap_writepage_ctx wpc = { + .inode = mapping->host, + .wbc = wbc, + .ops = &blkdev_writeback_ops + }; - return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops); + return iomap_writepages(&wpc); } const struct address_space_operations def_blk_aops = { @@ -687,7 +726,8 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) { - return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL); + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL, + NULL); } /* @@ -817,7 +857,7 @@ reexpand: #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_ZERO_RANGE) + FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) @@ -826,11 +866,19 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, struct block_device *bdev = I_BDEV(inode); loff_t end = start + len - 1; loff_t isize; + unsigned int flags; int error; /* Fail if we don't recognize the flags. */ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; + /* + * Don't allow writing zeroes if the device does not enable the + * unmap write zeroes operation. + */ + if ((mode & FALLOC_FL_WRITE_ZEROES) && + !bdev_write_zeroes_unmap_sectors(bdev)) + return -EOPNOTSUPP; /* Don't go off the end of the device. */ isize = bdev_nr_bytes(bdev); @@ -853,48 +901,46 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); - /* - * Invalidate the page cache, including dirty pages, for valid - * de-allocate mode calls to fallocate(). - */ switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: - error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); - if (error) - goto fail; - - error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL, - BLKDEV_ZERO_NOUNMAP); + flags = BLKDEV_ZERO_NOUNMAP; break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); - if (error) - goto fail; - - error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL, - BLKDEV_ZERO_NOFALLBACK); + flags = BLKDEV_ZERO_NOFALLBACK; + break; + case FALLOC_FL_WRITE_ZEROES: + flags = 0; break; default: error = -EOPNOTSUPP; + goto fail; } + /* + * Invalidate the page cache, including dirty pages, for valid + * de-allocate mode calls to fallocate(). + */ + error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); + if (error) + goto fail; + + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, flags); fail: filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); return error; } -static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +static int blkdev_mmap_prepare(struct vm_area_desc *desc) { - struct inode *bd_inode = bdev_file_inode(file); + struct file *file = desc->file; - if (bdev_read_only(I_BDEV(bd_inode))) - return generic_file_readonly_mmap(file, vma); + if (bdev_read_only(I_BDEV(bdev_file_inode(file)))) + return generic_file_readonly_mmap_prepare(desc); - return generic_file_mmap(file, vma); + return generic_file_mmap_prepare(desc); } const struct file_operations def_blk_fops = { @@ -904,7 +950,7 @@ const struct file_operations def_blk_fops = { .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, .iopoll = iocb_bio_iopoll, - .mmap = blkdev_mmap, + .mmap_prepare = blkdev_mmap_prepare, .fsync = blkdev_fsync, .unlocked_ioctl = blkdev_ioctl, #ifdef CONFIG_COMPAT diff --git a/block/genhd.c b/block/genhd.c index c2bd86cd09de..9bbc38d12792 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -125,37 +125,50 @@ static void part_stat_read_all(struct block_device *part, } } -unsigned int part_in_flight(struct block_device *part) +static void bdev_count_inflight_rw(struct block_device *part, + unsigned int inflight[2], bool mq_driver) { - unsigned int inflight = 0; + int write = 0; + int read = 0; int cpu; + if (mq_driver) { + blk_mq_in_driver_rw(part, inflight); + return; + } + for_each_possible_cpu(cpu) { - inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + - part_stat_local_read_cpu(part, in_flight[1], cpu); + read += part_stat_local_read_cpu(part, in_flight[READ], cpu); + write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu); } - if ((int)inflight < 0) - inflight = 0; - return inflight; + /* + * While iterating all CPUs, some IOs may be issued from a CPU already + * traversed and complete on a CPU that has not yet been traversed, + * causing the inflight number to be negative. + */ + inflight[READ] = read > 0 ? read : 0; + inflight[WRITE] = write > 0 ? write : 0; } -static void part_in_flight_rw(struct block_device *part, - unsigned int inflight[2]) +/** + * bdev_count_inflight - get the number of inflight IOs for a block device. + * + * @part: the block device. + * + * Inflight here means started IO accounting, from bdev_start_io_acct() for + * bio-based block device, and from blk_account_io_start() for rq-based block + * device. + */ +unsigned int bdev_count_inflight(struct block_device *part) { - int cpu; + unsigned int inflight[2] = {0}; - inflight[0] = 0; - inflight[1] = 0; - for_each_possible_cpu(cpu) { - inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); - inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); - } - if ((int)inflight[0] < 0) - inflight[0] = 0; - if ((int)inflight[1] < 0) - inflight[1] = 0; + bdev_count_inflight_rw(part, inflight, false); + + return inflight[READ] + inflight[WRITE]; } +EXPORT_SYMBOL_GPL(bdev_count_inflight); /* * Can be deleted altogether. Later. @@ -389,19 +402,35 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) return ret; } -/** - * add_disk_fwnode - add disk information to kernel list with fwnode - * @parent: parent device for the disk - * @disk: per-device partitioning information - * @groups: Additional per-device sysfs groups - * @fwnode: attached disk fwnode - * - * This function registers the partitioning information in @disk - * with the kernel. Also attach a fwnode to the disk device. - */ -int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups, - struct fwnode_handle *fwnode) +static void add_disk_final(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + + if (!(disk->flags & GENHD_FL_HIDDEN)) { + /* Make sure the first partition scan will be proceed */ + if (get_capacity(disk) && disk_has_partscan(disk)) + set_bit(GD_NEED_PART_SCAN, &disk->state); + + bdev_add(disk->part0, ddev->devt); + if (get_capacity(disk)) + disk_scan_partitions(disk, BLK_OPEN_READ); + + /* + * Announce the disk and partitions after all partitions are + * created. (for hidden disks uevents remain suppressed forever) + */ + dev_set_uevent_suppress(ddev, 0); + disk_uevent(disk, KOBJ_ADD); + } + + blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); + disk_add_events(disk); + set_bit(GD_ADDED, &disk->state); +} + +static int __add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode) { struct device *ddev = disk_to_dev(disk); @@ -416,12 +445,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, */ if (disk->fops->submit_bio || disk->fops->poll_bio) return -EINVAL; - - /* - * Initialize the I/O scheduler code and pick a default one if - * needed. - */ - elevator_init_mq(disk->queue); } else { if (!disk->fops->submit_bio) return -EINVAL; @@ -438,7 +461,7 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, ret = -EINVAL; if (disk->major) { if (WARN_ON(!disk->minors)) - goto out_exit_elevator; + goto out; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", @@ -448,14 +471,14 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, if (disk->first_minor > MINORMASK || disk->minors > MINORMASK + 1 || disk->first_minor + disk->minors > MINORMASK + 1) - goto out_exit_elevator; + goto out; } else { if (WARN_ON(disk->minors)) - goto out_exit_elevator; + goto out; ret = blk_alloc_ext_minor(); if (ret < 0) - goto out_exit_elevator; + goto out; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; } @@ -516,21 +539,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, &disk->bdi->dev->kobj, "bdi"); if (ret) goto out_unregister_bdi; - - /* Make sure the first partition scan will be proceed */ - if (get_capacity(disk) && disk_has_partscan(disk)) - set_bit(GD_NEED_PART_SCAN, &disk->state); - - bdev_add(disk->part0, ddev->devt); - if (get_capacity(disk)) - disk_scan_partitions(disk, BLK_OPEN_READ); - - /* - * Announce the disk and partitions after all partitions are - * created. (for hidden disks uevents remain suppressed forever) - */ - dev_set_uevent_suppress(ddev, 0); - disk_uevent(disk, KOBJ_ADD); } else { /* * Even if the block_device for a hidden gendisk is not @@ -539,10 +547,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, */ disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } - - blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); - disk_add_events(disk); - set_bit(GD_ADDED, &disk->state); return 0; out_unregister_bdi: @@ -564,12 +568,46 @@ out_device_del: out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); -out_exit_elevator: - if (disk->queue->elevator) { - mutex_lock(&disk->queue->elevator_lock); - elevator_exit(disk->queue); - mutex_unlock(&disk->queue->elevator_lock); +out: + return ret; +} + +/** + * add_disk_fwnode - add disk information to kernel list with fwnode + * @parent: parent device for the disk + * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups + * @fwnode: attached disk fwnode + * + * This function registers the partitioning information in @disk + * with the kernel. Also attach a fwnode to the disk device. + */ +int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode) +{ + struct blk_mq_tag_set *set; + unsigned int memflags; + int ret; + + if (queue_is_mq(disk->queue)) { + set = disk->queue->tag_set; + memflags = memalloc_noio_save(); + down_read(&set->update_nr_hwq_lock); + ret = __add_disk(parent, disk, groups, fwnode); + up_read(&set->update_nr_hwq_lock); + memalloc_noio_restore(memflags); + } else { + ret = __add_disk(parent, disk, groups, fwnode); } + + /* + * add_disk_final() needn't to read `nr_hw_queues`, so move it out + * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary + * lock dependency on `disk->open_mutex` from scanning partition. + */ + if (!ret) + add_disk_final(disk); return ret; } EXPORT_SYMBOL_GPL(add_disk_fwnode); @@ -652,26 +690,7 @@ void blk_mark_disk_dead(struct gendisk *disk) } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); -/** - * del_gendisk - remove the gendisk - * @disk: the struct gendisk to remove - * - * Removes the gendisk and all its associated resources. This deletes the - * partitions associated with the gendisk, and unregisters the associated - * request_queue. - * - * This is the counter to the respective __device_add_disk() call. - * - * The final removal of the struct gendisk happens when its refcount reaches 0 - * with put_disk(), which should be called after del_gendisk(), if - * __device_add_disk() was used. - * - * Drivers exist which depend on the release of the gendisk to be synchronous, - * it should not be deferred. - * - * Context: can sleep - */ -void del_gendisk(struct gendisk *disk) +static void __del_gendisk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct block_device *part; @@ -743,14 +762,7 @@ void del_gendisk(struct gendisk *disk) if (queue_is_mq(q)) blk_mq_cancel_work_sync(q); - blk_mq_quiesce_queue(q); - if (q->elevator) { - mutex_lock(&q->elevator_lock); - elevator_exit(q); - mutex_unlock(&q->elevator_lock); - } rq_qos_exit(q); - blk_mq_unquiesce_queue(q); /* * If the disk does not own the queue, allow using passthrough requests @@ -764,6 +776,55 @@ void del_gendisk(struct gendisk *disk) if (start_drain) blk_unfreeze_release_lock(q); } + +static void disable_elv_switch(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + WARN_ON_ONCE(!queue_is_mq(q)); + + down_write(&set->update_nr_hwq_lock); + blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q); + up_write(&set->update_nr_hwq_lock); +} + +/** + * del_gendisk - remove the gendisk + * @disk: the struct gendisk to remove + * + * Removes the gendisk and all its associated resources. This deletes the + * partitions associated with the gendisk, and unregisters the associated + * request_queue. + * + * This is the counter to the respective __device_add_disk() call. + * + * The final removal of the struct gendisk happens when its refcount reaches 0 + * with put_disk(), which should be called after del_gendisk(), if + * __device_add_disk() was used. + * + * Drivers exist which depend on the release of the gendisk to be synchronous, + * it should not be deferred. + * + * Context: can sleep + */ +void del_gendisk(struct gendisk *disk) +{ + struct blk_mq_tag_set *set; + unsigned int memflags; + + if (!queue_is_mq(disk->queue)) { + __del_gendisk(disk); + } else { + set = disk->queue->tag_set; + + disable_elv_switch(disk->queue); + + memflags = memalloc_noio_save(); + down_read(&set->update_nr_hwq_lock); + __del_gendisk(disk); + up_read(&set->update_nr_hwq_lock); + memalloc_noio_restore(memflags); + } +} EXPORT_SYMBOL(del_gendisk); /** @@ -1005,7 +1066,7 @@ ssize_t part_stat_show(struct device *dev, struct disk_stats stat; unsigned int inflight; - inflight = part_in_flight(bdev); + inflight = bdev_count_inflight(bdev); if (inflight) { part_stat_lock(); update_io_ticks(bdev, jiffies, true); @@ -1042,19 +1103,21 @@ ssize_t part_stat_show(struct device *dev, (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); } +/* + * Show the number of IOs issued to driver. + * For bio-based device, started from bdev_start_io_acct(); + * For rq-based device, started from blk_mq_start_request(); + */ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); - unsigned int inflight[2]; + unsigned int inflight[2] = {0}; - if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, bdev, inflight); - else - part_in_flight_rw(bdev, inflight); + bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q)); - return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]); + return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]); } static ssize_t disk_capability_show(struct device *dev, @@ -1240,6 +1303,7 @@ static void disk_release(struct device *dev) disk_free_zone_resources(disk); xa_destroy(&disk->part_tbl); + kobject_put(&disk->queue_kobj); disk->queue->disk = NULL; blk_put_queue(disk->queue); @@ -1307,7 +1371,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; - inflight = part_in_flight(hd); + inflight = bdev_count_inflight(hd); if (inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); @@ -1422,6 +1486,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif + mutex_init(&disk->rqos_state_mutex); + kobject_init(&disk->queue_kobj, &blk_queue_ktype); return disk; out_erase_part0: diff --git a/block/ioctl.c b/block/ioctl.c index e472cc1030c6..f7b0006ca45d 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -13,6 +13,7 @@ #include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/io_uring/cmd.h> +#include <linux/blk-integrity.h> #include <uapi/linux/blkdev.h> #include "blk.h" #include "blk-crypto-internal.h" @@ -644,7 +645,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, case IOC_PR_CLEAR: return blkdev_pr_clear(bdev, mode, argp); default: - return -ENOIOCTLCMD; + return blk_get_meta_cap(bdev, cmd, argp); } } diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 0f0f8452609a..70cbc7b2deb4 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -157,10 +157,7 @@ struct kyber_queue_data { */ struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; - /* - * Async request percentage, converted to per-word depth for - * sbitmap_get_shallow(). - */ + /* Number of allowed async requests. */ unsigned int async_depth; struct kyber_cpu_latency __percpu *cpu_latency; @@ -276,7 +273,7 @@ static void kyber_resize_domain(struct kyber_queue_data *kqd, static void kyber_timer_fn(struct timer_list *t) { - struct kyber_queue_data *kqd = from_timer(kqd, t, timer); + struct kyber_queue_data *kqd = timer_container_of(kqd, t, timer); unsigned int sched_domain; int cpu; bool bad = false; @@ -402,20 +399,13 @@ err: return ERR_PTR(ret); } -static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) +static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct kyber_queue_data *kqd; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; kqd = kyber_queue_data_alloc(q); - if (IS_ERR(kqd)) { - kobject_put(&eq->kobj); + if (IS_ERR(kqd)) return PTR_ERR(kqd); - } blk_stat_enable_accounting(q); @@ -454,10 +444,8 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int shift = tags->bitmap_tags.sb.shift; - - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; + kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U; sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 754f6b7415cd..b9b7cdf1d3c9 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -488,20 +488,6 @@ unlock: } /* - * 'depth' is a number in the range 1..INT_MAX representing a number of - * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since - * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). - * Values larger than q->nr_requests have the same effect as q->nr_requests. - */ -static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) -{ - struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; - const unsigned int nrr = hctx->queue->nr_requests; - - return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; -} - -/* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ @@ -517,7 +503,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ - data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); + data->shallow_depth = dd->async_depth; } /* Called by blk_mq_update_nr_requests(). */ @@ -568,20 +554,14 @@ static void dd_exit_sched(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static int dd_init_sched(struct request_queue *q, struct elevator_type *e) +static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct deadline_data *dd; - struct elevator_queue *eq; enum dd_prio prio; - int ret = -ENOMEM; - - eq = elevator_alloc(q, e); - if (!eq) - return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) - goto put_eq; + return -ENOMEM; eq->elevator_data = dd; @@ -608,10 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) q->elevator = eq; return 0; - -put_eq: - kobject_put(&eq->kobj); - return ret; } /* @@ -715,7 +691,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } /* - * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list(). + * Called from blk_mq_insert_request() or blk_mq_dispatch_list(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, diff --git a/block/t10-pi.c b/block/t10-pi.c index 851db518ee5e..0c4ed9702146 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -56,7 +56,7 @@ static void t10_pi_generate(struct blk_integrity_iter *iter, pi->ref_tag = 0; iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } } @@ -105,7 +105,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } @@ -125,7 +125,7 @@ next: static void t10_pi_type1_prepare(struct request *rq) { struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u32 ref_tag = t10_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; @@ -177,7 +177,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u32 ref_tag = t10_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; @@ -234,7 +234,7 @@ static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, put_unaligned_be48(0ULL, pi->ref_tag); iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } } @@ -289,7 +289,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } @@ -299,7 +299,7 @@ next: static void ext_pi_type1_prepare(struct request *rq) { struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u64 ref_tag = ext_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; @@ -340,7 +340,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u64 ref_tag = ext_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; |