diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/badblocks.c | 5 | ||||
-rw-r--r-- | block/bdev.c | 17 | ||||
-rw-r--r-- | block/bfq-iosched.c | 35 | ||||
-rw-r--r-- | block/bfq-iosched.h | 3 | ||||
-rw-r--r-- | block/bio-integrity.c | 30 | ||||
-rw-r--r-- | block/bio.c | 15 | ||||
-rw-r--r-- | block/blk-cgroup.c | 23 | ||||
-rw-r--r-- | block/blk-core.c | 27 | ||||
-rw-r--r-- | block/blk-merge.c | 54 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 36 | ||||
-rw-r--r-- | block/blk-mq.c | 88 | ||||
-rw-r--r-- | block/blk-mq.h | 8 | ||||
-rw-r--r-- | block/blk-settings.c | 28 | ||||
-rw-r--r-- | block/blk-sysfs.c | 12 | ||||
-rw-r--r-- | block/blk-throttle.c | 15 | ||||
-rw-r--r-- | block/blk-zoned.c | 109 | ||||
-rw-r--r-- | block/blk.h | 3 | ||||
-rw-r--r-- | block/bounce.c | 2 | ||||
-rw-r--r-- | block/elevator.c | 3 | ||||
-rw-r--r-- | block/fops.c | 21 | ||||
-rw-r--r-- | block/genhd.c | 22 | ||||
-rw-r--r-- | block/ioctl.c | 6 | ||||
-rw-r--r-- | block/kyber-iosched.c | 9 | ||||
-rw-r--r-- | block/mq-deadline.c | 29 | ||||
-rw-r--r-- | block/partitions/efi.c | 2 | ||||
-rw-r--r-- | block/partitions/ldm.h | 2 | ||||
-rw-r--r-- | block/partitions/mac.c | 18 |
27 files changed, 384 insertions, 238 deletions
diff --git a/block/badblocks.c b/block/badblocks.c index db4ec8b9b2a8..a9709771a101 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -1349,14 +1349,15 @@ re_check: len = sectors; update_sectors: + /* This situation should never happen */ + WARN_ON(sectors < len); + s += len; sectors -= len; if (sectors > 0) goto re_check; - WARN_ON(sectors < 0); - if (unacked_badblocks > 0) rv = -1; else if (acked_badblocks > 0) diff --git a/block/bdev.c b/block/bdev.c index 738e3c8457e7..e7daca6565ea 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -168,9 +168,26 @@ int set_blocksize(struct file *file, int size) /* Don't change the size if it is same as current */ if (inode->i_blkbits != blksize_bits(size)) { + /* + * Flush and truncate the pagecache before we reconfigure the + * mapping geometry because folio sizes are variable now. If a + * reader has already allocated a folio whose size is smaller + * than the new min_order but invokes readahead after the new + * min_order becomes visible, readahead will think there are + * "zero" blocks per folio and crash. Take the inode and + * invalidation locks to avoid racing with + * read/write/fallocate. + */ + inode_lock(inode); + filemap_invalidate_lock(inode->i_mapping); + sync_blockdev(bdev); + kill_bdev(bdev); + inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); + filemap_invalidate_unlock(inode->i_mapping); + inode_unlock(inode); } return 0; } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index cad16c163611..68359e1b92e2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -701,17 +701,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; struct bfq_io_cq *bic = bfq_bic_lookup(data->q); - int depth; - unsigned limit = data->q->nr_requests; - unsigned int act_idx; + unsigned int limit, act_idx; /* Sync reads have full depth available */ - if (op_is_sync(opf) && !op_is_write(opf)) { - depth = 0; - } else { - depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; - limit = (limit * depth) >> bfqd->full_depth_shift; - } + if (op_is_sync(opf) && !op_is_write(opf)) + limit = data->q->nr_requests; + else + limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { /* Fast path to check if bfqq is already allocated. */ @@ -725,14 +721,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * available requests and thus starve other entities. */ if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) { - depth = 1; + limit = 1; break; } } + bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", - __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); - if (depth) - data->shallow_depth = depth; + __func__, bfqd->wr_busy_queues, op_is_sync(opf), limit); + + if (limit < data->q->nr_requests) + data->shallow_depth = limit; } static struct bfq_queue * @@ -7128,9 +7126,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) */ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { - unsigned int depth = 1U << bt->sb.shift; + unsigned int nr_requests = bfqd->queue->nr_requests; - bfqd->full_depth_shift = bt->sb.shift; /* * In-word depths if no bfq_queue is being weight-raised: * leaving 25% of tags only for sync reads. @@ -7142,13 +7139,13 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) * limit 'something'. */ /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max(depth >> 1, 1U); + bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U); /* * no more than 75% of tags for sync writes (25% extra tags * w.r.t. async I/O, to prevent async I/O from starving sync * writes) */ - bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U); + bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U); /* * In-word depths in case some bfq_queue is being weight- @@ -7158,9 +7155,9 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) * shortage. */ /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); + bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); + bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U); } static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 687a3a7ba784..31217f196f4f 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -813,8 +813,7 @@ struct bfq_data { * Depth limits used in bfq_limit_depth (see comments on the * function) */ - unsigned int word_depths[2][2]; - unsigned int full_depth_shift; + unsigned int async_depths[2][2]; /* * Number of independent actuators. This is equal to 1 in diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 88e3ad73c385..456026c4a3c9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -104,31 +104,28 @@ err: } EXPORT_SYMBOL(bio_integrity_alloc); -static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, - bool dirty) +static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs) { int i; - for (i = 0; i < nr_vecs; i++) { - if (dirty && !PageCompound(bv[i].bv_page)) - set_page_dirty_lock(bv[i].bv_page); + for (i = 0; i < nr_vecs; i++) unpin_user_page(bv[i].bv_page); - } } static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) { - unsigned short nr_vecs = bip->bip_max_vcnt - 1; - struct bio_vec *copy = &bip->bip_vec[1]; - size_t bytes = bip->bip_iter.bi_size; - struct iov_iter iter; + unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *orig_bvecs = &bip->bip_vec[1]; + struct bio_vec *bounce_bvec = &bip->bip_vec[0]; + size_t bytes = bounce_bvec->bv_len; + struct iov_iter orig_iter; int ret; - iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); - ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter); WARN_ON_ONCE(ret != bytes); - bio_integrity_unpin_bvec(copy, nr_vecs, true); + bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs); } /** @@ -148,8 +145,7 @@ void bio_integrity_unmap_user(struct bio *bio) return; } - bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, - bio_data_dir(bio) == READ); + bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt); } /** @@ -235,7 +231,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, } if (write) - bio_integrity_unpin_bvec(bvec, nr_vecs, false); + bio_integrity_unpin_bvec(bvec, nr_vecs); else memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec)); @@ -361,7 +357,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, return 0; release_pages: - bio_integrity_unpin_bvec(bvec, nr_bvecs, false); + bio_integrity_unpin_bvec(bvec, nr_bvecs); free_bvec: if (bvec != stack_vec) kfree(bvec); diff --git a/block/bio.c b/block/bio.c index ac4d77c88932..094a5adf79d2 100644 --- a/block/bio.c +++ b/block/bio.c @@ -77,7 +77,7 @@ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; - char name[8]; + char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); @@ -611,7 +611,7 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; - if (nr_vecs > UIO_MAXIOV) + if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); } @@ -1156,9 +1156,10 @@ EXPORT_SYMBOL(bio_add_page); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off) { + unsigned long nr = off / PAGE_SIZE; + WARN_ON_ONCE(len > UINT_MAX); - WARN_ON_ONCE(off > UINT_MAX); - __bio_add_page(bio, &folio->page, len, off); + __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } EXPORT_SYMBOL_GPL(bio_add_folio_nofail); @@ -1179,9 +1180,11 @@ EXPORT_SYMBOL_GPL(bio_add_folio_nofail); bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { - if (len > UINT_MAX || off > UINT_MAX) + unsigned long nr = off / PAGE_SIZE; + + if (len > UINT_MAX) return false; - return bio_add_page(bio, &folio->page, len, off) > 0; + return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } EXPORT_SYMBOL(bio_add_folio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 45a395862fbc..643d6bf66522 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1138,6 +1138,7 @@ static void blkcg_fill_root_iostats(void) blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } + class_dev_iter_exit(&iter); } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) @@ -1724,27 +1725,27 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; + /* + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy + * without pd_alloc_fn/pd_free_fn can't be activated. + */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + return -EINVAL; + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ - ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); + ret = -ENOSPC; goto err_unlock; } - /* - * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy - * without pd_alloc_fn/pd_free_fn can't be activated. - */ - if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || - (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) - goto err_unlock; - /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; @@ -1755,8 +1756,10 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) + if (!cpd) { + ret = -ENOMEM; goto err_free_cpds; + } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; diff --git a/block/blk-core.c b/block/blk-core.c index 4f791a3114a1..c7b6c1f76359 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -629,8 +629,14 @@ static void __submit_bio(struct bio *bio) blk_mq_submit_bio(bio); } else if (likely(bio_queue_enter(bio) == 0)) { struct gendisk *disk = bio->bi_bdev->bd_disk; - - disk->fops->submit_bio(bio); + + if ((bio->bi_opf & REQ_POLLED) && + !(disk->queue->limits.features & BLK_FEAT_POLL)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + } else { + disk->fops->submit_bio(bio); + } blk_queue_exit(disk->queue); } @@ -805,12 +811,6 @@ void submit_bio_noacct(struct bio *bio) } } - if (!(q->limits.features & BLK_FEAT_POLL) && - (bio->bi_opf & REQ_POLLED)) { - bio_clear_polled(bio); - goto not_supported; - } - switch (bio_op(bio)) { case REQ_OP_READ: break; @@ -935,7 +935,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) return 0; q = bdev_get_queue(bdev); - if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL)) + if (cookie == BLK_QC_T_NONE) return 0; blk_flush_plug(current->plug, false); @@ -956,7 +956,8 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) } else { struct gendisk *disk = q->disk; - if (disk && disk->fops->poll_bio) + if ((q->limits.features & BLK_FEAT_POLL) && disk && + disk->fops->poll_bio) ret = disk->fops->poll_bio(bio, iob, flags); } blk_queue_exit(q); @@ -1120,8 +1121,8 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) return; plug->cur_ktime = 0; - plug->mq_list = NULL; - plug->cached_rq = NULL; + rq_list_init(&plug->mq_list); + rq_list_init(&plug->cached_rqs); plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); plug->rq_count = 0; plug->multiple_queues = false; @@ -1217,7 +1218,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) * queue for cached requests, we don't want a blocked task holding * up a queue freeze/quiesce event. */ - if (unlikely(!rq_list_empty(plug->cached_rq))) + if (unlikely(!rq_list_empty(&plug->cached_rqs))) blk_mq_free_plug_rqs(plug); plug->cur_ktime = 0; diff --git a/block/blk-merge.c b/block/blk-merge.c index 5baa950f34fe..7ddd7dd23dda 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -864,12 +864,13 @@ static struct request *attempt_merge(struct request_queue *q, if (rq_data_dir(req) != rq_data_dir(next)) return NULL; - /* Don't merge requests with different write hints. */ - if (req->write_hint != next->write_hint) - return NULL; - - if (req->ioprio != next->ioprio) - return NULL; + if (req->bio && next->bio) { + /* Don't merge requests with different write hints. */ + if (req->bio->bi_write_hint != next->bio->bi_write_hint) + return NULL; + if (req->bio->bi_ioprio != next->bio->bi_ioprio) + return NULL; + } if (!blk_atomic_write_mergeable_rqs(req, next)) return NULL; @@ -998,12 +999,13 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; - /* Don't merge requests with different write hints. */ - if (rq->write_hint != bio->bi_write_hint) - return false; - - if (rq->ioprio != bio_prio(bio)) - return false; + if (rq->bio) { + /* Don't merge requests with different write hints. */ + if (rq->bio->bi_write_hint != bio->bi_write_hint) + return false; + if (rq->bio->bi_ioprio != bio->bi_ioprio) + return false; + } if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) return false; @@ -1175,23 +1177,23 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, struct blk_plug *plug = current->plug; struct request *rq; - if (!plug || rq_list_empty(plug->mq_list)) + if (!plug || rq_list_empty(&plug->mq_list)) return false; - rq_list_for_each(&plug->mq_list, rq) { - if (rq->q == q) { - if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == - BIO_MERGE_OK) - return true; - break; - } + rq = plug->mq_list.tail; + if (rq->q == q) + return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK; + else if (!plug->multiple_queues) + return false; - /* - * Only keep iterating plug list for merges if we have multiple - * queues - */ - if (!plug->multiple_queues) - break; + rq_list_for_each(&plug->mq_list, rq) { + if (rq->q != q) + continue; + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) + return true; + break; } return false; } diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 9638b25fd521..444798c5374f 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -11,6 +11,7 @@ #include <linux/smp.h> #include <linux/cpu.h> #include <linux/group_cpus.h> +#include <linux/device/bus.h> #include "blk.h" #include "blk-mq.h" @@ -54,3 +55,38 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) return NUMA_NO_NODE; } + +/** + * blk_mq_map_hw_queues - Create CPU to hardware queue mapping + * @qmap: CPU to hardware queue map + * @dev: The device to map queues + * @offset: Queue offset to use for the device + * + * Create a CPU to hardware queue mapping in @qmap. The struct bus_type + * irq_get_affinity callback will be used to retrieve the affinity. + */ +void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, + struct device *dev, unsigned int offset) + +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + if (!dev->bus->irq_get_affinity) + goto fallback; + + for (queue = 0; queue < qmap->nr_queues; queue++) { + mask = dev->bus->irq_get_affinity(dev, queue + offset); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + qmap->mq_map[cpu] = qmap->queue_offset + queue; + } + + return; + +fallback: + blk_mq_map_queues(qmap); +} +EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 4e76651e786d..e1bca29dc358 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -506,7 +506,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); rq = blk_mq_rq_ctx_init(data, tags, tag); - rq_list_add(data->cached_rq, rq); + rq_list_add_head(data->cached_rqs, rq); nr++; } if (!(data->rq_flags & RQF_SCHED_TAGS)) @@ -515,7 +515,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; - return rq_list_pop(data->cached_rq); + return rq_list_pop(data->cached_rqs); } static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) @@ -612,7 +612,7 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q, .flags = flags, .cmd_flags = opf, .nr_tags = plug->nr_ios, - .cached_rq = &plug->cached_rq, + .cached_rqs = &plug->cached_rqs, }; struct request *rq; @@ -637,14 +637,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (!plug) return NULL; - if (rq_list_empty(plug->cached_rq)) { + if (rq_list_empty(&plug->cached_rqs)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); if (!rq) return NULL; } else { - rq = rq_list_peek(&plug->cached_rq); + rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; @@ -653,7 +653,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; - plug->cached_rq = rq_list_next(rq); + rq_list_pop(&plug->cached_rqs); blk_mq_rq_time_init(rq, 0); } @@ -830,7 +830,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; - while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) + while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) blk_mq_free_request(rq); } @@ -870,7 +870,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status) blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, - IOPRIO_PRIO_CLASS(req->ioprio)); + IOPRIO_PRIO_CLASS(req_get_ioprio(req))); } /* @@ -1386,8 +1386,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; - rq->rq_next = NULL; - rq_list_add(&plug->mq_list, rq); + rq_list_add_tail(&plug->mq_list, rq); plug->rq_count++; } @@ -2655,7 +2654,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; - rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, @@ -2781,7 +2779,7 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug) blk_status_t ret = BLK_STS_OK; while ((rq = rq_list_pop(&plug->mq_list))) { - bool last = rq_list_empty(plug->mq_list); + bool last = rq_list_empty(&plug->mq_list); if (hctx != rq->mq_hctx) { if (hctx) { @@ -2824,8 +2822,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; - struct request *requeue_list = NULL; - struct request **requeue_lastp = &requeue_list; + struct rq_list requeue_list = {}; unsigned int depth = 0; bool is_passthrough = false; LIST_HEAD(list); @@ -2839,12 +2836,12 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) is_passthrough = blk_rq_is_passthrough(rq); } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || is_passthrough != blk_rq_is_passthrough(rq)) { - rq_list_add_tail(&requeue_lastp, rq); + rq_list_add_tail(&requeue_list, rq); continue; } - list_add(&rq->queuelist, &list); + list_add_tail(&rq->queuelist, &list); depth++; - } while (!rq_list_empty(plug->mq_list)); + } while (!rq_list_empty(&plug->mq_list)); plug->mq_list = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); @@ -2899,19 +2896,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) if (q->mq_ops->queue_rqs) { blk_mq_run_dispatch_ops(q, __blk_mq_flush_plug_list(q, plug)); - if (rq_list_empty(plug->mq_list)) + if (rq_list_empty(&plug->mq_list)) return; } blk_mq_run_dispatch_ops(q, blk_mq_plug_issue_direct(plug)); - if (rq_list_empty(plug->mq_list)) + if (rq_list_empty(&plug->mq_list)) return; } do { blk_mq_dispatch_plug_list(plug, from_schedule); - } while (!rq_list_empty(plug->mq_list)); + } while (!rq_list_empty(&plug->mq_list)); } static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2976,7 +2973,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; - data.cached_rq = &plug->cached_rq; + data.cached_rqs = &plug->cached_rqs; } rq = __blk_mq_alloc_requests(&data); @@ -2999,7 +2996,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, if (!plug) return NULL; - rq = rq_list_peek(&plug->cached_rq); + rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (type != rq->mq_hctx->type && @@ -3013,14 +3010,14 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { - WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); + if (rq_list_pop(&plug->cached_rqs) != rq) + WARN_ON_ONCE(1); /* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle. */ - plug->cached_rq = rq_list_next(rq); rq_qos_throttle(rq->q, bio); blk_mq_rq_time_init(rq, 0); @@ -3092,14 +3089,21 @@ void blk_mq_submit_bio(struct bio *bio) } /* - * Device reconfiguration may change logical block size, so alignment - * check has to be done with queue usage counter held + * Device reconfiguration may change logical block size or reduce the + * number of poll queues, so the checks for alignment and poll support + * have to be done with queue usage counter held. */ if (unlikely(bio_unaligned(bio, q))) { bio_io_error(bio); goto queue_exit; } + if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + goto queue_exit; + } + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) goto queue_exit; @@ -3110,8 +3114,10 @@ void blk_mq_submit_bio(struct bio *bio) if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; - if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) - goto queue_exit; + if (bio_needs_zone_write_plugging(bio)) { + if (blk_zone_plug_bio(bio, nr_segs)) + goto queue_exit; + } new_request: if (!rq) { @@ -3302,8 +3308,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; - rq->ioprio = rq_src->ioprio; - rq->write_hint = rq_src->write_hint; + rq->nr_integrity_segments = rq_src->nr_integrity_segments; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; @@ -4325,12 +4330,6 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -static bool blk_mq_can_poll(struct blk_mq_tag_set *set) -{ - return set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues; -} - struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata) { @@ -4341,7 +4340,7 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, if (!lim) lim = &default_lim; lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; - if (blk_mq_can_poll(set)) + if (set->nr_maps > HCTX_TYPE_POLL) lim->features |= BLK_FEAT_POLL; q = blk_alloc_queue(lim, set->numa_node); @@ -5029,8 +5028,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { - struct queue_limits lim; - blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { @@ -5044,13 +5041,6 @@ fallback: set->nr_hw_queues = prev_nr_hw_queues; goto fallback; } - lim = queue_limits_start_update(q); - if (blk_mq_can_poll(set)) - lim.features |= BLK_FEAT_POLL; - else - lim.features &= ~BLK_FEAT_POLL; - if (queue_limits_commit_update(q, &lim) < 0) - pr_warn("updating the poll flag failed\n"); blk_mq_map_swqueue(q); } @@ -5110,9 +5100,9 @@ static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags) { - struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); - - return blk_hctx_poll(q, hctx, iob, flags); + if (!blk_mq_can_poll(q)) + return 0; + return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, diff --git a/block/blk-mq.h b/block/blk-mq.h index f4ac1af77a26..a80d3b3105f9 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -155,7 +155,7 @@ struct blk_mq_alloc_data { /* allocate multiple requests/tags in one go */ unsigned int nr_tags; - struct request **cached_rq; + struct rq_list *cached_rqs; /* input & output parameter */ struct blk_mq_ctx *ctx; @@ -451,4 +451,10 @@ do { \ #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ +static inline bool blk_mq_can_poll(struct request_queue *q) +{ + return (q->limits.features & BLK_FEAT_POLL) && + q->tag_set->map[HCTX_TYPE_POLL].nr_queues; +} + #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index 7abf034089cd..9ae3eee4b5ae 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -61,8 +61,14 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi, /* * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. + * + * There is no hardware limitation for the read-ahead size and the user + * might have increased the read-ahead size through sysfs, so don't ever + * decrease it. */ - bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); + bdi->ra_pages = max3(bdi->ra_pages, + lim->io_opt * 2 / PAGE_SIZE, + VM_READAHEAD_PAGES); bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; } @@ -118,6 +124,11 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) return 0; } + if (lim->features & BLK_FEAT_BOUNCE_HIGH) { + pr_warn("no bounce buffer support for integrity metadata\n"); + return -EINVAL; + } + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { pr_warn("integrity support disabled.\n"); return -EINVAL; @@ -309,12 +320,19 @@ static int blk_validate_limits(struct queue_limits *lim) lim->max_discard_sectors = min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); + /* + * When discard is not supported, discard_granularity should be reported + * as 0 to userspace. + */ + if (lim->max_discard_sectors) + lim->discard_granularity = + max(lim->discard_granularity, lim->physical_block_size); + else + lim->discard_granularity = 0; + if (!lim->max_discard_segments) lim->max_discard_segments = 1; - if (lim->discard_granularity < lim->physical_block_size) - lim->discard_granularity = lim->physical_block_size; - /* * By default there is no limit on the segment boundary alignment, * but if there is one it can't be smaller than the page size as @@ -605,7 +623,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, } /* chunk_sectors a multiple of the physical block size? */ - if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { + if (t->chunk_sectors % (t->physical_block_size >> SECTOR_SHIFT)) { t->chunk_sectors = 0; t->flags |= BLK_FLAG_MISALIGNED; ret = -1; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 207577145c54..6a38f312e385 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -256,10 +256,17 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ !!(disk->queue->limits.features & _feature)); \ } -QUEUE_SYSFS_FEATURE_SHOW(poll, BLK_FEAT_POLL); QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); +static ssize_t queue_poll_show(struct gendisk *disk, char *page) +{ + if (queue_is_mq(disk->queue)) + return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); + return sysfs_emit(page, "%u\n", + !!(disk->queue->limits.features & BLK_FEAT_POLL)); +} + static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { if (blk_queue_is_zoned(disk->queue)) @@ -806,6 +813,8 @@ out_unregister_ia_ranges: out_debugfs_remove: blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); + if (queue_is_mq(q)) + blk_mq_sysfs_unregister(disk); out_put_queue_kobj: kobject_put(&disk->queue_kobj); mutex_unlock(&q->sysfs_dir_lock); @@ -859,4 +868,5 @@ void blk_unregister_queue(struct gendisk *disk) mutex_unlock(&q->sysfs_dir_lock); blk_debugfs_remove(disk); + kobject_put(&disk->queue_kobj); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2c4192e12efa..6b82fcbd7e77 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1593,13 +1593,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) return tg_may_dispatch(tg, bio, NULL); } -static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) -{ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) - tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); - tg->carryover_ios[rw]--; -} - bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -1636,10 +1629,12 @@ bool __blk_throtl_bio(struct bio *bio) /* * IOs which may cause priority inversions are * dispatched directly, even if they're over limit. - * Debts are handled by carryover_bytes/ios while - * calculating wait time. + * + * Charge and dispatch directly, and our throttle + * control algorithm is adaptive, and extra IO bytes + * will be throttled for paying the debt */ - tg_dispatch_in_debt(tg, bio, rw); + throtl_charge_bio(tg, bio); } else { /* if above limits, break to queue */ break; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 767bcbce74fa..24c80078ca44 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -347,6 +347,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, op = REQ_OP_ZONE_RESET; /* Invalidate the page cache, including dirty pages. */ + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); ret = blkdev_truncate_zone_range(bdev, mode, &zrange); if (ret) @@ -368,8 +369,10 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); fail: - if (cmd == BLKRESETZONE) + if (cmd == BLKRESETZONE) { filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); + } return ret; } @@ -427,13 +430,14 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, } } hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); + atomic_inc(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); return true; } -static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, - sector_t sector) +static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, + sector_t sector) { unsigned int zno = disk_zone_no(disk, sector); unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); @@ -454,6 +458,15 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, return NULL; } +static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, + sector_t sector) +{ + if (!atomic_read(&disk->nr_zone_wplugs)) + return NULL; + + return disk_get_hashed_zone_wplug(disk, sector); +} + static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) { struct blk_zone_wplug *zwplug = @@ -518,6 +531,7 @@ static void disk_remove_zone_wplug(struct gendisk *disk, zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); hlist_del_init_rcu(&zwplug->node); + atomic_dec(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); disk_put_zone_wplug(zwplug); } @@ -607,6 +621,11 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { struct bio *bio; + if (bio_list_empty(&zwplug->bio_list)) + return; + + pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", + zwplug->disk->disk_name, zwplug->zone_no); while ((bio = bio_list_pop(&zwplug->bio_list))) blk_zone_wplug_bio_io_error(zwplug, bio); } @@ -1055,6 +1074,47 @@ plug: return true; } +static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug; + unsigned long flags; + + /* + * We have native support for zone append operations, so we are not + * going to handle @bio through plugging. However, we may already have a + * zone write plug for the target zone if that zone was previously + * partially written using regular writes. In such case, we risk leaving + * the plug in the disk hash table if the zone is fully written using + * zone append operations. Avoid this by removing the zone write plug. + */ + zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + if (likely(!zwplug)) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * We are about to remove the zone write plug. But if the user + * (mistakenly) has issued regular writes together with native zone + * append, we must aborts the writes as otherwise the plugged BIOs would + * not be executed by the plug BIO work as disk_get_zone_wplug() will + * return NULL after the plug is removed. Aborting the plugged write + * BIOs is consistent with the fact that these writes will most likely + * fail anyway as there is no ordering guarantees between zone append + * operations and regular write operations. + */ + if (!bio_list_empty(&zwplug->bio_list)) { + pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", + disk->disk_name, zwplug->zone_no); + disk_zone_wplug_abort(zwplug); + } + disk_remove_zone_wplug(disk, zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); +} + /** * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging * @bio: The BIO being submitted @@ -1071,25 +1131,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) { struct block_device *bdev = bio->bi_bdev; - if (!bdev->bd_disk->zone_wplugs_hash) - return false; - - /* - * If the BIO already has the plugging flag set, then it was already - * handled through this path and this is a submission from the zone - * plug bio submit work. - */ - if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) - return false; - - /* - * We do not need to do anything special for empty flush BIOs, e.g - * BIOs such as issued by blkdev_issue_flush(). The is because it is - * the responsibility of the user to first wait for the completion of - * write operations for flush to have any effect on the persistence of - * the written data. - */ - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash)) return false; /* @@ -1111,8 +1153,10 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) */ switch (bio_op(bio)) { case REQ_OP_ZONE_APPEND: - if (!bdev_emulates_zone_append(bdev)) + if (!bdev_emulates_zone_append(bdev)) { + blk_zone_wplug_handle_native_zone_append(bio); return false; + } fallthrough; case REQ_OP_WRITE: case REQ_OP_WRITE_ZEROES: @@ -1178,6 +1222,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= REQ_OP_ZONE_APPEND; + bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND); } /* @@ -1259,7 +1304,6 @@ again: spin_unlock_irqrestore(&zwplug->lock, flags); bdev = bio->bi_bdev; - submit_bio_noacct_nocheck(bio); /* * blk-mq devices will reuse the extra reference on the request queue @@ -1267,8 +1311,12 @@ again: * path for BIO-based devices will not do that. So drop this extra * reference here. */ - if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) + if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { + bdev->bd_disk->fops->submit_bio(bio); blk_queue_exit(bdev->bd_disk->queue); + } else { + blk_mq_submit_bio(bio); + } put_zwplug: /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ @@ -1299,6 +1347,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, { unsigned int i; + atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); @@ -1353,6 +1402,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) } } + WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; @@ -1570,11 +1620,12 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, } /* - * We need to track the write pointer of all zones that are not - * empty nor full. So make sure we have a zone write plug for - * such zone if the device has a zone write plug hash table. + * If the device needs zone append emulation, we need to track the + * write pointer of all zones that are not empty nor full. So make sure + * we have a zone write plug for such zone if the device has a zone + * write plug hash table. */ - if (!disk->zone_wplugs_hash) + if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) return 0; disk_zone_wplug_sync_wp_offset(disk, zone); diff --git a/block/blk.h b/block/blk.h index 1426f9c28197..e91012247ff2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -482,7 +482,8 @@ static inline void blk_zone_update_request_bio(struct request *rq, * the original BIO sector so that blk_zone_write_plug_bio_endio() can * lookup the zone write plug. */ - if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) + if (req_op(rq) == REQ_OP_ZONE_APPEND || + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) bio->bi_iter.bi_sector = rq->__sector; } void blk_zone_write_plug_bio_endio(struct bio *bio); diff --git a/block/bounce.c b/block/bounce.c index 0d898cd5ec49..09a9616cf209 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -41,8 +41,6 @@ static void init_bounce_bioset(void) ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); BUG_ON(ret); - if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE)) - BUG_ON(1); ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); BUG_ON(ret); diff --git a/block/elevator.c b/block/elevator.c index 43ba4ab1ada7..1f76e9efd771 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -752,7 +752,6 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, ssize_t elv_iosched_show(struct gendisk *disk, char *name) { struct request_queue *q = disk->queue; - struct elevator_queue *eq = q->elevator; struct elevator_type *cur = NULL, *e; int len = 0; @@ -763,7 +762,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) len += sprintf(name+len, "[none] "); } else { len += sprintf(name+len, "none "); - cur = eq->type; + cur = q->elevator->type; } spin_lock(&elv_list_lock); diff --git a/block/fops.c b/block/fops.c index 13a67940d040..d4b1d942f270 100644 --- a/block/fops.c +++ b/block/fops.c @@ -721,7 +721,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = direct_write_fallback(iocb, from, ret, blkdev_buffered_write(iocb, from)); } else { + /* + * Take i_rwsem and invalidate_lock to avoid racing with + * set_blocksize changing i_blkbits/folio order and punching + * out the pagecache. + */ + inode_lock_shared(bd_inode); ret = blkdev_buffered_write(iocb, from); + inode_unlock_shared(bd_inode); } if (ret > 0) @@ -732,6 +739,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct inode *bd_inode = bdev_file_inode(iocb->ki_filp); struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; @@ -758,16 +766,23 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) file_accessed(iocb->ki_filp); ret = blkdev_direct_IO(iocb, to); - if (ret >= 0) { + if (ret > 0) { iocb->ki_pos += ret; count -= ret; } - iov_iter_revert(to, count - iov_iter_count(to)); + if (ret != -EIOCBQUEUED) + iov_iter_revert(to, count - iov_iter_count(to)); if (ret < 0 || !count) goto reexpand; } + /* + * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize + * changing i_blkbits/folio order and punching out the pagecache. + */ + inode_lock_shared(bd_inode); ret = filemap_read(iocb, to, ret); + inode_unlock_shared(bd_inode); reexpand: if (unlikely(shorted)) @@ -810,6 +825,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; + inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); /* @@ -842,6 +858,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, fail: filemap_invalidate_unlock(inode->i_mapping); + inode_unlock(inode); return error; } diff --git a/block/genhd.c b/block/genhd.c index 8645cf3b0816..99344f53c789 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -778,7 +778,7 @@ static ssize_t disk_badblocks_store(struct device *dev, } #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD -void blk_request_module(dev_t devt) +static bool blk_probe_dev(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; @@ -788,14 +788,26 @@ void blk_request_module(dev_t devt) if ((*n)->major == major && (*n)->probe) { (*n)->probe(devt); mutex_unlock(&major_names_lock); - return; + return true; } } mutex_unlock(&major_names_lock); + return false; +} + +void blk_request_module(dev_t devt) +{ + int error; + + if (blk_probe_dev(devt)) + return; - if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(devt)); + error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)); + /* Make old-style 2.4 aliases work */ + if (error > 0) + error = request_module("block-major-%d", MAJOR(devt)); + if (!error) + blk_probe_dev(devt); } #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ diff --git a/block/ioctl.c b/block/ioctl.c index 6554b728bae6..919066b4bb49 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -141,6 +141,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, if (err) return err; + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) @@ -173,6 +174,7 @@ out_unplug: blk_finish_plug(&plug); fail: filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } @@ -198,12 +200,14 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, end > bdev_nr_bytes(bdev)) return -EINVAL; + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end - 1); if (!err) err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, GFP_KERNEL); filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } @@ -235,6 +239,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, return -EINVAL; /* Invalidate the page cache, including dirty pages */ + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end); if (err) @@ -245,6 +250,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, fail: filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 4155594aefc6..ccfefa6a3669 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -157,10 +157,7 @@ struct kyber_queue_data { */ struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; - /* - * Async request percentage, converted to per-word depth for - * sbitmap_get_shallow(). - */ + /* Number of allowed async requests. */ unsigned int async_depth; struct kyber_cpu_latency __percpu *cpu_latency; @@ -454,10 +451,8 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int shift = tags->bitmap_tags.sb.shift; - - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; + kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U; sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index acdc28756d9d..19473a9b5044 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -488,20 +488,6 @@ unlock: } /* - * 'depth' is a number in the range 1..INT_MAX representing a number of - * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since - * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). - * Values larger than q->nr_requests have the same effect as q->nr_requests. - */ -static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) -{ - struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; - const unsigned int nrr = hctx->queue->nr_requests; - - return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; -} - -/* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ @@ -517,7 +503,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ - data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); + data->shallow_depth = dd->async_depth; } /* Called by blk_mq_update_nr_requests(). */ @@ -685,10 +671,9 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; - if (!rq->elv.priv[0]) { + if (!rq->elv.priv[0]) per_prio->stats.inserted++; - rq->elv.priv[0] = (void *)(uintptr_t)1; - } + rq->elv.priv[0] = per_prio; if (blk_mq_sched_try_insert_merge(q, rq, free)) return; @@ -753,18 +738,14 @@ static void dd_prepare_request(struct request *rq) */ static void dd_finish_request(struct request *rq) { - struct request_queue *q = rq->q; - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = dd_rq_ioclass(rq); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - struct dd_per_prio *per_prio = &dd->per_prio[prio]; + struct dd_per_prio *per_prio = rq->elv.priv[0]; /* * The block layer core may call dd_finish_request() without having * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ - if (rq->elv.priv[0]) + if (per_prio) atomic_inc(&per_prio->stats.completed); } diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 5e9be13a56a8..7acba66eed48 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -682,7 +682,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) out[size] = 0; while (i < size) { - u8 c = le16_to_cpu(in[i]) & 0xff; + u8 c = le16_to_cpu(in[i]) & 0x7f; if (c && !isprint(c)) c = '!'; diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h index e259180c8914..aa3bd050d8cd 100644 --- a/block/partitions/ldm.h +++ b/block/partitions/ldm.h @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * ldm - Part of the Linux-NTFS project. * * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> diff --git a/block/partitions/mac.c b/block/partitions/mac.c index c80183156d68..b02530d98629 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -53,13 +53,25 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); + + /* + * If the "block size" is not a power of 2, things get weird - we might + * end up with a partition straddling a sector boundary, so we wouldn't + * be able to read a partition entry with read_part_sector(). + * Real block sizes are probably (?) powers of two, so just require + * that. + */ + if (!is_power_of_2(secsize)) + return -1; datasize = round_down(secsize, 512); data = read_part_sector(state, datasize / 512, §); if (!data) return -1; partoffset = secsize % 512; - if (partoffset + sizeof(*part) > datasize) + if (partoffset + sizeof(*part) > datasize) { + put_dev_sector(sect); return -1; + } part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); @@ -112,8 +124,8 @@ int mac_partition(struct parsed_partitions *state) int i, l; goodness++; - l = strlen(part->name); - if (strcmp(part->name, "/") == 0) + l = strnlen(part->name, sizeof(part->name)); + if (strncmp(part->name, "/", sizeof(part->name)) == 0) goodness++; for (i = 0; i <= l - 4; ++i) { if (strncasecmp(part->name + i, "root", |