diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-02-21 22:02:48 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-02-21 22:02:48 +0300 |
commit | 582cd91f69de8e44857cb610ebca661dac8656b7 (patch) | |
tree | 0d680db02a5c236ee87b408b3f13ce33ebaca907 | |
parent | bd018bbaa58640da786d4289563e71c5ef3938c7 (diff) | |
parent | f885056a48ccf4ad4332def91e973f3993fa8695 (diff) | |
download | linux-582cd91f69de8e44857cb610ebca661dac8656b7.tar.xz |
Merge tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe:
"Another nice round of removing more code than what is added, mostly
due to Christoph's relentless pursuit of tech debt removal/cleanups.
This pull request contains:
- Two series of BFQ improvements (Paolo, Jan, Jia)
- Block iov_iter improvements (Pavel)
- bsg error path fix (Pan)
- blk-mq scheduler improvements (Jan)
- -EBUSY discard fix (Jan)
- bvec allocation improvements (Ming, Christoph)
- bio allocation and init improvements (Christoph)
- Store bdev pointer in bio instead of gendisk + partno (Christoph)
- Block trace point cleanups (Christoph)
- hard read-only vs read-only split (Christoph)
- Block based swap cleanups (Christoph)
- Zoned write granularity support (Damien)
- Various fixes/tweaks (Chunguang, Guoqing, Lei, Lukas, Huhai)"
* tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block: (104 commits)
mm: simplify swapdev_block
sd_zbc: clear zone resources for non-zoned case
block: introduce blk_queue_clear_zone_settings()
zonefs: use zone write granularity as block size
block: introduce zone_write_granularity limit
block: use blk_queue_set_zoned in add_partition()
nullb: use blk_queue_set_zoned() to setup zoned devices
nvme: cleanup zone information initialization
block: document zone_append_max_bytes attribute
block: use bi_max_vecs to find the bvec pool
md/raid10: remove dead code in reshape_request
block: mark the bio as cloned in bio_iov_bvec_set
block: set BIO_NO_PAGE_REF in bio_iov_bvec_set
block: remove a layer of indentation in bio_iov_iter_get_pages
block: turn the nr_iovecs argument to bio_alloc* into an unsigned short
block: remove the 1 and 4 vec bvec_slabs entries
block: streamline bvec_alloc
block: factor out a bvec_alloc_gfp helper
block: move struct biovec_slab to bio.c
block: reuse BIO_INLINE_VECS for integrity bvecs
...
148 files changed, 1343 insertions, 1608 deletions
diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst index 36771a131b56..ddb867e0185b 100644 --- a/Documentation/block/biovecs.rst +++ b/Documentation/block/biovecs.rst @@ -40,6 +40,8 @@ normal code doesn't have to deal with bi_bvec_done. There is a lower level advance function - bvec_iter_advance() - which takes a pointer to a biovec, not a bio; this is used by the bio integrity code. +As of 5.12 bvec segments with zero bv_len are not supported. + What's all this get us? ======================= diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index 2638d3446b79..4dc7f0d499a8 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -261,6 +261,12 @@ For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES is not supported. +zone_append_max_bytes (RO) +-------------------------- +This is the maximum number of bytes that can be written to a sequential +zone of a zoned block device using a zone append write operation +(REQ_OP_ZONE_APPEND). This value is always 0 for regular block devices. + zoned (RO) ---------- This indicates if the device is a zoned block device and the zone model of the @@ -273,4 +279,11 @@ devices are described in the ZBC (Zoned Block Commands) and ZAC do not support zone commands, they will be treated as regular block devices and zoned will report "none". +zone_write_granularity (RO) +--------------------------- +This indicates the alignment constraint, in bytes, for write operations in +sequential zones of zoned block devices (devices with a zoned attributed +that reports "host-managed" or "host-aware"). This value is always 0 for +regular block devices. + Jens Axboe <jens.axboe@oracle.com>, February 2009 diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 81c05baa8312..35ed01a5fbc9 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -179,7 +179,6 @@ fault_type=%d Support configuring fault injection type, should be FAULT_KVMALLOC 0x000000002 FAULT_PAGE_ALLOC 0x000000004 FAULT_PAGE_GET 0x000000008 - FAULT_ALLOC_BIO 0x000000010 FAULT_ALLOC_NID 0x000000020 FAULT_ORPHAN 0x000000040 FAULT_BLOCK 0x000000080 diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 867036aa90b8..1f8cf8e10b34 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -865,3 +865,19 @@ no matter what. Everything is handled by the caller. clone_private_mount() returns a longterm mount now, so the proper destructor of its result is kern_unmount() or kern_unmount_array(). + +--- + +**mandatory** + +zero-length bvec segments are disallowed, they must be filtered out before +passed on to an iterator. + +--- + +**mandatory** + +For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but +uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and +page references stay until I/O has completed, i.e. until ->ki_complete() has +been called or returned with non -EIOCBQUEUED code. diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 92d26c812441..ba808543161a 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -61,7 +61,7 @@ struct nfhd_device { static blk_qc_t nfhd_submit_bio(struct bio *bio) { - struct nfhd_device *dev = bio->bi_disk->private_data; + struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; struct bvec_iter iter; int dir, len, shift; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 3447556d276d..fc09be7b1347 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -103,7 +103,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector, static blk_qc_t simdisk_submit_bio(struct bio *bio) { - struct simdisk *dev = bio->bi_disk->private_data; + struct simdisk *dev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; struct bvec_iter iter; sector_t sector = bio->bi_iter.bi_sector; diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9e81d1052091..b398dde53af9 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -158,7 +158,6 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); -BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS \ /* Expiration time of sync (0) and async (1) requests, in ns. */ @@ -1024,9 +1023,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, else bfq_clear_bfqq_IO_bound(bfqq); + bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns; + bfqq->inject_limit = bic->saved_inject_limit; + bfqq->decrease_time_jif = bic->saved_decrease_time_jif; + bfqq->entity.new_weight = bic->saved_weight; bfqq->ttime = bic->saved_ttime; + bfqq->io_start_time = bic->saved_io_start_time; + bfqq->tot_idle_time = bic->saved_tot_idle_time; bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->service_from_wr = bic->saved_service_from_wr; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; @@ -1647,6 +1653,8 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, return bfqq_weight > in_serv_weight; } +static bool bfq_better_to_idle(struct bfq_queue *bfqq); + static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, struct bfq_queue *bfqq, int old_wr_coeff, @@ -1671,15 +1679,19 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * - it is sync, * - it does not belong to a large burst, * - it has been idle for enough time or is soft real-time, - * - is linked to a bfq_io_cq (it is not shared in any sense). + * - is linked to a bfq_io_cq (it is not shared in any sense), + * - has a default weight (otherwise we assume the user wanted + * to control its weight explicitly) */ in_burst = bfq_bfqq_in_large_burst(bfqq); soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && !BFQQ_TOTALLY_SEEKY(bfqq) && !in_burst && time_is_before_jiffies(bfqq->soft_rt_next_start) && - bfqq->dispatched == 0; - *interactive = !in_burst && idle_for_long_time; + bfqq->dispatched == 0 && + bfqq->entity.new_weight == 40; + *interactive = !in_burst && idle_for_long_time && + bfqq->entity.new_weight == 40; wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || (bfq_bfqq_sync(bfqq) && @@ -1717,17 +1729,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_clear_bfqq_just_created(bfqq); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (arrived_in_time) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - } - if (bfqd->low_latency) { if (unlikely(time_is_after_jiffies(bfqq->split_time))) /* wraparound */ @@ -1755,10 +1756,10 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_add_bfqq_busy(bfqd, bfqq); /* - * Expire in-service queue only if preemption may be needed - * for guarantees. In particular, we care only about two - * cases. The first is that bfqq has to recover a service - * hole, as explained in the comments on + * Expire in-service queue if preemption may be needed for + * guarantees or throughput. As for guarantees, we care + * explicitly about two cases. The first is that bfqq has to + * recover a service hole, as explained in the comments on * bfq_bfqq_update_budg_for_activation(), i.e., that * bfqq_wants_to_preempt is true. However, if bfqq does not * carry time-critical I/O, then bfqq's bandwidth is less @@ -1785,11 +1786,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * timestamps of the in-service queue would need to be * updated, and this operation is quite costly (see the * comments on bfq_bfqq_update_budg_for_activation()). + * + * As for throughput, we ask bfq_better_to_idle() whether we + * still need to plug I/O dispatching. If bfq_better_to_idle() + * says no, then plugging is not needed any longer, either to + * boost throughput or to perserve service guarantees. Then + * the best option is to stop plugging I/O, as not doing so + * would certainly lower throughput. We may end up in this + * case if: (1) upon a dispatch attempt, we detected that it + * was better to plug I/O dispatch, and to wait for a new + * request to arrive for the currently in-service queue, but + * (2) this switch of bfqq to busy changes the scenario. */ if (bfqd->in_service_queue && ((bfqq_wants_to_preempt && bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || - bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && + bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) || + !bfq_better_to_idle(bfqd->in_service_queue)) && next_queue_may_preempt(bfqd)) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); @@ -1861,6 +1874,138 @@ static void bfq_reset_inject_limit(struct bfq_data *bfqd, bfqq->decrease_time_jif = jiffies; } +static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) +{ + u64 tot_io_time = now_ns - bfqq->io_start_time; + + if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0) + bfqq->tot_idle_time += + now_ns - bfqq->ttime.last_end_request; + + if (unlikely(bfq_bfqq_just_created(bfqq))) + return; + + /* + * Must be busy for at least about 80% of the time to be + * considered I/O bound. + */ + if (bfqq->tot_idle_time * 5 > tot_io_time) + bfq_clear_bfqq_IO_bound(bfqq); + else + bfq_mark_bfqq_IO_bound(bfqq); + + /* + * Keep an observation window of at most 200 ms in the past + * from now. + */ + if (tot_io_time > 200 * NSEC_PER_MSEC) { + bfqq->io_start_time = now_ns - (tot_io_time>>1); + bfqq->tot_idle_time >>= 1; + } +} + +/* + * Detect whether bfqq's I/O seems synchronized with that of some + * other queue, i.e., whether bfqq, after remaining empty, happens to + * receive new I/O only right after some I/O request of the other + * queue has been completed. We call waker queue the other queue, and + * we assume, for simplicity, that bfqq may have at most one waker + * queue. + * + * A remarkable throughput boost can be reached by unconditionally + * injecting the I/O of the waker queue, every time a new + * bfq_dispatch_request happens to be invoked while I/O is being + * plugged for bfqq. In addition to boosting throughput, this + * unblocks bfqq's I/O, thereby improving bandwidth and latency for + * bfqq. Note that these same results may be achieved with the general + * injection mechanism, but less effectively. For details on this + * aspect, see the comments on the choice of the queue for injection + * in bfq_select_queue(). + * + * Turning back to the detection of a waker queue, a queue Q is deemed + * as a waker queue for bfqq if, for three consecutive times, bfqq + * happens to become non empty right after a request of Q has been + * completed. In particular, on the first time, Q is tentatively set + * as a candidate waker queue, while on the third consecutive time + * that Q is detected, the field waker_bfqq is set to Q, to confirm + * that Q is a waker queue for bfqq. These detection steps are + * performed only if bfqq has a long think time, so as to make it more + * likely that bfqq's I/O is actually being blocked by a + * synchronization. This last filter, plus the above three-times + * requirement, make false positives less likely. + * + * NOTE + * + * The sooner a waker queue is detected, the sooner throughput can be + * boosted by injecting I/O from the waker queue. Fortunately, + * detection is likely to be actually fast, for the following + * reasons. While blocked by synchronization, bfqq has a long think + * time. This implies that bfqq's inject limit is at least equal to 1 + * (see the comments in bfq_update_inject_limit()). So, thanks to + * injection, the waker queue is likely to be served during the very + * first I/O-plugging time interval for bfqq. This triggers the first + * step of the detection mechanism. Thanks again to injection, the + * candidate waker queue is then likely to be confirmed no later than + * during the next I/O-plugging interval for bfqq. + * + * ISSUE + * + * On queue merging all waker information is lost. + */ +static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, + u64 now_ns) +{ + if (!bfqd->last_completed_rq_bfqq || + bfqd->last_completed_rq_bfqq == bfqq || + bfq_bfqq_has_short_ttime(bfqq) || + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || + bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) + return; + + if (bfqd->last_completed_rq_bfqq != + bfqq->tentative_waker_bfqq) { + /* + * First synchronization detected with a + * candidate waker queue, or with a different + * candidate waker queue from the current one. + */ + bfqq->tentative_waker_bfqq = + bfqd->last_completed_rq_bfqq; + bfqq->num_waker_detections = 1; + } else /* Same tentative waker queue detected again */ + bfqq->num_waker_detections++; + + if (bfqq->num_waker_detections == 3) { + bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * bfqq->waker_bfqq must be reset. To + * this goal, we maintain in each + * waker queue a list, woken_list, of + * all the queues that reference the + * waker queue through their + * waker_bfqq pointer. When the waker + * queue exits, the waker_bfqq pointer + * of all the queues in the woken_list + * is reset. + * + * In addition, if bfqq is already in + * the woken_list of a waker queue, + * then, before being inserted into + * the woken_list of a new waker + * queue, bfqq must be removed from + * the woken_list of the old waker + * queue. + */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqd->last_completed_rq_bfqq->woken_list); + } +} + static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -1868,117 +2013,14 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; + u64 now_ns = ktime_get_ns(); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; bfqd->queued++; if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { - /* - * Detect whether bfqq's I/O seems synchronized with - * that of some other queue, i.e., whether bfqq, after - * remaining empty, happens to receive new I/O only - * right after some I/O request of the other queue has - * been completed. We call waker queue the other - * queue, and we assume, for simplicity, that bfqq may - * have at most one waker queue. - * - * A remarkable throughput boost can be reached by - * unconditionally injecting the I/O of the waker - * queue, every time a new bfq_dispatch_request - * happens to be invoked while I/O is being plugged - * for bfqq. In addition to boosting throughput, this - * unblocks bfqq's I/O, thereby improving bandwidth - * and latency for bfqq. Note that these same results - * may be achieved with the general injection - * mechanism, but less effectively. For details on - * this aspect, see the comments on the choice of the - * queue for injection in bfq_select_queue(). - * - * Turning back to the detection of a waker queue, a - * queue Q is deemed as a waker queue for bfqq if, for - * two consecutive times, bfqq happens to become non - * empty right after a request of Q has been - * completed. In particular, on the first time, Q is - * tentatively set as a candidate waker queue, while - * on the second time, the flag - * bfq_bfqq_has_waker(bfqq) is set to confirm that Q - * is a waker queue for bfqq. These detection steps - * are performed only if bfqq has a long think time, - * so as to make it more likely that bfqq's I/O is - * actually being blocked by a synchronization. This - * last filter, plus the above two-times requirement, - * make false positives less likely. - * - * NOTE - * - * The sooner a waker queue is detected, the sooner - * throughput can be boosted by injecting I/O from the - * waker queue. Fortunately, detection is likely to be - * actually fast, for the following reasons. While - * blocked by synchronization, bfqq has a long think - * time. This implies that bfqq's inject limit is at - * least equal to 1 (see the comments in - * bfq_update_inject_limit()). So, thanks to - * injection, the waker queue is likely to be served - * during the very first I/O-plugging time interval - * for bfqq. This triggers the first step of the - * detection mechanism. Thanks again to injection, the - * candidate waker queue is then likely to be - * confirmed no later than during the next - * I/O-plugging interval for bfqq. - */ - if (bfqd->last_completed_rq_bfqq && - !bfq_bfqq_has_short_ttime(bfqq) && - ktime_get_ns() - bfqd->last_completion < - 200 * NSEC_PER_USEC) { - if (bfqd->last_completed_rq_bfqq != bfqq && - bfqd->last_completed_rq_bfqq != - bfqq->waker_bfqq) { - /* - * First synchronization detected with - * a candidate waker queue, or with a - * different candidate waker queue - * from the current one. - */ - bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; - - /* - * If the waker queue disappears, then - * bfqq->waker_bfqq must be reset. To - * this goal, we maintain in each - * waker queue a list, woken_list, of - * all the queues that reference the - * waker queue through their - * waker_bfqq pointer. When the waker - * queue exits, the waker_bfqq pointer - * of all the queues in the woken_list - * is reset. - * - * In addition, if bfqq is already in - * the woken_list of a waker queue, - * then, before being inserted into - * the woken_list of a new waker - * queue, bfqq must be removed from - * the woken_list of the old waker - * queue. - */ - if (!hlist_unhashed(&bfqq->woken_list_node)) - hlist_del_init(&bfqq->woken_list_node); - hlist_add_head(&bfqq->woken_list_node, - &bfqd->last_completed_rq_bfqq->woken_list); - - bfq_clear_bfqq_has_waker(bfqq); - } else if (bfqd->last_completed_rq_bfqq == - bfqq->waker_bfqq && - !bfq_bfqq_has_waker(bfqq)) { - /* - * synchronization with waker_bfqq - * seen for the second time - */ - bfq_mark_bfqq_has_waker(bfqq); - } - } + bfq_check_waker(bfqd, bfqq, now_ns); /* * Periodically reset inject limit, to make sure that @@ -2047,6 +2089,9 @@ static void bfq_add_request(struct request *rq) } } + if (bfq_bfqq_sync(bfqq)) + bfq_update_io_intensity(bfqq, now_ns); + elv_rb_add(&bfqq->sort_list, rq); /* @@ -2352,6 +2397,24 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { + /* + * If bfqq has been enjoying interactive weight-raising, then + * reset soft_rt_next_start. We do it for the following + * reason. bfqq may have been conveying the I/O needed to load + * a soft real-time application. Such an application actually + * exhibits a soft real-time I/O pattern after it finishes + * loading, and finally starts doing its job. But, if bfqq has + * been receiving a lot of bandwidth so far (likely to happen + * on a fast device), then soft_rt_next_start now contains a + * high value that. So, without this reset, bfqq would be + * prevented from being possibly considered as soft_rt for a + * very long time. + */ + + if (bfqq->wr_cur_max_time != + bfqq->bfqd->bfq_wr_rt_max_time) + bfqq->soft_rt_next_start = jiffies; + if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; @@ -2686,10 +2749,16 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; + bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns; + bic->saved_inject_limit = bfqq->inject_limit; + bic->saved_decrease_time_jif = bfqq->decrease_time_jif; + bic->saved_weight = bfqq->entity.orig_weight; bic->saved_ttime = bfqq->ttime; bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_io_start_time = bfqq->io_start_time; + bic->saved_tot_idle_time = bfqq->tot_idle_time; bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); if (unlikely(bfq_bfqq_just_created(bfqq) && @@ -2712,6 +2781,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_wr_coeff = bfqq->wr_coeff; bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + bic->saved_service_from_wr = bfqq->service_from_wr; bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; } @@ -2937,6 +3007,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, } bfqd->in_service_queue = bfqq; + bfqd->in_serv_last_pos = 0; } /* @@ -3442,20 +3513,38 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) * order until all the requests already queued in the device have been * served. The last sub-condition commented above somewhat mitigates * this problem for weight-raised queues. + * + * However, as an additional mitigation for this problem, we preserve + * plugging for a special symmetric case that may suddenly turn into + * asymmetric: the case where only bfqq is busy. In this case, not + * expiring bfqq does not cause any harm to any other queues in terms + * of service guarantees. In contrast, it avoids the following unlucky + * sequence of events: (1) bfqq is expired, (2) a new queue with a + * lower weight than bfqq becomes busy (or more queues), (3) the new + * queue is served until a new request arrives for bfqq, (4) when bfqq + * is finally served, there are so many requests of the new queue in + * the drive that the pending requests for bfqq take a lot of time to + * be served. In particular, event (2) may case even already + * dispatched requests of bfqq to be delayed, inside the drive. So, to + * avoid this series of events, the scenario is preventively declared + * as asymmetric also if bfqq is the only busy queues */ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { + int tot_busy_queues = bfq_tot_busy_queues(bfqd); + /* No point in idling for bfqq if it won't get requests any longer */ if (unlikely(!bfqq_process_refs(bfqq))) return false; return (bfqq->wr_coeff > 1 && (bfqd->wr_busy_queues < - bfq_tot_busy_queues(bfqd) || + tot_busy_queues || bfqd->rq_in_driver >= bfqq->dispatched + 4)) || - bfq_asymmetric_scenario(bfqd, bfqq); + bfq_asymmetric_scenario(bfqd, bfqq) || + tot_busy_queues == 1; } static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -3939,10 +4028,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) bfq_bfqq_charge_time(bfqd, bfqq, delta); - if (reason == BFQQE_TOO_IDLE && - entity->service <= 2 * entity->budget / 10) - bfq_clear_bfqq_IO_bound(bfqq); - if (bfqd->low_latency && bfqq->wr_coeff == 1) bfqq->last_wr_start_finish = jiffies; @@ -3952,30 +4037,15 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, * If we get here, and there are no outstanding * requests, then the request pattern is isochronous * (see the comments on the function - * bfq_bfqq_softrt_next_start()). Thus we can compute - * soft_rt_next_start. And we do it, unless bfqq is in - * interactive weight raising. We do not do it in the - * latter subcase, for the following reason. bfqq may - * be conveying the I/O needed to load a soft - * real-time application. Such an application will - * actually exhibit a soft real-time I/O pattern after - * it finally starts doing its job. But, if - * soft_rt_next_start is computed here for an - * interactive bfqq, and bfqq had received a lot of - * service before remaining with no outstanding - * request (likely to happen on a fast device), then - * soft_rt_next_start would be assigned such a high - * value that, for a very long time, bfqq would be - * prevented from being possibly considered as soft - * real time. + * bfq_bfqq_softrt_next_start()). Therefore we can + * compute soft_rt_next_start. * * If, instead, the queue still has outstanding * requests, then we have to wait for the completion * of all the outstanding requests to discover whether * the request pattern is actually isochronous. */ - if (bfqq->dispatched == 0 && - bfqq->wr_coeff != bfqd->bfq_wr_coeff) + if (bfqq->dispatched == 0) bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); else if (bfqq->dispatched > 0) { @@ -4497,9 +4567,9 @@ check_queue: bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= bfq_bfqq_budget_left(async_bfqq)) bfqq = bfqq->bic->bfqq[0]; - else if (bfq_bfqq_has_waker(bfqq) && + else if (bfqq->waker_bfqq && bfq_bfqq_busy(bfqq->waker_bfqq) && - bfqq->next_rq && + bfqq->waker_bfqq->next_rq && bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, bfqq->waker_bfqq) <= bfq_bfqq_budget_left(bfqq->waker_bfqq) @@ -4559,9 +4629,21 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->wr_cur_max_time)) { if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + - bfq_wr_duration(bfqd))) + bfq_wr_duration(bfqd))) { + /* + * Either in interactive weight + * raising, or in soft_rt weight + * raising with the + * interactive-weight-raising period + * elapsed (so no switch back to + * interactive weight raising). + */ bfq_bfqq_end_wr(bfqq); - else { + } else { /* + * soft_rt finishing while still in + * interactive period, switch back to + * interactive weight raising + */ switch_back_to_interactive_wr(bfqq, bfqd); bfqq->entity.prio_changed = 1; } @@ -4640,9 +4722,6 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - if (!atomic_read(&hctx->elevator_queued)) - return false; - /* * Avoiding lock: a race on bfqd->busy_queues should cause at * most a call to dispatch for nothing @@ -4892,7 +4971,6 @@ void bfq_put_queue(struct bfq_queue *bfqq) hlist_for_each_entry_safe(item, n, &bfqq->woken_list, woken_list_node) { item->waker_bfqq = NULL; - bfq_clear_bfqq_has_waker(item); hlist_del_init(&item->woken_list_node); } @@ -5012,6 +5090,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d", + bfqq->new_ioprio, bfqq->entity.new_weight); bfqq->entity.prio_changed = 1; } @@ -5049,6 +5129,8 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync) { + u64 now_ns = ktime_get_ns(); + RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); @@ -5076,7 +5158,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_clear_bfqq_sync(bfqq); /* set end request to minus infinity from now */ - bfqq->ttime.last_end_request = ktime_get_ns() + 1; + bfqq->ttime.last_end_request = now_ns + 1; + + bfqq->io_start_time = now_ns; bfq_mark_bfqq_IO_bound(bfqq); @@ -5194,11 +5278,19 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_ttime *ttime = &bfqq->ttime; - u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + u64 elapsed; + /* + * We are really interested in how long it takes for the queue to + * become busy when there is no outstanding IO for this queue. So + * ignore cases when the bfq queue has already IO queued. + */ + if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) + return; + elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); - ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; + ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ttime->ttime_samples); @@ -5213,8 +5305,26 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->wr_coeff > 1 && bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - BFQQ_TOTALLY_SEEKY(bfqq)) - bfq_bfqq_end_wr(bfqq); + BFQQ_TOTALLY_SEEKY(bfqq)) { + if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + /* + * In soft_rt weight raising with the + * interactive-weight-raising period + * elapsed (so no switch back to + * interactive weight raising). + */ + bfq_bfqq_end_wr(bfqq); + } else { /* + * stopping soft_rt weight raising + * while still in interactive period, + * switch back to interactive weight + * raising + */ + switch_back_to_interactive_wr(bfqq, bfqd); + bfqq->entity.prio_changed = 1; + } + } } static void bfq_update_has_short_ttime(struct bfq_data *bfqd, @@ -5238,12 +5348,13 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, return; /* Think time is infinite if no process is linked to - * bfqq. Otherwise check average think time to - * decide whether to mark as has_short_ttime + * bfqq. Otherwise check average think time to decide whether + * to mark as has_short_ttime. To this goal, compare average + * think time with half the I/O-plugging timeout. */ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || (bfq_sample_valid(bfqq->ttime.ttime_samples) && - bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1)) has_short_ttime = false; state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq); @@ -5557,7 +5668,6 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); bfq_insert_request(hctx, rq, at_head); - atomic_inc(&hctx->elevator_queued); } } @@ -5925,7 +6035,6 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); bfq_finish_requeue_request_body(bfqq); - atomic_dec(&rq->mq_hctx->elevator_queued); spin_unlock_irqrestore(&bfqd->lock, flags); } else { @@ -6489,8 +6598,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_slice_idle = bfq_slice_idle; bfqd->bfq_timeout = bfq_timeout; - bfqd->bfq_requests_within_timer = 120; - bfqd->bfq_large_burst_thresh = 8; bfqd->bfq_burst_interval = msecs_to_jiffies(180); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 703895224562..b8e793c34ff1 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -291,6 +291,11 @@ struct bfq_queue { /* associated @bfq_ttime struct */ struct bfq_ttime ttime; + /* when bfqq started to do I/O within the last observation window */ + u64 io_start_time; + /* how long bfqq has remained empty during the last observ. window */ + u64 tot_idle_time; + /* bit vector: a 1 for each seeky requests in history */ u32 seek_history; @@ -371,6 +376,11 @@ struct bfq_queue { * bfq_select_queue(). */ struct bfq_queue *waker_bfqq; + /* pointer to the curr. tentative waker queue, see bfq_check_waker() */ + struct bfq_queue *tentative_waker_bfqq; + /* number of times the same tentative waker has been detected */ + unsigned int num_waker_detections; + /* node for woken_list, see below */ struct hlist_node woken_list_node; /* @@ -407,6 +417,9 @@ struct bfq_io_cq { */ bool saved_IO_bound; + u64 saved_io_start_time; + u64 saved_tot_idle_time; + /* * Same purpose as the previous fields for the value of the * field keeping the queue's belonging to a large burst @@ -432,9 +445,15 @@ struct bfq_io_cq { */ unsigned long saved_wr_coeff; unsigned long saved_last_wr_start_finish; + unsigned long saved_service_from_wr; unsigned long saved_wr_start_at_switch_to_srt; unsigned int saved_wr_cur_max_time; struct bfq_ttime saved_ttime; + + /* Save also injection state */ + u64 saved_last_serv_time_ns; + unsigned int saved_inject_limit; + unsigned long saved_decrease_time_jif; }; /** @@ -642,14 +661,6 @@ struct bfq_data { unsigned int bfq_timeout; /* - * Number of consecutive requests that must be issued within - * the idle time slice to set again idling to a queue which - * was marked as non-I/O-bound (see the definition of the - * IO_bound flag for further details). - */ - unsigned int bfq_requests_within_timer; - - /* * Force device idling whenever needed to provide accurate * service guarantees, without caring about throughput * issues. CAVEAT: this may even increase latencies, in case @@ -770,7 +781,6 @@ enum bfqq_state_flags { */ BFQQF_coop, /* bfqq is shared */ BFQQF_split_coop, /* shared bfqq will be split */ - BFQQF_has_waker /* bfqq has a waker queue */ }; #define BFQ_BFQQ_FNS(name) \ @@ -790,7 +800,6 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); -BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS /* Expiration reasons. */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 26776bdbdf36..070e34a7feb1 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -137,9 +137,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, sd->next_in_service = next_in_service; - if (!next_in_service) - return parent_sched_may_change; - return parent_sched_may_change; } diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 9ffd7e289554..dfa652122a2d 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -14,8 +14,6 @@ #include <linux/slab.h> #include "blk.h" -#define BIP_INLINE_VECS 4 - static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; @@ -30,7 +28,7 @@ static void __bio_integrity_free(struct bio_set *bs, if (bs && mempool_initialized(&bs->bio_integrity_pool)) { if (bip->bip_vec) bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_slab); + bip->bip_max_vcnt); mempool_free(bip, &bs->bio_integrity_pool); } else { kfree(bip); @@ -63,7 +61,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, inline_vecs = nr_vecs; } else { bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIP_INLINE_VECS; + inline_vecs = BIO_INLINE_VECS; } if (unlikely(!bip)) @@ -72,14 +70,11 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); if (nr_vecs > inline_vecs) { - unsigned long idx = 0; - - bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, - &bs->bvec_integrity_pool); + bip->bip_max_vcnt = nr_vecs; + bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, + &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; - bip->bip_max_vcnt = bvec_nr_vecs(idx); - bip->bip_slab = idx; } else { bip->bip_vec = bip->bip_inline_vecs; bip->bip_max_vcnt = inline_vecs; @@ -140,7 +135,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; if (bip->bip_vcnt && - bvec_gap_to_prev(bio->bi_disk->queue, + bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue, &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; @@ -162,7 +157,7 @@ EXPORT_SYMBOL(bio_integrity_add_page); static blk_status_t bio_integrity_process(struct bio *bio, struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct blk_integrity_iter iter; struct bvec_iter bviter; struct bio_vec bv; @@ -171,7 +166,7 @@ static blk_status_t bio_integrity_process(struct bio *bio, void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; - iter.disk_name = bio->bi_disk->disk_name; + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; iter.prot_buf = prot_buf; @@ -208,8 +203,8 @@ static blk_status_t bio_integrity_process(struct bio *bio, bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); - struct request_queue *q = bio->bi_disk->queue; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct request_queue *q = bio->bi_bdev->bd_disk->queue; void *buf; unsigned long start, end; unsigned int len, nr_pages; @@ -329,7 +324,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); /* * At the moment verify is called bio's iterator was advanced @@ -355,7 +350,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) */ bool __bio_integrity_endio(struct bio *bio) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && @@ -381,7 +376,7 @@ bool __bio_integrity_endio(struct bio *bio) void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); bip->bip_iter.bi_sector += bytes_done >> 9; @@ -397,7 +392,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) void bio_integrity_trim(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); } @@ -470,6 +465,6 @@ void __init bio_integrity_init(void) bip_slab = kmem_cache_create("bio_integrity_payload", sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIP_INLINE_VECS, + sizeof(struct bio_vec) * BIO_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } diff --git a/block/bio.c b/block/bio.c index 2f21d2958b60..a1c4d2900c7a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -19,27 +19,40 @@ #include <linux/highmem.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> +#include <linux/xarray.h> #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" -/* - * Test patch to inline a certain number of bi_io_vec's inside the bio - * itself, to shrink a bio data allocation from two mempool calls to one - */ -#define BIO_INLINE_VECS 4 - -/* - * if you change this list, also change bvec_alloc or things will - * break badly! cannot be bigger than what you can fit into an - * unsigned short - */ -#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n } -static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { - BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max), +static struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +} bvec_slabs[] __read_mostly = { + { .nr_vecs = 16, .name = "biovec-16" }, + { .nr_vecs = 64, .name = "biovec-64" }, + { .nr_vecs = 128, .name = "biovec-128" }, + { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, }; -#undef BV + +static struct biovec_slab *biovec_slab(unsigned short nr_vecs) +{ + switch (nr_vecs) { + /* smaller bios use inline vecs */ + case 5 ... 16: + return &bvec_slabs[0]; + case 17 ... 64: + return &bvec_slabs[1]; + case 65 ... 128: + return &bvec_slabs[2]; + case 129 ... BIO_MAX_PAGES: + return &bvec_slabs[3]; + default: + BUG(); + return NULL; + } +} /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by @@ -58,178 +71,133 @@ struct bio_slab { char name[8]; }; static DEFINE_MUTEX(bio_slab_lock); -static struct bio_slab *bio_slabs; -static unsigned int bio_slab_nr, bio_slab_max; +static DEFINE_XARRAY(bio_slabs); -static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +static struct bio_slab *create_bio_slab(unsigned int size) { - unsigned int sz = sizeof(struct bio) + extra_size; - struct kmem_cache *slab = NULL; - struct bio_slab *bslab, *new_bio_slabs; - unsigned int new_bio_slab_max; - unsigned int i, entry = -1; + struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); - mutex_lock(&bio_slab_lock); + if (!bslab) + return NULL; - i = 0; - while (i < bio_slab_nr) { - bslab = &bio_slabs[i]; + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); + bslab->slab = kmem_cache_create(bslab->name, size, + ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL); + if (!bslab->slab) + goto fail_alloc_slab; - if (!bslab->slab && entry == -1) - entry = i; - else if (bslab->slab_size == sz) { - slab = bslab->slab; - bslab->slab_ref++; - break; - } - i++; - } + bslab->slab_ref = 1; + bslab->slab_size = size; - if (slab) - goto out_unlock; - - if (bio_slab_nr == bio_slab_max && entry == -1) { - new_bio_slab_max = bio_slab_max << 1; - new_bio_slabs = krealloc(bio_slabs, - new_bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!new_bio_slabs) - goto out_unlock; - bio_slab_max = new_bio_slab_max; - bio_slabs = new_bio_slabs; - } - if (entry == -1) - entry = bio_slab_nr++; + if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) + return bslab; + + kmem_cache_destroy(bslab->slab); - bslab = &bio_slabs[entry]; +fail_alloc_slab: + kfree(bslab); + return NULL; +} + +static inline unsigned int bs_bio_slab_size(struct bio_set *bs) +{ + return bs->front_pad + sizeof(struct bio) + bs->back_pad; +} - snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); - slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, - SLAB_HWCACHE_ALIGN, NULL); - if (!slab) - goto out_unlock; +static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) +{ + unsigned int size = bs_bio_slab_size(bs); + struct bio_slab *bslab; - bslab->slab = slab; - bslab->slab_ref = 1; - bslab->slab_size = sz; -out_unlock: + mutex_lock(&bio_slab_lock); + bslab = xa_load(&bio_slabs, size); + if (bslab) + bslab->slab_ref++; + else + bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); - return slab; + + if (bslab) + return bslab->slab; + return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; - unsigned int i; + unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); - for (i = 0; i < bio_slab_nr; i++) { - if (bs->bio_slab == bio_slabs[i].slab) { - bslab = &bio_slabs[i]; - break; - } - } - + bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; + WARN_ON_ONCE(bslab->slab != bs->bio_slab); + WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; + xa_erase(&bio_slabs, slab_size); + kmem_cache_destroy(bslab->slab); - bslab->slab = NULL; + kfree(bslab); out: mutex_unlock(&bio_slab_lock); } -unsigned int bvec_nr_vecs(unsigned short idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - return bvec_slabs[--idx].nr_vecs; -} - -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) -{ - if (!idx) - return; - idx--; + BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES); - BIO_BUG_ON(idx >= BVEC_POOL_NR); - - if (idx == BVEC_POOL_MAX) { + if (nr_vecs == BIO_MAX_PAGES) mempool_free(bv, pool); - } else { - struct biovec_slab *bvs = bvec_slabs + idx; + else if (nr_vecs > BIO_INLINE_VECS) + kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); +} - kmem_cache_free(bvs->slab, bv); - } +/* + * Make the first allocation restricted and don't dump info on allocation + * failures, since we'll fall back to the mempool in case of failure. + */ +static inline gfp_t bvec_alloc_gfp(gfp_t gfp) +{ + return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, - mempool_t *pool) +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask) { - struct bio_vec *bvl; + struct biovec_slab *bvs = biovec_slab(*nr_vecs); - /* - * see comment near bvec_array define! - */ - switch (nr) { - case 1: - *idx = 0; - break; - case 2 ... 4: - *idx = 1; - break; - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: + if (WARN_ON_ONCE(!bvs)) return NULL; - } /* - * idx now points to the pool we want to allocate from. only the - * 1-vec entry pool is mempool backed. + * Upgrade the nr_vecs request to take full advantage of the allocation. + * We also rely on this in the bvec_free path. */ - if (*idx == BVEC_POOL_MAX) { -fallback: - bvl = mempool_alloc(pool, gfp_mask); - } else { - struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); + *nr_vecs = bvs->nr_vecs; - /* - * Make this allocation restricted and don't dump info on - * allocation failures, since we'll fallback to the mempool - * in case of failure. - */ - __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + /* + * Try a slab allocation first for all smaller allocations. If that + * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. + * The mempool is sized to handle up to BIO_MAX_PAGES entries. + */ + if (*nr_vecs < BIO_MAX_PAGES) { + struct bio_vec *bvl; - /* - * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM - * is set, retry with the 1-entry mempool - */ - bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { - *idx = BVEC_POOL_MAX; - goto fallback; - } + bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); + if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) + return bvl; + *nr_vecs = BIO_MAX_PAGES; } - (*idx)++; - return bvl; + return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) @@ -255,7 +223,7 @@ static void bio_free(struct bio *bio) bio_uninit(bio); if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); /* * If we have front padding, adjust the bio pointer before freeing @@ -299,12 +267,8 @@ EXPORT_SYMBOL(bio_init); */ void bio_reset(struct bio *bio) { - unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - bio_uninit(bio); - memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags; atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); @@ -405,122 +369,97 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * - * Description: - * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is - * backed by the @bs's mempool. + * Allocate a bio from the mempools in @bs. * - * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will - * always be able to allocate a bio. This is due to the mempool guarantees. - * To make this work, callers must never allocate more than 1 bio at a time - * from this pool. Callers that need to allocate more than 1 bio must always - * submit the previously allocated bio for IO before attempting to allocate - * a new one. Failure to do so can cause deadlocks under memory pressure. + * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to + * allocate a bio. This is due to the mempool guarantees. To make this work, + * callers must never allocate more than 1 bio at a time from the general pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under submit_bio_noacct() (i.e. any block - * driver), bios are not submitted until after you return - see the code in - * submit_bio_noacct() that converts recursion into iteration, to prevent - * stack overflows. + * Note that when running under submit_bio_noacct() (i.e. any block driver), + * bios are not submitted until after you return - see the code in + * submit_bio_noacct() that converts recursion into iteration, to prevent + * stack overflows. * - * This would normally mean allocating multiple bios under - * submit_bio_noacct() would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. + * This would normally mean allocating multiple bios under submit_bio_noacct() + * would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. * - * RETURNS: - * Pointer to new bio on success, NULL on failure. + * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, +struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; - unsigned front_pad; - unsigned inline_vecs; - struct bio_vec *bvl = NULL; struct bio *bio; void *p; - if (!bs) { - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - front_pad = 0; - inline_vecs = nr_iovecs; - } else { - /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && - nr_iovecs > 0)) - return NULL; - /* - * submit_bio_noacct() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath submit_bio_noacct(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_DIRECT_RECLAIM; if that fails, we punt those - * bios we would be blocking to the rescuer workqueue before - * we retry with the original gfp_flags. - */ - - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; + /* should not use nobvec bioset for nr_iovecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0)) + return NULL; + /* + * submit_bio_noacct() converts recursion to iteration; this means if + * we're running beneath it, any bios we allocate and submit will not be + * submitted (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios + * from the same bio_set() while running underneath submit_bio_noacct(). + * If we were to allocate multiple bios (say a stacking block driver + * that was splitting bios), we would deadlock if we exhausted the + * mempool's reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are bios on + * current->bio_list, we first try the allocation without + * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be + * blocking to the rescuer workqueue before we retry with the original + * gfp_flags. + */ + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + + p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); - } - - front_pad = bs->front_pad; - inline_vecs = BIO_INLINE_VECS; } - if (unlikely(!p)) return NULL; - bio = p + front_pad; - bio_init(bio, NULL, 0); + bio = p + bs->front_pad; + if (nr_iovecs > BIO_INLINE_VECS) { + struct bio_vec *bvl = NULL; - if (nr_iovecs > inline_vecs) { - unsigned long idx = 0; - - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); } - if (unlikely(!bvl)) goto err_free; - bio->bi_flags |= idx << BVEC_POOL_OFFSET; + bio_init(bio, bvl, nr_iovecs); } else if (nr_iovecs) { - bvl = bio->bi_inline_vecs; + bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); + } else { + bio_init(bio, NULL, 0); } bio->bi_pool = bs; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bvl; return bio; err_free: @@ -529,6 +468,31 @@ err_free: } EXPORT_SYMBOL(bio_alloc_bioset); +/** + * bio_kmalloc - kmalloc a bio for I/O + * @gfp_mask: the GFP_* mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * + * Use kmalloc to allocate and initialize a bio. + * + * Returns: Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) +{ + struct bio *bio; + + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); + if (unlikely(!bio)) + return NULL; + bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs); + bio->bi_pool = NULL; + return bio; +} +EXPORT_SYMBOL(bio_kmalloc); + void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { unsigned long flags; @@ -607,16 +571,7 @@ void bio_truncate(struct bio *bio, unsigned new_size) */ void guard_bio_eod(struct bio *bio) { - sector_t maxsector; - struct block_device *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = bdev_nr_sectors(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; @@ -673,17 +628,18 @@ EXPORT_SYMBOL(bio_put); */ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { - BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); + WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs); /* - * most users will be overriding ->bi_disk with a new target, + * most users will be overriding ->bi_bdev with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_disk = bio_src->bi_disk; - bio->bi_partno = bio_src->bi_partno; + bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; @@ -730,7 +686,7 @@ EXPORT_SYMBOL(bio_clone_fast); const char *bio_devname(struct bio *bio, char *buf) { - return disk_name(bio->bi_disk, bio->bi_partno, buf); + return bdevname(bio->bi_bdev, buf); } EXPORT_SYMBOL(bio_devname); @@ -870,7 +826,7 @@ EXPORT_SYMBOL(bio_add_pc_page); int bio_add_zone_append_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; bool same_page = false; if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) @@ -993,21 +949,18 @@ void bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(bio_release_pages); -static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - const struct bio_vec *bv = iter->bvec; - unsigned int len; - size_t size; - - if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) - return -EINVAL; - - len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); - size = bio_add_page(bio, bv->bv_page, len, - bv->bv_offset + iter->iov_offset); - if (unlikely(size != len)) - return -EINVAL; - iov_iter_advance(iter, size); + WARN_ON_ONCE(bio->bi_max_vecs); + + bio->bi_vcnt = iter->nr_segs; + bio->bi_io_vec = (struct bio_vec *)iter->bvec; + bio->bi_iter.bi_bvec_done = iter->iov_offset; + bio->bi_iter.bi_size = iter->count; + bio_set_flag(bio, BIO_NO_PAGE_REF); + bio_set_flag(bio, BIO_CLONED); + + iov_iter_advance(iter, iter->count); return 0; } @@ -1070,7 +1023,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; unsigned int max_append_sectors = queue_max_zone_append_sectors(q); struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; @@ -1121,41 +1074,40 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. If we're adding kernel pages, and the caller told us it's safe to - * do so, we just have to add the pages to the bio directly. We don't grab an - * extra reference to those pages (the user should already have that), and we - * don't put the page on IO completion. The caller needs to check if the bio is - * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be - * released. + * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided + * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs + * to ensure the bvecs and pages stay referenced until the submitted I/O is + * completed by a call to ->ki_complete() or returns with an error other than + * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF + * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. + * + * It's intended for direct IO, so doesn't do PSI tracking, the caller is + * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - const bool is_bvec = iov_iter_is_bvec(iter); - int ret; + int ret = 0; - if (WARN_ON_ONCE(bio->bi_vcnt)) - return -EINVAL; + if (iov_iter_is_bvec(iter)) { + if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) + return -EINVAL; + return bio_iov_bvec_set(bio, iter); + } do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - if (WARN_ON_ONCE(is_bvec)) - return -EINVAL; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) ret = __bio_iov_append_get_pages(bio, iter); - } else { - if (is_bvec) - ret = __bio_iov_bvec_add_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); - } + else + ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - if (is_bvec) - bio_set_flag(bio, BIO_NO_PAGE_REF); + /* don't account direct I/O as memory stall */ + bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); @@ -1178,7 +1130,8 @@ static void submit_bio_wait_endio(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); unsigned long hang_check; bio->bi_private = &done; @@ -1455,8 +1408,8 @@ again: if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk) - rq_qos_done_bio(bio->bi_disk->queue, bio); + if (bio->bi_bdev) + rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); /* * Need to have a real endio function for chained bios, otherwise @@ -1471,8 +1424,8 @@ again: goto again; } - if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_disk->queue, bio); + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } @@ -1559,7 +1512,7 @@ EXPORT_SYMBOL_GPL(bio_trim); */ int biovec_init_pool(mempool_t *pool, int pool_entries) { - struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; + struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } @@ -1612,15 +1565,17 @@ int bioset_init(struct bio_set *bs, unsigned int front_pad, int flags) { - unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - bs->front_pad = front_pad; + if (flags & BIOSET_NEED_BVECS) + bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + else + bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); - bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; @@ -1663,39 +1618,19 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); -static void __init biovec_init_slabs(void) +static int __init init_bio(void) { int i; - for (i = 0; i < BVEC_POOL_NR; i++) { - int size; - struct biovec_slab *bvs = bvec_slabs + i; + bio_integrity_init(); - if (bvs->nr_vecs <= BIO_INLINE_VECS) { - bvs->slab = NULL; - continue; - } + for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { + struct biovec_slab *bvs = bvec_slabs + i; - size = bvs->nr_vecs * sizeof(struct bio_vec); - bvs->slab = kmem_cache_create(bvs->name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + bvs->slab = kmem_cache_create(bvs->name, + bvs->nr_vecs * sizeof(struct bio_vec), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } -} - -static int __init init_bio(void) -{ - bio_slab_max = 2; - bio_slab_nr = 0; - bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), - GFP_KERNEL); - - BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); - - if (!bio_slabs) - panic("bio: can't allocate bios\n"); - - bio_integrity_init(); - biovec_init_slabs(); if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) panic("bio: can't allocate bios\n"); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4221a1539391..a317c03d40f6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -32,8 +32,6 @@ #include <linux/psi.h> #include "blk.h" -#define MAX_KEY_LEN 100 - /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire @@ -1765,12 +1763,15 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) if (unlikely(current->flags & PF_KTHREAD)) return; - if (!blk_get_queue(q)) - return; + if (current->throttle_queue != q) { + if (!blk_get_queue(q)) + return; + + if (current->throttle_queue) + blk_put_queue(current->throttle_queue); + current->throttle_queue = q; + } - if (current->throttle_queue) - blk_put_queue(current->throttle_queue); - current->throttle_queue = q; if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); @@ -1808,7 +1809,8 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); - blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue); + blkg = blkg_lookup_create(css_to_blkcg(css), + bio->bi_bdev->bd_disk->queue); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; @@ -1844,8 +1846,8 @@ void bio_associate_blkg_from_css(struct bio *bio, if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { - blkg_get(bio->bi_disk->queue->root_blkg); - bio->bi_blkg = bio->bi_disk->queue->root_blkg; + blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg); + bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); diff --git a/block/blk-core.c b/block/blk-core.c index 7663a9b94b80..5e752840b41a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -476,7 +476,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) static inline int bio_queue_enter(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; bool nowait = bio->bi_opf & REQ_NOWAIT; int ret; @@ -531,7 +531,7 @@ struct request_queue *blk_alloc_queue(int node_id) if (q->id < 0) goto fail_q; - ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0); if (ret) goto fail_id; @@ -692,11 +692,9 @@ static inline bool should_fail_request(struct block_device *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool bio_check_ro(struct bio *bio, struct block_device *part) +static inline bool bio_check_ro(struct bio *bio) { - const int op = bio_op(bio); - - if (part->bd_read_only && op_is_write(op)) { + if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) @@ -704,7 +702,7 @@ static inline bool bio_check_ro(struct bio *bio, struct block_device *part) WARN_ONCE(1, "Trying to write to read-only block-device %s (partno %d)\n", - bio_devname(bio, b), part->bd_partno); + bio_devname(bio, b), bio->bi_bdev->bd_partno); /* Older lvm-tools actually trigger this */ return false; } @@ -714,7 +712,7 @@ static inline bool bio_check_ro(struct bio *bio, struct block_device *part) static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -725,8 +723,9 @@ ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); * This may well happen - the kernel calls bread() without checking the size of * the device, e.g., when mounting a file system. */ -static inline int bio_check_eod(struct bio *bio, sector_t maxsector) +static inline int bio_check_eod(struct bio *bio) { + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); if (nr_sectors && maxsector && @@ -741,33 +740,20 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) /* * Remap block n of partition p to block n+start(p) of the disk. */ -static inline int blk_partition_remap(struct bio *bio) +static int blk_partition_remap(struct bio *bio) { - struct block_device *p; - int ret = -EIO; + struct block_device *p = bio->bi_bdev; - rcu_read_lock(); - p = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (unlikely(!p)) - goto out; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) - goto out; - if (unlikely(bio_check_ro(bio, p))) - goto out; - + return -EIO; if (bio_sectors(bio)) { - if (bio_check_eod(bio, bdev_nr_sectors(p))) - goto out; bio->bi_iter.bi_sector += p->bd_start_sect; trace_block_bio_remap(bio, p->bd_dev, bio->bi_iter.bi_sector - p->bd_start_sect); } - bio->bi_partno = 0; - ret = 0; -out: - rcu_read_unlock(); - return ret; + bio_set_flag(bio, BIO_REMAPPED); + return 0; } /* @@ -807,7 +793,8 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, static noinline_for_stack bool submit_bio_checks(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct block_device *bdev = bio->bi_bdev; + struct request_queue *q = bdev->bd_disk->queue; blk_status_t status = BLK_STS_IOERR; struct blk_plug *plug; @@ -826,14 +813,12 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (should_fail_bio(bio)) goto end_io; - - if (bio->bi_partno) { - if (unlikely(blk_partition_remap(bio))) - goto end_io; - } else { - if (unlikely(bio_check_ro(bio, bio->bi_disk->part0))) + if (unlikely(bio_check_ro(bio))) + goto end_io; + if (!bio_flagged(bio, BIO_REMAPPED)) { + if (unlikely(bio_check_eod(bio))) goto end_io; - if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) + if (bdev->bd_partno && unlikely(blk_partition_remap(bio))) goto end_io; } @@ -926,7 +911,7 @@ end_io: static blk_qc_t __submit_bio(struct bio *bio) { - struct gendisk *disk = bio->bi_disk; + struct gendisk *disk = bio->bi_bdev->bd_disk; blk_qc_t ret = BLK_QC_T_NONE; if (blk_crypto_bio_prep(&bio)) { @@ -968,7 +953,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) current->bio_list = bio_list_on_stack; do { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct bio_list lower, same; if (unlikely(bio_queue_enter(bio) != 0)) @@ -989,7 +974,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bio->bi_disk->queue) + if (q == bio->bi_bdev->bd_disk->queue) bio_list_add(&same, bio); else bio_list_add(&lower, bio); @@ -1014,7 +999,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) current->bio_list = bio_list; do { - struct gendisk *disk = bio->bi_disk; + struct gendisk *disk = bio->bi_bdev->bd_disk; if (unlikely(bio_queue_enter(bio) != 0)) continue; @@ -1057,7 +1042,7 @@ blk_qc_t submit_bio_noacct(struct bio *bio) return BLK_QC_T_NONE; } - if (!bio->bi_disk->fops->submit_bio) + if (!bio->bi_bdev->bd_disk->fops->submit_bio) return __submit_bio_noacct_mq(bio); return __submit_bio_noacct(bio); } @@ -1069,7 +1054,7 @@ EXPORT_SYMBOL(submit_bio_noacct); * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The - * bio will be send to the device described by the bi_disk and bi_partno fields. + * bio will be send to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback @@ -1089,7 +1074,8 @@ blk_qc_t submit_bio(struct bio *bio) unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - count = queue_logical_block_size(bio->bi_disk->queue) >> 9; + count = queue_logical_block_size( + bio->bi_bdev->bd_disk->queue) >> 9; else count = bio_sectors(bio); @@ -1313,7 +1299,11 @@ void blk_account_io_start(struct request *rq) if (!blk_do_io_stat(rq)) return; - rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + /* passthrough requests can hold bios that do not have ->bi_bdev set */ + if (rq->bio && rq->bio->bi_bdev) + rq->part = rq->bio->bi_bdev; + else + rq->part = rq->rq_disk->part0; part_stat_lock(); update_io_ticks(rq->part, jiffies, false); @@ -1336,14 +1326,17 @@ static unsigned long __part_start_io_acct(struct block_device *part, return now; } -unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part, - struct bio *bio) +/** + * bio_start_io_acct - start I/O accounting for bio based drivers + * @bio: bio to start account for + * + * Returns the start time that should be passed back to bio_end_io_acct(). + */ +unsigned long bio_start_io_acct(struct bio *bio) { - *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); - - return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); + return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio)); } -EXPORT_SYMBOL_GPL(part_start_io_acct); +EXPORT_SYMBOL_GPL(bio_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) @@ -1366,12 +1359,12 @@ static void __part_end_io_acct(struct block_device *part, unsigned int op, part_stat_unlock(); } -void part_end_io_acct(struct block_device *part, struct bio *bio, - unsigned long start_time) +void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, + struct block_device *orig_bdev) { - __part_end_io_acct(part, bio_op(bio), start_time); + __part_end_io_acct(orig_bdev, bio_op(bio), start_time); } -EXPORT_SYMBOL_GPL(part_end_io_acct); +EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index c162b754efbd..e8327c50d7c9 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -164,10 +164,12 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) struct bio_vec bv; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL); + bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src)); if (!bio) return NULL; - bio->bi_disk = bio_src->bi_disk; + bio->bi_bdev = bio_src->bi_bdev; + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 5da43f0973b4..09fcb18fa778 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -280,7 +280,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, + if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm, &bc_key->crypto_cfg)) return true; diff --git a/block/blk-exec.c b/block/blk-exec.c index 85324d53d072..beae70a0e5e5 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -31,8 +31,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) } /** - * blk_execute_rq_nowait - insert a request into queue for execution - * @q: queue to insert the request in + * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @bd_disk: matching gendisk * @rq: request to insert * @at_head: insert request at head or tail of queue @@ -45,9 +44,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) * Note: * This function will invoke @done directly if the queue is dead. */ -void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head, - rq_end_io_fn *done) +void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq, + int at_head, rq_end_io_fn *done) { WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); @@ -67,7 +65,6 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); /** * blk_execute_rq - insert a request into queue for execution - * @q: queue to insert the request in * @bd_disk: matching gendisk * @rq: request to insert * @at_head: insert request at head or tail of queue @@ -76,14 +73,13 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. */ -void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head) +void blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head) { DECLARE_COMPLETION_ONSTACK(wait); unsigned long hang_check; rq->end_io_data = &wait; - blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); + blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq); /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; diff --git a/block/blk-flush.c b/block/blk-flush.c index 76c1624cb06c..7942ca6ed321 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -432,23 +432,18 @@ void blk_insert_flush(struct request *rq) /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for - * @gfp_mask: memory allocation flags (for bio_alloc) * * Description: * Issue a flush for the block device in question. */ -int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) +int blkdev_issue_flush(struct block_device *bdev) { - struct bio *bio; - int ret = 0; + struct bio bio; - bio = bio_alloc(gfp_mask, 0); - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - - ret = submit_bio_wait(bio); - bio_put(bio); - return ret; + bio_init(&bio, NULL, 0); + bio_set_dev(&bio, bdev); + bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + return submit_bio_wait(&bio); } EXPORT_SYMBOL(blkdev_issue_flush); diff --git a/block/blk-merge.c b/block/blk-merge.c index 808768f6b174..ffb4aa0ea68b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -298,14 +298,13 @@ split: * Split a bio into two bios, chain the two bios, submit the second half and * store a pointer to the first half in *@bio. If the second bio is still too * big it will be split by a recursive call to this function. Since this - * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is - * the responsibility of the caller to ensure that - * @bio->bi_disk->queue->bio_split is only released after processing of the - * split bio has finished. + * function may allocate a new bio from q->bio_split, it is the responsibility + * of the caller to ensure that q->bio_split is only released after processing + * of the split bio has finished. */ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) { - struct request_queue *q = (*bio)->bi_disk->queue; + struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue; struct bio *split = NULL; switch (bio_op(*bio)) { @@ -358,9 +357,9 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) * * Split a bio into two bios, chains the two bios, submit the second half and * store a pointer to the first half in *@bio. Since this function may allocate - * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of - * the caller to ensure that @bio->bi_disk->queue->bio_split is only released - * after processing of the split bio has finished. + * a new bio from q->bio_split, it is the responsibility of the caller to ensure + * that q->bio_split is only released after processing of the split bio has + * finished. */ void blk_queue_split(struct bio **bio) { @@ -866,7 +865,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; /* must be same device */ - if (rq->rq_disk != bio->bi_disk) + if (rq->rq_disk != bio->bi_bdev->bd_disk) return false; /* only merge integrity protected bio into ditto rq */ diff --git a/block/blk-mq.c b/block/blk-mq.c index f285a9123a8b..f21d922ecfaf 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1646,6 +1646,42 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) } EXPORT_SYMBOL(blk_mq_run_hw_queue); +/* + * Is the request queue handled by an IO scheduler that does not respect + * hardware queues when dispatching? + */ +static bool blk_mq_has_sqsched(struct request_queue *q) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.dispatch_request && + !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE)) + return true; + return false; +} + +/* + * Return prefered queue to dispatch from (if any) for non-mq aware IO + * scheduler. + */ +static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + + /* + * If the IO scheduler does not respect hardware queues when + * dispatching, we just don't bother with multiple HW queues and + * dispatch from hctx for the current CPU since running multiple queues + * just causes lock contention inside the scheduler and pointless cache + * bouncing. + */ + hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, + raw_smp_processor_id()); + if (!blk_mq_hctx_stopped(hctx)) + return hctx; + return NULL; +} + /** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. @@ -1653,14 +1689,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queue); */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx, *sq_hctx; int i; + sq_hctx = NULL; + if (blk_mq_has_sqsched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_run_hw_queue(hctx, async); + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); @@ -1672,14 +1717,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx, *sq_hctx; int i; + sq_hctx = NULL; + if (blk_mq_has_sqsched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_delay_run_hw_queue(hctx, msecs); + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); @@ -2128,7 +2182,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ blk_qc_t blk_mq_submit_bio(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_mq_alloc_data data = { @@ -2653,7 +2707,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, goto free_hctx; atomic_set(&hctx->nr_active, 0); - atomic_set(&hctx->elevator_queued, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; diff --git a/block/blk-settings.c b/block/blk-settings.c index 43990b1d148b..7dd8be314ac6 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -60,6 +60,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->io_opt = 0; lim->misaligned = 0; lim->zoned = BLK_ZONED_NONE; + lim->zone_write_granularity = 0; } EXPORT_SYMBOL(blk_set_default_limits); @@ -367,6 +368,28 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) EXPORT_SYMBOL(blk_queue_physical_block_size); /** + * blk_queue_zone_write_granularity - set zone write granularity for the queue + * @q: the request queue for the zoned device + * @size: the zone write granularity size, in bytes + * + * Description: + * This should be set to the lowest possible size allowing to write in + * sequential zones of a zoned block device. + */ +void blk_queue_zone_write_granularity(struct request_queue *q, + unsigned int size) +{ + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + return; + + q->limits.zone_write_granularity = size; + + if (q->limits.zone_write_granularity < q->limits.logical_block_size) + q->limits.zone_write_granularity = q->limits.logical_block_size; +} +EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity); + +/** * blk_queue_alignment_offset - set physical block alignment offset * @q: the request queue for the device * @offset: alignment offset in bytes @@ -631,6 +654,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_granularity; } + t->zone_write_granularity = max(t->zone_write_granularity, + b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); return ret; } @@ -847,6 +872,8 @@ EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); */ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) { + struct request_queue *q = disk->queue; + switch (model) { case BLK_ZONED_HM: /* @@ -865,7 +892,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) * we do nothing special as far as the block layer is concerned. */ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - disk_has_partitions(disk)) + !xa_empty(&disk->part_tbl)) model = BLK_ZONED_NONE; break; case BLK_ZONED_NONE: @@ -875,7 +902,17 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) break; } - disk->queue->limits.zoned = model; + q->limits.zoned = model; + if (model != BLK_ZONED_NONE) { + /* + * Set the zone write granularity to the device logical block + * size by default. The driver can change this value if needed. + */ + blk_queue_zone_write_granularity(q, + queue_logical_block_size(q)); + } else { + blk_queue_clear_zone_settings(q); + } } EXPORT_SYMBOL_GPL(blk_queue_set_zoned); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b513f1683af0..ae39c7f3d83d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -219,6 +219,12 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_zeroes_sectors << 9); } +static ssize_t queue_zone_write_granularity_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_zone_write_granularity(q), page); +} + static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) { unsigned long long max_sectors = q->limits.max_zone_append_sectors; @@ -585,6 +591,7 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); +QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); @@ -639,6 +646,7 @@ static struct attribute *queue_attrs[] = { &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, + &queue_zone_write_granularity_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d52cac9f3a7c..b1b22d863bdf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2178,7 +2178,7 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) bool blk_throtl_bio(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0321ca83e73f..42aed0160f86 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -518,7 +518,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } -static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) +static inline bool wbt_should_throttle(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: @@ -545,7 +545,7 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; - } else if (wbt_should_throttle(rwb, bio)) { + } else if (wbt_should_throttle(bio)) { if (current_is_kswapd()) flags |= WBT_KSWAPD; if (bio_op(bio) == REQ_OP_DISCARD) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7a68b6e4300c..833978c02e60 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -549,3 +549,20 @@ int blk_revalidate_disk_zones(struct gendisk *disk, return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); + +void blk_queue_clear_zone_settings(struct request_queue *q) +{ + blk_mq_freeze_queue(q); + + blk_queue_free_zone_bitmaps(q); + blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q); + q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE; + q->nr_zones = 0; + q->max_open_zones = 0; + q->max_active_zones = 0; + q->limits.chunk_sectors = 0; + q->limits.zone_write_granularity = 0; + q->limits.max_zone_append_sectors = 0; + + blk_mq_unfreeze_queue(q); +} diff --git a/block/blk.h b/block/blk.h index 7550364c326c..3b53e44b967e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,6 +55,11 @@ void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); +#define BIO_INLINE_VECS 4 +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask); +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); + static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { @@ -202,8 +207,6 @@ static inline void elevator_exit(struct request_queue *q, __elevator_exit(q, e); } -struct block_device *__disk_get_part(struct gendisk *disk, int partno); - ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, @@ -331,12 +334,12 @@ struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp); #ifdef CONFIG_BLK_DEV_ZONED void blk_queue_free_zone_bitmaps(struct request_queue *q); +void blk_queue_clear_zone_settings(struct request_queue *q); #else static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} +static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} #endif -struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); - int blk_alloc_devt(struct block_device *part, dev_t *devt); void blk_free_devt(dev_t devt); char *disk_name(struct gendisk *hd, int partno, char *buf); @@ -349,7 +352,6 @@ int bdev_add_partition(struct block_device *bdev, int partno, int bdev_del_partition(struct block_device *bdev, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); -int disk_expand_part_tbl(struct gendisk *disk, int target); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, diff --git a/block/bounce.c b/block/bounce.c index d3f51acd6e3b..fc55314aa426 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -246,7 +246,9 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; - bio->bi_disk = bio_src->bi_disk; + bio->bi_bdev = bio_src->bi_bdev; + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/bsg.c b/block/bsg.c index d7bae94b64d9..bd10922d5cbb 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -157,8 +157,10 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg) return PTR_ERR(rq); ret = q->bsg_dev.ops->fill_hdr(rq, &hdr, mode); - if (ret) + if (ret) { + blk_put_request(rq); return ret; + } rq->timeout = msecs_to_jiffies(hdr.timeout); if (!rq->timeout) @@ -181,7 +183,7 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg) bio = rq->bio; - blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL)); + blk_execute_rq(NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL)); ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr); blk_rq_unmap_user(bio); diff --git a/block/genhd.c b/block/genhd.c index 9e741a4f351b..36ff45bbaaaf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -162,15 +162,6 @@ static void part_in_flight_rw(struct block_device *part, inflight[1] = 0; } -struct block_device *__disk_get_part(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); - - if (unlikely(partno < 0 || partno >= ptbl->len)) - return NULL; - return rcu_dereference(ptbl->part[partno]); -} - /** * disk_part_iter_init - initialize partition iterator * @piter: iterator to initialize @@ -185,26 +176,14 @@ struct block_device *__disk_get_part(struct gendisk *disk, int partno) void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, unsigned int flags) { - struct disk_part_tbl *ptbl; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - piter->disk = disk; piter->part = NULL; - - if (flags & DISK_PITER_REVERSE) - piter->idx = ptbl->len - 1; - else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) + if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) piter->idx = 0; else piter->idx = 1; - piter->flags = flags; - - rcu_read_unlock(); } -EXPORT_SYMBOL_GPL(disk_part_iter_init); /** * disk_part_iter_next - proceed iterator to the next partition and return it @@ -217,57 +196,30 @@ EXPORT_SYMBOL_GPL(disk_part_iter_init); */ struct block_device *disk_part_iter_next(struct disk_part_iter *piter) { - struct disk_part_tbl *ptbl; - int inc, end; + struct block_device *part; + unsigned long idx; /* put the last partition */ disk_part_iter_exit(piter); - /* get part_tbl */ rcu_read_lock(); - ptbl = rcu_dereference(piter->disk->part_tbl); - - /* determine iteration parameters */ - if (piter->flags & DISK_PITER_REVERSE) { - inc = -1; - if (piter->flags & (DISK_PITER_INCL_PART0 | - DISK_PITER_INCL_EMPTY_PART0)) - end = -1; - else - end = 0; - } else { - inc = 1; - end = ptbl->len; - } - - /* iterate to the next partition */ - for (; piter->idx != end; piter->idx += inc) { - struct block_device *part; - - part = rcu_dereference(ptbl->part[piter->idx]); - if (!part) - continue; - piter->part = bdgrab(part); - if (!piter->part) - continue; + xa_for_each_start(&piter->disk->part_tbl, idx, part, piter->idx) { if (!bdev_nr_sectors(part) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && - piter->idx == 0)) { - bdput(piter->part); - piter->part = NULL; + piter->idx == 0)) continue; - } - piter->idx += inc; + piter->part = bdgrab(part); + if (!piter->part) + continue; + piter->idx = idx + 1; break; } - rcu_read_unlock(); return piter->part; } -EXPORT_SYMBOL_GPL(disk_part_iter_next); /** * disk_part_iter_exit - finish up partition iteration @@ -284,91 +236,6 @@ void disk_part_iter_exit(struct disk_part_iter *piter) bdput(piter->part); piter->part = NULL; } -EXPORT_SYMBOL_GPL(disk_part_iter_exit); - -static inline int sector_in_part(struct block_device *part, sector_t sector) -{ - return part->bd_start_sect <= sector && - sector < part->bd_start_sect + bdev_nr_sectors(part); -} - -/** - * disk_map_sector_rcu - map sector to partition - * @disk: gendisk of interest - * @sector: sector to map - * - * Find out which partition @sector maps to on @disk. This is - * primarily used for stats accounting. - * - * CONTEXT: - * RCU read locked. - * - * RETURNS: - * Found partition on success, part0 is returned if no partition matches - * or the matched partition is being deleted. - */ -struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) -{ - struct disk_part_tbl *ptbl; - struct block_device *part; - int i; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - part = rcu_dereference(ptbl->last_lookup); - if (part && sector_in_part(part, sector)) - goto out_unlock; - - for (i = 1; i < ptbl->len; i++) { - part = rcu_dereference(ptbl->part[i]); - if (part && sector_in_part(part, sector)) { - rcu_assign_pointer(ptbl->last_lookup, part); - goto out_unlock; - } - } - - part = disk->part0; -out_unlock: - rcu_read_unlock(); - return part; -} - -/** - * disk_has_partitions - * @disk: gendisk of interest - * - * Walk through the partition table and check if valid partition exists. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * True if the gendisk has at least one valid non-zero size partition. - * Otherwise false. - */ -bool disk_has_partitions(struct gendisk *disk) -{ - struct disk_part_tbl *ptbl; - int i; - bool ret = false; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - /* Iterate partitions skipping the whole device at index 0 */ - for (i = 1; i < ptbl->len; i++) { - if (rcu_dereference(ptbl->part[i])) { - ret = true; - break; - } - } - - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL_GPL(disk_has_partitions); /* * Can be deleted altogether. Later. @@ -604,6 +471,18 @@ static char *bdevt_str(dev_t devt, char *buf) return buf; } +void disk_uevent(struct gendisk *disk, enum kobject_action action) +{ + struct disk_part_iter piter; + struct block_device *part; + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(bdev_kobj(part), action); + disk_part_iter_exit(&piter); +} +EXPORT_SYMBOL_GPL(disk_uevent); + static void disk_scan_partitions(struct gendisk *disk) { struct block_device *bdev; @@ -621,8 +500,6 @@ static void register_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); - struct disk_part_iter piter; - struct block_device *part; int err; ddev->parent = parent; @@ -665,15 +542,9 @@ static void register_disk(struct device *parent, struct gendisk *disk, disk_scan_partitions(disk); - /* announce disk after possible partitions are created */ + /* announce the disk and partitions after all partitions are created */ dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(bdev_kobj(part), KOBJ_ADD); - disk_part_iter_exit(&piter); + disk_uevent(disk, KOBJ_ADD); if (disk->queue->backing_dev_info->dev) { err = sysfs_create_link(&ddev->kobj, @@ -829,8 +700,7 @@ void del_gendisk(struct gendisk *disk) down_write(&bdev_lookup_sem); /* invalidate stuff */ - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(part); delete_partition(part); @@ -929,7 +799,7 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno) struct block_device *bdev = NULL; rcu_read_lock(); - bdev = __disk_get_part(disk, partno); + bdev = xa_load(&disk->part_tbl, partno); if (bdev && !bdgrab(bdev)) bdev = NULL; rcu_read_unlock(); @@ -1320,83 +1190,6 @@ static const struct attribute_group *disk_attr_groups[] = { }; /** - * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way - * @disk: disk to replace part_tbl for - * @new_ptbl: new part_tbl to install - * - * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The - * original ptbl is freed using RCU callback. - * - * LOCKING: - * Matching bd_mutex locked or the caller is the only user of @disk. - */ -static void disk_replace_part_tbl(struct gendisk *disk, - struct disk_part_tbl *new_ptbl) -{ - struct disk_part_tbl *old_ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - rcu_assign_pointer(disk->part_tbl, new_ptbl); - - if (old_ptbl) { - rcu_assign_pointer(old_ptbl->last_lookup, NULL); - kfree_rcu(old_ptbl, rcu_head); - } -} - -/** - * disk_expand_part_tbl - expand disk->part_tbl - * @disk: disk to expand part_tbl for - * @partno: expand such that this partno can fit in - * - * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl - * uses RCU to allow unlocked dereferencing for stats and other stuff. - * - * LOCKING: - * Matching bd_mutex locked or the caller is the only user of @disk. - * Might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int disk_expand_part_tbl(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *old_ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - struct disk_part_tbl *new_ptbl; - int len = old_ptbl ? old_ptbl->len : 0; - int i, target; - - /* - * check for int overflow, since we can get here from blkpg_ioctl() - * with a user passed 'partno'. - */ - target = partno + 1; - if (target < 0) - return -EINVAL; - - /* disk_max_parts() is zero during initialization, ignore if so */ - if (disk_max_parts(disk) && target > disk_max_parts(disk)) - return -EINVAL; - - if (target <= len) - return 0; - - new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL, - disk->node_id); - if (!new_ptbl) - return -ENOMEM; - - new_ptbl->len = target; - - for (i = 0; i < len; i++) - rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); - - disk_replace_part_tbl(disk, new_ptbl); - return 0; -} - -/** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk * @@ -1419,7 +1212,7 @@ static void disk_release(struct device *dev) blk_free_devt(dev->devt); disk_release_events(disk); kfree(disk->random); - disk_replace_part_tbl(disk, NULL); + xa_destroy(&disk->part_tbl); bdput(disk->part0); if (disk->queue) blk_put_queue(disk->queue); @@ -1572,7 +1365,6 @@ dev_t blk_lookup_devt(const char *name, int partno) struct gendisk *__alloc_disk_node(int minors, int node_id) { struct gendisk *disk; - struct disk_part_tbl *ptbl; if (minors > DISK_MAX_PARTS) { printk(KERN_ERR @@ -1590,11 +1382,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_free_disk; disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) - goto out_bdput; - - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], disk->part0); + xa_init(&disk->part_tbl); + if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) + goto out_destroy_part_tbl; disk->minors = minors; rand_initialize_disk(disk); @@ -1603,7 +1393,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); return disk; -out_bdput: +out_destroy_part_tbl: + xa_destroy(&disk->part_tbl); bdput(disk->part0); out_free_disk: kfree(disk); @@ -1638,31 +1429,32 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } -void set_disk_ro(struct gendisk *disk, int flag) +/** + * set_disk_ro - set a gendisk read-only + * @disk: gendisk to operate on + * @read_only: %true to set the disk read-only, %false set the disk read/write + * + * This function is used to indicate whether a given disk device should have its + * read-only flag set. set_disk_ro() is typically used by device drivers to + * indicate whether the underlying physical device is write-protected. + */ +void set_disk_ro(struct gendisk *disk, bool read_only) { - struct disk_part_iter piter; - struct block_device *part; - - if (disk->part0->bd_read_only != flag) { - set_disk_ro_uevent(disk, flag); - disk->part0->bd_read_only = flag; + if (read_only) { + if (test_and_set_bit(GD_READ_ONLY, &disk->state)) + return; + } else { + if (!test_and_clear_bit(GD_READ_ONLY, &disk->state)) + return; } - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) - part->bd_read_only = flag; - disk_part_iter_exit(&piter); + set_disk_ro_uevent(disk, read_only); } - EXPORT_SYMBOL(set_disk_ro); int bdev_read_only(struct block_device *bdev) { - if (!bdev) - return 0; - return bdev->bd_read_only; + return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); } - EXPORT_SYMBOL(bdev_read_only); /* diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index dc89199bc8c6..c25c41d0d061 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -1029,6 +1029,7 @@ static struct elevator_type kyber_sched = { #endif .elevator_attrs = kyber_sched_attrs, .elevator_name = "kyber", + .elevator_features = ELEVATOR_F_MQ_AWARE, .elevator_owner = THIS_MODULE, }; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 800ac902809b..b57470e154c8 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -386,8 +386,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock(&dd->lock); rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); - if (rq) - atomic_dec(&rq->mq_hctx->elevator_queued); return rq; } @@ -535,7 +533,6 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, at_head); - atomic_inc(&hctx->elevator_queued); } spin_unlock(&dd->lock); } @@ -582,9 +579,6 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; - if (!atomic_read(&hctx->elevator_queued)) - return false; - return !list_empty_careful(&dd->dispatch) || !list_empty_careful(&dd->fifo_list[0]) || !list_empty_careful(&dd->fifo_list[1]); diff --git a/block/partitions/core.c b/block/partitions/core.c index 4601a845cd79..f3d9ff2cafb6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -197,7 +197,7 @@ static ssize_t part_start_show(struct device *dev, static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_read_only); + return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, @@ -289,13 +289,7 @@ struct device_type part_type = { */ void delete_partition(struct block_device *part) { - struct gendisk *disk = part->bd_disk; - struct disk_part_tbl *ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - rcu_assign_pointer(ptbl->part[part->bd_partno], NULL); - rcu_assign_pointer(ptbl->last_lookup, NULL); - + xa_erase(&part->bd_disk->part_tbl, part->bd_partno); kobject_put(part->bd_holder_dir); device_del(&part->bd_device); @@ -327,7 +321,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, struct device *ddev = disk_to_dev(disk); struct device *pdev; struct block_device *bdev; - struct disk_part_tbl *ptbl; const char *dname; int err; @@ -343,18 +336,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, case BLK_ZONED_HA: pr_info("%s: disabling host aware zoned block device support due to partitions\n", disk->disk_name); - disk->queue->limits.zoned = BLK_ZONED_NONE; + blk_queue_set_zoned(disk, BLK_ZONED_NONE); break; case BLK_ZONED_NONE: break; } - err = disk_expand_part_tbl(disk, partno); - if (err) - return ERR_PTR(err); - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - - if (ptbl->part[partno]) + if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); bdev = bdev_alloc(disk, partno); @@ -363,7 +351,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); - bdev->bd_read_only = get_disk_ro(disk); if (info) { err = -ENOMEM; @@ -408,8 +395,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, } /* everything is up and running, commence */ + err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); + if (err) + goto out_del; bdev_add(bdev, devt); - rcu_assign_pointer(ptbl->part[partno], bdev); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) @@ -615,7 +604,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) { struct parsed_partitions *state; - int ret = -EAGAIN, p, highest; + int ret = -EAGAIN, p; if (!disk_part_scan_enabled(disk)) return 0; @@ -663,15 +652,6 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - /* - * Detect the highest partition number and preallocate disk->part_tbl. - * This is an optimization and not strictly necessary. - */ - for (p = 1, highest = 0; p < state->limit; p++) - if (state->parts[p].size) - highest = p; - disk_expand_part_tbl(disk, highest); - for (p = 1; p < state->limit; p++) if (!blk_add_partition(disk, bdev, state, p)) goto out_free_state; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index c9f009cc0446..6599bac0a78c 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -357,7 +357,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, * (if he doesn't check that is his problem). * N.B. a non-zero SCSI status is _not_ necessarily an error. */ - blk_execute_rq(q, bd_disk, rq, at_head); + blk_execute_rq(bd_disk, rq, at_head); hdr->duration = jiffies_to_msecs(jiffies - start_time); @@ -493,7 +493,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, goto error; } - blk_execute_rq(q, disk, rq, 0); + blk_execute_rq(disk, rq, 0); err = req->result & 0xff; /* only 8 bit SCSI status */ if (err) { @@ -532,7 +532,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, scsi_req(rq)->cmd[0] = cmd; scsi_req(rq)->cmd[4] = data; scsi_req(rq)->cmd_len = 6; - blk_execute_rq(q, bd_disk, rq, 0); + blk_execute_rq(bd_disk, rq, 0); err = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c43a6ab4b1f3..18bf99906662 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -284,15 +284,11 @@ out: static blk_qc_t brd_submit_bio(struct bio *bio) { - struct brd_device *brd = bio->bi_disk->private_data; + struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; + sector_t sector = bio->bi_iter.bi_sector; struct bio_vec bvec; - sector_t sector; struct bvec_iter iter; - sector = bio->bi_iter.bi_sector; - if (bio_end_sector(bio) > get_capacity(bio->bi_disk)) - goto io_error; - bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; int err; diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 7227fc7ab8ed..72cf7603d51f 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -138,7 +138,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, op_flags |= REQ_FUA | REQ_PREFLUSH; op_flags |= REQ_SYNC; - bio = bio_alloc_drbd(GFP_NOIO); + bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set); bio_set_dev(bio, bdev->md_bdev); bio->bi_iter.bi_sector = sector; err = -EIO; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index df53dca5d02c..c1f816f896a8 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -976,7 +976,7 @@ static void drbd_bm_endio(struct bio *bio) static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) { - struct bio *bio = bio_alloc_drbd(GFP_NOIO); + struct bio *bio = bio_alloc_bioset(GFP_NOIO, 1, &drbd_md_io_bio_set); struct drbd_device *device = ctx->device; struct drbd_bitmap *b = device->bitmap; struct page *page; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8f879e5c2f67..02db50d7e4c6 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1422,8 +1422,6 @@ extern mempool_t drbd_md_io_page_pool; /* We also need to make sure we get a bio * when we need it for housekeeping purposes */ extern struct bio_set drbd_md_io_bio_set; -/* to allocate from that set */ -extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); /* And a bio_set for cloning */ extern struct bio_set drbd_io_bio_set; @@ -1579,8 +1577,8 @@ static inline void drbd_submit_bio_noacct(struct drbd_device *device, int fault_type, struct bio *bio) { __release(local); - if (!bio->bi_disk) { - drbd_err(device, "drbd_submit_bio_noacct: bio->bi_disk == NULL\n"); + if (!bio->bi_bdev) { + drbd_err(device, "drbd_submit_bio_noacct: bio->bi_bdev == NULL\n"); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1c8c18b2a25f..788dd97e6026 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -138,19 +138,6 @@ static const struct block_device_operations drbd_ops = { .release = drbd_release, }; -struct bio *bio_alloc_drbd(gfp_t gfp_mask) -{ - struct bio *bio; - - if (!bioset_initialized(&drbd_md_io_bio_set)) - return bio_alloc(gfp_mask, 1); - - bio = bio_alloc_bioset(gfp_mask, 1, &drbd_md_io_bio_set); - if (!bio) - return NULL; - return bio; -} - #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will give tons of false positives. When this is a real functions sparse works. diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 330f851cb8f0..9dbb660a7d7c 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -30,7 +30,10 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio return NULL; memset(req, 0, sizeof(*req)); - drbd_req_make_private_bio(req, bio_src); + req->private_bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set); + req->private_bio->bi_private = req; + req->private_bio->bi_end_io = drbd_request_endio; + req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0) | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0) @@ -1595,7 +1598,7 @@ void do_submit(struct work_struct *ws) blk_qc_t drbd_submit_bio(struct bio *bio) { - struct drbd_device *device = bio->bi_disk->private_data; + struct drbd_device *device = bio->bi_bdev->bd_disk->private_data; unsigned long start_jif; blk_queue_split(&bio); diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 55bb0f8721fa..511f39a08de4 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -256,18 +256,6 @@ enum drbd_req_state_bits { #define MR_WRITE 1 #define MR_READ 2 -static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) -{ - struct bio *bio; - bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set); - - req->private_bio = bio; - - bio->bi_private = req; - bio->bi_end_io = drbd_request_endio; - bio->bi_next = NULL; -} - /* Short lived temporary struct on the stack. * We could squirrel the error to be returned into * bio->bi_iter.bi_size, or similar. But that would be too ugly. */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 02044ab7f767..64563bfdf0da 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1523,8 +1523,11 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) drbd_al_begin_io(device, &req->i); - drbd_req_make_private_bio(req, req->master_bio); + req->private_bio = bio_clone_fast(req->master_bio, GFP_NOIO, + &drbd_io_bio_set); bio_set_dev(req->private_bio, device->ldev->backing_bdev); + req->private_bio->bi_private = req; + req->private_bio->bi_end_io = drbd_request_endio; submit_bio_noacct(req->private_bio); return 0; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 53ac59d19ae5..3fd99836bb1c 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1015,7 +1015,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, rq->timeout = timeout; /* insert request and run queue */ - blk_execute_rq(rq->q, NULL, rq, true); + blk_execute_rq(NULL, rq, true); if (int_cmd->status) { dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n", diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 5357c3a4a36f..d6c821d48090 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1420,7 +1420,7 @@ static blk_qc_t null_submit_bio(struct bio *bio) { sector_t sector = bio->bi_iter.bi_sector; sector_t nr_sectors = bio_sectors(bio); - struct nullb *nullb = bio->bi_disk->private_data; + struct nullb *nullb = bio->bi_bdev->bd_disk->private_data; struct nullb_queue *nq = nullb_to_queue(nullb); struct nullb_cmd *cmd; diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index fce0a54df0e5..bfcab1c782b5 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -148,10 +148,6 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) sector += dev->zone_size_sects; } - q->limits.zoned = BLK_ZONED_HM; - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - return 0; } @@ -160,6 +156,10 @@ int null_register_zoned_dev(struct nullb *nullb) struct nullb_device *dev = nullb->dev; struct request_queue *q = nullb->q; + blk_queue_set_zoned(nullb->disk, BLK_ZONED_HM); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + if (queue_is_mq(q)) { int ret = blk_revalidate_disk_zones(nullb->disk, NULL); diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index a7af4f27b7c3..897acda20ac8 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -781,7 +781,7 @@ static int pd_special_command(struct pd_unit *disk, req = blk_mq_rq_to_pdu(rq); req->func = func; - blk_execute_rq(disk->gd->queue, disk->gd, rq, 0); + blk_execute_rq(disk->gd, rq, 0); blk_put_request(rq); return 0; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index b8bb8ec7538d..fc4b0f1aa86d 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -722,7 +722,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * if (cgc->quiet) rq->rq_flags |= RQF_QUIET; - blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0); + blk_execute_rq(pd->bdev->bd_disk, rq, 0); if (scsi_req(rq)->result) ret = -EIO; out: @@ -2374,7 +2374,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio) blk_queue_split(&bio); - pd = bio->bi_disk->queue->queuedata; + pd = bio->bi_bdev->bd_disk->queue->queuedata; if (!pd) { pr_err("%s incorrect request queue\n", bio_devname(bio, b)); goto end_io; @@ -2418,7 +2418,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio) split = bio; } - pkt_make_request_write(bio->bi_disk->queue, split); + pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split); } while (split != bio); return BLK_QC_T_NONE; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index b71d28372ef3..1d738999fb69 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -581,7 +581,7 @@ out: static blk_qc_t ps3vram_submit_bio(struct bio *bio) { - struct ps3_system_bus_device *dev = bio->bi_disk->private_data; + struct ps3_system_bus_device *dev = bio->bi_bdev->bd_disk->private_data; struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); int busy; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 59cfe71d0b3a..bbb88eb009e0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -692,29 +692,10 @@ static void rbd_release(struct gendisk *disk, fmode_t mode) put_device(&rbd_dev->dev); } -static int rbd_set_read_only(struct block_device *bdev, bool ro) -{ - struct rbd_device *rbd_dev = bdev->bd_disk->private_data; - - /* - * Both images mapped read-only and snapshots can't be marked - * read-write. - */ - if (!ro) { - if (rbd_is_ro(rbd_dev)) - return -EROFS; - - rbd_assert(!rbd_is_snap(rbd_dev)); - } - - return 0; -} - static const struct block_device_operations rbd_bd_ops = { .owner = THIS_MODULE, .open = rbd_open, .release = rbd_release, - .set_read_only = rbd_set_read_only, }; /* diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index edacefff6e35..9a28322a8cd8 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -122,7 +122,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, static blk_qc_t rsxx_submit_bio(struct bio *bio) { - struct rsxx_cardinfo *card = bio->bi_disk->private_data; + struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data; struct rsxx_bio_meta *bio_meta; blk_status_t st = BLK_STS_IOERR; diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 4478eb7efee0..2cdf2771f8e8 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -539,7 +539,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) spin_unlock_irq(&host->lock); DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL); + blk_execute_rq_nowait(NULL, rq, true, NULL); return 0; @@ -578,7 +578,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) crq->msg_bucket = (u32) rc; DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag); - blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL); + blk_execute_rq_nowait(NULL, rq, true, NULL); return 0; } diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 2b95d7b33b91..982732dbe82e 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -521,7 +521,7 @@ static int mm_check_plugged(struct cardinfo *card) static blk_qc_t mm_submit_bio(struct bio *bio) { - struct cardinfo *card = bio->bi_disk->private_data; + struct cardinfo *card = bio->bi_bdev->bd_disk->private_data; pr_debug("mm_make_request %llu %u\n", (unsigned long long)bio->bi_iter.bi_sector, diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 145606dc52db..b0285db7cf4f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -320,7 +320,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) if (err) goto out; - blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); + blk_execute_rq(vblk->disk, req, false); err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req))); out: blk_put_request(req); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e2933cb7a82a..d6243dbc53cc 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1596,7 +1596,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) */ static blk_qc_t zram_submit_bio(struct bio *bio) { - struct zram *zram = bio->bi_disk->private_data; + struct zram *zram = bio->bi_bdev->bd_disk->private_data; if (!valid_io_request(zram, bio->bi_iter.bi_sector, bio->bi_iter.bi_size)) { diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 8f0e52a71493..90ad34c6ef8e 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2214,7 +2214,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, rq->timeout = 60 * HZ; bio = rq->bio; - blk_execute_rq(q, cdi->disk, rq, 0); + blk_execute_rq(cdi->disk, rq, 0); if (scsi_req(rq)->result) { struct scsi_sense_hdr sshdr; diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 013ad33fbbc8..a1ce9f5ac3aa 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -107,7 +107,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, memcpy(scsi_req(rq)->cmd, pc->c, 12); if (drive->media == ide_tape) scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1; - blk_execute_rq(drive->queue, disk, rq, 0); + blk_execute_rq(disk, rq, 0); error = scsi_req(rq)->result ? -EIO : 0; put_req: blk_put_request(rq); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 25d2d88e82ad..cffbcc27a34c 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -467,7 +467,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, } } - blk_execute_rq(drive->queue, info->disk, rq, 0); + blk_execute_rq(info->disk, rq, 0); error = scsi_req(rq)->result ? -EIO : 0; if (buffer) diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 46f2df288c6a..011eab9c69b7 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -299,7 +299,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_MISC; rq->rq_flags = RQF_QUIET; - blk_execute_rq(drive->queue, cd->disk, rq, 0); + blk_execute_rq(cd->disk, rq, 0); ret = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); /* diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index f2f93ed40356..ca1d4b3d3878 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -173,7 +173,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, *(int *)&scsi_req(rq)->cmd[1] = arg; ide_req(rq)->special = setting->set; - blk_execute_rq(q, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); ret = scsi_req(rq)->result; blk_put_request(rq); diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 34b9441084f8..8413731c6259 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -482,7 +482,7 @@ static int set_multcount(ide_drive_t *drive, int arg) drive->mult_req = arg; drive->special_flags |= IDE_SFLAG_SET_MULTMODE; - blk_execute_rq(drive->queue, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); blk_put_request(rq); return (drive->mult_count == arg) ? 0 : -EIO; diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index 58994da10c06..43fbc37d85c3 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -137,7 +137,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, void __user *argp) rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); ide_req(rq)->type = ATA_PRIV_TASKFILE; - blk_execute_rq(drive->queue, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); err = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); @@ -235,7 +235,7 @@ static int generic_drive_reset(ide_drive_t *drive) ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd_len = 1; scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET; - blk_execute_rq(drive->queue, NULL, rq, 1); + blk_execute_rq(NULL, rq, 1); ret = scsi_req(rq)->result; blk_put_request(rq); return ret; diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index 8af7af6001eb..a80a0f28f7b9 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -37,7 +37,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) scsi_req(rq)->cmd_len = 1; ide_req(rq)->type = ATA_PRIV_MISC; ide_req(rq)->special = &timeout; - blk_execute_rq(q, NULL, rq, 1); + blk_execute_rq(NULL, rq, 1); rc = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); if (rc) diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 82ab308f1aaf..d680b3e3295f 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -27,7 +27,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) mesg.event = PM_EVENT_FREEZE; rqpm.pm_state = mesg.event; - blk_execute_rq(drive->queue, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); ret = scsi_req(rq)->result ? -EIO : 0; blk_put_request(rq); @@ -50,7 +50,7 @@ static int ide_pm_execute_rq(struct request *rq) blk_mq_end_request(rq, BLK_STS_OK); return -ENXIO; } - blk_execute_rq(q, NULL, rq, true); + blk_execute_rq(NULL, rq, true); return scsi_req(rq)->result ? -EIO : 0; } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 88b96437b22e..fa05e7e7d609 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -868,7 +868,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) goto out_put; } - blk_execute_rq(drive->queue, tape->disk, rq, 0); + blk_execute_rq(tape->disk, rq, 0); /* calculate the number of transferred bytes and update buffer state */ size -= scsi_req(rq)->resid_len; diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index d016cbe68cba..6665fc4724b9 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -443,7 +443,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, ide_req(rq)->special = cmd; cmd->rq = rq; - blk_execute_rq(drive->queue, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); error = scsi_req(rq)->result ? -EIO : 0; put_req: blk_put_request(rq); diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index b6246f73895c..5924f09c217b 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -49,7 +49,7 @@ struct bio_set pblk_bio_set; static blk_qc_t pblk_submit_bio(struct bio *bio) { - struct pblk *pblk = bio->bi_disk->queue->queuedata; + struct pblk *pblk = bio->bi_bdev->bd_disk->queue->queuedata; if (bio_op(bio) == REQ_OP_DISCARD) { pblk_discard(pblk, bio); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index b00fd08d696b..63e809f38e3f 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -114,7 +114,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); if (!check) return; - check->bi_disk = bio->bi_disk; + bio_set_dev(check, bio->bi_bdev); check->bi_opf = REQ_OP_READ; check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 85b1f2a9b72d..29c231758293 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -475,7 +475,7 @@ struct search { unsigned int read_dirty_data:1; unsigned int cache_missed:1; - struct block_device *part; + struct block_device *orig_bdev; unsigned long start_time; struct btree_op op; @@ -670,8 +670,8 @@ static void bio_complete(struct search *s) { if (s->orig_bio) { /* Count on bcache device */ - part_end_io_acct(s->part, s->orig_bio, s->start_time); - + bio_end_io_acct_remapped(s->orig_bio, s->start_time, + s->orig_bdev); trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; bio_endio(s->orig_bio); @@ -714,7 +714,8 @@ static void search_free(struct closure *cl) } static inline struct search *search_alloc(struct bio *bio, - struct bcache_device *d) + struct bcache_device *d, struct block_device *orig_bdev, + unsigned long start_time) { struct search *s; @@ -732,7 +733,8 @@ static inline struct search *search_alloc(struct bio *bio, s->write = op_is_write(bio_op(bio)); s->read_dirty_data = 0; /* Count on the bcache device */ - s->start_time = part_start_io_acct(d->disk, &s->part, bio); + s->orig_bdev = orig_bdev; + s->start_time = start_time; s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -894,7 +896,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, !(bio->bi_opf & (REQ_META|REQ_PRIO)) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, - get_capacity(bio->bi_disk) - bio_end_sector(bio)); + get_capacity(bio->bi_bdev->bd_disk) - + bio_end_sector(bio)); s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); @@ -1073,7 +1076,7 @@ struct detached_dev_io_private { unsigned long start_time; bio_end_io_t *bi_end_io; void *bi_private; - struct block_device *part; + struct block_device *orig_bdev; }; static void detached_dev_end_io(struct bio *bio) @@ -1085,7 +1088,7 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_private = ddip->bi_private; /* Count on the bcache device */ - part_end_io_acct(ddip->part, bio, ddip->start_time); + bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev); if (bio->bi_status) { struct cached_dev *dc = container_of(ddip->d, @@ -1098,7 +1101,8 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io(bio); } -static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) +static void detached_dev_do_request(struct bcache_device *d, struct bio *bio, + struct block_device *orig_bdev, unsigned long start_time) { struct detached_dev_io_private *ddip; struct cached_dev *dc = container_of(d, struct cached_dev, disk); @@ -1111,7 +1115,8 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); ddip->d = d; /* Count on the bcache device */ - ddip->start_time = part_start_io_acct(d->disk, &ddip->part, bio); + ddip->orig_bdev = orig_bdev; + ddip->start_time = start_time; ddip->bi_end_io = bio->bi_end_io; ddip->bi_private = bio->bi_private; bio->bi_end_io = detached_dev_end_io; @@ -1167,8 +1172,10 @@ static void quit_max_writeback_rate(struct cache_set *c, blk_qc_t cached_dev_submit_bio(struct bio *bio) { struct search *s; - struct bcache_device *d = bio->bi_disk->private_data; + struct block_device *orig_bdev = bio->bi_bdev; + struct bcache_device *d = orig_bdev->bd_disk->private_data; struct cached_dev *dc = container_of(d, struct cached_dev, disk); + unsigned long start_time; int rw = bio_data_dir(bio); if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || @@ -1193,11 +1200,13 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } } + start_time = bio_start_io_acct(bio); + bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { - s = search_alloc(bio, d); + s = search_alloc(bio, d, orig_bdev, start_time); trace_bcache_request_start(s->d, bio); if (!bio->bi_iter.bi_size) { @@ -1218,7 +1227,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) } } else /* I/O request sent to backing device */ - detached_dev_do_request(d, bio); + detached_dev_do_request(d, bio, orig_bdev, start_time); return BLK_QC_T_NONE; } @@ -1274,7 +1283,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) { struct search *s; struct closure *cl; - struct bcache_device *d = bio->bi_disk->private_data; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { bio->bi_status = BLK_STS_IOERR; @@ -1282,7 +1291,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } - s = search_alloc(bio, d); + s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio)); cl = &s->cl; bio = &s->bio.bio; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2047a9cccdb5..193fe7652329 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1939,7 +1939,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) goto err; if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), - BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) + BIOSET_NEED_RESCUER)) goto err; c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb); diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index 2ea0360108e1..a3b71350eec8 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -18,8 +18,7 @@ */ struct dm_bio_details { - struct gendisk *bi_disk; - u8 bi_partno; + struct block_device *bi_bdev; int __bi_remaining; unsigned long bi_flags; struct bvec_iter bi_iter; @@ -31,8 +30,7 @@ struct dm_bio_details { static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) { - bd->bi_disk = bio->bi_disk; - bd->bi_partno = bio->bi_partno; + bd->bi_bdev = bio->bi_bdev; bd->bi_flags = bio->bi_flags; bd->bi_iter = bio->bi_iter; bd->__bi_remaining = atomic_read(&bio->__bi_remaining); @@ -44,8 +42,7 @@ static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) { - bio->bi_disk = bd->bi_disk; - bio->bi_partno = bd->bi_partno; + bio->bi_bdev = bd->bi_bdev; bio->bi_flags = bd->bi_flags; bio->bi_iter = bd->bi_iter; atomic_set(&bio->__bi_remaining, bd->__bi_remaining); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index af6d4f898e4c..89a73204dbf4 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -449,7 +449,7 @@ static int __check_incompat_features(struct cache_disk_superblock *disk_super, /* * Check for read-only metadata to skip the following RDWR checks. */ - if (get_disk_ro(cmd->bdev->bd_disk)) + if (bdev_read_only(cmd->bdev)) return 0; features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP; diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index bdb255edc200..a90bdf9b2ca6 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -85,12 +85,6 @@ struct clone { struct dm_clone_metadata *cmd; - /* - * bio used to flush the destination device, before committing the - * metadata. - */ - struct bio flush_bio; - /* Region hydration hash table */ struct hash_table_bucket *ht; @@ -1155,11 +1149,7 @@ static int commit_metadata(struct clone *clone, bool *dest_dev_flushed) goto out; } - bio_reset(&clone->flush_bio); - bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev); - clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - - r = submit_bio_wait(&clone->flush_bio); + r = blkdev_issue_flush(clone->dest_dev->bdev); if (unlikely(r)) { __metadata_operation_failed(clone, "flush destination device", r); goto out; @@ -1886,7 +1876,6 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) bio_list_init(&clone->deferred_flush_completions); clone->hydration_offset = 0; atomic_set(&clone->hydrations_in_flight, 0); - bio_init(&clone->flush_bio, NULL, 0); clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); if (!clone->wq) { @@ -1958,7 +1947,6 @@ static void clone_dtr(struct dm_target *ti) struct clone *clone = ti->private; mutex_destroy(&clone->commit_lock); - bio_uninit(&clone->flush_bio); for (i = 0; i < clone->nr_ctr_args; i++) kfree(clone->ctr_args[i]); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index fa09bc4e4c54..b0a82f29a2e4 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -145,7 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list) struct dm_raid1_bio_record { struct mirror *m; - /* if details->bi_disk == NULL, details were not saved */ + /* if details->bi_bdev == NULL, details were not saved */ struct dm_bio_details details; region_t write_region; }; @@ -1190,7 +1190,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) struct dm_raid1_bio_record *bio_record = dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record)); - bio_record->details.bi_disk = NULL; + bio_record->details.bi_bdev = NULL; if (rw == WRITE) { /* Save region for mirror_end_io() handler */ @@ -1257,7 +1257,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, goto out; if (unlikely(*error)) { - if (!bio_record->details.bi_disk) { + if (!bio_record->details.bi_bdev) { /* * There wasn't enough memory to record necessary * information for a retry or there was no other @@ -1282,7 +1282,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, bd = &bio_record->details; dm_bio_restore(bd, bio); - bio_record->details.bi_disk = NULL; + bio_record->details.bi_bdev = NULL; bio->bi_status = 0; queue_bio(ms, bio, rw); @@ -1292,7 +1292,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, } out: - bio_record->details.bi_disk = NULL; + bio_record->details.bi_bdev = NULL; return DM_ENDIO_DONE; } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 6ebb2127f3e2..e75b20480e46 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -636,7 +636,7 @@ static int __check_incompat_features(struct thin_disk_superblock *disk_super, /* * Check for read-only metadata to skip the following RDWR checks. */ - if (get_disk_ro(pmd->bdev->bd_disk)) + if (bdev_read_only(pmd->bdev)) return 0; features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index b298fefb022e..039d17b28938 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -819,7 +819,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); + ret = blkdev_issue_flush(dev->bdev); return ret; } @@ -862,7 +862,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); + ret = blkdev_issue_flush(dev->bdev); return ret; } @@ -933,7 +933,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); + ret = blkdev_issue_flush(dev->bdev); goto err; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7bac564f3faa..479ec5bea09e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -977,16 +977,17 @@ static void clone_endio(struct bio *bio) struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; struct bio *orig_bio = io->orig_bio; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD && - !bio->bi_disk->queue->limits.max_discard_sectors) + !q->limits.max_discard_sectors) disable_discard(md); else if (bio_op(bio) == REQ_OP_WRITE_SAME && - !bio->bi_disk->queue->limits.max_write_same_sectors) + !q->limits.max_write_same_sectors) disable_write_same(md); else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !bio->bi_disk->queue->limits.max_write_zeroes_sectors) + !q->limits.max_write_zeroes_sectors) disable_write_zeroes(md); } @@ -996,7 +997,7 @@ static void clone_endio(struct bio *bio) */ if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { sector_t written_sector = bio->bi_iter.bi_sector; - struct request_queue *q = orig_bio->bi_disk->queue; + struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue; u64 mask = (u64)blk_queue_zone_sectors(q) - 1; orig_bio->bi_iter.bi_sector += written_sector & mask; @@ -1422,8 +1423,7 @@ static int __send_empty_flush(struct clone_info *ci) */ bio_init(&flush_bio, NULL, 0); flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - flush_bio.bi_disk = ci->io->md->disk; - bio_associate_blkg(&flush_bio); + bio_set_dev(&flush_bio, ci->io->md->disk->part0); ci->bio = &flush_bio; ci->sector_count = 0; @@ -1626,7 +1626,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, static blk_qc_t dm_submit_bio(struct bio *bio) { - struct mapped_device *md = bio->bi_disk->private_data; + struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; blk_qc_t ret = BLK_QC_T_NONE; int srcu_idx; struct dm_table *map; diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 68cac7d19278..63ed8329a98d 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -252,7 +252,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) start_sector + data_offset; if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) { + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) { /* Just ignore it */ bio_endio(bio); } else { diff --git a/drivers/md/md.c b/drivers/md/md.c index 04384452a7ab..21da0c48f6c2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -340,24 +340,6 @@ static int start_readonly; */ static bool create_on_open = true; -struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, - struct mddev *mddev) -{ - if (!mddev || !bioset_initialized(&mddev->bio_set)) - return bio_alloc(gfp_mask, nr_iovecs); - - return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set); -} -EXPORT_SYMBOL_GPL(bio_alloc_mddev); - -static struct bio *md_bio_alloc_sync(struct mddev *mddev) -{ - if (!mddev || !bioset_initialized(&mddev->sync_set)) - return bio_alloc(GFP_NOIO, 1); - - return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set); -} - /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -463,8 +445,8 @@ struct md_io { struct mddev *mddev; bio_end_io_t *orig_bi_end_io; void *orig_bi_private; + struct block_device *orig_bi_bdev; unsigned long start_time; - struct block_device *part; }; static void md_end_io(struct bio *bio) @@ -472,7 +454,7 @@ static void md_end_io(struct bio *bio) struct md_io *md_io = bio->bi_private; struct mddev *mddev = md_io->mddev; - part_end_io_acct(md_io->part, bio, md_io->start_time); + bio_end_io_acct_remapped(bio, md_io->start_time, md_io->orig_bi_bdev); bio->bi_end_io = md_io->orig_bi_end_io; bio->bi_private = md_io->orig_bi_private; @@ -486,7 +468,7 @@ static void md_end_io(struct bio *bio) static blk_qc_t md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); - struct mddev *mddev = bio->bi_disk->private_data; + struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); @@ -514,12 +496,12 @@ static blk_qc_t md_submit_bio(struct bio *bio) md_io->mddev = mddev; md_io->orig_bi_end_io = bio->bi_end_io; md_io->orig_bi_private = bio->bi_private; + md_io->orig_bi_bdev = bio->bi_bdev; bio->bi_end_io = md_end_io; bio->bi_private = md_io; - md_io->start_time = part_start_io_acct(mddev->gendisk, - &md_io->part, bio); + md_io->start_time = bio_start_io_acct(bio); } /* bio could be mergeable after passing to underlayer */ @@ -613,7 +595,7 @@ static void submit_flushes(struct work_struct *ws) atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); + bi = bio_alloc_bioset(GFP_NOIO, 0, &mddev->bio_set); bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bio_set_dev(bi, rdev->bdev); @@ -999,7 +981,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, if (test_bit(Faulty, &rdev->flags)) return; - bio = md_bio_alloc_sync(mddev); + bio = bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set); atomic_inc(&rdev->nr_pending); @@ -1031,29 +1013,29 @@ int md_super_wait(struct mddev *mddev) int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int op, int op_flags, bool metadata_op) { - struct bio *bio = md_bio_alloc_sync(rdev->mddev); - int ret; + struct bio bio; + struct bio_vec bvec; + + bio_init(&bio, &bvec, 1); if (metadata_op && rdev->meta_bdev) - bio_set_dev(bio, rdev->meta_bdev); + bio_set_dev(&bio, rdev->meta_bdev); else - bio_set_dev(bio, rdev->bdev); - bio_set_op_attrs(bio, op, op_flags); + bio_set_dev(&bio, rdev->bdev); + bio.bi_opf = op | op_flags; if (metadata_op) - bio->bi_iter.bi_sector = sector + rdev->sb_start; + bio.bi_iter.bi_sector = sector + rdev->sb_start; else if (rdev->mddev->reshape_position != MaxSector && (rdev->mddev->reshape_backwards == (sector >= rdev->mddev->reshape_position))) - bio->bi_iter.bi_sector = sector + rdev->new_data_offset; + bio.bi_iter.bi_sector = sector + rdev->new_data_offset; else - bio->bi_iter.bi_sector = sector + rdev->data_offset; - bio_add_page(bio, page, size, 0); + bio.bi_iter.bi_sector = sector + rdev->data_offset; + bio_add_page(&bio, page, size, 0); - submit_bio_wait(bio); + submit_bio_wait(&bio); - ret = !bio->bi_status; - bio_put(bio); - return ret; + return !bio.bi_status; } EXPORT_SYMBOL_GPL(sync_page_io); @@ -2417,6 +2399,12 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_add_rdev); +static bool rdev_read_only(struct md_rdev *rdev) +{ + return bdev_read_only(rdev->bdev) || + (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); +} + static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { char b[BDEVNAME_SIZE]; @@ -2426,8 +2414,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) && - mddev->pers) + if (rdev_read_only(rdev) && mddev->pers) return -EROFS; /* make sure rdev->sectors exceeds mddev->dev_sectors */ @@ -5861,9 +5848,7 @@ int md_run(struct mddev *mddev) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev); - if (mddev->ro != 1 && - (bdev_read_only(rdev->bdev) || - bdev_read_only(rdev->meta_bdev))) { + if (mddev->ro != 1 && rdev_read_only(rdev)) { mddev->ro = 1; if (mddev->gendisk) set_disk_ro(mddev->gendisk, 1); @@ -6158,7 +6143,7 @@ static int restart_array(struct mddev *mddev) if (test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) has_journal = true; - if (bdev_read_only(rdev->bdev)) + if (rdev_read_only(rdev)) has_readonly = true; } rcu_read_unlock(); diff --git a/drivers/md/md.h b/drivers/md/md.h index 34070ab30a8a..bcbba1b5ec4a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -556,7 +556,7 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) { - atomic_add(nr_sectors, &bio->bi_disk->sync_io); + md_sync_acct(bio->bi_bdev, nr_sectors); } struct md_personality @@ -742,8 +742,6 @@ extern void md_rdev_clear(struct md_rdev *rdev); extern void md_handle_request(struct mddev *mddev, struct bio *bio); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); -extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, - struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); @@ -793,14 +791,14 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_SAME && - !bio->bi_disk->queue->limits.max_write_same_sectors) + !bio->bi_bdev->bd_disk->queue->limits.max_write_same_sectors) mddev->queue->limits.max_write_same_sectors = 0; } static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !bio->bi_disk->queue->limits.max_write_zeroes_sectors) + !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) mddev->queue->limits.max_write_zeroes_sectors = 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c0347997f6ff..d2378765dc15 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -794,13 +794,13 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void *)bio->bi_disk; + struct md_rdev *rdev = (void *)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1104,7 +1104,7 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, int i = 0; struct bio *behind_bio = NULL; - behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev); + behind_bio = bio_alloc_bioset(GFP_NOIO, vcnt, &r1_bio->mddev->bio_set); if (!behind_bio) return; @@ -1520,7 +1520,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_disk = (void *)conf->mirrors[i].rdev; + mbio->bi_bdev = (void *)conf->mirrors[i].rdev; cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); if (cb) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c5d88ef6a45c..a9ae7d113492 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -882,13 +882,13 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_disk; + struct md_rdev *rdev = (void*)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1075,13 +1075,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_disk; + struct md_rdev *rdev = (void*)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1253,7 +1253,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_disk = (void *)rdev; + mbio->bi_bdev = (void *)rdev; atomic_inc(&r10_bio->remaining); @@ -3003,7 +3003,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that - * have bi_end_io, bi_sector, bi_disk set, + * have bi_end_io, bi_sector, bi_bdev set, * and bi_private set to the r10bio. * For recovery, we may actually create several r10bios * with 2 bios in each, that correspond to the bios in the main one. @@ -4531,7 +4531,7 @@ read_more: return sectors_done; } - read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); + read_bio = bio_alloc_bioset(GFP_KERNEL, RESYNC_PAGES, &mddev->bio_set); bio_set_dev(read_bio, rdev->bdev); read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr @@ -4539,10 +4539,6 @@ read_more: read_bio->bi_private = r10_bio; read_bio->bi_end_io = end_reshape_read; bio_set_op_attrs(read_bio, REQ_OP_READ, 0); - read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); - read_bio->bi_status = 0; - read_bio->bi_vcnt = 0; - read_bio->bi_iter.bi_size = 0; r10_bio->master_bio = read_bio; r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index d0f540296fe9..e8c118e05dfd 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1037,7 +1037,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, } /* flush the disk cache after recovery if necessary */ - ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL); + ret = blkdev_issue_flush(rdev->bdev); out: __free_page(page); return ret; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3a90cc0e43ca..a348b2adf2a9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5310,7 +5310,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); - WARN_ON_ONCE(bio->bi_partno); + WARN_ON_ONCE(bio->bi_bdev->bd_partno); chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= @@ -5393,90 +5393,72 @@ static void raid5_align_endio(struct bio *bi) static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) { struct r5conf *conf = mddev->private; - int dd_idx; - struct bio* align_bi; + struct bio *align_bio; struct md_rdev *rdev; - sector_t end_sector; + sector_t sector, end_sector, first_bad; + int bad_sectors, dd_idx; if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("%s: non aligned\n", __func__); return 0; } - /* - * use bio_clone_fast to make a copy of the bio - */ - align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); - if (!align_bi) - return 0; - /* - * set bi_end_io to a new function, and set bi_private to the - * original bio. - */ - align_bi->bi_end_io = raid5_align_endio; - align_bi->bi_private = raid_bio; - /* - * compute position - */ - align_bi->bi_iter.bi_sector = - raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, - 0, &dd_idx, NULL); - end_sector = bio_end_sector(align_bi); + sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0, + &dd_idx, NULL); + end_sector = bio_end_sector(raid_bio); + rcu_read_lock(); + if (r5c_big_stripe_cached(conf, sector)) + goto out_rcu_unlock; + rdev = rcu_dereference(conf->disks[dd_idx].replacement); if (!rdev || test_bit(Faulty, &rdev->flags) || rdev->recovery_offset < end_sector) { rdev = rcu_dereference(conf->disks[dd_idx].rdev); - if (rdev && - (test_bit(Faulty, &rdev->flags) || + if (!rdev) + goto out_rcu_unlock; + if (test_bit(Faulty, &rdev->flags) || !(test_bit(In_sync, &rdev->flags) || - rdev->recovery_offset >= end_sector))) - rdev = NULL; + rdev->recovery_offset >= end_sector)) + goto out_rcu_unlock; } - if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { - rcu_read_unlock(); - bio_put(align_bi); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + + align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); + bio_set_dev(align_bio, rdev->bdev); + align_bio->bi_end_io = raid5_align_endio; + align_bio->bi_private = raid_bio; + align_bio->bi_iter.bi_sector = sector; + + raid_bio->bi_next = (void *)rdev; + + if (is_badblock(rdev, sector, bio_sectors(align_bio), &first_bad, + &bad_sectors)) { + bio_put(align_bio); + rdev_dec_pending(rdev, mddev); return 0; } - if (rdev) { - sector_t first_bad; - int bad_sectors; - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - raid_bio->bi_next = (void*)rdev; - bio_set_dev(align_bi, rdev->bdev); - - if (is_badblock(rdev, align_bi->bi_iter.bi_sector, - bio_sectors(align_bi), - &first_bad, &bad_sectors)) { - bio_put(align_bi); - rdev_dec_pending(rdev, mddev); - return 0; - } + /* No reshape active, so we can trust rdev->data_offset */ + align_bio->bi_iter.bi_sector += rdev->data_offset; - /* No reshape active, so we can trust rdev->data_offset */ - align_bi->bi_iter.bi_sector += rdev->data_offset; + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, + conf->device_lock); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_quiescent, - conf->quiesce == 0, - conf->device_lock); - atomic_inc(&conf->active_aligned_reads); - spin_unlock_irq(&conf->device_lock); + if (mddev->gendisk) + trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), + raid_bio->bi_iter.bi_sector); + submit_bio_noacct(align_bio); + return 1; - if (mddev->gendisk) - trace_block_bio_remap(align_bi, disk_devt(mddev->gendisk), - raid_bio->bi_iter.bi_sector); - submit_bio_noacct(align_bi); - return 1; - } else { - rcu_read_unlock(); - bio_put(align_bi); - return 0; - } +out_rcu_unlock: + rcu_read_unlock(); + return 0; } static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 42e27a298218..a1d6b68320ae 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -253,7 +253,7 @@ static ssize_t power_ro_lock_store(struct device *dev, goto out_put; } req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP; - blk_execute_rq(mq->queue, NULL, req, 0); + blk_execute_rq(NULL, req, 0); ret = req_to_mmc_queue_req(req)->drv_op_result; blk_put_request(req); @@ -629,7 +629,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md, rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idatas; req_to_mmc_queue_req(req)->ioc_count = 1; - blk_execute_rq(mq->queue, NULL, req, 0); + blk_execute_rq(NULL, req, 0); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata); blk_put_request(req); @@ -698,7 +698,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md, rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL; req_to_mmc_queue_req(req)->drv_op_data = idata; req_to_mmc_queue_req(req)->ioc_count = num_of_cmds; - blk_execute_rq(mq->queue, NULL, req, 0); + blk_execute_rq(NULL, req, 0); ioc_err = req_to_mmc_queue_req(req)->drv_op_result; /* copy to user if data and response */ @@ -2722,7 +2722,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val) if (IS_ERR(req)) return PTR_ERR(req); req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS; - blk_execute_rq(mq->queue, NULL, req, 0); + blk_execute_rq(NULL, req, 0); ret = req_to_mmc_queue_req(req)->drv_op_result; if (ret >= 0) { *val = ret; @@ -2761,7 +2761,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp) } req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_EXT_CSD; req_to_mmc_queue_req(req)->drv_op_data = &ext_csd; - blk_execute_rq(mq->queue, NULL, req, 0); + blk_execute_rq(NULL, req, 0); err = req_to_mmc_queue_req(req)->drv_op_result; blk_put_request(req); if (err) { diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 22e5617b2cea..e03a1f38d750 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -165,7 +165,7 @@ static int nsblk_do_bvec(struct nd_namespace_blk *nsblk, static blk_qc_t nd_blk_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip; - struct nd_namespace_blk *nsblk = bio->bi_disk->private_data; + struct nd_namespace_blk *nsblk = bio->bi_bdev->bd_disk->private_data; struct bvec_iter iter; unsigned long start; struct bio_vec bvec; @@ -177,7 +177,7 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio) bip = bio_integrity(bio); rw = bio_data_dir(bio); - do_acct = blk_queue_io_stat(bio->bi_disk->queue); + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) start = bio_start_io_acct(bio); bio_for_each_segment(bvec, bio, iter) { diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 12ff6f8784ac..41aa1f01fc07 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1442,7 +1442,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, static blk_qc_t btt_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct btt *btt = bio->bi_disk->private_data; + struct btt *btt = bio->bi_bdev->bd_disk->private_data; struct bvec_iter iter; unsigned long start; struct bio_vec bvec; @@ -1452,7 +1452,7 @@ static blk_qc_t btt_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return BLK_QC_T_NONE; - do_acct = blk_queue_io_stat(bio->bi_disk->queue); + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) start = bio_start_io_acct(bio); bio_for_each_segment(bvec, bio, iter) { diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index f33bdae626ba..281fedb4dc4d 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -196,13 +196,13 @@ static blk_qc_t pmem_submit_bio(struct bio *bio) unsigned long start; struct bio_vec bvec; struct bvec_iter iter; - struct pmem_device *pmem = bio->bi_disk->private_data; + struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data; struct nd_region *nd_region = to_region(pmem); if (bio->bi_opf & REQ_PREFLUSH) ret = nvdimm_flush(nd_region, bio); - do_acct = blk_queue_io_stat(bio->bi_disk->queue); + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) start = bio_start_io_acct(bio); bio_for_each_segment(bvec, bio, iter) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f13eb4ded95f..89cacff897a8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -925,7 +925,7 @@ static void nvme_execute_rq_polled(struct request_queue *q, rq->cmd_flags |= REQ_HIPRI; rq->end_io_data = &wait; - blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq); + blk_execute_rq_nowait(bd_disk, rq, at_head, nvme_end_sync_rq); while (!completion_done(&wait)) { blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); @@ -964,7 +964,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, if (poll) nvme_execute_rq_polled(req->q, NULL, req, at_head); else - blk_execute_rq(req->q, NULL, req, at_head); + blk_execute_rq(NULL, req, at_head); if (result) *result = nvme_req(req)->result; if (nvme_req(req)->flags & NVME_REQ_CANCELLED) @@ -1101,7 +1101,7 @@ void nvme_execute_passthru_rq(struct request *rq) u32 effects; effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - blk_execute_rq(rq->q, disk, rq, 0); + blk_execute_rq(disk, rq, 0); nvme_passthru_end(ctrl, effects); } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); @@ -1113,7 +1113,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; - struct gendisk *disk = ns ? ns->disk : NULL; + struct block_device *bdev = ns ? ns->disk->part0 : NULL; struct request *req; struct bio *bio = NULL; void *meta = NULL; @@ -1133,8 +1133,9 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (ret) goto out; bio = req->bio; - bio->bi_disk = disk; - if (disk && meta_buffer && meta_len) { + if (bdev) + bio_set_dev(bio, bdev); + if (bdev && meta_buffer && meta_len) { meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, meta_seed, write); if (IS_ERR(meta)) { @@ -1202,7 +1203,7 @@ static int nvme_keep_alive(struct nvme_ctrl *ctrl) rq->timeout = ctrl->kato * HZ; rq->end_io_data = ctrl; - blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); + blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io); return 0; } @@ -2125,9 +2126,8 @@ static void nvme_update_disk_info(struct gendisk *disk, nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); - if ((id->nsattr & NVME_NS_ATTR_RO) || - test_bit(NVME_NS_FORCE_RO, &ns->flags)) - set_disk_ro(disk, true); + set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) || + test_bit(NVME_NS_FORCE_RO, &ns->flags)); } static inline bool nvme_first_scan(struct gendisk *disk) @@ -2176,17 +2176,18 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) ns->lba_shift = id->lbaf[lbaf].ds; nvme_set_queue_limits(ns->ctrl, ns->queue); + ret = nvme_configure_metadata(ns, id); + if (ret) + goto out_unfreeze; + nvme_set_chunk_sectors(ns, id); + nvme_update_disk_info(ns->disk, ns, id); + if (ns->head->ids.csi == NVME_CSI_ZNS) { ret = nvme_update_zone_info(ns, lbaf); if (ret) goto out_unfreeze; } - ret = nvme_configure_metadata(ns, id); - if (ret) - goto out_unfreeze; - nvme_set_chunk_sectors(ns, id); - nvme_update_disk_info(ns->disk, ns, id); blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 470cef3abec3..b705988629f2 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -695,7 +695,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd, rq->end_io_data = rqd; - blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); + blk_execute_rq_nowait(NULL, rq, 0, nvme_nvm_end_io); return 0; @@ -757,7 +757,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, { bool write = nvme_is_write((struct nvme_command *)vcmd); struct nvm_dev *dev = ns->ndev; - struct gendisk *disk = ns->disk; struct request *rq; struct bio *bio = NULL; __le64 *ppa_list = NULL; @@ -817,10 +816,10 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); } - bio->bi_disk = disk; + bio_set_dev(bio, ns->disk->part0); } - blk_execute_rq(q, NULL, rq, 0); + blk_execute_rq(NULL, rq, 0); if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) ret = -EINTR; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 282b7a4ea9a9..1427c9555cef 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -296,7 +296,7 @@ static bool nvme_available_path(struct nvme_ns_head *head) blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) { - struct nvme_ns_head *head = bio->bi_disk->private_data; + struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; struct device *dev = disk_to_dev(head->disk); struct nvme_ns *ns; blk_qc_t ret = BLK_QC_T_NONE; @@ -312,7 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); if (likely(ns)) { - bio->bi_disk = ns->disk; + bio_set_dev(bio, ns->disk->part0); bio->bi_opf |= REQ_NVME_MPATH; trace_block_bio_remap(bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); @@ -352,7 +352,7 @@ static void nvme_requeue_work(struct work_struct *work) * Reset disk to the mpath node and resubmit to select a new * path. */ - bio->bi_disk = head->disk; + bio_set_dev(bio, head->disk->part0); submit_bio_noacct(bio); } } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6bad4d4dcdf0..808ab5186928 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1357,7 +1357,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) } abort_req->end_io_data = NULL; - blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); + blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio); /* * The aborted req will be completed on receiving the abort req. @@ -2281,7 +2281,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) req->end_io_data = nvmeq; init_completion(&nvmeq->delete_done); - blk_execute_rq_nowait(q, NULL, req, false, + blk_execute_rq_nowait(NULL, req, false, opcode == nvme_admin_delete_cq ? nvme_del_cq_end : nvme_del_queue_end); return 0; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b7ce4f221d99..f5ef3edeb2fd 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1468,7 +1468,7 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue, if (unlikely(nr)) goto mr_put; - nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c, + nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c, req->mr->sig_attrs, ns->pi_type); nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 1dfe9a3500e3..c7e3ec561ba0 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -9,13 +9,7 @@ int nvme_revalidate_zones(struct nvme_ns *ns) { - struct request_queue *q = ns->queue; - int ret; - - ret = blk_revalidate_disk_zones(ns->disk, NULL); - if (!ret) - blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); - return ret; + return blk_revalidate_disk_zones(ns->disk, NULL); } static int nvme_set_max_append(struct nvme_ctrl *ctrl) @@ -109,10 +103,11 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) goto free_data; } - q->limits.zoned = BLK_ZONED_HM; + blk_queue_set_zoned(ns->disk, BLK_ZONED_HM); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1); + blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); free_data: kfree(id); return status; diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 125dde3f410e..bf6e0ac9ad28 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -333,7 +333,7 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) u16 nvmet_bdev_flush(struct nvmet_req *req) { - if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL)) + if (blkdev_issue_flush(req->ns->bdev)) return NVME_SC_INTERNAL | NVME_SC_DNR; return 0; } diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index b9776fc8f08f..cbc88acdd233 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -275,7 +275,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) schedule_work(&req->p.work); } else { rq->end_io_data = req; - blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0, + blk_execute_rq_nowait(ns ? ns->disk : NULL, rq, 0, nvmet_passthru_req_done); } diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index c7eb9a10c680..28c04a4efa66 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -428,23 +428,15 @@ static int dasd_state_unfmt_to_basic(struct dasd_device *device) static int dasd_state_ready_to_online(struct dasd_device * device) { - struct gendisk *disk; - struct disk_part_iter piter; - struct block_device *part; - device->state = DASD_STATE_ONLINE; if (device->block) { dasd_schedule_block_bh(device->block); if ((device->features & DASD_FEATURE_USERAW)) { - disk = device->block->gdp; - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + kobject_uevent(&disk_to_dev(device->block->gdp)->kobj, + KOBJ_CHANGE); return 0; } - disk = device->block->bdev->bd_disk; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(bdev_kobj(part), KOBJ_CHANGE); - disk_part_iter_exit(&piter); + disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE); } return 0; } @@ -455,9 +447,6 @@ dasd_state_ready_to_online(struct dasd_device * device) static int dasd_state_online_to_ready(struct dasd_device *device) { int rc; - struct gendisk *disk; - struct disk_part_iter piter; - struct block_device *part; if (device->discipline->online_to_ready) { rc = device->discipline->online_to_ready(device); @@ -466,13 +455,8 @@ static int dasd_state_online_to_ready(struct dasd_device *device) } device->state = DASD_STATE_READY; - if (device->block && !(device->features & DASD_FEATURE_USERAW)) { - disk = device->block->bdev->bd_disk; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(bdev_kobj(part), KOBJ_CHANGE); - disk_part_iter_exit(&piter); - } + if (device->block && !(device->features & DASD_FEATURE_USERAW)) + disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE); return 0; } diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 299e77ec2c41..da33cb4cba28 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -879,17 +879,13 @@ dcssblk_submit_bio(struct bio *bio) blk_queue_split(&bio); bytes_done = 0; - dev_info = bio->bi_disk->private_data; + dev_info = bio->bi_bdev->bd_disk->private_data; if (dev_info == NULL) goto fail; if ((bio->bi_iter.bi_sector & 7) != 0 || (bio->bi_iter.bi_size & 4095) != 0) /* Request is not page-aligned. */ goto fail; - if (bio_end_sector(bio) > get_capacity(bio->bi_disk)) { - /* Request beyond end of DCSS segment. */ - goto fail; - } /* verify data transfer direction */ if (dev_info->is_shared) { switch (dev_info->segment_type) { diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index c2536f7767b3..d1ed39162943 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -184,7 +184,7 @@ static unsigned long xpram_highest_page_index(void) */ static blk_qc_t xpram_submit_bio(struct bio *bio) { - xpram_device_t *xdev = bio->bi_disk->private_data; + xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; struct bvec_iter iter; unsigned int index; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f11f51e2465f..c00f06e9ecb0 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2007,7 +2007,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) req->timeout = 10 * HZ; rq->retries = 5; - blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done); + blk_execute_rq_nowait(NULL, req, 1, eh_lock_door_done); } /** diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index b3f14f05340a..4d2280658559 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -269,7 +269,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, /* * head injection *required* here otherwise quiesce won't work */ - blk_execute_rq(req->q, NULL, req, 1); + blk_execute_rq(NULL, req, 1); /* * Some devices (USB mass-storage in particular) may transfer diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index cf07b7f93579..03adb39293c2 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -665,12 +665,28 @@ static int sd_zbc_init_disk(struct scsi_disk *sdkp) return 0; } -void sd_zbc_release_disk(struct scsi_disk *sdkp) +static void sd_zbc_clear_zone_info(struct scsi_disk *sdkp) { + /* Serialize against revalidate zones */ + mutex_lock(&sdkp->rev_mutex); + kvfree(sdkp->zones_wp_offset); sdkp->zones_wp_offset = NULL; kfree(sdkp->zone_wp_update_buf); sdkp->zone_wp_update_buf = NULL; + + sdkp->nr_zones = 0; + sdkp->rev_nr_zones = 0; + sdkp->zone_blocks = 0; + sdkp->rev_zone_blocks = 0; + + mutex_unlock(&sdkp->rev_mutex); +} + +void sd_zbc_release_disk(struct scsi_disk *sdkp) +{ + if (sd_is_zoned(sdkp)) + sd_zbc_clear_zone_info(sdkp); } static void sd_zbc_revalidate_zones_cb(struct gendisk *disk) @@ -769,6 +785,21 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) */ return 0; + /* READ16/WRITE16 is mandatory for ZBC disks */ + sdkp->device->use_16_for_rw = 1; + sdkp->device->use_10_for_rw = 0; + + if (!blk_queue_is_zoned(q)) { + /* + * This can happen for a host aware disk with partitions. + * The block device zone information was already cleared + * by blk_queue_set_zoned(). Only clear the scsi disk zone + * information and exit early. + */ + sd_zbc_clear_zone_info(sdkp); + return 0; + } + /* Check zoned block device characteristics (unconstrained reads) */ ret = sd_zbc_check_zoned_characteristics(sdkp, buf); if (ret) @@ -789,9 +820,13 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) blk_queue_max_active_zones(q, 0); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); - /* READ16/WRITE16 is mandatory for ZBC disks */ - sdkp->device->use_16_for_rw = 1; - sdkp->device->use_10_for_rw = 0; + /* + * Per ZBC and ZAC specifications, writes in sequential write required + * zones of host-managed devices must be aligned to the device physical + * block size. + */ + if (blk_queue_zoned_model(q) == BLK_ZONED_HM) + blk_queue_zone_write_granularity(q, sdkp->physical_block_size); sdkp->rev_nr_zones = nr_zones; sdkp->rev_zone_blocks = zone_blocks; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index bfa8d77322d7..4383d93110f8 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -829,8 +829,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, srp->rq->timeout = timeout; kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */ - blk_execute_rq_nowait(sdp->device->request_queue, sdp->disk, - srp->rq, at_head, sg_rq_end_io); + blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io); return 0; } diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 43f7624508a9..841ad2fc369a 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -585,7 +585,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, rq->retries = retries; req->end_io_data = SRpnt; - blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end); + blk_execute_rq_nowait(NULL, req, 1, st_scsi_execute_end); return 0; } diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index b0cb5b95e892..cce455929778 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -241,6 +241,7 @@ struct target_core_file_cmd { unsigned long len; struct se_cmd *cmd; struct kiocb iocb; + struct bio_vec bvecs[]; }; static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) @@ -268,29 +269,22 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, struct target_core_file_cmd *aio_cmd; struct iov_iter iter = {}; struct scatterlist *sg; - struct bio_vec *bvec; ssize_t len = 0; int ret = 0, i; - aio_cmd = kmalloc(sizeof(struct target_core_file_cmd), GFP_KERNEL); + aio_cmd = kmalloc(struct_size(aio_cmd, bvecs, sgl_nents), GFP_KERNEL); if (!aio_cmd) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - bvec = kcalloc(sgl_nents, sizeof(struct bio_vec), GFP_KERNEL); - if (!bvec) { - kfree(aio_cmd); - return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - } - for_each_sg(sgl, sg, sgl_nents, i) { - bvec[i].bv_page = sg_page(sg); - bvec[i].bv_len = sg->length; - bvec[i].bv_offset = sg->offset; + aio_cmd->bvecs[i].bv_page = sg_page(sg); + aio_cmd->bvecs[i].bv_len = sg->length; + aio_cmd->bvecs[i].bv_offset = sg->offset; len += sg->length; } - iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len); + iov_iter_bvec(&iter, is_write, aio_cmd->bvecs, sgl_nents, len); aio_cmd->cmd = cmd; aio_cmd->len = len; @@ -307,8 +301,6 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, else ret = call_read_iter(file, &aio_cmd->iocb, &iter); - kfree(bvec); - if (ret != -EIOCBQUEUED) cmd_rw_aio_complete(&aio_cmd->iocb, ret, 0); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 7994f27e4527..33770e5808ce 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1000,8 +1000,7 @@ pscsi_execute_cmd(struct se_cmd *cmd) req->timeout = PS_TIMEOUT_OTHER; scsi_req(req)->retries = PS_RETRY; - blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req, - (cmd->sam_task_attr == TCM_HEAD_TAG), + blk_execute_rq_nowait(NULL, req, (cmd->sam_task_attr == TCM_HEAD_TAG), pscsi_req_done); return 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index 235b5042672e..ec26179c8062 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -126,7 +126,6 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, bd_abort_claiming(bdev, truncate_bdev_range); return 0; } -EXPORT_SYMBOL(truncate_bdev_range); static void set_init_blocksize(struct block_device *bdev) { @@ -424,7 +423,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; - nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES); if (!nr_pages) { bool polled = false; @@ -489,9 +488,10 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { int nr_pages; - nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); - if (!nr_pages) + if (!iov_iter_count(iter)) return 0; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1); if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); @@ -688,7 +688,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ - error = blkdev_issue_flush(bdev, GFP_KERNEL); + error = blkdev_issue_flush(bdev); if (error == -EOPNOTSUPP) error = 0; @@ -1808,13 +1808,11 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, return error; /* - * Invalidate again; if someone wandered in and dirtied a page, - * the caller will be given -EBUSY. The third argument is - * inclusive, so the rounding here is safe. + * Invalidate the page cache again; if someone wandered in and dirtied + * a page, we just discard it - userspace has no way of knowing whether + * the write happened before or after discard completing... */ - return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, - start >> PAGE_SHIFT, - end >> PAGE_SHIFT); + return truncate_bdev_range(bdev, file->f_mode, start, end); } const struct file_operations def_blk_fops = { diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 6ff44e53814c..113cb85c1fd4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2674,7 +2674,7 @@ static void __btrfsic_submit_bio(struct bio *bio) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno); + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i = 0; @@ -2690,9 +2690,9 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n", + pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_disk); + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); @@ -2721,8 +2721,8 @@ static void __btrfsic_submit_bio(struct bio *bio) } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_disk); + pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5394641541f7..8ec34ecb6d68 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1104,8 +1104,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && !last->bi_status && - last->bi_disk == stripe->dev->bdev->bd_disk && - last->bi_partno == stripe->dev->bdev->bd_partno) { + last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) return 0; @@ -1356,9 +1355,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bbio->num_stripes; i++) { stripe = &rbio->bbio->stripes[i]; if (in_range(physical, stripe->physical, rbio->stripe_len) && - stripe->dev->bdev && - bio->bi_disk == stripe->dev->bdev->bd_disk && - bio->bi_partno == stripe->dev->bdev->bd_partno) { + stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; } } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 310fce00fcda..582df11d298a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1728,7 +1728,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) sbio = sctx->wr_curr_bio; sctx->wr_curr_bio = NULL; - WARN_ON(!sbio->bio->bi_disk); + WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer * orders the requests before sending them to the driver which diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b8fab44394f5..bc3b33efddc5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -421,7 +421,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) * Preallocate a bio that's always going to be used for flushing device * barriers and matches the device lifespan */ - dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); + dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); if (!dev->flush_bio) { kfree(dev); return ERR_PTR(-ENOMEM); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 9a5cf153da89..d0eb0c8d6269 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1275,8 +1275,8 @@ void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, return; ordered->physical = physical; - ordered->disk = bio->bi_disk; - ordered->partno = bio->bi_partno; + ordered->disk = bio->bi_bdev->bd_disk; + ordered->partno = bio->bi_bdev->bd_partno; btrfs_put_ordered_extent(ordered); } diff --git a/fs/direct-io.c b/fs/direct-io.c index d53fa92a1ab6..aa1083ecd623 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -426,6 +426,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags; bio->bi_private = dio; + /* don't account direct I/O as memory stall */ + bio_clear_flag(bio, BIO_WORKINGSET); spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; @@ -434,7 +436,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); - dio->bio_disk = bio->bi_disk; + dio->bio_disk = bio->bi_bdev->bd_disk; if (sdio->submit_io) { sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); diff --git a/fs/exfat/file.c b/fs/exfat/file.c index a92478eabfa4..183ffdf4d43c 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -361,7 +361,7 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + return blkdev_issue_flush(inode->i_sb->s_bdev); } const struct file_operations exfat_file_operations = { diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 0a14a7c87bf8..6e8208acfc62 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1076,7 +1076,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); + blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); if (sbi->s_fc_bytes == 0) { @@ -1535,7 +1535,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) out: iput(inode); if (!ret) - blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); + blkdev_issue_flush(sb->s_bdev); return 0; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 113bfb023a4a..027a7d7037a0 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -174,7 +174,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = ext4_fsync_journal(inode, datasync, &needs_barrier); if (needs_barrier) { - err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + err = blkdev_issue_flush(inode->i_sb->s_bdev); if (!ret) ret = err; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b215c564bc31..20f2fcb799f5 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1583,7 +1583,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (ret < 0) goto err_out; if (barrier) - blkdev_issue_flush(sb->s_bdev, GFP_NOFS); + blkdev_issue_flush(sb->s_bdev); skip_zeroout: ext4_lock_group(sb, group); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9a6f9875aa34..fb5985102c1d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5709,7 +5709,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) needs_barrier = true; if (needs_barrier) { int err; - err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); + err = blkdev_issue_flush(sb->s_bdev); if (!ret) ret = err; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1ee966a63df9..b9721c8f116c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -49,27 +49,6 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, - unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); -} - -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio) -{ - if (noio) { - /* No failure on bio allocation */ - return __f2fs_bio_alloc(GFP_NOIO, npages); - } - - if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); - return NULL; - } - - return __f2fs_bio_alloc(GFP_KERNEL, npages); -} - static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; @@ -398,22 +377,12 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } -/* - * Return true, if pre_bio's bdev is same as its target device. - */ -static bool __same_bdev(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio) -{ - struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL); - return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; -} - static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; struct bio *bio; - bio = f2fs_bio_alloc(sbi, npages, true); + bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); f2fs_target_device(sbi, fio->new_blkaddr, bio); if (is_read_io(fio->op)) { @@ -711,7 +680,7 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, return false; if (last_blkaddr + 1 != cur_blkaddr) return false; - return __same_bdev(sbi, cur_blkaddr, bio); + return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL); } static bool io_type_is_mergeable(struct f2fs_bio_info *io, @@ -999,8 +968,9 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; - bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), - for_write); + bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, + min_t(int, nr_pages, BIO_MAX_PAGES), + &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2860003a09ed..506c801880f3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -43,7 +43,6 @@ enum { FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, - FAULT_ALLOC_BIO, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -3482,7 +3481,6 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 440634dfaa56..993004f06a77 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -563,17 +563,7 @@ do_sync: static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio; - int ret; - - bio = f2fs_bio_alloc(sbi, 0, false); - if (!bio) - return -ENOMEM; - - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; - bio_set_dev(bio, bdev); - ret = submit_bio_wait(bio); - bio_put(bio); + int ret = blkdev_issue_flush(bdev); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 30d5abef4361..4acfa7d36731 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -46,7 +46,6 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", - [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", diff --git a/fs/fat/file.c b/fs/fat/file.c index f9ee27cf4d7c..5fee74f1ad61 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -195,7 +195,7 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + return blkdev_issue_flush(inode->i_sb->s_bdev); } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index e3da9e96b835..ca464328b79c 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -340,7 +340,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, } if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + blkdev_issue_flush(inode->i_sb->s_bdev); inode_unlock(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 807119ae5adf..b9e3db3f855f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -239,7 +239,7 @@ out: mutex_unlock(&sbi->vh_mutex); if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); + blkdev_issue_flush(sb->s_bdev); return error; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e4c258a2cefc..e2c4991833b8 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -279,11 +279,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, orig_count = iov_iter_count(dio->submit.iter); iov_iter_truncate(dio->submit.iter, length); - nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); - if (nr_pages <= 0) { - ret = nr_pages; + if (!iov_iter_count(dio->submit.iter)) goto out; - } if (need_zeroout) { /* zero out from the start of the block to the write offset */ @@ -299,6 +296,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, */ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES); do { size_t n; if (dio->error) { @@ -339,7 +337,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, dio->size += n; copied += n; - nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, + BIO_MAX_PAGES); iomap_dio_submit_bio(dio, iomap, bio, pos); pos += n; } while (nr_pages); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 472932b9e6bc..63b526d44886 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -416,7 +416,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); + blkdev_issue_flush(journal->j_fs_dev); return __jbd2_update_log_tail(journal, first_tid, blocknr); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b121d7d434c6..3cc4ab2ba7f4 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -825,7 +825,7 @@ start_journal_io: if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); + blkdev_issue_flush(journal->j_fs_dev); /* Done it all: now write the commit record asynchronously. */ if (jbd2_has_feature_async_commit(journal)) { @@ -932,7 +932,7 @@ start_journal_io: stats.run.rs_blocks_logged++; if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { - blkdev_issue_flush(journal->j_dev, GFP_NOFS); + blkdev_issue_flush(journal->j_dev); } if (err) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index dc0694fcfcd1..69f18fe20923 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -326,7 +326,7 @@ int jbd2_journal_recover(journal_t *journal) err = err2; /* Make sure all replayed data is on permanent storage */ if (journal->j_flags & JBD2_BARRIER) { - err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL); + err2 = blkdev_issue_flush(journal->j_fs_dev); if (!err) err = err2; } diff --git a/fs/libfs.c b/fs/libfs.c index 79721571e014..abf7674fb437 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1117,7 +1117,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, err = __generic_file_fsync(file, start, end, datasync); if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + return blkdev_issue_flush(inode->i_sb->s_bdev); } EXPORT_SYMBOL(generic_file_fsync); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 3be6836074ae..1a96ce28efb0 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -123,11 +123,6 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, npg = min(npg, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, npg); - if (!bio && (current->flags & PF_MEMALLOC)) { - while (!bio && (npg /= 2)) - bio = bio_alloc(GFP_NOIO, npg); - } - if (bio) { bio->bi_iter.bi_sector = disk_sector; bio_set_dev(bio, bdev); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index a07c39c94bbd..1058659a8d31 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -254,7 +254,7 @@ again: req->cmd[4] = bufflen & 0xff; req->cmd_len = COMMAND_SIZE(INQUIRY); - blk_execute_rq(rq->q, NULL, rq, 1); + blk_execute_rq(NULL, rq, 1); if (req->result) { pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", req->result); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1a8729eded8b..1e75417bfe6e 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -386,10 +386,6 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, struct bio *bio; bio = bio_alloc(GFP_NOIO, nr_vecs); - if (bio == NULL) { - while (!bio && (nr_vecs >>= 1)) - bio = bio_alloc(GFP_NOIO, nr_vecs); - } if (likely(bio)) { bio_set_dev(bio, nilfs->ns_bdev); bio->bi_iter.bi_sector = diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index b55cdeb4d169..987c8ab02aee 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -375,7 +375,7 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs) */ smp_wmb(); - err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL); + err = blkdev_issue_flush(nilfs->ns_bdev); if (err != -EIO) err = 0; return err; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 85979e2214b3..df6d709d2ae3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -194,7 +194,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, needs_barrier = true; err = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + ret = blkdev_issue_flush(inode->i_sb->s_bdev); if (!err) err = ret; } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 0b641ae694f1..1db0254bc38b 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -159,7 +159,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, barrier_done = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + blkdev_issue_flush(inode->i_sb->s_bdev); inode_unlock(inode); if (barrier_done < 0) return barrier_done; diff --git a/fs/splice.c b/fs/splice.c index b06846f1e6ee..5dbce4dcc1a7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -662,12 +662,14 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, /* build the vector */ left = sd.total_len; - for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) { + for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t this_len = buf->len; - if (this_len > left) - this_len = left; + /* zero-length bvecs are not supported, skip them */ + if (!this_len) + continue; + this_len = min(this_len, left); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { @@ -680,6 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, array[n].bv_len = this_len; array[n].bv_offset = buf->offset; left -= this_len; + n++; } iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); diff --git a/fs/super.c b/fs/super.c index 2c6cdea2ab2d..5a1f384ffc74 100644 --- a/fs/super.c +++ b/fs/super.c @@ -865,7 +865,8 @@ int reconfigure_super(struct fs_context *fc) if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK - if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) + if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && + bdev_read_only(sb->s_bdev)) return -EACCES; #endif diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 21b1d034aca3..586d42342a79 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -343,7 +343,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS); + blkdev_issue_flush(buftarg->bt_bdev); } STATIC void diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 0e7ab0bc00ae..a29653c0196b 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -541,7 +541,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) ret = file_write_and_wait_range(file, start, end); if (!ret) - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); + ret = blkdev_issue_flush(inode->i_sb->s_bdev); if (ret) zonefs_io_error(inode, true); @@ -678,7 +678,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) if (!nr_pages) return 0; - bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set); + bio = bio_alloc(GFP_NOFS, nr_pages); if (!bio) return -ENOMEM; @@ -1581,12 +1581,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; /* - * The block size is set to the device physical sector size to ensure - * that write operations on 512e devices (512B logical block and 4KB - * physical block) are always aligned to the device physical blocks, - * as mandated by the ZBC/ZAC specifications. + * The block size is set to the device zone write granularity to ensure + * that write operations are always aligned according to the device + * interface constraints. */ - sb_set_blocksize(sb, bdev_physical_block_size(sb->s_bdev)); + sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev)); sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev)); sbi->s_uid = GLOBAL_ROOT_UID; sbi->s_gid = GLOBAL_ROOT_GID; diff --git a/include/linux/bio.h b/include/linux/bio.h index de62911473bb..5b468f2242ff 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -10,6 +10,7 @@ #include <linux/ioprio.h> /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include <linux/blk_types.h> +#include <linux/uio.h> #define BIO_DEBUG @@ -328,7 +329,6 @@ struct bio_integrity_payload { struct bvec_iter bip_iter; - unsigned short bip_slab; /* slab the bip came from */ unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ @@ -406,7 +406,9 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); -extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); +struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs, + struct bio_set *bs); +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); @@ -414,16 +416,11 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio_set fs_bio_set; -static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } -static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); -} - extern blk_qc_t submit_bio(struct bio *); extern void bio_endio(struct bio *); @@ -441,6 +438,18 @@ static inline void bio_wouldblock_error(struct bio *bio) bio_endio(bio); } +/* + * Calculate number of bvec segments that should be allocated to fit data + * pointed by @iter. If @iter is backed by bvec it's going to be reused + * instead of allocating a new one. + */ +static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) +{ + if (iov_iter_is_bvec(iter)) + return 0; + return iov_iter_npages(iter, max_segs); +} + struct request_queue; extern int submit_bio_wait(struct bio *bio); @@ -480,29 +489,26 @@ static inline void zero_fill_bio(struct bio *bio) zero_fill_bio_iter(bio, bio->bi_iter); } -extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); -extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); -extern unsigned int bvec_nr_vecs(unsigned short idx); extern const char *bio_devname(struct bio *bio, char *buffer); -#define bio_set_dev(bio, bdev) \ -do { \ - if ((bio)->bi_disk != (bdev)->bd_disk) \ - bio_clear_flag(bio, BIO_THROTTLED);\ - (bio)->bi_disk = (bdev)->bd_disk; \ - (bio)->bi_partno = (bdev)->bd_partno; \ - bio_associate_blkg(bio); \ +#define bio_set_dev(bio, bdev) \ +do { \ + bio_clear_flag(bio, BIO_REMAPPED); \ + if ((bio)->bi_bdev != (bdev)) \ + bio_clear_flag(bio, BIO_THROTTLED); \ + (bio)->bi_bdev = (bdev); \ + bio_associate_blkg(bio); \ } while (0) #define bio_copy_dev(dst, src) \ do { \ - (dst)->bi_disk = (src)->bi_disk; \ - (dst)->bi_partno = (src)->bi_partno; \ + bio_clear_flag(dst, BIO_REMAPPED); \ + (dst)->bi_bdev = (src)->bi_bdev; \ bio_clone_blkg_association(dst, src); \ } while (0) #define bio_dev(bio) \ - disk_devt((bio)->bi_disk) + disk_devt((bio)->bi_bdev->bd_disk) #ifdef CONFIG_BLK_CGROUP void bio_associate_blkg(struct bio *bio); @@ -705,6 +711,7 @@ struct bio_set { mempool_t bvec_integrity_pool; #endif + unsigned int back_pad; /* * Deadlock avoidance for stacking block drivers: see comments in * bio_alloc_bioset() for details @@ -715,12 +722,6 @@ struct bio_set { struct workqueue_struct *rescue_workqueue; }; -struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -}; - static inline bool bioset_initialized(struct bio_set *bs) { return bs->bio_slab != NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d705b174d346..aabbf6830ffc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,10 +140,6 @@ struct blk_mq_hw_ctx { * shared across request queues. */ atomic_t nr_active; - /** - * @elevator_queued: Number of queued requests on hctx. - */ - atomic_t elevator_queued; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; @@ -602,8 +598,8 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); - if (bio->bi_disk) - rq->rq_disk = bio->bi_disk; + if (bio->bi_bdev) + rq->rq_disk = bio->bi_bdev->bd_disk; } blk_qc_t blk_mq_submit_bio(struct bio *bio); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 866f74261b3b..db026b6ec15a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -222,16 +222,15 @@ static inline void bio_issue_init(struct bio_issue *issue, */ struct bio { struct bio *bi_next; /* request queue link */ - struct gendisk *bi_disk; + struct block_device *bi_bdev; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. */ - unsigned short bi_flags; /* status, etc and bvec pool number */ + unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; unsigned short bi_write_hint; blk_status_t bi_status; - u8 bi_partno; atomic_t __bi_remaining; struct bvec_iter bi_iter; @@ -304,36 +303,10 @@ enum { * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ + BIO_REMAPPED, BIO_FLAG_LAST }; -/* See BVEC_POOL_OFFSET below before adding new flags */ - -/* - * We support 6 different bvec pools, the last one is magic in that it - * is backed by a mempool. - */ -#define BVEC_POOL_NR 6 -#define BVEC_POOL_MAX (BVEC_POOL_NR - 1) - -/* - * Top 3 bits of bio flags indicate the pool the bvecs came from. We add - * 1 to the actual index so that 0 indicates that there are no bvecs to be - * freed. - */ -#define BVEC_POOL_BITS (3) -#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) -#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) -#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1) -# error "BVEC_POOL_BITS is too small" -#endif - -/* - * Flags starting here get preserved by bio_reset() - this includes - * only BVEC_POOL_IDX() - */ -#define BIO_RESET_BITS BVEC_POOL_OFFSET - typedef __u32 __bitwise blk_mq_req_flags_t; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f94ee3089e01..9149f4a5adb3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -337,6 +337,7 @@ struct queue_limits { unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; + unsigned int zone_write_granularity; unsigned short max_segments; unsigned short max_integrity_segments; @@ -948,9 +949,8 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns extern int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); -extern void blk_execute_rq(struct request_queue *, struct gendisk *, - struct request *, int); -extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, +extern void blk_execute_rq(struct gendisk *, struct request *, int); +extern void blk_execute_rq_nowait(struct gendisk *, struct request *, int, rq_end_io_fn *); /* Helper to convert REQ_OP_XXX to its string format XXX */ @@ -1161,6 +1161,8 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); extern void blk_queue_max_zone_append_sectors(struct request_queue *q, unsigned int max_zone_append_sectors); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); +void blk_queue_zone_write_granularity(struct request_queue *q, + unsigned int size); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); void blk_queue_update_readahead(struct request_queue *q); @@ -1289,7 +1291,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) !list_empty(&plug->cb_list)); } -int blkdev_issue_flush(struct block_device *, gfp_t); +int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ struct blk_plug { @@ -1317,7 +1319,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) return false; } -static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) +static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; } @@ -1474,6 +1476,18 @@ static inline int bdev_io_opt(struct block_device *bdev) return queue_io_opt(bdev_get_queue(bdev)); } +static inline unsigned int +queue_zone_write_granularity(const struct request_queue *q) +{ + return q->limits.zone_write_granularity; +} + +static inline unsigned int +bdev_zone_write_granularity(struct block_device *bdev) +{ + return queue_zone_write_granularity(bdev_get_queue(bdev)); +} + static inline int queue_alignment_offset(const struct request_queue *q) { if (q->limits.misaligned) @@ -1954,21 +1968,9 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); -unsigned long part_start_io_acct(struct gendisk *disk, - struct block_device **part, struct bio *bio); -void part_end_io_acct(struct block_device *part, struct bio *bio, - unsigned long start_time); - -/** - * bio_start_io_acct - start I/O accounting for bio based drivers - * @bio: bio to start account for - * - * Returns the start time that should be passed back to bio_end_io_acct(). - */ -static inline unsigned long bio_start_io_acct(struct bio *bio) -{ - return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio)); -} +unsigned long bio_start_io_acct(struct bio *bio); +void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, + struct block_device *orig_bdev); /** * bio_end_io_acct - end I/O accounting for bio based drivers @@ -1977,7 +1979,7 @@ static inline unsigned long bio_start_io_acct(struct bio *bio) */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { - return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time); + return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); } int bdev_read_only(struct block_device *bdev); @@ -2012,21 +2014,16 @@ void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, + loff_t lend); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, - loff_t lend); int sync_blockdev(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { } -static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode, - loff_t lstart, loff_t lend) -{ - return 0; -} static inline int sync_blockdev(struct block_device *bdev) { return 0; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index bacc40a0bdf3..1fe8e105b83b 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -172,6 +172,8 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); /* Supports zoned block devices sequential write constraint */ #define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) +/* Supports scheduling on multiple hardware queues */ +#define ELEVATOR_F_MQ_AWARE (1U << 1) #endif /* CONFIG_BLOCK */ #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 809aaa32d53c..f364619092cc 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -32,6 +32,7 @@ extern struct class block_class; #include <linux/string.h> #include <linux/fs.h> #include <linux/workqueue.h> +#include <linux/xarray.h> #define PARTITION_META_INFO_VOLNAMELTH 64 /* @@ -116,13 +117,6 @@ enum { DISK_EVENT_FLAG_UEVENT = 1 << 1, }; -struct disk_part_tbl { - struct rcu_head rcu_head; - int len; - struct block_device __rcu *last_lookup; - struct block_device __rcu *part[]; -}; - struct disk_events; struct badblocks; @@ -148,12 +142,7 @@ struct gendisk { unsigned short events; /* supported events */ unsigned short event_flags; /* flags related to event processing */ - /* Array of pointers to partitions indexed by partno. - * Protected with matching bdev lock but stat and other - * non-critical accesses use RCU. Always access through - * helpers. - */ - struct disk_part_tbl __rcu *part_tbl; + struct xarray part_tbl; struct block_device *part0; const struct block_device_operations *fops; @@ -163,6 +152,7 @@ struct gendisk { int flags; unsigned long state; #define GD_NEED_PART_SCAN 0 +#define GD_READ_ONLY 1 struct kobject *slave_dir; struct timer_rand_state *random; @@ -212,10 +202,11 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +void disk_uevent(struct gendisk *disk, enum kobject_action action); + /* * Smarter partition iterator without context limits. */ -#define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */ #define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */ #define DISK_PITER_INCL_PART0 (1 << 2) /* include partition 0 */ #define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */ @@ -223,7 +214,7 @@ static inline dev_t disk_devt(struct gendisk *disk) struct disk_part_iter { struct gendisk *disk; struct block_device *part; - int idx; + unsigned long idx; unsigned int flags; }; @@ -231,7 +222,6 @@ extern void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, unsigned int flags); struct block_device *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); -extern bool disk_has_partitions(struct gendisk *disk); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, @@ -249,11 +239,12 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk) extern void del_gendisk(struct gendisk *gp); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); -extern void set_disk_ro(struct gendisk *disk, int flag); +void set_disk_ro(struct gendisk *disk, bool read_only); static inline int get_disk_ro(struct gendisk *disk) { - return disk->part0->bd_read_only; + return disk->part0->bd_read_only || + test_bit(GD_READ_ONLY, &disk->state); } extern void disk_block_events(struct gendisk *disk); diff --git a/include/linux/swap.h b/include/linux/swap.h index 596bc2f4d9b0..3f1f7ae0fbe9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -468,7 +468,6 @@ extern int free_swap_and_cache(swp_entry_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); -extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); extern int __swap_count(swp_entry_t entry); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb0fe4c66b84..9e9ee4945043 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -903,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, @@ -915,22 +915,24 @@ static void blk_add_trace_bio_complete(void *ignore, static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE, + 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE, + 0); } static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) @@ -967,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; rcu_read_lock(); @@ -997,7 +999,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, sector_t from) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f0b2ccb1bb01..d8ca336ff9cd 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -72,8 +72,6 @@ __start.bi_bvec_done = skip; \ __start.bi_idx = 0; \ for_each_bvec(__v, i->bvec, __bi, __start) { \ - if (!__v.bv_len) \ - continue; \ (void)(STEP); \ } \ } @@ -1069,6 +1067,21 @@ static void pipe_advance(struct iov_iter *i, size_t size) pipe_truncate(i); } +static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) +{ + struct bvec_iter bi; + + bi.bi_size = i->count; + bi.bi_bvec_done = i->iov_offset; + bi.bi_idx = 0; + bvec_iter_advance(i->bvec, &bi, size); + + i->bvec += bi.bi_idx; + i->nr_segs -= bi.bi_idx; + i->count = bi.bi_size; + i->iov_offset = bi.bi_bvec_done; +} + void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(iov_iter_is_pipe(i))) { @@ -1079,6 +1092,10 @@ void iov_iter_advance(struct iov_iter *i, size_t size) i->count -= size; return; } + if (iov_iter_is_bvec(i)) { + iov_iter_bvec_advance(i, size); + return; + } iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); diff --git a/mm/page_io.c b/mm/page_io.c index 9bca17ecc4df..92f7941c6d01 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -26,25 +26,6 @@ #include <linux/uio.h> #include <linux/sched/task.h> -static struct bio *get_swap_bio(gfp_t gfp_flags, - struct page *page, bio_end_io_t end_io) -{ - struct bio *bio; - - bio = bio_alloc(gfp_flags, 1); - if (bio) { - struct block_device *bdev; - - bio->bi_iter.bi_sector = map_swap_page(page, &bdev); - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; - bio->bi_end_io = end_io; - - bio_add_page(bio, page, thp_size(page), 0); - } - return bio; -} - void end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -361,13 +342,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } - bio = get_swap_bio(GFP_NOIO, page, end_write_func); - if (bio == NULL) { - set_page_dirty(page); - unlock_page(page); - return -ENOMEM; - } + bio = bio_alloc(GFP_NOIO, 1); + bio_set_dev(bio, sis->bdev); + bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); + bio->bi_end_io = end_write_func; + bio_add_page(bio, page, thp_size(page), 0); + bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); @@ -427,18 +408,18 @@ int swap_readpage(struct page *page, bool synchronous) } ret = 0; - bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); - if (bio == NULL) { - unlock_page(page); - ret = -ENOMEM; - goto out; - } - disk = bio->bi_disk; + bio = bio_alloc(GFP_KERNEL, 1); + bio_set_dev(bio, sis->bdev); + bio->bi_opf = REQ_OP_READ; + bio->bi_iter.bi_sector = swap_page_sector(page); + bio->bi_end_io = end_swap_bio_read; + bio_add_page(bio, page, thp_size(page), 0); + + disk = bio->bi_bdev->bd_disk; /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); if (synchronous) { bio->bi_opf |= REQ_HIPRI; get_task_struct(current); diff --git a/mm/swapfile.c b/mm/swapfile.c index 9fffc5af29d1..21a98cb8d646 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,7 +47,6 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static sector_t map_swap_entry(swp_entry_t, struct block_device**); DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -1850,12 +1849,13 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { - struct block_device *bdev; struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; - return map_swap_entry(swp_entry(type, offset), &bdev); + se = offset_to_swap_extent(si, offset); + return se->start_block + (offset - se->start_page); } /* @@ -2282,36 +2282,6 @@ static void drain_mmlist(void) } /* - * Use this swapdev's extent info to locate the (PAGE_SIZE) block which - * corresponds to page offset for the specified swap entry. - * Note that the type of this function is sector_t, but it returns page offset - * into the bdev, not sector offset. - */ -static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) -{ - struct swap_info_struct *sis; - struct swap_extent *se; - pgoff_t offset; - - sis = swp_swap_info(entry); - *bdev = sis->bdev; - - offset = swp_offset(entry); - se = offset_to_swap_extent(sis, offset); - return se->start_block + (offset - se->start_page); -} - -/* - * Returns the page offset into bdev for the specified page's swap entry. - */ -sector_t map_swap_page(struct page *page, struct block_device **bdev) -{ - swp_entry_t entry; - entry.val = page_private(page); - return map_swap_entry(entry, bdev); -} - -/* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) |