diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile | 1 | ||||
-rw-r--r-- | block/badblocks.c | 2 | ||||
-rw-r--r-- | block/bdev.c | 23 | ||||
-rw-r--r-- | block/bfq-cgroup.c | 111 | ||||
-rw-r--r-- | block/bfq-iosched.c | 95 | ||||
-rw-r--r-- | block/bfq-iosched.h | 11 | ||||
-rw-r--r-- | block/bio.c | 146 | ||||
-rw-r--r-- | block/blk-cgroup-fc-appid.c | 57 | ||||
-rw-r--r-- | block/blk-cgroup.c | 168 | ||||
-rw-r--r-- | block/blk-cgroup.h | 140 | ||||
-rw-r--r-- | block/blk-core.c | 81 | ||||
-rw-r--r-- | block/blk-crypto-fallback.c | 15 | ||||
-rw-r--r-- | block/blk-iocost.c | 76 | ||||
-rw-r--r-- | block/blk-iolatency.c | 8 | ||||
-rw-r--r-- | block/blk-lib.c | 124 | ||||
-rw-r--r-- | block/blk-map.c | 47 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 2 | ||||
-rw-r--r-- | block/blk-mq.c | 111 | ||||
-rw-r--r-- | block/blk-settings.c | 74 | ||||
-rw-r--r-- | block/blk-throttle.c | 5 | ||||
-rw-r--r-- | block/blk.h | 21 | ||||
-rw-r--r-- | block/bounce.c | 1 | ||||
-rw-r--r-- | block/fops.c | 35 | ||||
-rw-r--r-- | block/genhd.c | 4 | ||||
-rw-r--r-- | block/ioctl.c | 48 | ||||
-rw-r--r-- | block/mq-deadline.c | 1 | ||||
-rw-r--r-- | block/partitions/acorn.c | 4 | ||||
-rw-r--r-- | block/partitions/atari.c | 1 | ||||
-rw-r--r-- | block/partitions/core.c | 14 | ||||
-rw-r--r-- | block/partitions/ldm.c | 15 |
30 files changed, 863 insertions, 578 deletions
diff --git a/block/Makefile b/block/Makefile index 3950ecbc5c26..4e01bb71ad6e 100644 --- a/block/Makefile +++ b/block/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o +obj-$(CONFIG_BLK_CGROUP_FC_APPID) += blk-cgroup-fc-appid.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o diff --git a/block/badblocks.c b/block/badblocks.c index d39056630d9c..3afb550c0f7b 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -65,7 +65,6 @@ int badblocks_check(struct badblocks *bb, sector_t s, int sectors, s >>= bb->shift; target += (1<<bb->shift) - 1; target >>= bb->shift; - sectors = target - s; } /* 'target' is now the first block after the bad range */ @@ -345,7 +344,6 @@ int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) s += (1<<bb->shift) - 1; s >>= bb->shift; target >>= bb->shift; - sectors = target - s; } write_seqlock_irq(&bb->lock); diff --git a/block/bdev.c b/block/bdev.c index 13de871fa816..5fe06c1f2def 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -200,6 +200,13 @@ int sync_blockdev(struct block_device *bdev) } EXPORT_SYMBOL(sync_blockdev); +int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) +{ + return filemap_write_and_wait_range(bdev->bd_inode->i_mapping, + lstart, lend); +} +EXPORT_SYMBOL(sync_blockdev_range); + /* * Write out and wait upon all dirty data associated with this * device. Filesystem data as well as the underlying block @@ -673,17 +680,17 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) } } - if (!bdev->bd_openers) + if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); - bdev->bd_openers++; + atomic_inc(&bdev->bd_openers); return 0; } static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) { - if (!--bdev->bd_openers) + if (atomic_dec_and_test(&bdev->bd_openers)) blkdev_flush_mapping(bdev); if (bdev->bd_disk->fops->release) bdev->bd_disk->fops->release(bdev->bd_disk, mode); @@ -694,7 +701,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) struct gendisk *disk = part->bd_disk; int ret; - if (part->bd_openers) + if (atomic_read(&part->bd_openers)) goto done; ret = blkdev_get_whole(bdev_whole(part), mode); @@ -708,7 +715,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); done: - part->bd_openers++; + atomic_inc(&part->bd_openers); return 0; out_blkdev_put: @@ -720,7 +727,7 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) { struct block_device *whole = bdev_whole(part); - if (--part->bd_openers) + if (!atomic_dec_and_test(&part->bd_openers)) return; blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; @@ -899,7 +906,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * of the world and we want to avoid long (could be several minute) * syncs while holding the mutex. */ - if (bdev->bd_openers == 1) + if (atomic_read(&bdev->bd_openers) == 1) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); @@ -1044,7 +1051,7 @@ void sync_bdevs(bool wait) bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); - if (!bdev->bd_openers) { + if (!atomic_read(&bdev->bd_openers)) { ; /* skip */ } else if (wait) { /* diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 420eda2589c0..09574af83566 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -557,6 +557,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; + bfqg->online = true; bfqg->rq_pos_tree = RB_ROOT; } @@ -585,28 +586,11 @@ static void bfq_group_set_parent(struct bfq_group *bfqg, entity->sched_data = &parent->sched_data; } -static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, - struct blkcg *blkcg) +static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) { - struct blkcg_gq *blkg; - - blkg = blkg_lookup(blkcg, bfqd->queue); - if (likely(blkg)) - return blkg_to_bfqg(blkg); - return NULL; -} - -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg) -{ - struct bfq_group *bfqg, *parent; + struct bfq_group *parent; struct bfq_entity *entity; - bfqg = bfq_lookup_bfqg(bfqd, blkcg); - - if (unlikely(!bfqg)) - return NULL; - /* * Update chain of bfq_groups as we might be handling a leaf group * which, along with some of its relatives, has not been hooked yet @@ -623,8 +607,24 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, bfq_group_set_parent(curr_bfqg, parent); } } +} - return bfqg; +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct bfq_group *bfqg; + + while (blkg) { + bfqg = blkg_to_bfqg(blkg); + if (bfqg->online) { + bio_associate_blkg_from_css(bio, &blkg->blkcg->css); + return bfqg; + } + blkg = blkg->parent; + } + bio_associate_blkg_from_css(bio, + &bfqg_to_blkg(bfqd->root_group)->blkcg->css); + return bfqd->root_group; } /** @@ -714,25 +714,15 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup - * time here, at the price of slightly more complex code. */ -static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct blkcg *blkcg) +static void *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_group *bfqg) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); - struct bfq_group *bfqg; struct bfq_entity *entity; - bfqg = bfq_find_set_group(bfqd, blkcg); - - if (unlikely(!bfqg)) - bfqg = bfqd->root_group; - if (async_bfqq) { entity = &async_bfqq->entity; @@ -743,9 +733,39 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, } if (sync_bfqq) { - entity = &sync_bfqq->entity; - if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { + /* We are the only user of this bfqq, just move it */ + if (sync_bfqq->entity.sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + } else { + struct bfq_queue *bfqq; + + /* + * The queue was merged to a different queue. Check + * that the merge chain still belongs to the same + * cgroup. + */ + for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) + if (bfqq->entity.sched_data != + &bfqg->sched_data) + break; + if (bfqq) { + /* + * Some queue changed cgroup so the merge is + * not valid anymore. We cannot easily just + * cancel the merge (by clearing new_bfqq) as + * there may be other processes using this + * queue and holding refs to all queues below + * sync_bfqq->new_bfqq. Similarly if the merge + * already happened, we need to detach from + * bfqq now so that we cannot merge bio to a + * request from the old cgroup. + */ + bfq_put_cooperator(sync_bfqq); + bfq_release_process_ref(bfqd, sync_bfqq); + bic_set_bfqq(bic, NULL, 1); + } + } } return bfqg; @@ -754,20 +774,24 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); - struct bfq_group *bfqg = NULL; + struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio); uint64_t serial_nr; - rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) - goto out; + return; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); + /* + * New cgroup for this process. Make sure it is linked to bfq internal + * cgroup hierarchy. + */ + bfq_link_bfqg(bfqd, bfqg); + __bfq_bic_change_cgroup(bfqd, bic, bfqg); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following @@ -820,8 +844,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) */ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); bic->blkcg_serial_nr = serial_nr; -out: - rcu_read_unlock(); } /** @@ -949,6 +971,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) put_async_queues: bfq_put_async_queues(bfqd, bfqg); + bfqg->online = false; spin_unlock_irqrestore(&bfqd->lock, flags); /* @@ -1438,7 +1461,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd) bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg) +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { return bfqd->root_group; } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 1f62dbdc521f..0d46cb728bbf 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -374,7 +374,7 @@ static const unsigned long bfq_activation_stable_merging = 600; */ static const unsigned long bfq_late_stable_merging = 600; -#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) +#define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0])) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) @@ -456,6 +456,8 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) */ void bfq_schedule_dispatch(struct bfq_data *bfqd) { + lockdep_assert_held(&bfqd->lock); + if (bfqd->queued != 0) { bfq_log(bfqd, "schedule dispatch"); blk_mq_run_hw_queues(bfqd->queue, true); @@ -2133,9 +2135,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || - bfqq->dispatched > 0 || - now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || - bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC) return; /* @@ -2208,9 +2208,13 @@ static void bfq_add_request(struct request *rq) bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; + /* + * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it + * may be read without holding the lock in bfq_has_work(). + */ + WRITE_ONCE(bfqd->queued, bfqd->queued + 1); - if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { + if (bfq_bfqq_sync(bfqq) && RQ_BIC(rq)->requests <= 1) { bfq_check_waker(bfqd, bfqq, now_ns); /* @@ -2400,7 +2404,11 @@ static void bfq_remove_request(struct request_queue *q, if (rq->queuelist.prev != &rq->queuelist) list_del_init(&rq->queuelist); bfqq->queued[sync]--; - bfqd->queued--; + /* + * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it + * may be read without holding the lock in bfq_has_work(). + */ + WRITE_ONCE(bfqd->queued, bfqd->queued - 1); elv_rb_del(&bfqq->sort_list, rq); elv_rqhash_del(q, rq); @@ -2463,10 +2471,17 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, spin_lock_irq(&bfqd->lock); - if (bic) + if (bic) { + /* + * Make sure cgroup info is uptodate for current process before + * considering the merge. + */ + bfq_bic_update_cgroup(bic, bio); + bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); - else + } else { bfqd->bio_bfqq = NULL; + } bfqd->bio_bic = bic; ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); @@ -2496,8 +2511,6 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, return ELEVATOR_NO_MERGE; } -static struct bfq_queue *bfq_init_rq(struct request *rq); - static void bfq_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { @@ -2506,7 +2519,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, blk_rq_pos(req) < blk_rq_pos(container_of(rb_prev(&req->rb_node), struct request, rb_node))) { - struct bfq_queue *bfqq = bfq_init_rq(req); + struct bfq_queue *bfqq = RQ_BFQQ(req); struct bfq_data *bfqd; struct request *prev, *next_rq; @@ -2558,8 +2571,8 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, static void bfq_requests_merged(struct request_queue *q, struct request *rq, struct request *next) { - struct bfq_queue *bfqq = bfq_init_rq(rq), - *next_bfqq = bfq_init_rq(next); + struct bfq_queue *bfqq = RQ_BFQQ(rq), + *next_bfqq = RQ_BFQQ(next); if (!bfqq) goto remove; @@ -2764,6 +2777,14 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) if (process_refs == 0 || new_process_refs == 0) return NULL; + /* + * Make sure merged queues belong to the same parent. Parents could + * have changed since the time we decided the two queues are suitable + * for merging. + */ + if (new_bfqq->entity.parent != bfqq->entity.parent) + return NULL; + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", new_bfqq->pid); @@ -2901,9 +2922,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_queue *new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq); - bic->stably_merged = true; - if (new_bfqq && new_bfqq->bic) - new_bfqq->bic->stably_merged = true; + if (new_bfqq) { + bic->stably_merged = true; + if (new_bfqq->bic) + new_bfqq->bic->stably_merged = + true; + } return new_bfqq; } else return NULL; @@ -5045,11 +5069,11 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; /* - * Avoiding lock: a race on bfqd->busy_queues should cause at + * Avoiding lock: a race on bfqd->queued should cause at * most a call to dispatch for nothing */ return !list_empty_careful(&bfqd->dispatch) || - bfq_tot_busy_queues(bfqd) > 0; + READ_ONCE(bfqd->queued); } static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) @@ -5310,7 +5334,7 @@ static void bfq_put_stable_ref(struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void bfq_put_cooperator(struct bfq_queue *bfqq) +void bfq_put_cooperator(struct bfq_queue *bfqq) { struct bfq_queue *__bfqq, *next; @@ -5716,14 +5740,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq; struct bfq_group *bfqg; - rcu_read_lock(); - - bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); - if (!bfqg) { - bfqq = &bfqd->oom_bfqq; - goto out; - } - + bfqg = bfq_bio_bfqg(bfqd, bio); if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ioprio); @@ -5769,8 +5786,6 @@ out: if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn) bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic); - - rcu_read_unlock(); return bfqq; } @@ -6117,6 +6132,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q, unsigned int cmd_flags) {} #endif /* CONFIG_BFQ_CGROUP_DEBUG */ +static struct bfq_queue *bfq_init_rq(struct request *rq); + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { @@ -6132,18 +6149,15 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bfqg_stats_update_legacy_io(q, rq); #endif spin_lock_irq(&bfqd->lock); + bfqq = bfq_init_rq(rq); if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); blk_mq_free_requests(&free); return; } - spin_unlock_irq(&bfqd->lock); - trace_block_rq_insert(rq); - spin_lock_irq(&bfqd->lock); - bfqq = bfq_init_rq(rq); if (!bfqq || at_head) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); @@ -6360,12 +6374,6 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_schedule_dispatch(bfqd); } -static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) -{ - bfqq_request_freed(bfqq); - bfq_put_queue(bfqq); -} - /* * The processes associated with bfqq may happen to generate their * cumulative I/O at a lower rate than the rate at which the device @@ -6562,7 +6570,9 @@ static void bfq_finish_requeue_request(struct request *rq) bfq_completed_request(bfqq, bfqd); } - bfq_finish_requeue_request_body(bfqq); + bfqq_request_freed(bfqq); + bfq_put_queue(bfqq); + RQ_BIC(rq)->requests--; spin_unlock_irqrestore(&bfqd->lock, flags); /* @@ -6796,6 +6806,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq_request_allocated(bfqq); bfqq->ref++; + bic->requests++; bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); @@ -6892,8 +6903,8 @@ bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_bfqq_expire(bfqd, bfqq, true, reason); schedule_dispatch: - spin_unlock_irqrestore(&bfqd->lock, flags); bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(&bfqd->lock, flags); } /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 3b83e3d1c2e5..ca8177d7bf7c 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -468,6 +468,7 @@ struct bfq_io_cq { struct bfq_queue *stable_merge_bfqq; bool stably_merged; /* non splittable if true */ + unsigned int requests; /* Number of requests this process has in flight */ }; /** @@ -928,6 +929,8 @@ struct bfq_group { /* reference counter (see comments in bfq_bic_update_cgroup) */ int ref; + /* Is bfq_group still online? */ + bool online; struct bfq_entity entity; struct bfq_sched_data sched_data; @@ -979,6 +982,7 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); +void bfq_put_cooperator(struct bfq_queue *bfqq); void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_schedule_dispatch(struct bfq_data *bfqd); @@ -1006,8 +1010,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); void bfq_end_wr_async(struct bfq_data *bfqd); -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg); +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio); struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); @@ -1100,13 +1103,13 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); break; \ bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ - bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ + &bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css, \ "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ blk_add_cgroup_trace_msg((bfqd)->queue, \ - bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \ + &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \ } while (0) #else /* CONFIG_BFQ_GROUP_IOSCHED */ diff --git a/block/bio.c b/block/bio.c index 4259125e16ab..a3893d80dccc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -224,24 +224,13 @@ EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; - void *p; - - bio_uninit(bio); + void *p = bio; - if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + WARN_ON_ONCE(!bs); - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - p -= bs->front_pad; - - mempool_free(p, &bs->bio_pool); - } else { - /* Bio was allocated by bio_kmalloc() */ - kfree(bio); - } + bio_uninit(bio); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + mempool_free(p - bs->front_pad, &bs->bio_pool); } /* @@ -419,6 +408,28 @@ static void punt_bios_to_rescuer(struct bio_set *bs) queue_work(bs->rescue_workqueue, &bs->rescue_work); } +static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, + unsigned short nr_vecs, unsigned int opf, gfp_t gfp, + struct bio_set *bs) +{ + struct bio_alloc_cache *cache; + struct bio *bio; + + cache = per_cpu_ptr(bs->cache, get_cpu()); + if (!cache->free_list) { + put_cpu(); + return NULL; + } + bio = cache->free_list; + cache->free_list = bio->bi_next; + cache->nr--; + put_cpu(); + + bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf); + bio->bi_pool = bs; + return bio; +} + /** * bio_alloc_bioset - allocate a bio for I/O * @bdev: block device to allocate the bio for (can be %NULL) @@ -451,6 +462,9 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * + * If REQ_ALLOC_CACHE is set, the final put of the bio MUST be done from process + * context, not hard/soft IRQ. + * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, @@ -465,6 +479,21 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; + if (opf & REQ_ALLOC_CACHE) { + if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { + bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, + gfp_mask, bs); + if (bio) + return bio; + /* + * No cached bio available, bio returned below marked with + * REQ_ALLOC_CACHE to particpate in per-cpu alloc cache. + */ + } else { + opf &= ~REQ_ALLOC_CACHE; + } + } + /* * submit_bio_noacct() converts recursion to iteration; this means if * we're running beneath it, any bios we allocate and submit will not be @@ -528,28 +557,28 @@ err_free: EXPORT_SYMBOL(bio_alloc_bioset); /** - * bio_kmalloc - kmalloc a bio for I/O + * bio_kmalloc - kmalloc a bio + * @nr_vecs: number of bio_vecs to allocate * @gfp_mask: the GFP_* mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate * - * Use kmalloc to allocate and initialize a bio. + * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized + * using bio_init() before use. To free a bio returned from this function use + * kfree() after calling bio_uninit(). A bio returned from this function can + * be reused by calling bio_uninit() before calling bio_init() again. + * + * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this + * function are not backed by a mempool can can fail. Do not use this function + * for allocations in the file system I/O path. * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) +struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - if (unlikely(!bio)) + if (nr_vecs > UIO_MAXIOV) return NULL; - bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs, - 0); - bio->bi_pool = NULL; - return bio; + return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); @@ -711,7 +740,7 @@ void bio_put(struct bio *bio) return; } - if (bio_flagged(bio, BIO_PERCPU_CACHE)) { + if (bio->bi_opf & REQ_ALLOC_CACHE) { struct bio_alloc_cache *cache; bio_uninit(bio); @@ -732,14 +761,15 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); - if (bio->bi_bdev == bio_src->bi_bdev && - bio_flagged(bio_src, BIO_REMAPPED)) - bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter = bio_src->bi_iter; - bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); + if (bio->bi_bdev) { + if (bio->bi_bdev == bio_src->bi_bdev && + bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); + bio_clone_blkg_association(bio, bio_src); + } if (bio_crypt_clone(bio, bio_src, gfp) < 0) return -ENOMEM; @@ -1727,55 +1757,13 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) flags |= BIOSET_NEED_BVECS; if (src->rescue_workqueue) flags |= BIOSET_NEED_RESCUER; + if (src->cache) + flags |= BIOSET_PERCPU_CACHE; return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags); } EXPORT_SYMBOL(bioset_init_from_src); -/** - * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb - * @kiocb: kiocb describing the IO - * @bdev: block device to allocate the bio for (can be %NULL) - * @nr_vecs: number of iovecs to pre-allocate - * @opf: operation and flags for bio - * @bs: bio_set to allocate from - * - * Description: - * Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only - * used to check if we should dip into the per-cpu bio_set allocation - * cache. The allocation uses GFP_KERNEL internally. On return, the - * bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio - * MUST be done from process context, not hard/soft IRQ. - * - */ -struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev, - unsigned short nr_vecs, unsigned int opf, struct bio_set *bs) -{ - struct bio_alloc_cache *cache; - struct bio *bio; - - if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS) - return bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs); - - cache = per_cpu_ptr(bs->cache, get_cpu()); - if (cache->free_list) { - bio = cache->free_list; - cache->free_list = bio->bi_next; - cache->nr--; - put_cpu(); - bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, - nr_vecs, opf); - bio->bi_pool = bs; - bio_set_flag(bio, BIO_PERCPU_CACHE); - return bio; - } - put_cpu(); - bio = bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs); - bio_set_flag(bio, BIO_PERCPU_CACHE); - return bio; -} -EXPORT_SYMBOL_GPL(bio_alloc_kiocb); - static int __init init_bio(void) { int i; diff --git a/block/blk-cgroup-fc-appid.c b/block/blk-cgroup-fc-appid.c new file mode 100644 index 000000000000..760a2e1878dd --- /dev/null +++ b/block/blk-cgroup-fc-appid.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "blk-cgroup.h" + +/** + * blkcg_set_fc_appid - set the fc_app_id field associted to blkcg + * @app_id: application identifier + * @cgrp_id: cgroup id + * @app_id_len: size of application identifier + */ +int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len) +{ + struct cgroup *cgrp; + struct cgroup_subsys_state *css; + struct blkcg *blkcg; + int ret = 0; + + if (app_id_len > FC_APPID_LEN) + return -EINVAL; + + cgrp = cgroup_get_from_id(cgrp_id); + if (!cgrp) + return -ENOENT; + css = cgroup_get_e_css(cgrp, &io_cgrp_subsys); + if (!css) { + ret = -ENOENT; + goto out_cgrp_put; + } + blkcg = css_to_blkcg(css); + /* + * There is a slight race condition on setting the appid. + * Worst case an I/O may not find the right id. + * This is no different from the I/O we let pass while obtaining + * the vmid from the fabric. + * Adding the overhead of a lock is not necessary. + */ + strlcpy(blkcg->fc_app_id, app_id, app_id_len); + css_put(css); +out_cgrp_put: + cgroup_put(cgrp); + return ret; +} +EXPORT_SYMBOL_GPL(blkcg_set_fc_appid); + +/** + * blkcg_get_fc_appid - get the fc app identifier associated with a bio + * @bio: target bio + * + * On success return the fc_app_id, on failure return NULL + */ +char *blkcg_get_fc_appid(struct bio *bio) +{ + if (!bio->bi_blkg || bio->bi_blkg->blkcg->fc_app_id[0] == '\0') + return NULL; + return bio->bi_blkg->blkcg->fc_app_id; +} +EXPORT_SYMBOL_GPL(blkcg_get_fc_appid); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8dfe62786cd5..40161a3f68d0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -59,6 +59,23 @@ static struct workqueue_struct *blkcg_punt_bio_wq; #define BLKG_DESTROY_BATCH_SIZE 64 +/** + * blkcg_css - find the current css + * + * Find the css associated with either the kthread or the current task. + * This may return a dying css, so it is up to the caller to use tryget logic + * to confirm it is alive and well. + */ +static struct cgroup_subsys_state *blkcg_css(void) +{ + struct cgroup_subsys_state *css; + + css = kthread_blkcg(); + if (css) + return css; + return task_css(current, io_cgrp_id); +} + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -156,6 +173,33 @@ static void blkg_async_bio_workfn(struct work_struct *work) } /** + * bio_blkcg_css - return the blkcg CSS associated with a bio + * @bio: target bio + * + * This returns the CSS for the blkcg associated with a bio, or %NULL if not + * associated. Callers are expected to either handle %NULL or know association + * has been done prior to calling this. + */ +struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) +{ + if (!bio || !bio->bi_blkg) + return NULL; + return &bio->bi_blkg->blkcg->css; +} +EXPORT_SYMBOL_GPL(bio_blkcg_css); + +/** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + return css_to_blkcg(blkcg->css.parent); +} + +/** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @q: request_queue the new blkg is associated with @@ -254,7 +298,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_gq *blkg; int i, ret; - WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(&q->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ @@ -905,7 +948,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { struct blkg_iostat_set *bis = &blkg->iostat; u64 rbytes, wbytes, rios, wios, dbytes, dios; - bool has_stats = false; const char *dname; unsigned seq; int i; @@ -931,14 +973,12 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { - has_stats = true; seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { - has_stats = true; seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); @@ -950,12 +990,10 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) if (!blkg->pd[i] || !pol->pd_stat_fn) continue; - if (pol->pd_stat_fn(blkg->pd[i], s)) - has_stats = true; + pol->pd_stat_fn(blkg->pd[i], s); } - if (has_stats) - seq_printf(s, "\n"); + seq_puts(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) @@ -994,6 +1032,13 @@ static struct cftype blkcg_legacy_files[] = { { } /* terminate */ }; +#ifdef CONFIG_CGROUP_WRITEBACK +struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) +{ + return &css_to_blkcg(css)->cgwb_list; +} +#endif + /* * blkcg destruction is a three-stage process. * @@ -1016,25 +1061,6 @@ static struct cftype blkcg_legacy_files[] = { */ /** - * blkcg_css_offline - cgroup css_offline callback - * @css: css of interest - * - * This function is called when @css is about to go away. Here the cgwbs are - * offlined first and only once writeback associated with the blkcg has - * finished do we start step 2 (see above). - */ -static void blkcg_css_offline(struct cgroup_subsys_state *css) -{ - struct blkcg *blkcg = css_to_blkcg(css); - - /* this prevents anyone from attaching or migrating to this blkcg */ - wb_blkcg_offline(blkcg); - - /* put the base online pin allowing step 2 to be triggered */ - blkcg_unpin_online(blkcg); -} - -/** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest * @@ -1045,7 +1071,7 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) * * This is the blkcg counterpart of ioc_release_fn(). */ -void blkcg_destroy_blkgs(struct blkcg *blkcg) +static void blkcg_destroy_blkgs(struct blkcg *blkcg) { might_sleep(); @@ -1075,6 +1101,57 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg) spin_unlock_irq(&blkcg->lock); } +/** + * blkcg_pin_online - pin online state + * @blkcg_css: blkcg of interest + * + * While pinned, a blkcg is kept online. This is primarily used to + * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline + * while an associated cgwb is still active. + */ +void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) +{ + refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); +} + +/** + * blkcg_unpin_online - unpin online state + * @blkcg_css: blkcg of interest + * + * This is primarily used to impedance-match blkg and cgwb lifetimes so + * that blkg doesn't go offline while an associated cgwb is still active. + * When this count goes to zero, all active cgwbs have finished so the + * blkcg can continue destruction by calling blkcg_destroy_blkgs(). + */ +void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) +{ + struct blkcg *blkcg = css_to_blkcg(blkcg_css); + + do { + if (!refcount_dec_and_test(&blkcg->online_pin)) + break; + blkcg_destroy_blkgs(blkcg); + blkcg = blkcg_parent(blkcg); + } while (blkcg); +} + +/** + * blkcg_css_offline - cgroup css_offline callback + * @css: css of interest + * + * This function is called when @css is about to go away. Here the cgwbs are + * offlined first and only once writeback associated with the blkcg has + * finished do we start step 2 (see above). + */ +static void blkcg_css_offline(struct cgroup_subsys_state *css) +{ + /* this prevents anyone from attaching or migrating to this blkcg */ + wb_blkcg_offline(css); + + /* put the base online pin allowing step 2 to be triggered */ + blkcg_unpin_online(css); +} + static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); @@ -1163,8 +1240,7 @@ unlock: static int blkcg_css_online(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg *parent = blkcg_parent(blkcg); + struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs @@ -1172,7 +1248,7 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) * parent so that offline always happens towards the root. */ if (parent) - blkcg_pin_online(parent); + blkcg_pin_online(css); return 0; } @@ -1201,14 +1277,13 @@ int blkcg_init_queue(struct request_queue *q) preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ - rcu_read_lock(); + /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, q, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); @@ -1234,7 +1309,6 @@ err_destroy_all: return ret; err_unlock: spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); @@ -1726,7 +1800,6 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) void blkcg_maybe_throttle_current(void) { struct request_queue *q = current->throttle_queue; - struct cgroup_subsys_state *css; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; @@ -1738,12 +1811,7 @@ void blkcg_maybe_throttle_current(void) current->use_memdelay = false; rcu_read_lock(); - css = kthread_blkcg(); - if (css) - blkcg = css_to_blkcg(css); - else - blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); - + blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; blkg = blkg_lookup(blkcg, q); @@ -1889,7 +1957,7 @@ void bio_associate_blkg(struct bio *bio) rcu_read_lock(); if (bio->bi_blkg) - css = &bio_blkcg(bio)->css; + css = bio_blkcg_css(bio); else css = blkcg_css(); @@ -1950,6 +2018,22 @@ void blk_cgroup_bio_start(struct bio *bio) put_cpu(); } +bool blk_cgroup_congested(void) +{ + struct cgroup_subsys_state *css; + bool ret = false; + + rcu_read_lock(); + for (css = blkcg_css(); css; css = css->parent) { + if (atomic_read(&css->cgroup->congestion_count)) { + ret = true; + break; + } + } + rcu_read_unlock(); + return ret; +} + static int __init blkcg_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 47e1e38390c9..d4de0a35e066 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,13 +15,101 @@ */ #include <linux/blk-cgroup.h> +#include <linux/cgroup.h> +#include <linux/kthread.h> #include <linux/blk-mq.h> +struct blkcg_gq; +struct blkg_policy_data; + + /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP +enum blkg_iostat_type { + BLKG_IOSTAT_READ, + BLKG_IOSTAT_WRITE, + BLKG_IOSTAT_DISCARD, + + BLKG_IOSTAT_NR, +}; + +struct blkg_iostat { + u64 bytes[BLKG_IOSTAT_NR]; + u64 ios[BLKG_IOSTAT_NR]; +}; + +struct blkg_iostat_set { + struct u64_stats_sync sync; + struct blkg_iostat cur; + struct blkg_iostat last; +}; + +/* association between a blk cgroup and a request queue */ +struct blkcg_gq { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + + /* reference count */ + struct percpu_ref refcnt; + + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + + struct blkg_iostat_set __percpu *iostat_cpu; + struct blkg_iostat_set iostat; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; + + spinlock_t async_bio_lock; + struct bio_list async_bios; + union { + struct work_struct async_bio_work; + struct work_struct free_work; + }; + + atomic_t use_delay; + atomic64_t delay_nsec; + atomic64_t delay_start; + u64 last_delay; + int last_use; + + struct rcu_head rcu_head; +}; + +struct blkcg { + struct cgroup_subsys_state css; + spinlock_t lock; + refcount_t online_pin; + + struct radix_tree_root blkg_tree; + struct blkcg_gq __rcu *blkg_hint; + struct hlist_head blkg_list; + + struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; + + struct list_head all_blkcgs_node; +#ifdef CONFIG_BLK_CGROUP_FC_APPID + char fc_app_id[FC_APPID_LEN]; +#endif +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif +}; + +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} + /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track @@ -63,7 +151,7 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); -typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, +typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { @@ -123,52 +211,14 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, void blkg_conf_finish(struct blkg_conf_ctx *ctx); /** - * blkcg_css - find the current css - * - * Find the css associated with either the kthread or the current task. - * This may return a dying css, so it is up to the caller to use tryget logic - * to confirm it is alive and well. - */ -static inline struct cgroup_subsys_state *blkcg_css(void) -{ - struct cgroup_subsys_state *css; - - css = kthread_blkcg(); - if (css) - return css; - return task_css(current, io_cgrp_id); -} - -/** - * __bio_blkcg - internal, inconsistent version to get blkcg - * - * DO NOT USE. - * This function is inconsistent and consequently is dangerous to use. The - * first part of the function returns a blkcg where a reference is owned by the - * bio. This means it does not need to be rcu protected as it cannot go away - * with the bio owning a reference to it. However, the latter potentially gets - * it from task_css(). This can race against task migration and the cgroup - * dying. It is also semantically different as it must be called rcu protected - * and is susceptible to failure when trying to get a reference to it. - * Therefore, it is not ok to assume that *_get() will always succeed on the - * blkcg returned here. - */ -static inline struct blkcg *__bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return css_to_blkcg(blkcg_css()); -} - -/** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning - * blkg. The idea is we do bio_blkcg() to look up the actual context for the - * bio and attach the appropriate blkg to the bio. Then we call this helper and - * if it is true run with the root blkg for that queue and then do any + * blkg. The idea is we do bio_blkcg_css() to look up the actual context for + * the bio and attach the appropriate blkg to the bio. Then we call this helper + * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) @@ -457,7 +507,8 @@ struct blkcg_policy_data { struct blkcg_policy { }; -#ifdef CONFIG_BLOCK +struct blkcg { +}; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) @@ -471,8 +522,6 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } -static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } - static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } @@ -488,7 +537,6 @@ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { r #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) -#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */ diff --git a/block/blk-core.c b/block/blk-core.c index bc0506772152..80fa73c419a9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -588,10 +588,9 @@ static inline int bio_check_eod(struct bio *bio) (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { pr_info_ratelimited("%s: attempt to access beyond end of device\n" - "%pg: rw=%d, want=%llu, limit=%llu\n", - current->comm, - bio->bi_bdev, bio->bi_opf, - bio_end_sector(bio), maxsector); + "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", + current->comm, bio->bi_bdev, bio->bi_opf, + bio->bi_iter.bi_sector, nr_sectors, maxsector); return -EIO; } return 0; @@ -816,11 +815,11 @@ void submit_bio_noacct(struct bio *bio) switch (bio_op(bio)) { case REQ_OP_DISCARD: - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) goto not_supported; break; case REQ_OP_SECURE_ERASE: - if (!blk_queue_secure_erase(q)) + if (!bdev_max_secure_erase_sectors(bdev)) goto not_supported; break; case REQ_OP_ZONE_APPEND: @@ -889,19 +888,11 @@ void submit_bio(struct bio *bio) if (blkcg_punt_bio_submit(bio)) return; - /* - * If it's a regular read/write or a barrier with data attached, - * go through the normal accounting stuff before submission. - */ - if (bio_has_data(bio)) { - unsigned int count = bio_sectors(bio); - - if (op_is_write(bio_op(bio))) { - count_vm_events(PGPGOUT, count); - } else { - task_io_account_read(bio->bi_iter.bi_size); - count_vm_events(PGPGIN, count); - } + if (bio_op(bio) == REQ_OP_READ) { + task_io_account_read(bio->bi_iter.bi_size); + count_vm_events(PGPGIN, bio_sectors(bio)); + } else if (bio_op(bio) == REQ_OP_WRITE) { + count_vm_events(PGPGOUT, bio_sectors(bio)); } /* @@ -1018,21 +1009,22 @@ again: } } -static unsigned long __part_start_io_acct(struct block_device *part, - unsigned int sectors, unsigned int op, - unsigned long start_time) +unsigned long bdev_start_io_acct(struct block_device *bdev, + unsigned int sectors, unsigned int op, + unsigned long start_time) { const int sgrp = op_stat_group(op); part_stat_lock(); - update_io_ticks(part, start_time, false); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, sectors[sgrp], sectors); - part_stat_local_inc(part, in_flight[op_is_write(op)]); + update_io_ticks(bdev, start_time, false); + part_stat_inc(bdev, ios[sgrp]); + part_stat_add(bdev, sectors[sgrp], sectors); + part_stat_local_inc(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); return start_time; } +EXPORT_SYMBOL(bdev_start_io_acct); /** * bio_start_io_acct_time - start I/O accounting for bio based drivers @@ -1041,8 +1033,8 @@ static unsigned long __part_start_io_acct(struct block_device *part, */ void bio_start_io_acct_time(struct bio *bio, unsigned long start_time) { - __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), - bio_op(bio), start_time); + bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), + bio_op(bio), start_time); } EXPORT_SYMBOL_GPL(bio_start_io_acct_time); @@ -1054,46 +1046,33 @@ EXPORT_SYMBOL_GPL(bio_start_io_acct_time); */ unsigned long bio_start_io_acct(struct bio *bio) { - return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), - bio_op(bio), jiffies); + return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), + bio_op(bio), jiffies); } EXPORT_SYMBOL_GPL(bio_start_io_acct); -unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, - unsigned int op) -{ - return __part_start_io_acct(disk->part0, sectors, op, jiffies); -} -EXPORT_SYMBOL(disk_start_io_acct); - -static void __part_end_io_acct(struct block_device *part, unsigned int op, - unsigned long start_time) +void bdev_end_io_acct(struct block_device *bdev, unsigned int op, + unsigned long start_time) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; part_stat_lock(); - update_io_ticks(part, now, true); - part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_local_dec(part, in_flight[op_is_write(op)]); + update_io_ticks(bdev, now, true); + part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_stat_local_dec(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); } +EXPORT_SYMBOL(bdev_end_io_acct); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, - struct block_device *orig_bdev) + struct block_device *orig_bdev) { - __part_end_io_acct(orig_bdev, bio_op(bio), start_time); + bdev_end_io_acct(orig_bdev, bio_op(bio), start_time); } EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); -void disk_end_io_acct(struct gendisk *disk, unsigned int op, - unsigned long start_time) -{ - __part_end_io_acct(disk->part0, op, start_time); -} -EXPORT_SYMBOL(disk_end_io_acct); - /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 7c854584b52b..621abd1b0e4d 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -152,23 +152,25 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) src_bio->bi_status = enc_bio->bi_status; - bio_put(enc_bio); + bio_uninit(enc_bio); + kfree(enc_bio); bio_endio(src_bio); } static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) { + unsigned int nr_segs = bio_segments(bio_src); struct bvec_iter iter; struct bio_vec bv; struct bio *bio; - bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src)); + bio = bio_kmalloc(nr_segs, GFP_NOIO); if (!bio) return NULL; - bio->bi_bdev = bio_src->bi_bdev; + bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs, + bio_src->bi_opf); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); - bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; @@ -177,7 +179,6 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio->bi_io_vec[bio->bi_vcnt++] = bv; bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); return bio; } @@ -363,8 +364,8 @@ out_release_keyslot: blk_crypto_put_keyslot(slot); out_put_enc_bio: if (enc_bio) - bio_put(enc_bio); - + bio_uninit(enc_bio); + kfree(enc_bio); return ret; } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 9bd670999d0a..33a11ba971ea 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -533,8 +533,7 @@ struct ioc_gq { /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; - struct iocg_stat local_stat; - struct iocg_stat desc_stat; + struct iocg_stat stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; u64 usage_delta_us; @@ -1371,7 +1370,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) return true; } else { if (iocg->indelay_since) { - iocg->local_stat.indelay_us += now->now - iocg->indelay_since; + iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = 0; } iocg->delay = 0; @@ -1419,7 +1418,7 @@ static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, /* if debt is paid in full, restore inuse */ if (!iocg->abs_vdebt) { - iocg->local_stat.indebt_us += now->now - iocg->indebt_since; + iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = 0; propagate_weights(iocg, iocg->active, iocg->last_inuse, @@ -1513,7 +1512,7 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, if (!waitqueue_active(&iocg->waitq)) { if (iocg->wait_since) { - iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = 0; } return; @@ -1641,11 +1640,30 @@ static void iocg_build_inner_walk(struct ioc_gq *iocg, } } +/* propagate the deltas to the parent */ +static void iocg_flush_stat_upward(struct ioc_gq *iocg) +{ + if (iocg->level > 0) { + struct iocg_stat *parent_stat = + &iocg->ancestors[iocg->level - 1]->stat; + + parent_stat->usage_us += + iocg->stat.usage_us - iocg->last_stat.usage_us; + parent_stat->wait_us += + iocg->stat.wait_us - iocg->last_stat.wait_us; + parent_stat->indebt_us += + iocg->stat.indebt_us - iocg->last_stat.indebt_us; + parent_stat->indelay_us += + iocg->stat.indelay_us - iocg->last_stat.indelay_us; + } + + iocg->last_stat = iocg->stat; +} + /* collect per-cpu counters and propagate the deltas to the parent */ -static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) +static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - struct iocg_stat new_stat; u64 abs_vusage = 0; u64 vusage_delta; int cpu; @@ -1661,34 +1679,9 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) iocg->last_stat_abs_vusage = abs_vusage; iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); - iocg->local_stat.usage_us += iocg->usage_delta_us; - - /* propagate upwards */ - new_stat.usage_us = - iocg->local_stat.usage_us + iocg->desc_stat.usage_us; - new_stat.wait_us = - iocg->local_stat.wait_us + iocg->desc_stat.wait_us; - new_stat.indebt_us = - iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us; - new_stat.indelay_us = - iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us; - - /* propagate the deltas to the parent */ - if (iocg->level > 0) { - struct iocg_stat *parent_stat = - &iocg->ancestors[iocg->level - 1]->desc_stat; + iocg->stat.usage_us += iocg->usage_delta_us; - parent_stat->usage_us += - new_stat.usage_us - iocg->last_stat.usage_us; - parent_stat->wait_us += - new_stat.wait_us - iocg->last_stat.wait_us; - parent_stat->indebt_us += - new_stat.indebt_us - iocg->last_stat.indebt_us; - parent_stat->indelay_us += - new_stat.indelay_us - iocg->last_stat.indelay_us; - } - - iocg->last_stat = new_stat; + iocg_flush_stat_upward(iocg); } /* get stat counters ready for reading on all active iocgs */ @@ -1699,13 +1692,13 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) /* flush leaves and build inner node walk list */ list_for_each_entry(iocg, target_iocgs, active_list) { - iocg_flush_stat_one(iocg, now); + iocg_flush_stat_leaf(iocg, now); iocg_build_inner_walk(iocg, &inner_walk); } /* keep flushing upwards by walking the inner list backwards */ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { - iocg_flush_stat_one(iocg, now); + iocg_flush_stat_upward(iocg); list_del_init(&iocg->walk_list); } } @@ -2152,16 +2145,16 @@ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) /* flush wait and indebt stat deltas */ if (iocg->wait_since) { - iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = now->now; } if (iocg->indebt_since) { - iocg->local_stat.indebt_us += + iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = now->now; } if (iocg->indelay_since) { - iocg->local_stat.indelay_us += + iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = now->now; } @@ -3005,13 +2998,13 @@ static void ioc_pd_free(struct blkg_policy_data *pd) kfree(iocg); } -static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) +static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; if (!ioc->enabled) - return false; + return; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( @@ -3027,7 +3020,6 @@ static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) iocg->last_stat.wait_us, iocg->last_stat.indebt_us, iocg->last_stat.indelay_us); - return true; } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2f33932e72e3..5b676c7cf2b6 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -891,7 +891,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) return 0; } -static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) +static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { struct latency_stat stat; int cpu; @@ -914,17 +914,16 @@ static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) (unsigned long long)stat.ps.missed, (unsigned long long)stat.ps.total, iolat->rq_depth.max_depth); - return true; } -static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) +static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat; unsigned long long cur_win; if (!blkcg_debug_stats) - return false; + return; if (iolat->ssd) return iolatency_ssd_stat(iolat, s); @@ -937,7 +936,6 @@ static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) else seq_printf(s, " depth=%u avg_lat=%llu win=%llu", iolat->rq_depth.max_depth, avg_lat, cur_win); - return true; } static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, diff --git a/block/blk-lib.c b/block/blk-lib.c index 237d60d8b585..09b7e1200c0f 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -10,30 +10,44 @@ #include "blk.h" +static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) +{ + unsigned int discard_granularity = bdev_discard_granularity(bdev); + sector_t granularity_aligned_sector; + + if (bdev_is_partition(bdev)) + sector += bdev->bd_start_sect; + + granularity_aligned_sector = + round_up(sector, discard_granularity >> SECTOR_SHIFT); + + /* + * Make sure subsequent bios start aligned to the discard granularity if + * it needs to be split. + */ + if (granularity_aligned_sector != sector) + return granularity_aligned_sector - sector; + + /* + * Align the bio size to the discard granularity to make splitting the bio + * at discard granularity boundaries easier in the driver if needed. + */ + return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; +} + int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags, - struct bio **biop) + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { - struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; - unsigned int op; - sector_t bs_mask, part_offset = 0; + sector_t bs_mask; if (bdev_read_only(bdev)) return -EPERM; - - if (flags & BLKDEV_DISCARD_SECURE) { - if (!blk_queue_secure_erase(q)) - return -EOPNOTSUPP; - op = REQ_OP_SECURE_ERASE; - } else { - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - op = REQ_OP_DISCARD; - } + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; /* In case the discard granularity isn't set by buggy device driver */ - if (WARN_ON_ONCE(!q->limits.discard_granularity)) { + if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { char dev_name[BDEVNAME_SIZE]; bdevname(bdev, dev_name); @@ -48,38 +62,11 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!nr_sects) return -EINVAL; - /* In case the discard request is in a partition */ - if (bdev_is_partition(bdev)) - part_offset = bdev->bd_start_sect; - while (nr_sects) { - sector_t granularity_aligned_lba, req_sects; - sector_t sector_mapped = sector + part_offset; - - granularity_aligned_lba = round_up(sector_mapped, - q->limits.discard_granularity >> SECTOR_SHIFT); - - /* - * Check whether the discard bio starts at a discard_granularity - * aligned LBA, - * - If no: set (granularity_aligned_lba - sector_mapped) to - * bi_size of the first split bio, then the second bio will - * start at a discard_granularity aligned LBA on the device. - * - If yes: use bio_aligned_discard_max_sectors() as the max - * possible bi_size of the first split bio. Then when this bio - * is split in device drive, the split ones are very probably - * to be aligned to discard_granularity of the device's queue. - */ - if (granularity_aligned_lba == sector_mapped) - req_sects = min_t(sector_t, nr_sects, - bio_aligned_discard_max_sectors(q)); - else - req_sects = min_t(sector_t, nr_sects, - granularity_aligned_lba - sector_mapped); - - WARN_ON_ONCE((req_sects << 9) > UINT_MAX); + sector_t req_sects = + min(nr_sects, bio_discard_limit(bdev, sector)); - bio = blk_next_bio(bio, bdev, 0, op, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_iter.bi_size = req_sects << 9; sector += req_sects; @@ -105,21 +92,19 @@ EXPORT_SYMBOL(__blkdev_issue_discard); * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: BLKDEV_DISCARD_* flags to control behaviour * * Description: * Issue a discard request for the sectors in question. */ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) + sector_t nr_sects, gfp_t gfp_mask) { struct bio *bio = NULL; struct blk_plug plug; int ret; blk_start_plug(&plug); - ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, - &bio); + ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, &bio); if (!ret && bio) { ret = submit_bio_wait(bio); if (ret == -EOPNOTSUPP) @@ -316,3 +301,42 @@ retry: return ret; } EXPORT_SYMBOL(blkdev_issue_zeroout); + +int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp) +{ + sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + unsigned int max_sectors = bdev_max_secure_erase_sectors(bdev); + struct bio *bio = NULL; + struct blk_plug plug; + int ret = 0; + + if (max_sectors == 0) + return -EOPNOTSUPP; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + if (bdev_read_only(bdev)) + return -EPERM; + + blk_start_plug(&plug); + for (;;) { + unsigned int len = min_t(sector_t, nr_sects, max_sectors); + + bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = len; + + sector += len << SECTOR_SHIFT; + nr_sects -= len << SECTOR_SHIFT; + if (!nr_sects) { + ret = submit_bio_wait(bio); + bio_put(bio); + break; + } + cond_resched(); + } + blk_finish_plug(&plug); + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_secure_erase); diff --git a/block/blk-map.c b/block/blk-map.c index c7f71d83eff1..df8b066cd548 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -152,10 +152,10 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE)); ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; - bio->bi_opf |= req_op(rq); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); if (map_data) { nr_pages = 1 << map_data->page_order; @@ -224,7 +224,8 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, cleanup: if (!map_data) bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); out_bmd: kfree(bmd); return ret; @@ -234,6 +235,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { unsigned int max_sectors = queue_max_hw_sectors(rq->q); + unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; int ret; int j; @@ -241,10 +243,10 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!iov_iter_count(iter)) return -EINVAL; - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_VECS)); + bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return -ENOMEM; - bio->bi_opf |= req_op(rq); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); while (iov_iter_count(iter)) { struct page **pages; @@ -260,10 +262,9 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - if (unlikely(offs & queue_dma_alignment(rq->q))) { - ret = -EINVAL; + if (unlikely(offs & queue_dma_alignment(rq->q))) j = 0; - } else { + else { for (j = 0; j < npages; j++) { struct page *page = pages[j]; unsigned int n = PAGE_SIZE - offs; @@ -303,7 +304,8 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, out_unmap: bio_release_pages(bio, false); - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ret; } @@ -323,7 +325,8 @@ static void bio_invalidate_vmalloc_pages(struct bio *bio) static void bio_map_kern_endio(struct bio *bio) { bio_invalidate_vmalloc_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); } /** @@ -348,9 +351,10 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, int offset, i; struct bio *bio; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); if (is_vmalloc) { flush_kernel_vmap_range(data, len); @@ -374,7 +378,8 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ERR_PTR(-EINVAL); } @@ -390,7 +395,8 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, static void bio_copy_kern_endio(struct bio *bio) { bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); } static void bio_copy_kern_endio_read(struct bio *bio) @@ -435,9 +441,10 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, return ERR_PTR(-EINVAL); nr_pages = end - start; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); while (len) { struct page *page; @@ -471,7 +478,8 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, cleanup: bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ERR_PTR(-ENOMEM); } @@ -602,7 +610,8 @@ int blk_rq_unmap_user(struct bio *bio) next_bio = bio; bio = bio->bi_next; - bio_put(next_bio); + bio_uninit(next_bio); + kfree(next_bio); } return ret; @@ -648,8 +657,10 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio->bi_opf |= req_op(rq); ret = blk_rq_append_bio(rq, bio); - if (unlikely(ret)) - bio_put(bio); + if (unlikely(ret)) { + bio_uninit(bio); + kfree(bio); + } return ret; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index aa0349e9f083..7e4136a60e1c 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -113,10 +113,8 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), - QUEUE_FLAG_NAME(DISCARD), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), - QUEUE_FLAG_NAME(SECERASE), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(DEAD), QUEUE_FLAG_NAME(INIT_DONE), diff --git a/block/blk-mq.c b/block/blk-mq.c index 84d749511f55..ae116b755648 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1083,7 +1083,7 @@ bool blk_mq_complete_request_remote(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* - * For a polled request, always complete locallly, it's pointless + * For a polled request, always complete locally, it's pointless * to redirect the completion. */ if (rq->cmd_flags & REQ_POLLED) @@ -1169,6 +1169,62 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) complete(waiting); } +/* + * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * queues. This is important for md arrays to benefit from merging + * requests. + */ +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) +{ + if (plug->multiple_queues) + return BLK_MAX_REQUEST_COUNT * 2; + return BLK_MAX_REQUEST_COUNT; +} + +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + struct request *last = rq_list_peek(&plug->mq_list); + + if (!plug->rq_count) { + trace_block_plug(rq->q); + } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || + (!blk_queue_nomerges(rq->q) && + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { + blk_mq_flush_plug_list(plug, false); + trace_block_plug(rq->q); + } + + if (!plug->multiple_queues && last && last->q != rq->q) + plug->multiple_queues = true; + if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + plug->has_elevator = true; + rq->rq_next = NULL; + rq_list_add(&plug->mq_list, rq); + plug->rq_count++; +} + +static void __blk_execute_rq_nowait(struct request *rq, bool at_head, + rq_end_io_fn *done, bool use_plug) +{ + WARN_ON(irqs_disabled()); + WARN_ON(!blk_rq_is_passthrough(rq)); + + rq->end_io = done; + + blk_account_io_start(rq); + + if (use_plug && current->plug) { + blk_add_rq_to_plug(current->plug, rq); + return; + } + /* + * don't check dying flag for MQ because the request won't + * be reused after dying flag is set + */ + blk_mq_sched_insert_request(rq, at_head, true, false); +} + + /** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @rq: request to insert @@ -1184,18 +1240,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) */ void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done) { - WARN_ON(irqs_disabled()); - WARN_ON(!blk_rq_is_passthrough(rq)); + __blk_execute_rq_nowait(rq, at_head, done, true); - rq->end_io = done; - - blk_account_io_start(rq); - - /* - * don't check dying flag for MQ because the request won't - * be reused after dying flag is set - */ - blk_mq_sched_insert_request(rq, at_head, true, false); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -1233,8 +1279,13 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head) DECLARE_COMPLETION_ONSTACK(wait); unsigned long hang_check; + /* + * iopoll requires request to be submitted to driver, so can't + * use plug + */ rq->end_io_data = &wait; - blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq); + __blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq, + !blk_rq_is_poll(rq)); /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; @@ -2676,40 +2727,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, hctx->queue->mq_ops->commit_rqs(hctx); } -/* - * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple - * queues. This is important for md arrays to benefit from merging - * requests. - */ -static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) -{ - if (plug->multiple_queues) - return BLK_MAX_REQUEST_COUNT * 2; - return BLK_MAX_REQUEST_COUNT; -} - -static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) -{ - struct request *last = rq_list_peek(&plug->mq_list); - - if (!plug->rq_count) { - trace_block_plug(rq->q); - } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || - (!blk_queue_nomerges(rq->q) && - blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { - blk_mq_flush_plug_list(plug, false); - trace_block_plug(rq->q); - } - - if (!plug->multiple_queues && last && last->q != rq->q) - plug->multiple_queues = true; - if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) - plug->has_elevator = true; - rq->rq_next = NULL; - rq_list_add(&plug->mq_list, rq); - plug->rq_count++; -} - static bool blk_mq_attempt_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { diff --git a/block/blk-settings.c b/block/blk-settings.c index b83df3d2eebc..6ccceb421ed2 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -46,6 +46,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_zone_append_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; + lim->max_secure_erase_sectors = 0; lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; @@ -177,6 +178,18 @@ void blk_queue_max_discard_sectors(struct request_queue *q, EXPORT_SYMBOL(blk_queue_max_discard_sectors); /** + * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase + * @q: the request queue for the device + * @max_sectors: maximum number of sectors to secure_erase + **/ +void blk_queue_max_secure_erase_sectors(struct request_queue *q, + unsigned int max_sectors) +{ + q->limits.max_secure_erase_sectors = max_sectors; +} +EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors); + +/** * blk_queue_max_write_zeroes_sectors - set max sectors for a single * write zeroes * @q: the request queue for the device @@ -468,6 +481,40 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) } EXPORT_SYMBOL(blk_queue_io_opt); +static int queue_limit_alignment_offset(struct queue_limits *lim, + sector_t sector) +{ + unsigned int granularity = max(lim->physical_block_size, lim->io_min); + unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) + << SECTOR_SHIFT; + + return (granularity + lim->alignment_offset - alignment) % granularity; +} + +static unsigned int queue_limit_discard_alignment(struct queue_limits *lim, + sector_t sector) +{ + unsigned int alignment, granularity, offset; + + if (!lim->max_discard_sectors) + return 0; + + /* Why are these in bytes, not sectors? */ + alignment = lim->discard_alignment >> SECTOR_SHIFT; + granularity = lim->discard_granularity >> SECTOR_SHIFT; + if (!granularity) + return 0; + + /* Offset of the partition start in 'granularity' sectors */ + offset = sector_div(sector, granularity); + + /* And why do we do this modulus *again* in blkdev_issue_discard()? */ + offset = (granularity + alignment - offset) % granularity; + + /* Turn it back into bytes, gaah */ + return offset << SECTOR_SHIFT; +} + static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) { sectors = round_down(sectors, lbs >> SECTOR_SHIFT); @@ -627,7 +674,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % t->discard_granularity; } - + t->max_secure_erase_sectors = min_not_zero(t->max_secure_erase_sectors, + b->max_secure_erase_sectors); t->zone_write_granularity = max(t->zone_write_granularity, b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); @@ -901,3 +949,27 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) } } EXPORT_SYMBOL_GPL(blk_queue_set_zoned); + +int bdev_alignment_offset(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q->limits.misaligned) + return -1; + if (bdev_is_partition(bdev)) + return queue_limit_alignment_offset(&q->limits, + bdev->bd_start_sect); + return q->limits.alignment_offset; +} +EXPORT_SYMBOL_GPL(bdev_alignment_offset); + +unsigned int bdev_discard_alignment(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (bdev_is_partition(bdev)) + return queue_limit_discard_alignment(&q->limits, + bdev->bd_start_sect); + return q->limits.discard_alignment; +} +EXPORT_SYMBOL_GPL(bdev_discard_alignment); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 469c483719be..139b2d7a99e2 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -227,7 +227,7 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) break; \ if ((__tg)) { \ blk_add_cgroup_trace_msg(__td->queue, \ - tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\ + &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ @@ -2189,13 +2189,14 @@ again: } out_unlock: - spin_unlock_irq(&q->queue_lock); bio_set_flag(bio, BIO_THROTTLED); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif + spin_unlock_irq(&q->queue_lock); + rcu_read_unlock(); return throttled; } diff --git a/block/blk.h b/block/blk.h index 8ccbc6e07636..434017701403 100644 --- a/block/blk.h +++ b/block/blk.h @@ -347,20 +347,6 @@ static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) } /* - * The max bio size which is aligned to q->limits.discard_granularity. This - * is a hint to split large discard bio in generic block layer, then if device - * driver needs to split the discard bio into smaller ones, their bi_size can - * be very probably and easily aligned to discard_granularity of the device's - * queue. - */ -static inline unsigned int bio_aligned_discard_max_sectors( - struct request_queue *q) -{ - return round_down(UINT_MAX, q->limits.discard_granularity) >> - SECTOR_SHIFT; -} - -/* * Internal io_context interface */ struct io_cq *ioc_find_get_icq(struct request_queue *q); @@ -450,13 +436,6 @@ extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; -static inline void bio_clear_polled(struct bio *bio) -{ - /* can't support alloc cache if we turn off polling */ - bio_clear_flag(bio, BIO_PERCPU_CACHE); - bio->bi_opf &= ~REQ_POLLED; -} - long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/block/bounce.c b/block/bounce.c index 467be46d0e65..8f7b6fe3b4db 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -191,7 +191,6 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) goto err_put; bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); return bio; diff --git a/block/fops.c b/block/fops.c index 06feb41d798b..d6b3276a6c68 100644 --- a/block/fops.c +++ b/block/fops.c @@ -44,14 +44,6 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb) #define DIO_INLINE_BIO_VECS 4 -static void blkdev_bio_end_io_simple(struct bio *bio) -{ - struct task_struct *waiter = bio->bi_private; - - WRITE_ONCE(bio->bi_private, NULL); - blk_wake_io_task(waiter); -} - static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { @@ -83,8 +75,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; - bio.bi_private = current; - bio.bi_end_io = blkdev_bio_end_io_simple; bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); @@ -97,18 +87,8 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOWAIT) bio.bi_opf |= REQ_NOWAIT; - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(&bio, iocb); - submit_bio(&bio); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio.bi_private)) - break; - if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); + submit_bio_wait(&bio); bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) @@ -197,8 +177,10 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool); - + if (iocb->ki_flags & IOCB_ALLOC_CACHE) + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, + &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); atomic_set(&dio->ref, 1); /* @@ -320,7 +302,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool); + if (iocb->ki_flags & IOCB_ALLOC_CACHE) + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, + &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->flags = 0; dio->iocb = iocb; @@ -670,7 +655,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL, 0); + len >> SECTOR_SHIFT, GFP_KERNEL); break; default: error = -EOPNOTSUPP; diff --git a/block/genhd.c b/block/genhd.c index b8b6759d670f..36532b931841 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1010,7 +1010,7 @@ static ssize_t disk_alignment_offset_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); + return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, @@ -1019,7 +1019,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); + return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t diskseq_show(struct device *dev, diff --git a/block/ioctl.c b/block/ioctl.c index f8703db99c73..46949f1b0dba 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -83,18 +83,17 @@ static int compat_blkpg_ioctl(struct block_device *bdev, #endif static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, - unsigned long arg, unsigned long flags) + unsigned long arg) { uint64_t range[2]; uint64_t start, len; - struct request_queue *q = bdev_get_queue(bdev); struct inode *inode = bdev->bd_inode; int err; if (!(mode & FMODE_WRITE)) return -EBADF; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (copy_from_user(range, (void __user *)arg, sizeof(range))) @@ -115,15 +114,43 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; - - err = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, flags); - + err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); fail: filemap_invalidate_unlock(inode->i_mapping); return err; } +static int blk_ioctl_secure_erase(struct block_device *bdev, fmode_t mode, + void __user *argp) +{ + uint64_t start, len; + uint64_t range[2]; + int err; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + if (!bdev_max_secure_erase_sectors(bdev)) + return -EOPNOTSUPP; + if (copy_from_user(range, argp, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; + if ((start & 511) || (len & 511)) + return -EINVAL; + if (start + len > bdev_nr_bytes(bdev)) + return -EINVAL; + + filemap_invalidate_lock(bdev->bd_inode->i_mapping); + err = truncate_bdev_range(bdev, mode, start, start + len - 1); + if (!err) + err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, + GFP_KERNEL); + filemap_invalidate_unlock(bdev->bd_inode->i_mapping); + return err; +} + + static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, unsigned long arg) { @@ -451,10 +478,9 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKROSET: return blkdev_roset(bdev, mode, cmd, arg); case BLKDISCARD: - return blk_ioctl_discard(bdev, mode, arg, 0); + return blk_ioctl_discard(bdev, mode, arg); case BLKSECDISCARD: - return blk_ioctl_discard(bdev, mode, arg, - BLKDEV_DISCARD_SECURE); + return blk_ioctl_secure_erase(bdev, mode, argp); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); case BLKGETDISKSEQ: @@ -489,7 +515,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: - return put_ushort(argp, !blk_queue_nonrot(bdev_get_queue(bdev))); + return put_ushort(argp, !bdev_nonrot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 3ed5eaf3446a..6ed602b2f80a 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -742,6 +742,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, if (at_head) { list_add(&rq->queuelist, &per_prio->dispatch); + rq->fifo_time = jiffies; } else { deadline_add_rq_rb(per_prio, rq); diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index 2c381c694c57..d2fc122d7426 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -282,13 +282,13 @@ int adfspart_check_ADFS(struct parsed_partitions *state) #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: case PARTITION_RISCIX_MFM: - slot = riscix_partition(state, start_sect, slot, + riscix_partition(state, start_sect, slot, nr_sects); break; #endif case PARTITION_LINUX: - slot = linux_partition(state, start_sect, slot, + linux_partition(state, start_sect, slot, nr_sects); break; } diff --git a/block/partitions/atari.c b/block/partitions/atari.c index da5994175416..9655c728262a 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -140,7 +140,6 @@ int atari_partition(struct parsed_partitions *state) /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) continue; - part_fmt = 2; put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); diff --git a/block/partitions/core.c b/block/partitions/core.c index 2ef8dfa1e5c8..8a0ec929023b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -200,21 +200,13 @@ static ssize_t part_ro_show(struct device *dev, static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct block_device *bdev = dev_to_bdev(dev); - - return sprintf(buf, "%u\n", - queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits, - bdev->bd_start_sect)); + return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct block_device *bdev = dev_to_bdev(dev); - - return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits, - bdev->bd_start_sect)); + return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -486,7 +478,7 @@ int bdev_del_partition(struct gendisk *disk, int partno) goto out_unlock; ret = -EBUSY; - if (part->bd_openers) + if (atomic_read(&part->bd_openers)) goto out_unlock; delete_partition(part); diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 27f6c7d9c776..38e58960ae03 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -736,7 +736,6 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) len = r_cols; } else { r_stripe = 0; - r_cols = 0; len = r_parent; } if (len < 0) @@ -783,11 +782,8 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; + } else len = r_diskid; - } if (len < 0) return false; @@ -826,11 +822,8 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; + } else len = r_name; - } if (len < 0) return false; @@ -963,10 +956,8 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) return false; } len = r_index; - } else { - r_index = 0; + } else len = r_diskid; - } if (len < 0) { ldm_error("len %d < 0", len); return false; |