diff options
Diffstat (limited to 'block/blk-core.c')
-rw-r--r-- | block/blk-core.c | 657 |
1 files changed, 467 insertions, 190 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index a0e3096c4bb5..ab51685988c2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -32,17 +32,18 @@ #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> +#include <linux/blk-cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); @@ -50,7 +51,7 @@ DEFINE_IDA(blk_queue_ida); /* * For the allocated request tables */ -struct kmem_cache *request_cachep = NULL; +struct kmem_cache *request_cachep; /* * For queue allocation @@ -62,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; +static void blk_clear_congested(struct request_list *rl, int sync) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + clear_wb_congested(rl->blkg->wb_congested, sync); +#else + /* + * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't + * flip its congestion state for events on other blkcgs. + */ + if (rl == &rl->q->root_rl) + clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + +static void blk_set_congested(struct request_list *rl, int sync) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + set_wb_congested(rl->blkg->wb_congested, sync); +#else + /* see blk_clear_congested() */ + if (rl == &rl->q->root_rl) + set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -82,18 +108,14 @@ void blk_queue_congestion_threshold(struct request_queue *q) * @bdev: device * * Locates the passed device's request queue and returns the address of its - * backing_dev_info - * - * Will return NULL if the request queue cannot be located. + * backing_dev_info. This function can only be called if @bdev is opened + * and the return value is never NULL. */ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) { - struct backing_dev_info *ret = NULL; struct request_queue *q = bdev_get_queue(bdev); - if (q) - ret = &q->backing_dev_info; - return ret; + return &q->backing_dev_info; } EXPORT_SYMBOL(blk_get_backing_dev_info); @@ -121,18 +143,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; + bio->bi_error = error; if (unlikely(rq->cmd_flags & REQ_QUIET)) - set_bit(BIO_QUIET, &bio->bi_flags); + bio_set_flag(bio, BIO_QUIET); bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) - bio_endio(bio, error); + bio_endio(bio); } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -146,8 +166,8 @@ void blk_dump_rq_flags(struct request *rq, char *msg) printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); - printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", - rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); + printk(KERN_INFO " bio %p, biotail %p, len %u\n", + rq->bio, rq->biotail, blk_rq_bytes(rq)); if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { printk(KERN_INFO " cdb: "); @@ -187,6 +207,22 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs) EXPORT_SYMBOL(blk_delay_queue); /** + * blk_start_queue_async - asynchronously restart a previously stopped queue + * @q: The &struct request_queue in question + * + * Description: + * blk_start_queue_async() will clear the stop flag on the queue, and + * ensure that the request_fn for the queue is run from an async + * context. + **/ +void blk_start_queue_async(struct request_queue *q) +{ + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + blk_run_queue_async(q); +} +EXPORT_SYMBOL(blk_start_queue_async); + +/** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question * @@ -239,7 +275,7 @@ EXPORT_SYMBOL(blk_stop_queue); * this function. * * This function does not cancel any asynchronous activity arising - * out of elevator or throttling code. That would require elevaotor_exit() + * out of elevator or throttling code. That would require elevator_exit() * and blkcg_exit_queue() to be called with queue lock initialized. * */ @@ -251,8 +287,10 @@ void blk_sync_queue(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) - cancel_delayed_work_sync(&hctx->delayed_work); + queue_for_each_hw_ctx(q, hctx, i) { + cancel_delayed_work_sync(&hctx->run_work); + cancel_delayed_work_sync(&hctx->delay_work); + } } else { cancel_delayed_work_sync(&q->delay_work); } @@ -286,6 +324,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q) q->request_fn(q); q->request_fn_active--; } +EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); /** * __blk_run_queue - run a single device queue @@ -391,11 +430,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) * be drained. Check all the queues and counters. */ if (drain_all) { + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { drain |= q->nr_rqs[i]; drain |= q->in_flight[i]; - drain |= !list_empty(&q->flush_queue[i]); + if (fq) + drain |= !list_empty(&fq->flush_queue[i]); } } @@ -435,14 +476,17 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) */ void blk_queue_bypass_start(struct request_queue *q) { - bool drain; - spin_lock_irq(q->queue_lock); - drain = !q->bypass_depth++; + q->bypass_depth++; queue_flag_set(QUEUE_FLAG_BYPASS, q); spin_unlock_irq(q->queue_lock); - if (drain) { + /* + * Queues start drained. Skip actual draining till init is + * complete. This avoids lenghty delays during queue init which + * can happen many times during boot. + */ + if (blk_queue_init_done(q)) { spin_lock_irq(q->queue_lock); __blk_drain_queue(q, false); spin_unlock_irq(q->queue_lock); @@ -469,6 +513,25 @@ void blk_queue_bypass_end(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_queue_bypass_end); +void blk_set_queue_dying(struct request_queue *q) +{ + queue_flag_set_unlocked(QUEUE_FLAG_DYING, q); + + if (q->mq_ops) + blk_mq_wake_waiters(q); + else { + struct request_list *rl; + + blk_queue_for_each_rl(rl, q) { + if (rl->rq_pool) { + wake_up(&rl->wait[BLK_RW_SYNC]); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } + } + } +} +EXPORT_SYMBOL_GPL(blk_set_queue_dying); + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -482,7 +545,7 @@ void blk_cleanup_queue(struct request_queue *q) /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); - queue_flag_set_unlocked(QUEUE_FLAG_DYING, q); + blk_set_queue_dying(q); spin_lock_irq(lock); /* @@ -507,30 +570,48 @@ void blk_cleanup_queue(struct request_queue *q) * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that q->request_fn() gets invoked after draining finished. */ - if (q->mq_ops) { - blk_mq_drain_queue(q); - spin_lock_irq(lock); - } else { - spin_lock_irq(lock); + blk_freeze_queue(q); + spin_lock_irq(lock); + if (!q->mq_ops) __blk_drain_queue(q, true); - } queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); + /* for synchronous bio-based driver finish in-flight integrity i/o */ + blk_flush_integrity(); + /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); blk_sync_queue(q); + if (q->mq_ops) + blk_mq_free_queue(q); + percpu_ref_exit(&q->q_usage_counter); + spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); + bdi_unregister(&q->backing_dev_info); + /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); +/* Allocate memory local to the request queue */ +static void *alloc_request_struct(gfp_t gfp_mask, void *data) +{ + int nid = (int)(long)data; + return kmem_cache_alloc_node(request_cachep, gfp_mask, nid); +} + +static void free_request_struct(void *element, void *unused) +{ + kmem_cache_free(request_cachep, element); +} + int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask) { @@ -543,9 +624,10 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q, init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, request_cachep, - gfp_mask, q->node); + rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct, + free_request_struct, + (void *)(long)q->node, gfp_mask, + q->node); if (!rl->rq_pool) return -ENOMEM; @@ -564,6 +646,47 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); +int blk_queue_enter(struct request_queue *q, bool nowait) +{ + while (true) { + int ret; + + if (percpu_ref_tryget_live(&q->q_usage_counter)) + return 0; + + if (nowait) + return -EBUSY; + + ret = wait_event_interruptible(q->mq_freeze_wq, + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); + if (blk_queue_dying(q)) + return -ENODEV; + if (ret) + return ret; + } +} + +void blk_queue_exit(struct request_queue *q) +{ + percpu_ref_put(&q->q_usage_counter); +} + +static void blk_queue_usage_counter_release(struct percpu_ref *ref) +{ + struct request_queue *q = + container_of(ref, struct request_queue, q_usage_counter); + + wake_up_all(&q->mq_freeze_wq); +} + +static void blk_rq_timed_out_timer(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + + kblockd_schedule_work(&q->timeout_work); +} + struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; @@ -574,23 +697,23 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - if (percpu_counter_init(&q->mq_usage_counter, 0)) - goto fail_q; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) - goto fail_c; + goto fail_q; + + q->bio_split = bioset_create(BIO_POOL_SIZE, 0); + if (!q->bio_split) + goto fail_id; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.state = 0; - q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; + q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info.name = "block"; q->node = node_id; err = bdi_init(&q->backing_dev_info); if (err) - goto fail_id; + goto fail_split; setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); @@ -601,9 +724,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) #ifdef CONFIG_BLK_CGROUP INIT_LIST_HEAD(&q->blkg_list); #endif - INIT_LIST_HEAD(&q->flush_queue[0]); - INIT_LIST_HEAD(&q->flush_queue[1]); - INIT_LIST_HEAD(&q->flush_data_in_flight); INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); kobject_init(&q->kobj, &blk_queue_ktype); @@ -628,17 +748,28 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) init_waitqueue_head(&q->mq_freeze_wq); - if (blkcg_init_queue(q)) + /* + * Init percpu_ref in atomic mode so that it's faster to shutdown. + * See blk_register_queue() for details. + */ + if (percpu_ref_init(&q->q_usage_counter, + blk_queue_usage_counter_release, + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_bdi; + if (blkcg_init_queue(q)) + goto fail_ref; + return q; +fail_ref: + percpu_ref_exit(&q->q_usage_counter); fail_bdi: bdi_destroy(&q->backing_dev_info); +fail_split: + bioset_free(q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); -fail_c: - percpu_counter_destroy(&q->mq_usage_counter); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; @@ -701,6 +832,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) } EXPORT_SYMBOL(blk_init_queue_node); +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); + struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, spinlock_t *lock) @@ -708,13 +841,14 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); - if (!q->flush_rq) + q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0); + if (!q->fq) return NULL; if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto fail; + INIT_WORK(&q->timeout_work, blk_timeout_work); q->request_fn = rfn; q->prep_rq_fn = NULL; q->unprep_rq_fn = NULL; @@ -745,7 +879,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, return q; fail: - kfree(q->flush_rq); + blk_free_flush_queue(q->fq); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -810,13 +944,8 @@ static void __freed_request(struct request_list *rl, int sync) { struct request_queue *q = rl->q; - /* - * bdi isn't aware of blkcg yet. As all async IOs end up root - * blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl && - rl->count[sync] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, sync); + if (rl->count[sync] < queue_congestion_off_threshold(q)) + blk_clear_congested(rl, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) @@ -846,6 +975,47 @@ static void freed_request(struct request_list *rl, unsigned int flags) __freed_request(rl, sync ^ 1); } +int blk_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct request_list *rl; + int on_thresh, off_thresh; + + spin_lock_irq(q->queue_lock); + q->nr_requests = nr; + blk_queue_congestion_threshold(q); + on_thresh = queue_congestion_on_threshold(q); + off_thresh = queue_congestion_off_threshold(q); + + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_SYNC); + + if (rl->count[BLK_RW_ASYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_ASYNC); + + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_SYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_SYNC); + wake_up(&rl->wait[BLK_RW_SYNC]); + } + + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_ASYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_ASYNC); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } + } + + spin_unlock_irq(q->queue_lock); + return 0; +} + /* * Determine if elevator data should be initialized when allocating the * request associated with @bio. @@ -891,9 +1061,9 @@ static struct io_context *rq_ioc(struct bio *bio) * Get a free request from @q. This function may fail under memory * pressure or if @q is dead. * - * Must be callled with @q->queue_lock held and, - * Returns %NULL on failure, with @q->queue_lock held. - * Returns !%NULL on success, with @q->queue_lock *not held*. + * Must be called with @q->queue_lock held and, + * Returns ERR_PTR on failure, with @q->queue_lock held. + * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, int rw_flags, struct bio *bio, gfp_t gfp_mask) @@ -907,7 +1077,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, int may_queue; if (unlikely(blk_queue_dying(q))) - return NULL; + return ERR_PTR(-ENODEV); may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) @@ -932,16 +1102,11 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, * process is not a "batcher", and not * exempted by the IO scheduler */ - return NULL; + return ERR_PTR(-ENOMEM); } } } - /* - * bdi isn't aware of blkcg yet. As all async IOs end up - * root blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl) - blk_set_queue_congested(q, is_sync); + blk_set_congested(rl, is_sync); } /* @@ -950,7 +1115,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, * allocated with any setting of ->nr_requests */ if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) - return NULL; + return ERR_PTR(-ENOMEM); q->nr_rqs[is_sync]++; rl->count[is_sync]++; @@ -1023,8 +1188,8 @@ fail_elvpriv: * shouldn't stall IO. Treat this request as !elvpriv. This will * disturb iosched and blkcg but weird is bettern than dead. */ - printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n", - dev_name(q->backing_dev_info.dev)); + printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", + __func__, dev_name(q->backing_dev_info.dev)); rq->cmd_flags &= ~REQ_ELVPRIV; rq->elv.icq = NULL; @@ -1055,7 +1220,7 @@ fail_alloc: rq_starved: if (unlikely(rl->count[is_sync] == 0)) rl->starved[is_sync] = 1; - return NULL; + return ERR_PTR(-ENOMEM); } /** @@ -1065,12 +1230,12 @@ rq_starved: * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * - * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this - * function keeps retrying under memory pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, + * this function keeps retrying under memory pressure and fails iff @q is dead. * - * Must be callled with @q->queue_lock held and, - * Returns %NULL on failure, with @q->queue_lock held. - * Returns !%NULL on success, with @q->queue_lock *not held*. + * Must be called with @q->queue_lock held and, + * Returns ERR_PTR on failure, with @q->queue_lock held. + * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) @@ -1083,12 +1248,12 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: rq = __get_request(rl, rw_flags, bio, gfp_mask); - if (rq) + if (!IS_ERR(rq)) return rq; - if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); - return NULL; + return rq; } /* wait on @rl and retry */ @@ -1125,7 +1290,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, spin_lock_irq(q->queue_lock); rq = get_request(q, rw, NULL, gfp_mask); - if (!rq) + if (IS_ERR(rq)) spin_unlock_irq(q->queue_lock); /* q->queue_lock is unlocked at this point */ @@ -1135,7 +1300,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask); + return blk_mq_alloc_request(q, rw, + (gfp_mask & __GFP_DIRECT_RECLAIM) ? + 0 : BLK_MQ_REQ_NOWAIT); else return blk_old_get_request(q, rw, gfp_mask); } @@ -1164,11 +1331,11 @@ EXPORT_SYMBOL(blk_get_request); * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be - * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for - * anything but the first bio in the chain. Otherwise you risk waiting for IO - * completion of a bio that hasn't been submitted yet, thus resulting in a - * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead - * of bio_alloc(), as that avoids the mempool deadlock. + * given to how you allocate bios. In particular, you cannot use + * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise + * you risk waiting for IO completion of a bio that hasn't been submitted yet, + * thus resulting in a deadlock. Alternatively bios should be allocated using + * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ @@ -1177,8 +1344,10 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, { struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask); - if (unlikely(!rq)) - return ERR_PTR(-ENOMEM); + if (IS_ERR(rq)) + return rq; + + blk_rq_set_block_pc(rq); for_each_bio(bio) { struct bio *bounce_bio = bio; @@ -1197,6 +1366,21 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, EXPORT_SYMBOL(blk_make_request); /** + * blk_rq_set_block_pc - initialize a request to type BLOCK_PC + * @rq: request to be initialized + * + */ +void blk_rq_set_block_pc(struct request *rq) +{ + rq->cmd_type = REQ_TYPE_BLOCK_PC; + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; + memset(rq->__cmd, 0, sizeof(rq->__cmd)); +} +EXPORT_SYMBOL(blk_rq_set_block_pc); + +/** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted * @rq: request to be inserted @@ -1212,7 +1396,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - if (blk_rq_tagged(rq)) + if (rq->cmd_flags & REQ_QUEUED) blk_queue_end_tag(q, rq); BUG_ON(blk_queued_rq(rq)); @@ -1231,12 +1415,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq, static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { + int inflight; + if (now == part->stamp) return; - if (part_in_flight(part)) { + inflight = part_in_flight(part); + if (inflight) { __part_stat_add(cpu, part, time_in_queue, - part_in_flight(part) * (now - part->stamp)); + inflight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1268,7 +1455,7 @@ void part_round_stats(int cpu, struct hd_struct *part) } EXPORT_SYMBOL_GPL(part_round_stats); -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM static void blk_pm_put_request(struct request *rq) { if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending) @@ -1360,7 +1547,6 @@ void blk_add_request_payload(struct request *rq, struct page *page, rq->__data_len = rq->resid_len = len; rq->nr_phys_segments = 1; - rq->buffer = bio_data(bio); } EXPORT_SYMBOL_GPL(blk_add_request_payload); @@ -1402,12 +1588,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, bio->bi_next = req->bio; req->bio = bio; - /* - * may not be valid. if the low level driver said - * it didn't need a bounce buffer then it better - * not touch req->buffer either... - */ - req->buffer = bio_data(bio); req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); @@ -1421,6 +1601,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * @q: request_queue new bio is being queued at * @bio: new bio being queued * @request_count: out parameter for number of traversed plugged requests + * @same_queue_rq: pointer to &struct request that gets filled in when + * another request associated with @q is found on the plug list + * (optional, may be %NULL) * * Determine whether @bio being queued on @q can be merged with a request * on %current's plugged list. Returns %true if merge was successful, @@ -1432,18 +1615,18 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * added on the elevator at this point. In addition, we don't have * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count) + unsigned int *request_count, + struct request **same_queue_rq) { struct blk_plug *plug; struct request *rq; bool ret = false; struct list_head *plug_list; - if (blk_queue_nomerges(q)) - goto out; - plug = current->plug; if (!plug) goto out; @@ -1457,8 +1640,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; - if (rq->q == q) + if (rq->q == q) { (*request_count)++; + /* + * Only blk-mq multiple hardware queues case checks the + * rq in the same queue, there should be only one such + * rq in a queue + **/ + if (same_queue_rq) + *same_queue_rq = rq; + } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; @@ -1478,6 +1669,30 @@ out: return ret; } +unsigned int blk_plug_queued_count(struct request_queue *q) +{ + struct blk_plug *plug; + struct request *rq; + struct list_head *plug_list; + unsigned int ret = 0; + + plug = current->plug; + if (!plug) + goto out; + + if (q->mq_ops) + plug_list = &plug->mq_list; + else + plug_list = &plug->list; + + list_for_each_entry(rq, plug_list, queuelist) { + if (rq->q == q) + ret++; + } +out: + return ret; +} + void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; @@ -1492,7 +1707,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -void blk_queue_bio(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; @@ -1507,9 +1722,12 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) */ blk_queue_bounce(q, &bio); + blk_queue_split(q, &bio, q->bio_split); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); - return; + bio->bi_error = -EIO; + bio_endio(bio); + return BLK_QC_T_NONE; } if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { @@ -1522,8 +1740,11 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (blk_attempt_plug_merge(q, bio, &request_count)) - return; + if (!blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return BLK_QC_T_NONE; + } else + request_count = blk_plug_queued_count(q); spin_lock_irq(q->queue_lock); @@ -1559,8 +1780,9 @@ get_rq: * Returns with the queue unlocked. */ req = get_request(q, rw_flags, bio, GFP_NOIO); - if (unlikely(!req)) { - bio_endio(bio, -ENODEV); /* @q is dead */ + if (IS_ERR(req)) { + bio->bi_error = PTR_ERR(req); + bio_endio(bio); goto out_unlock; } @@ -1598,8 +1820,9 @@ get_rq: out_unlock: spin_unlock_irq(q->queue_lock); } + + return BLK_QC_T_NONE; } -EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */ /* * If bio->bi_dev is a partition, remap the location @@ -1630,8 +1853,6 @@ static void handle_bad_sector(struct bio *bio) bio->bi_rw, (unsigned long long)bio_end_sector(bio), (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); - - set_bit(BIO_EOF, &bio->bi_flags); } #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -1654,7 +1875,7 @@ static int __init fail_make_request_debugfs(void) struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); - return IS_ERR(dir) ? PTR_ERR(dir) : 0; + return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); @@ -1722,15 +1943,6 @@ generic_make_request_checks(struct bio *bio) goto end_io; } - if (likely(bio_is_rw(bio) && - nr_sectors > queue_max_hw_sectors(q))) { - printk(KERN_ERR "bio too big device %s (%u > %u)\n", - bdevname(bio->bi_bdev, b), - bio_sectors(bio), - queue_max_hw_sectors(q)); - goto end_io; - } - part = bio->bi_bdev->bd_part; if (should_fail_request(part, bio->bi_iter.bi_size) || should_fail_request(&part_to_disk(part)->part0, @@ -1779,14 +1991,15 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); - if (blk_throtl_bio(q, bio)) - return false; /* throttled, will be resubmitted later */ + if (!blkcg_bio_issue_check(q, bio)) + return false; trace_block_bio_queue(q, bio); return true; end_io: - bio_endio(bio, err); + bio->bi_error = err; + bio_endio(bio); return false; } @@ -1814,12 +2027,13 @@ end_io: * a lower device by calling into generic_make_request recursively, which * means the bio should NOT be touched after the call to ->make_request_fn. */ -void generic_make_request(struct bio *bio) +blk_qc_t generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; + blk_qc_t ret = BLK_QC_T_NONE; if (!generic_make_request_checks(bio)) - return; + goto out; /* * We only want one ->make_request_fn to be active at a time, else @@ -1833,7 +2047,7 @@ void generic_make_request(struct bio *bio) */ if (current->bio_list) { bio_list_add(current->bio_list, bio); - return; + goto out; } /* following loop may be a bit non-obvious, and so deserves some @@ -1856,11 +2070,23 @@ void generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - q->make_request_fn(q, bio); + if (likely(blk_queue_enter(q, false) == 0)) { + ret = q->make_request_fn(q, bio); + + blk_queue_exit(q); + + bio = bio_list_pop(current->bio_list); + } else { + struct bio *bio_next = bio_list_pop(current->bio_list); - bio = bio_list_pop(current->bio_list); + bio_io_error(bio); + bio = bio_next; + } } while (bio); current->bio_list = NULL; /* deactivate */ + +out: + return ret; } EXPORT_SYMBOL(generic_make_request); @@ -1874,7 +2100,7 @@ EXPORT_SYMBOL(generic_make_request); * interfaces; @bio must be presetup and ready for I/O. * */ -void submit_bio(int rw, struct bio *bio) +blk_qc_t submit_bio(int rw, struct bio *bio) { bio->bi_rw |= rw; @@ -1908,12 +2134,13 @@ void submit_bio(int rw, struct bio *bio) } } - generic_make_request(bio); + return generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); /** - * blk_rq_check_limits - Helper function to check a request for the queue limit + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for new the queue limits * @q: the queue * @rq: the request being checked * @@ -1924,20 +2151,13 @@ EXPORT_SYMBOL(submit_bio); * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * - * This function should also be useful for request stacking drivers - * in some cases below, so export this function. * Request stacking drivers like request-based dm may change the queue - * limits while requests are in the queue (e.g. dm's table swapping). - * Such request stacking drivers should check those requests against - * the new queue limits again when they dispatch those requests, - * although such checkings are also done against the old queue limits - * when submitting requests. + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. */ -int blk_rq_check_limits(struct request_queue *q, struct request *rq) +static int blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) { - if (!rq_mergeable(rq)) - return 0; - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; @@ -1957,7 +2177,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) return 0; } -EXPORT_SYMBOL_GPL(blk_rq_check_limits); /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request @@ -1969,13 +2188,20 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) unsigned long flags; int where = ELEVATOR_INSERT_BACK; - if (blk_rq_check_limits(q, rq)) + if (blk_cloned_rq_check_limits(q, rq)) return -EIO; if (rq->rq_disk && should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return -EIO; + if (q->mq_ops) { + if (blk_queue_io_stat(q)) + blk_account_io_start(rq, true); + blk_mq_insert_request(rq, false, true, true); + return 0; + } + spin_lock_irqsave(q->queue_lock, flags); if (unlikely(blk_queue_dying(q))) { spin_unlock_irqrestore(q->queue_lock, flags); @@ -2084,7 +2310,7 @@ void blk_account_io_done(struct request *req) } } -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM /* * Don't process normal requests when queue is suspended * or in the process of suspending/resuming @@ -2350,11 +2576,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) { int total_bytes; + trace_block_rq_complete(req->q, req, nr_bytes); + if (!req->bio) return false; - trace_block_rq_complete(req->q, req, nr_bytes); - /* * For fs requests, rq is just carrier of independent bio's * and each partial completion should be handled separately. @@ -2394,8 +2620,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) error_type = "I/O"; break; } - printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n", - error_type, req->rq_disk ? + printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", + __func__, error_type, req->rq_disk ? req->rq_disk->disk_name : "?", (unsigned long long)blk_rq_pos(req)); @@ -2434,7 +2660,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) } req->__data_len -= total_bytes; - req->buffer = bio_data(req->bio); /* update sector only for requests with clear definition of sector */ if (req->cmd_type == REQ_TYPE_FS) @@ -2503,9 +2728,9 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); /* * queue lock must be held */ -static void blk_finish_request(struct request *req, int error) +void blk_finish_request(struct request *req, int error) { - if (blk_rq_tagged(req)) + if (req->cmd_flags & REQ_QUEUED) blk_queue_end_tag(req->q, req); BUG_ON(blk_queued_rq(req)); @@ -2529,6 +2754,7 @@ static void blk_finish_request(struct request *req, int error) __blk_put_request(req->q, req); } } +EXPORT_SYMBOL(blk_finish_request); /** * blk_end_bidi_request - Complete a bidi request @@ -2752,10 +2978,9 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ rq->cmd_flags |= bio->bi_rw & REQ_WRITE; - if (bio_has_data(bio)) { + if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); - rq->buffer = bio_data(bio); - } + rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; @@ -2831,12 +3056,12 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /* * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied. + * The actual data parts (e.g. ->cmd, ->sense) are not copied. */ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; + dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); @@ -2857,7 +3082,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src) * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense) + * The actual data parts of @rq_src (e.g. ->cmd, ->sense) * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. @@ -2874,10 +3099,8 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, if (!bs) bs = fs_bio_set; - blk_rq_init(NULL, rq); - __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_bioset(bio_src, gfp_mask, bs); + bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; @@ -2904,20 +3127,25 @@ free_and_out: } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) +int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); -int kblockd_schedule_delayed_work(struct request_queue *q, - struct delayed_work *dwork, unsigned long delay) +int kblockd_schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) { return queue_delayed_work(kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_schedule_delayed_work); -#define PLUG_MAGIC 0x91827364 +int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); /** * blk_start_plug - initialize blk_plug and track it inside the task_struct @@ -2937,22 +3165,20 @@ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; - plug->magic = PLUG_MAGIC; + /* + * If this is a nested plug, don't actually assign it. + */ + if (tsk->plug) + return; + INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); - /* - * If this is a nested plug, don't actually assign it. It will be - * flushed on its own. + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier */ - if (!tsk->plug) { - /* - * Store ordering should not be needed here, since a potential - * preempt will imply a full memory barrier - */ - tsk->plug = plug; - } + tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); @@ -3034,8 +3260,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) LIST_HEAD(list); unsigned int depth; - BUG_ON(plug->magic != PLUG_MAGIC); - flush_plug_callbacks(plug, from_schedule); if (!list_empty(&plug->mq_list)) @@ -3101,14 +3325,56 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) void blk_finish_plug(struct blk_plug *plug) { + if (plug != current->plug) + return; blk_flush_plug_list(plug, false); - if (plug == current->plug) - current->plug = NULL; + current->plug = NULL; } EXPORT_SYMBOL(blk_finish_plug); -#ifdef CONFIG_PM_RUNTIME +bool blk_poll(struct request_queue *q, blk_qc_t cookie) +{ + struct blk_plug *plug; + long state; + + if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return false; + + plug = current->plug; + if (plug) + blk_flush_plug_list(plug, false); + + state = current->state; + while (!need_resched()) { + unsigned int queue_num = blk_qc_t_to_queue_num(cookie); + struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); + if (ret > 0) { + hctx->poll_success++; + set_current_state(TASK_RUNNING); + return true; + } + + if (signal_pending_state(state, current)) + set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return true; + if (ret < 0) + break; + cpu_relax(); + } + + return false; +} + +#ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine * @q: the queue of the device @@ -3164,6 +3430,9 @@ int blk_pre_runtime_suspend(struct request_queue *q) { int ret = 0; + if (!q->dev) + return ret; + spin_lock_irq(q->queue_lock); if (q->nr_pending) { ret = -EBUSY; @@ -3191,6 +3460,9 @@ EXPORT_SYMBOL(blk_pre_runtime_suspend); */ void blk_post_runtime_suspend(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_SUSPENDED; @@ -3215,6 +3487,9 @@ EXPORT_SYMBOL(blk_post_runtime_suspend); */ void blk_pre_runtime_resume(struct request_queue *q) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); q->rpm_status = RPM_RESUMING; spin_unlock_irq(q->queue_lock); @@ -3237,6 +3512,9 @@ EXPORT_SYMBOL(blk_pre_runtime_resume); */ void blk_post_runtime_resume(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_ACTIVE; @@ -3254,19 +3532,18 @@ EXPORT_SYMBOL(blk_post_runtime_resume); int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * - sizeof(((struct request *)0)->cmd_flags)); + FIELD_SIZEOF(struct request, cmd_flags)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", - WQ_MEM_RECLAIM | WQ_HIGHPRI | - WQ_POWER_EFFICIENT, 0); + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, SLAB_PANIC, NULL); - blk_requestq_cachep = kmem_cache_create("blkdev_queue", + blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); return 0; |