From d1e36282b0bbd5de6a9c4d5275e94ef3b3438f48 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Aug 2018 10:36:56 -0600 Subject: block: add REQ_HIPRI and inherit it from IOCB_HIPRI We use IOCB_HIPRI to poll for IO in the caller instead of scheduling. This information is not available for (or after) IO submission. The driver may make different queue choices based on the type of IO, so make the fact that we will poll for this IO known to the lower layers as well. Reviewed-by: Hannes Reinecke Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- fs/iomap.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/iomap.c') diff --git a/fs/iomap.c b/fs/iomap.c index 64ce240217a1..f61d13dfdf09 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1553,6 +1553,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, unsigned len) { struct page *page = ZERO_PAGE(0); + int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); @@ -1561,9 +1562,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; + if (dio->iocb->ki_flags & IOCB_HIPRI) + flags |= REQ_HIPRI; + get_page(page); __bio_add_page(bio, page, len, 0); - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + bio_set_op_attrs(bio, REQ_OP_WRITE, flags); atomic_inc(&dio->ref); return submit_bio(bio); @@ -1662,6 +1666,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio_set_pages_dirty(bio); } + if (dio->iocb->ki_flags & IOCB_HIPRI) + bio->bi_opf |= REQ_HIPRI; + iov_iter_advance(dio->submit.iter, n); dio->size += n; -- cgit v1.2.3 From 0619317ff8baa2da9238191ad5167ed3618c16d9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Nov 2018 21:16:54 -0700 Subject: block: add polled wakeup task helper If we're polling for IO on a device that doesn't use interrupts, then IO completion loop (and wake of task) is done by submitting task itself. If that is the case, then we don't need to enter the wake_up_process() function, we can simply mark ourselves as TASK_RUNNING. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- fs/iomap.c | 2 +- include/linux/blkdev.h | 13 +++++++++++++ mm/page_io.c | 2 +- 4 files changed, 17 insertions(+), 4 deletions(-) (limited to 'fs/iomap.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index c039abfb2052..9fe56672cfe5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -181,7 +181,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio) struct task_struct *waiter = bio->bi_private; WRITE_ONCE(bio->bi_private, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } static ssize_t @@ -305,7 +305,7 @@ static void blkdev_bio_end_io(struct bio *bio) struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } } diff --git a/fs/iomap.c b/fs/iomap.c index f61d13dfdf09..b0462b363bad 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1525,7 +1525,7 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (dio->wait_for_completion) { struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } else if (dio->flags & IOMAP_DIO_WRITE) { struct inode *inode = file_inode(dio->iocb->ki_filp); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 41aaa05e42c1..91c44f7a7f62 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1772,4 +1772,17 @@ static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, #endif /* CONFIG_BLOCK */ +static inline void blk_wake_io_task(struct task_struct *waiter) +{ + /* + * If we're polling, the task itself is doing the completions. For + * that case, we don't need to signal a wakeup, it's enough to just + * mark us as RUNNING. + */ + if (waiter == current) + __set_current_state(TASK_RUNNING); + else + wake_up_process(waiter); +} + #endif diff --git a/mm/page_io.c b/mm/page_io.c index d4d1c89bcddd..57572ff46016 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -140,7 +140,7 @@ out: unlock_page(page); WRITE_ONCE(bio->bi_private, NULL); bio_put(bio); - wake_up_process(waiter); + blk_wake_io_task(waiter); put_task_struct(waiter); } -- cgit v1.2.3 From 849a370016a5489c49253338507ee6cc4a08df4b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Nov 2018 08:37:34 -0700 Subject: block: avoid ordered task state change for polled IO For the core poll helper, the task state setting don't need to imply any atomics, as it's the current task itself that is being modified and we're not going to sleep. For IRQ driven, the wakeup path have the necessary barriers to not need us using the heavy handed version of the task state setting. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- fs/block_dev.c | 7 +++++-- fs/iomap.c | 3 ++- mm/page_io.c | 3 ++- 4 files changed, 11 insertions(+), 6 deletions(-) (limited to 'fs/iomap.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 32b246ed44c0..7fc4abb4cc36 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3331,12 +3331,12 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) ret = q->mq_ops->poll(hctx, rq->tag); if (ret > 0) { hctx->poll_success++; - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); return true; } if (signal_pending_state(state, current)) - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); if (current->state == TASK_RUNNING) return true; diff --git a/fs/block_dev.c b/fs/block_dev.c index 4d79bc80fb41..64ba27b8b754 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -237,9 +237,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, qc = submit_bio(&bio); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio.bi_private)) break; + if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); @@ -415,7 +417,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) return -EIOCBQUEUED; for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->waiter)) break; diff --git a/fs/iomap.c b/fs/iomap.c index b0462b363bad..c5df035ace6f 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1888,7 +1888,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return -EIOCBQUEUED; for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->submit.waiter)) break; diff --git a/mm/page_io.c b/mm/page_io.c index 57572ff46016..a7271fa481f6 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -405,7 +405,8 @@ int swap_readpage(struct page *page, bool synchronous) bio_get(bio); qc = submit_bio(bio); while (synchronous) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio->bi_private)) break; -- cgit v1.2.3 From 0a1b8b87d064a47fad9ec475316002da28559207 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Nov 2018 08:24:43 -0700 Subject: block: make blk_poll() take a parameter on whether to spin or not blk_poll() has always kept spinning until it found an IO. This is fine for SYNC polling, since we need to find one request we have pending, but in preparation for ASYNC polling it can be beneficial to just check if we have any entries available or not. Existing callers are converted to pass in 'spin == true', to retain the old behavior. Signed-off-by: Jens Axboe --- block/blk-core.c | 9 ++++++--- block/blk-mq.c | 6 +++--- drivers/nvme/host/multipath.c | 4 ++-- fs/block_dev.c | 4 ++-- fs/direct-io.c | 2 +- fs/iomap.c | 2 +- include/linux/blkdev.h | 4 ++-- mm/page_io.c | 2 +- 8 files changed, 18 insertions(+), 15 deletions(-) (limited to 'fs/iomap.c') diff --git a/block/blk-core.c b/block/blk-core.c index 03c4202b69bf..9af56dbb84f1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1277,19 +1277,22 @@ EXPORT_SYMBOL(submit_bio); * blk_poll - poll for IO completions * @q: the queue * @cookie: cookie passed back at IO submission time + * @spin: whether to spin for completions * * Description: * Poll for completions on the passed in queue. Returns number of - * completed entries found. + * completed entries found. If @spin is true, then blk_poll will continue + * looping until at least one completion is found, unless the task is + * otherwise marked running (or we need to reschedule). */ -int blk_poll(struct request_queue *q, blk_qc_t cookie) +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) { if (!q->poll_fn || !blk_qc_t_valid(cookie)) return 0; if (current->plug) blk_flush_plug_list(current->plug, false); - return q->poll_fn(q, cookie); + return q->poll_fn(q, cookie, spin); } EXPORT_SYMBOL_GPL(blk_poll); diff --git a/block/blk-mq.c b/block/blk-mq.c index b66cca3ce1e5..c2751f0a3ccc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -38,7 +38,7 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" -static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie); +static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -3352,7 +3352,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, return blk_mq_poll_hybrid_sleep(q, hctx, rq); } -static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie) +static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin) { struct blk_mq_hw_ctx *hctx; long state; @@ -3392,7 +3392,7 @@ static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie) if (current->state == TASK_RUNNING) return 1; - if (ret < 0) + if (ret < 0 || !spin) break; cpu_relax(); } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f9eeb3b58632..ffebdd0ae34b 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -220,7 +220,7 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, return ret; } -static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) +static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc, bool spin) { struct nvme_ns_head *head = q->queuedata; struct nvme_ns *ns; @@ -230,7 +230,7 @@ static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) srcu_idx = srcu_read_lock(&head->srcu); ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu); if (likely(ns && nvme_path_is_optimized(ns))) - found = ns->queue->poll_fn(q, qc); + found = ns->queue->poll_fn(q, qc, spin); srcu_read_unlock(&head->srcu, srcu_idx); return found; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 64ba27b8b754..d233a59ea364 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -243,7 +243,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc, true)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -423,7 +423,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc, true)) io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/direct-io.c b/fs/direct-io.c index ea07d5a34317..a5a4e5a1423e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -518,7 +518,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); diff --git a/fs/iomap.c b/fs/iomap.c index c5df035ace6f..74c1f37f0fd6 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1896,7 +1896,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || !blk_poll(dio->submit.last_queue, - dio->submit.cookie)) + dio->submit.cookie, true)) io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3015e9b5ae3..e3c0a8ec16a7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -283,7 +283,7 @@ static inline unsigned short req_get_ioprio(struct request *req) struct blk_queue_ctx; typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); -typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t); +typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t, bool spin); struct bio_vec; typedef int (dma_drain_needed_fn)(struct request *); @@ -867,7 +867,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -int blk_poll(struct request_queue *q, blk_qc_t cookie); +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { diff --git a/mm/page_io.c b/mm/page_io.c index a7271fa481f6..5bdfd21c1bd9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -410,7 +410,7 @@ int swap_readpage(struct page *page, bool synchronous) if (!READ_ONCE(bio->bi_private)) break; - if (!blk_poll(disk->queue, qc)) + if (!blk_poll(disk->queue, qc, true)) break; } __set_current_state(TASK_RUNNING); -- cgit v1.2.3