From d1e36282b0bbd5de6a9c4d5275e94ef3b3438f48 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Aug 2018 10:36:56 -0600 Subject: block: add REQ_HIPRI and inherit it from IOCB_HIPRI We use IOCB_HIPRI to poll for IO in the caller instead of scheduling. This information is not available for (or after) IO submission. The driver may make different queue choices based on the type of IO, so make the fact that we will poll for this IO known to the lower layers as well. Reviewed-by: Hannes Reinecke Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 ++ fs/direct-io.c | 2 ++ fs/iomap.c | 9 ++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index a80b4f0ee7c4..c039abfb2052 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -232,6 +232,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_HIPRI) + bio.bi_opf |= REQ_HIPRI; qc = submit_bio(&bio); for (;;) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 722d17c88edb..ea07d5a34317 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1265,6 +1265,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } else { dio->op = REQ_OP_READ; } + if (iocb->ki_flags & IOCB_HIPRI) + dio->op_flags |= REQ_HIPRI; /* * For AIO O_(D)SYNC writes we need to defer completions to a workqueue diff --git a/fs/iomap.c b/fs/iomap.c index 64ce240217a1..f61d13dfdf09 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1553,6 +1553,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, unsigned len) { struct page *page = ZERO_PAGE(0); + int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); @@ -1561,9 +1562,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; + if (dio->iocb->ki_flags & IOCB_HIPRI) + flags |= REQ_HIPRI; + get_page(page); __bio_add_page(bio, page, len, 0); - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + bio_set_op_attrs(bio, REQ_OP_WRITE, flags); atomic_inc(&dio->ref); return submit_bio(bio); @@ -1662,6 +1666,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio_set_pages_dirty(bio); } + if (dio->iocb->ki_flags & IOCB_HIPRI) + bio->bi_opf |= REQ_HIPRI; + iov_iter_advance(dio->submit.iter, n); dio->size += n; -- cgit v1.2.3 From 0619317ff8baa2da9238191ad5167ed3618c16d9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Nov 2018 21:16:54 -0700 Subject: block: add polled wakeup task helper If we're polling for IO on a device that doesn't use interrupts, then IO completion loop (and wake of task) is done by submitting task itself. If that is the case, then we don't need to enter the wake_up_process() function, we can simply mark ourselves as TASK_RUNNING. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- fs/iomap.c | 2 +- include/linux/blkdev.h | 13 +++++++++++++ mm/page_io.c | 2 +- 4 files changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index c039abfb2052..9fe56672cfe5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -181,7 +181,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio) struct task_struct *waiter = bio->bi_private; WRITE_ONCE(bio->bi_private, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } static ssize_t @@ -305,7 +305,7 @@ static void blkdev_bio_end_io(struct bio *bio) struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } } diff --git a/fs/iomap.c b/fs/iomap.c index f61d13dfdf09..b0462b363bad 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1525,7 +1525,7 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (dio->wait_for_completion) { struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } else if (dio->flags & IOMAP_DIO_WRITE) { struct inode *inode = file_inode(dio->iocb->ki_filp); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 41aaa05e42c1..91c44f7a7f62 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1772,4 +1772,17 @@ static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, #endif /* CONFIG_BLOCK */ +static inline void blk_wake_io_task(struct task_struct *waiter) +{ + /* + * If we're polling, the task itself is doing the completions. For + * that case, we don't need to signal a wakeup, it's enough to just + * mark us as RUNNING. + */ + if (waiter == current) + __set_current_state(TASK_RUNNING); + else + wake_up_process(waiter); +} + #endif diff --git a/mm/page_io.c b/mm/page_io.c index d4d1c89bcddd..57572ff46016 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -140,7 +140,7 @@ out: unlock_page(page); WRITE_ONCE(bio->bi_private, NULL); bio_put(bio); - wake_up_process(waiter); + blk_wake_io_task(waiter); put_task_struct(waiter); } -- cgit v1.2.3 From d34513d384487e8022f143a3a6b791e6d7f0dad6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 6 Nov 2018 14:29:11 -0700 Subject: block: for async O_DIRECT, mark us as polling if asked to Inherit the iocb IOCB_HIPRI flag, and pass on REQ_HIPRI for those kinds of requests. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 9fe56672cfe5..e72b119ede84 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -383,6 +383,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); if (!nr_pages) { + if (iocb->ki_flags & IOCB_HIPRI) + bio->bi_opf |= REQ_HIPRI; + qc = submit_bio(bio); break; } -- cgit v1.2.3 From cb700eb3faa488fbb4b60689adec84032d7cf24a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Nov 2018 19:56:53 -0700 Subject: block: don't plug for aio/O_DIRECT HIPRI IO Those will go straight to issue inside blk-mq, so don't bother setting up a block plug for them. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index e72b119ede84..4d79bc80fb41 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -330,6 +330,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; + bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; @@ -353,7 +354,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) dio->multi_bio = false; dio->should_dirty = is_read && iter_is_iovec(iter); - blk_start_plug(&plug); + /* + * Don't plug for HIPRI/polled IO, as those should go straight + * to issue + */ + if (!is_poll) + blk_start_plug(&plug); + for (;;) { bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; @@ -400,7 +407,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } - blk_finish_plug(&plug); + + if (!is_poll) + blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; -- cgit v1.2.3 From 849a370016a5489c49253338507ee6cc4a08df4b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Nov 2018 08:37:34 -0700 Subject: block: avoid ordered task state change for polled IO For the core poll helper, the task state setting don't need to imply any atomics, as it's the current task itself that is being modified and we're not going to sleep. For IRQ driven, the wakeup path have the necessary barriers to not need us using the heavy handed version of the task state setting. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- fs/block_dev.c | 7 +++++-- fs/iomap.c | 3 ++- mm/page_io.c | 3 ++- 4 files changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/block/blk-mq.c b/block/blk-mq.c index 32b246ed44c0..7fc4abb4cc36 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3331,12 +3331,12 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) ret = q->mq_ops->poll(hctx, rq->tag); if (ret > 0) { hctx->poll_success++; - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); return true; } if (signal_pending_state(state, current)) - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); if (current->state == TASK_RUNNING) return true; diff --git a/fs/block_dev.c b/fs/block_dev.c index 4d79bc80fb41..64ba27b8b754 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -237,9 +237,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, qc = submit_bio(&bio); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio.bi_private)) break; + if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); @@ -415,7 +417,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) return -EIOCBQUEUED; for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->waiter)) break; diff --git a/fs/iomap.c b/fs/iomap.c index b0462b363bad..c5df035ace6f 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1888,7 +1888,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return -EIOCBQUEUED; for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->submit.waiter)) break; diff --git a/mm/page_io.c b/mm/page_io.c index 57572ff46016..a7271fa481f6 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -405,7 +405,8 @@ int swap_readpage(struct page *page, bool synchronous) bio_get(bio); qc = submit_bio(bio); while (synchronous) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio->bi_private)) break; -- cgit v1.2.3 From 76dc891395dc61e92e2ff31b6161815ce5eb715b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 20 Nov 2018 10:52:36 +0900 Subject: aio: Fix fallback I/O priority value For cases when the application does not specify aio_reqprio for an aio, fallback to use get_current_ioprio() to obtain the task I/O priority last set using ioprio_set() rather than the hardcoded IOPRIO_CLASS_NONE value. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Adam Manzanares Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 301e6314183b..b984918be4b7 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1441,7 +1441,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) req->ki_ioprio = iocb->aio_reqprio; } else - req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + req->ki_ioprio = get_current_ioprio(); ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) -- cgit v1.2.3 From 0a1b8b87d064a47fad9ec475316002da28559207 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Nov 2018 08:24:43 -0700 Subject: block: make blk_poll() take a parameter on whether to spin or not blk_poll() has always kept spinning until it found an IO. This is fine for SYNC polling, since we need to find one request we have pending, but in preparation for ASYNC polling it can be beneficial to just check if we have any entries available or not. Existing callers are converted to pass in 'spin == true', to retain the old behavior. Signed-off-by: Jens Axboe --- block/blk-core.c | 9 ++++++--- block/blk-mq.c | 6 +++--- drivers/nvme/host/multipath.c | 4 ++-- fs/block_dev.c | 4 ++-- fs/direct-io.c | 2 +- fs/iomap.c | 2 +- include/linux/blkdev.h | 4 ++-- mm/page_io.c | 2 +- 8 files changed, 18 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/block/blk-core.c b/block/blk-core.c index 03c4202b69bf..9af56dbb84f1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1277,19 +1277,22 @@ EXPORT_SYMBOL(submit_bio); * blk_poll - poll for IO completions * @q: the queue * @cookie: cookie passed back at IO submission time + * @spin: whether to spin for completions * * Description: * Poll for completions on the passed in queue. Returns number of - * completed entries found. + * completed entries found. If @spin is true, then blk_poll will continue + * looping until at least one completion is found, unless the task is + * otherwise marked running (or we need to reschedule). */ -int blk_poll(struct request_queue *q, blk_qc_t cookie) +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) { if (!q->poll_fn || !blk_qc_t_valid(cookie)) return 0; if (current->plug) blk_flush_plug_list(current->plug, false); - return q->poll_fn(q, cookie); + return q->poll_fn(q, cookie, spin); } EXPORT_SYMBOL_GPL(blk_poll); diff --git a/block/blk-mq.c b/block/blk-mq.c index b66cca3ce1e5..c2751f0a3ccc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -38,7 +38,7 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" -static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie); +static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -3352,7 +3352,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, return blk_mq_poll_hybrid_sleep(q, hctx, rq); } -static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie) +static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin) { struct blk_mq_hw_ctx *hctx; long state; @@ -3392,7 +3392,7 @@ static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie) if (current->state == TASK_RUNNING) return 1; - if (ret < 0) + if (ret < 0 || !spin) break; cpu_relax(); } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f9eeb3b58632..ffebdd0ae34b 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -220,7 +220,7 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, return ret; } -static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) +static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc, bool spin) { struct nvme_ns_head *head = q->queuedata; struct nvme_ns *ns; @@ -230,7 +230,7 @@ static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) srcu_idx = srcu_read_lock(&head->srcu); ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu); if (likely(ns && nvme_path_is_optimized(ns))) - found = ns->queue->poll_fn(q, qc); + found = ns->queue->poll_fn(q, qc, spin); srcu_read_unlock(&head->srcu, srcu_idx); return found; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 64ba27b8b754..d233a59ea364 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -243,7 +243,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc, true)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -423,7 +423,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc, true)) io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/direct-io.c b/fs/direct-io.c index ea07d5a34317..a5a4e5a1423e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -518,7 +518,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); diff --git a/fs/iomap.c b/fs/iomap.c index c5df035ace6f..74c1f37f0fd6 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1896,7 +1896,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || !blk_poll(dio->submit.last_queue, - dio->submit.cookie)) + dio->submit.cookie, true)) io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3015e9b5ae3..e3c0a8ec16a7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -283,7 +283,7 @@ static inline unsigned short req_get_ioprio(struct request *req) struct blk_queue_ctx; typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); -typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t); +typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t, bool spin); struct bio_vec; typedef int (dma_drain_needed_fn)(struct request *); @@ -867,7 +867,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -int blk_poll(struct request_queue *q, blk_qc_t cookie); +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { diff --git a/mm/page_io.c b/mm/page_io.c index a7271fa481f6..5bdfd21c1bd9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -410,7 +410,7 @@ int swap_readpage(struct page *page, bool synchronous) if (!READ_ONCE(bio->bi_private)) break; - if (!blk_poll(disk->queue, qc)) + if (!blk_poll(disk->queue, qc, true)) break; } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From 531724abc3bfb556c1dd68086cf9cb51f76464e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Nov 2018 09:23:48 +0100 Subject: block: avoid extra bio reference for async O_DIRECT The bio referencing has a trick that doesn't do any actual atomic inc/dec on the reference count until we have to elevator to > 1. For the async IO O_DIRECT case, we can't use the simple DIO variants, so we use __blkdev_direct_IO(). It always grabs an extra reference to the bio after allocation, which means we then enter the slower path of actually having to do atomic_inc/dec on the count. We don't need to do that for the async case, unless we end up going multi-bio, in which case we're already doing huge amounts of IO. For the smaller IO case (< BIO_MAX_PAGES), we can do without the extra ref. Based on an earlier patch (and commit log) from Jens Axboe. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index d233a59ea364..e1886cc7048f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -302,7 +302,8 @@ static void blkdev_bio_end_io(struct bio *bio) } dio->iocb->ki_complete(iocb, ret, 0); - bio_put(&dio->bio); + if (dio->multi_bio) + bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; @@ -343,14 +344,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) return -EINVAL; bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); - bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) + if (dio->is_sync) { dio->waiter = current; - else + bio_get(bio); + } else { dio->iocb = iocb; + } dio->size = 0; dio->multi_bio = false; @@ -400,6 +402,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } if (!dio->multi_bio) { + /* + * AIO needs an extra reference to ensure the dio + * structure which is embedded into the first bio + * stays around. + */ + if (!is_sync) + bio_get(bio); dio->multi_bio = true; atomic_set(&dio->ref, 2); } else { -- cgit v1.2.3 From 154989e45fd8de9bfb52bbd6e5ea763e437e54c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Nov 2018 16:44:07 +0100 Subject: aio: clear IOCB_HIPRI No one is going to poll for aio (yet), so we must clear the HIPRI flag, as we would otherwise send it down the poll queues, where no one will be polling for completions. Signed-off-by: Christoph Hellwig IOCB_HIPRI, not RWF_HIPRI. Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- fs/aio.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 205390c0c1bb..05647d352bf3 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1436,8 +1436,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); - fput(req->ki_filp); - return ret; + goto out_fput; } req->ki_ioprio = iocb->aio_reqprio; @@ -1446,7 +1445,13 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) - fput(req->ki_filp); + goto out_fput; + + req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ + return 0; + +out_fput: + fput(req->ki_filp); return ret; } -- cgit v1.2.3 From fd42df305f804ddc0d5ac028e944784283b2f92d Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 5 Dec 2018 12:10:34 -0500 Subject: blkcg: associate writeback bios with a blkg One of the goals of this series is to remove a separate reference to the css of the bio. This can and should be accessed via bio_blkcg(). In this patch, wbc_init_bio() now requires a bio to have a device associated with it. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 8 +++++--- block/bio.c | 18 ++++++++++++++++++ fs/buffer.c | 10 +++++----- fs/ext4/page-io.c | 2 +- include/linux/bio.h | 5 +++++ include/linux/writeback.h | 5 +++-- 6 files changed, 37 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 476722b7b636..baf19bf28385 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1879,8 +1879,10 @@ following two functions. wbc_init_bio(@wbc, @bio) Should be called for each bio carrying writeback data and - associates the bio with the inode's owner cgroup. Can be - called anytime between bio allocation and submission. + associates the bio with the inode's owner cgroup and the + corresponding request queue. This must be called after + a queue (device) has been associated with the bio and + before submission. wbc_account_io(@wbc, @page, @bytes) Should be called for each data segment being written out. @@ -1899,7 +1901,7 @@ the configuration, the bio may be executed at a lower priority and if the writeback session is holding shared resources, e.g. a journal entry, may lead to priority inversion. There is no one easy solution for the problem. Filesystems can try to work around specific problem -cases by skipping wbc_init_bio() or using bio_associate_blkcg() +cases by skipping wbc_init_bio() and using bio_associate_blkg() directly. diff --git a/block/bio.c b/block/bio.c index f0f069c1823c..b42477b6a225 100644 --- a/block/bio.c +++ b/block/bio.c @@ -2034,6 +2034,24 @@ static void __bio_associate_blkg_from_css(struct bio *bio, rcu_read_unlock(); } +/** + * bio_associate_blkg_from_css - associate a bio with a specified css + * @bio: target bio + * @css: target css + * + * Associate @bio with the blkg found by combining the css's blkg and the + * request_queue of the @bio. This takes a reference on the css that will + * be put upon freeing of @bio. + */ +void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ + css_get(css); + bio->bi_css = css; + __bio_associate_blkg_from_css(bio, css); +} +EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); + #ifdef CONFIG_MEMCG /** * bio_associate_blkg_from_page - associate a bio with the page's blkg diff --git a/fs/buffer.c b/fs/buffer.c index 1286c2b95498..d60d61e8ed7d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + submit_bio(bio); return 0; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index db7590178dfc..2aa62d58d8dd 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; - wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; + wbc_init_bio(io->io_wbc, bio); return 0; } diff --git a/include/linux/bio.h b/include/linux/bio.h index f13572c254a7..f0438061a5a3 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -515,6 +515,8 @@ static inline void bio_associate_blkg_from_page(struct bio *bio, int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); void bio_disassociate_blkg(struct bio *bio); void bio_associate_blkg(struct bio *bio); +void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css); void bio_disassociate_task(struct bio *bio); void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ @@ -522,6 +524,9 @@ static inline int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { return 0; } static inline void bio_disassociate_blkg(struct bio *bio) { } static inline void bio_associate_blkg(struct bio *bio) { } +static inline void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ } static inline void bio_disassociate_task(struct bio *bio) { } static inline void bio_clone_blkcg_association(struct bio *dst, struct bio *src) { } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fdfd04e348f6..738a0c24874f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -246,7 +246,8 @@ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup - * writeback context. + * writeback context. Must be called after the bio has been associated with + * a device. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { @@ -257,7 +258,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) * regular writeback instead of writing things out itself. */ if (wbc->wb) - bio_associate_blkcg(bio, wbc->wb->blkcg_css); + bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } #else /* CONFIG_CGROUP_WRITEBACK */ -- cgit v1.2.3