diff options
Diffstat (limited to 'block')
47 files changed, 9555 insertions, 3789 deletions
diff --git a/block/Kconfig b/block/Kconfig index 2429515c05c2..161491d0a879 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,6 +77,7 @@ config BLK_DEV_BSGLIB config BLK_DEV_INTEGRITY bool "Block layer data integrity support" + select CRC_T10DIF if BLK_DEV_INTEGRITY ---help--- Some storage devices allow extra information to be stored/retrieved to help protect the data. The block layer diff --git a/block/Makefile b/block/Makefile index 20645e88fb57..9eda2322b2d4 100644 --- a/block/Makefile +++ b/block/Makefile @@ -2,13 +2,15 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ - genhd.o scsi_ioctl.o partition-generic.o partitions/ + genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ + badblocks.o partitions/ +obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o @@ -18,5 +20,6 @@ obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o + diff --git a/block/badblocks.c b/block/badblocks.c new file mode 100644 index 000000000000..7be53cb1cc3c --- /dev/null +++ b/block/badblocks.c @@ -0,0 +1,585 @@ +/* + * Bad block management + * + * - Heavily based on MD badblocks code from Neil Brown + * + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/badblocks.h> +#include <linux/seqlock.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/slab.h> + +/** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information + * @s: sector (start) at which to check for badblocks + * @sectors: number of sectors to check for badblocks + * @first_bad: pointer to store location of the first badblock + * @bad_sectors: pointer to store number of badblocks after @first_bad + * + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so badblocks_check + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * + * Return: + * 0: there are no known bad blocks in the range + * 1: there are known bad block which are all acknowledged + * -1: there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo; + u64 *p = bb->page; + int rv; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<<bb->shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + lo = 0; + rv = 0; + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + /* This could still be the one, earlier ranges + * could not. + */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_check); + +/** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * @acknowledged: weather to mark the bad sectors as acknowledged + * + * This might extend the table, or might contract it if two adjacent ranges + * can be merged. We binary-search to find the 'insertion' point, then + * decide how best to handle it. + * + * Return: + * 0: success + * 1: failed to set badblocks (out of space) + */ +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 0; + unsigned long flags; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 0; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + + s >>= bb->shift; + next += (1<<bb->shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irqsave(&bb->lock, flags); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal + */ + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range + */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' + */ + if (bb->count >= MAX_BADBLOCKS) { + /* No room for more */ + rv = 1; + break; + } else { + int this_sectors = sectors; + + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + write_sequnlock_irqrestore(&bb->lock, flags); + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_set); + +/** + * badblocks_clear() - Remove a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + * + * Return: + * 0: success + * 1: failed to clear badblocks + */ +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. + */ + s += (1<<bb->shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + lo = mid; + else + hi = mid; + } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MAX_BADBLOCKS) { + rv = -ENOSPC; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_clear); + +/** + * ack_all_badblocks() - Acknowledge all bad blocks in a list. + * @bb: the badblocks structure that holds all badblock information + * + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0 && bb->unacked_exist) { + u64 *p = bb->page; + int i; + + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(ack_all_badblocks); + +/** + * badblocks_show() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of returned data + */ +ssize_t badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); + } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} +EXPORT_SYMBOL_GPL(badblocks_show); + +/** + * badblocks_store() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @len: length of data received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of the buffer processed or -ve error. + */ +ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, + int unack) +{ + unsigned long long sector; + int length; + char newline; + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (badblocks_set(bb, sector, length, !unack)) + return -ENOSPC; + else + return len; +} +EXPORT_SYMBOL_GPL(badblocks_store); + +static int __badblocks_init(struct device *dev, struct badblocks *bb, + int enable) +{ + bb->dev = dev; + bb->count = 0; + if (enable) + bb->shift = 0; + else + bb->shift = -1; + if (dev) + bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL); + else + bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!bb->page) { + bb->shift = -1; + return -ENOMEM; + } + seqlock_init(&bb->lock); + + return 0; +} + +/** + * badblocks_init() - initialize the badblocks structure + * @bb: the badblocks structure that holds all badblock information + * @enable: weather to enable badblocks accounting + * + * Return: + * 0: success + * -ve errno: on error + */ +int badblocks_init(struct badblocks *bb, int enable) +{ + return __badblocks_init(NULL, bb, enable); +} +EXPORT_SYMBOL_GPL(badblocks_init); + +int devm_init_badblocks(struct device *dev, struct badblocks *bb) +{ + if (!bb) + return -EINVAL; + return __badblocks_init(dev, bb, 1); +} +EXPORT_SYMBOL_GPL(devm_init_badblocks); + +/** + * badblocks_exit() - free the badblocks structure + * @bb: the badblocks structure that holds all badblock information + */ +void badblocks_exit(struct badblocks *bb) +{ + if (!bb) + return; + if (bb->dev) + devm_kfree(bb->dev, bb->page); + else + kfree(bb->page); + bb->page = NULL; +} +EXPORT_SYMBOL_GPL(badblocks_exit); diff --git a/block/bio-integrity.c b/block/bio-integrity.c new file mode 100644 index 000000000000..711e4d8de6fa --- /dev/null +++ b/block/bio-integrity.c @@ -0,0 +1,525 @@ +/* + * bio-integrity.c - bio data integrity extensions + * + * Copyright (C) 2007, 2008, 2009 Oracle Corporation + * Written by: Martin K. Petersen <martin.petersen@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include <linux/blkdev.h> +#include <linux/mempool.h> +#include <linux/export.h> +#include <linux/bio.h> +#include <linux/workqueue.h> +#include <linux/slab.h> + +#define BIP_INLINE_VECS 4 + +static struct kmem_cache *bip_slab; +static struct workqueue_struct *kintegrityd_wq; + +void blk_flush_integrity(void) +{ + flush_workqueue(kintegrityd_wq); +} + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) +{ + struct bio_integrity_payload *bip; + struct bio_set *bs = bio->bi_pool; + unsigned long idx = BIO_POOL_NONE; + unsigned inline_vecs; + + if (!bs || !bs->bio_integrity_pool) { + bip = kmalloc(sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * nr_vecs, gfp_mask); + inline_vecs = nr_vecs; + } else { + bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); + inline_vecs = BIP_INLINE_VECS; + } + + if (unlikely(!bip)) + return ERR_PTR(-ENOMEM); + + memset(bip, 0, sizeof(*bip)); + + if (nr_vecs > inline_vecs) { + bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, + bs->bvec_integrity_pool); + if (!bip->bip_vec) + goto err; + bip->bip_max_vcnt = bvec_nr_vecs(idx); + } else { + bip->bip_vec = bip->bip_inline_vecs; + bip->bip_max_vcnt = inline_vecs; + } + + bip->bip_slab = idx; + bip->bip_bio = bio; + bio->bi_integrity = bip; + bio->bi_rw |= REQ_INTEGRITY; + + return bip; +err: + mempool_free(bip, bs->bio_integrity_pool); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(bio_integrity_alloc); + +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * + * Description: Used to free the integrity portion of a bio. Usually + * called from bio_free(). + */ +void bio_integrity_free(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_set *bs = bio->bi_pool; + + if (bip->bip_flags & BIP_BLOCK_INTEGRITY) + kfree(page_address(bip->bip_vec->bv_page) + + bip->bip_vec->bv_offset); + + if (bs && bs->bio_integrity_pool) { + if (bip->bip_slab != BIO_POOL_NONE) + bvec_free(bs->bvec_integrity_pool, bip->bip_vec, + bip->bip_slab); + + mempool_free(bip, bs->bio_integrity_pool); + } else { + kfree(bip); + } + + bio->bi_integrity = NULL; +} +EXPORT_SYMBOL(bio_integrity_free); + +/** + * bio_integrity_add_page - Attach integrity metadata + * @bio: bio to update + * @page: page containing integrity metadata + * @len: number of bytes of integrity metadata in page + * @offset: start offset within page + * + * Description: Attach a page containing integrity metadata to bio. + */ +int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_vec *iv; + + if (bip->bip_vcnt >= bip->bip_max_vcnt) { + printk(KERN_ERR "%s: bip_vec full\n", __func__); + return 0; + } + + iv = bip->bip_vec + bip->bip_vcnt; + + if (bip->bip_vcnt && + bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev), + &bip->bip_vec[bip->bip_vcnt - 1], offset)) + return 0; + + iv->bv_page = page; + iv->bv_len = len; + iv->bv_offset = offset; + bip->bip_vcnt++; + + return len; +} +EXPORT_SYMBOL(bio_integrity_add_page); + +/** + * bio_integrity_enabled - Check whether integrity can be passed + * @bio: bio to check + * + * Description: Determines whether bio_integrity_prep() can be called + * on this bio or not. bio data direction and target device must be + * set prior to calling. The functions honors the write_generate and + * read_verify flags in sysfs. + */ +bool bio_integrity_enabled(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + if (!bio_is_rw(bio)) + return false; + + /* Already protected? */ + if (bio_integrity(bio)) + return false; + + if (bi == NULL) + return false; + + if (bio_data_dir(bio) == READ && bi->profile->verify_fn != NULL && + (bi->flags & BLK_INTEGRITY_VERIFY)) + return true; + + if (bio_data_dir(bio) == WRITE && bi->profile->generate_fn != NULL && + (bi->flags & BLK_INTEGRITY_GENERATE)) + return true; + + return false; +} +EXPORT_SYMBOL(bio_integrity_enabled); + +/** + * bio_integrity_intervals - Return number of integrity intervals for a bio + * @bi: blk_integrity profile for device + * @sectors: Size of the bio in 512-byte sectors + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the data integrity + * interval size of the storage device. Convert the block layer sectors + * to the appropriate number of integrity intervals. + */ +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return sectors >> (bi->interval_exp - 9); +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return bio_integrity_intervals(bi, sectors) * bi->tuple_size; +} + +/** + * bio_integrity_process - Process integrity metadata for a bio + * @bio: bio to generate/verify integrity metadata for + * @proc_fn: Pointer to the relevant processing function + */ +static int bio_integrity_process(struct bio *bio, + integrity_processing_fn *proc_fn) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_iter iter; + struct bvec_iter bviter; + struct bio_vec bv; + struct bio_integrity_payload *bip = bio_integrity(bio); + unsigned int ret = 0; + void *prot_buf = page_address(bip->bip_vec->bv_page) + + bip->bip_vec->bv_offset; + + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; + iter.interval = 1 << bi->interval_exp; + iter.seed = bip_get_seed(bip); + iter.prot_buf = prot_buf; + + bio_for_each_segment(bv, bio, bviter) { + void *kaddr = kmap_atomic(bv.bv_page); + + iter.data_buf = kaddr + bv.bv_offset; + iter.data_size = bv.bv_len; + + ret = proc_fn(&iter); + if (ret) { + kunmap_atomic(kaddr); + return ret; + } + + kunmap_atomic(kaddr); + } + return ret; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Description: Allocates a buffer for integrity metadata, maps the + * pages and attaches them to a bio. The bio must have data + * direction, target device and start sector set priot to calling. In + * the WRITE case, integrity metadata will be generated using the + * block device's integrity function. In the READ case, the buffer + * will be prepared for DMA and a suitable end_io handler set up. + */ +int bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct request_queue *q; + void *buf; + unsigned long start, end; + unsigned int len, nr_pages; + unsigned int bytes, offset, i; + unsigned int intervals; + + bi = bdev_get_integrity(bio->bi_bdev); + q = bdev_get_queue(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bio_integrity(bio)); + + intervals = bio_integrity_intervals(bi, bio_sectors(bio)); + + /* Allocate kernel buffer for protection data */ + len = intervals * bi->tuple_size; + buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); + if (unlikely(buf == NULL)) { + printk(KERN_ERR "could not allocate integrity buffer\n"); + return -ENOMEM; + } + + end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ((unsigned long) buf) >> PAGE_SHIFT; + nr_pages = end - start; + + /* Allocate bio integrity payload and integrity vectors */ + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + if (IS_ERR(bip)) { + printk(KERN_ERR "could not allocate data integrity bioset\n"); + kfree(buf); + return PTR_ERR(bip); + } + + bip->bip_flags |= BIP_BLOCK_INTEGRITY; + bip->bip_iter.bi_size = len; + bip_set_seed(bip, bio->bi_iter.bi_sector); + + if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) + bip->bip_flags |= BIP_IP_CHECKSUM; + + /* Map it */ + offset = offset_in_page(buf); + for (i = 0 ; i < nr_pages ; i++) { + int ret; + bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + ret = bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset); + + if (ret == 0) + return 0; + + if (ret < bytes) + break; + + buf += bytes; + len -= bytes; + offset = 0; + } + + /* Install custom I/O completion handler if read verify is enabled */ + if (bio_data_dir(bio) == READ) { + bip->bip_end_io = bio->bi_end_io; + bio->bi_end_io = bio_integrity_endio; + } + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + bio_integrity_process(bio, bi->profile->generate_fn); + + return 0; +} +EXPORT_SYMBOL(bio_integrity_prep); + +/** + * bio_integrity_verify_fn - Integrity I/O completion worker + * @work: Work struct stored in bio to be verified + * + * Description: This workqueue function is called to complete a READ + * request. The function verifies the transferred integrity metadata + * and then calls the original bio end_io function. + */ +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + bio->bi_error = bio_integrity_process(bio, bi->profile->verify_fn); + + /* Restore original bio completion handler */ + bio->bi_end_io = bip->bip_end_io; + bio_endio(bio); +} + +/** + * bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * @error: Pointer to errno + * + * Description: Completion for integrity I/O + * + * Normally I/O completion is done in interrupt context. However, + * verifying I/O integrity is a time-consuming task which must be run + * in process context. This function postpones completion + * accordingly. + */ +void bio_integrity_endio(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + BUG_ON(bip->bip_bio != bio); + + /* In case of an I/O error there is no point in verifying the + * integrity metadata. Restore original bio end_io handler + * and run it. + */ + if (bio->bi_error) { + bio->bi_end_io = bip->bip_end_io; + bio_endio(bio); + + return; + } + + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); +} +EXPORT_SYMBOL(bio_integrity_endio); + +/** + * bio_integrity_advance - Advance integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes that have been completed + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and advances the + * integrity vector accordingly. + */ +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); + + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); +} +EXPORT_SYMBOL(bio_integrity_advance); + +/** + * bio_integrity_trim - Trim integrity vector + * @bio: bio whose integrity vector to update + * @offset: offset to first data sector + * @sectors: number of data sectors + * + * Description: Used to trim the integrity vector in a cloned bio. + * The ivec will be advanced corresponding to 'offset' data sectors + * and the length will be truncated corresponding to 'len' data + * sectors. + */ +void bio_integrity_trim(struct bio *bio, unsigned int offset, + unsigned int sectors) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + bio_integrity_advance(bio, offset << 9); + bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors); +} +EXPORT_SYMBOL(bio_integrity_trim); + +/** + * bio_integrity_clone - Callback for cloning bios with integrity metadata + * @bio: New bio + * @bio_src: Original bio + * @gfp_mask: Memory allocation mask + * + * Description: Called to allocate a bip when cloning a bio + */ +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + gfp_t gfp_mask) +{ + struct bio_integrity_payload *bip_src = bio_integrity(bio_src); + struct bio_integrity_payload *bip; + + BUG_ON(bip_src == NULL); + + bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); + if (IS_ERR(bip)) + return PTR_ERR(bip); + + memcpy(bip->bip_vec, bip_src->bip_vec, + bip_src->bip_vcnt * sizeof(struct bio_vec)); + + bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_iter = bip_src->bip_iter; + + return 0; +} +EXPORT_SYMBOL(bio_integrity_clone); + +int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + if (bs->bio_integrity_pool) + return 0; + + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); + if (!bs->bio_integrity_pool) + return -1; + + bs->bvec_integrity_pool = biovec_create_pool(pool_size); + if (!bs->bvec_integrity_pool) { + mempool_destroy(bs->bio_integrity_pool); + return -1; + } + + return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ + if (bs->bio_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); + + if (bs->bvec_integrity_pool) + mempool_destroy(bs->bvec_integrity_pool); +} +EXPORT_SYMBOL(bioset_integrity_free); + +void __init bio_integrity_init(void) +{ + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + + bip_slab = kmem_cache_create("bio_integrity_payload", + sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * BIP_INLINE_VECS, + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +} diff --git a/block/bio.c b/block/bio.c new file mode 100644 index 000000000000..dbabd48b1934 --- /dev/null +++ b/block/bio.c @@ -0,0 +1,2055 @@ +/* + * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + * + */ +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/uio.h> +#include <linux/iocontext.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mempool.h> +#include <linux/workqueue.h> +#include <linux/cgroup.h> + +#include <trace/events/block.h> + +/* + * Test patch to inline a certain number of bi_io_vec's inside the bio + * itself, to shrink a bio data allocation from two mempool calls to one + */ +#define BIO_INLINE_VECS 4 + +/* + * if you change this list, also change bvec_alloc or things will + * break badly! cannot be bigger than what you can fit into an + * unsigned short + */ +#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } +static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { + BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), +}; +#undef BV + +/* + * fs_bio_set is the bio_set containing bio and iovec memory pools used by + * IO code that does not need private memory pools. + */ +struct bio_set *fs_bio_set; +EXPORT_SYMBOL(fs_bio_set); + +/* + * Our slab pool management + */ +struct bio_slab { + struct kmem_cache *slab; + unsigned int slab_ref; + unsigned int slab_size; + char name[8]; +}; +static DEFINE_MUTEX(bio_slab_lock); +static struct bio_slab *bio_slabs; +static unsigned int bio_slab_nr, bio_slab_max; + +static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +{ + unsigned int sz = sizeof(struct bio) + extra_size; + struct kmem_cache *slab = NULL; + struct bio_slab *bslab, *new_bio_slabs; + unsigned int new_bio_slab_max; + unsigned int i, entry = -1; + + mutex_lock(&bio_slab_lock); + + i = 0; + while (i < bio_slab_nr) { + bslab = &bio_slabs[i]; + + if (!bslab->slab && entry == -1) + entry = i; + else if (bslab->slab_size == sz) { + slab = bslab->slab; + bslab->slab_ref++; + break; + } + i++; + } + + if (slab) + goto out_unlock; + + if (bio_slab_nr == bio_slab_max && entry == -1) { + new_bio_slab_max = bio_slab_max << 1; + new_bio_slabs = krealloc(bio_slabs, + new_bio_slab_max * sizeof(struct bio_slab), + GFP_KERNEL); + if (!new_bio_slabs) + goto out_unlock; + bio_slab_max = new_bio_slab_max; + bio_slabs = new_bio_slabs; + } + if (entry == -1) + entry = bio_slab_nr++; + + bslab = &bio_slabs[entry]; + + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); + slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, + SLAB_HWCACHE_ALIGN, NULL); + if (!slab) + goto out_unlock; + + bslab->slab = slab; + bslab->slab_ref = 1; + bslab->slab_size = sz; +out_unlock: + mutex_unlock(&bio_slab_lock); + return slab; +} + +static void bio_put_slab(struct bio_set *bs) +{ + struct bio_slab *bslab = NULL; + unsigned int i; + + mutex_lock(&bio_slab_lock); + + for (i = 0; i < bio_slab_nr; i++) { + if (bs->bio_slab == bio_slabs[i].slab) { + bslab = &bio_slabs[i]; + break; + } + } + + if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) + goto out; + + WARN_ON(!bslab->slab_ref); + + if (--bslab->slab_ref) + goto out; + + kmem_cache_destroy(bslab->slab); + bslab->slab = NULL; + +out: + mutex_unlock(&bio_slab_lock); +} + +unsigned int bvec_nr_vecs(unsigned short idx) +{ + return bvec_slabs[idx].nr_vecs; +} + +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) +{ + BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); + + if (idx == BIOVEC_MAX_IDX) + mempool_free(bv, pool); + else { + struct biovec_slab *bvs = bvec_slabs + idx; + + kmem_cache_free(bvs->slab, bv); + } +} + +struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, + mempool_t *pool) +{ + struct bio_vec *bvl; + + /* + * see comment near bvec_array define! + */ + switch (nr) { + case 1: + *idx = 0; + break; + case 2 ... 4: + *idx = 1; + break; + case 5 ... 16: + *idx = 2; + break; + case 17 ... 64: + *idx = 3; + break; + case 65 ... 128: + *idx = 4; + break; + case 129 ... BIO_MAX_PAGES: + *idx = 5; + break; + default: + return NULL; + } + + /* + * idx now points to the pool we want to allocate from. only the + * 1-vec entry pool is mempool backed. + */ + if (*idx == BIOVEC_MAX_IDX) { +fallback: + bvl = mempool_alloc(pool, gfp_mask); + } else { + struct biovec_slab *bvs = bvec_slabs + *idx; + gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); + + /* + * Make this allocation restricted and don't dump info on + * allocation failures, since we'll fallback to the mempool + * in case of failure. + */ + __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + + /* + * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM + * is set, retry with the 1-entry mempool + */ + bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); + if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { + *idx = BIOVEC_MAX_IDX; + goto fallback; + } + } + + return bvl; +} + +static void __bio_free(struct bio *bio) +{ + bio_disassociate_task(bio); + + if (bio_integrity(bio)) + bio_integrity_free(bio); +} + +static void bio_free(struct bio *bio) +{ + struct bio_set *bs = bio->bi_pool; + void *p; + + __bio_free(bio); + + if (bs) { + if (bio_flagged(bio, BIO_OWNS_VEC)) + bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); + + /* + * If we have front padding, adjust the bio pointer before freeing + */ + p = bio; + p -= bs->front_pad; + + mempool_free(p, bs->bio_pool); + } else { + /* Bio was allocated by bio_kmalloc() */ + kfree(bio); + } +} + +void bio_init(struct bio *bio) +{ + memset(bio, 0, sizeof(*bio)); + atomic_set(&bio->__bi_remaining, 1); + atomic_set(&bio->__bi_cnt, 1); +} +EXPORT_SYMBOL(bio_init); + +/** + * bio_reset - reinitialize a bio + * @bio: bio to reset + * + * Description: + * After calling bio_reset(), @bio will be in the same state as a freshly + * allocated bio returned bio bio_alloc_bioset() - the only fields that are + * preserved are the ones that are initialized by bio_alloc_bioset(). See + * comment in struct bio. + */ +void bio_reset(struct bio *bio) +{ + unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); + + __bio_free(bio); + + memset(bio, 0, BIO_RESET_BYTES); + bio->bi_flags = flags; + atomic_set(&bio->__bi_remaining, 1); +} +EXPORT_SYMBOL(bio_reset); + +static void bio_chain_endio(struct bio *bio) +{ + struct bio *parent = bio->bi_private; + + parent->bi_error = bio->bi_error; + bio_endio(parent); + bio_put(bio); +} + +/* + * Increment chain count for the bio. Make sure the CHAIN flag update + * is visible before the raised count. + */ +static inline void bio_inc_remaining(struct bio *bio) +{ + bio_set_flag(bio, BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} + +/** + * bio_chain - chain bio completions + * @bio: the target bio + * @parent: the @bio's parent bio + * + * The caller won't have a bi_end_io called when @bio completes - instead, + * @parent's bi_end_io won't be called until both @parent and @bio have + * completed; the chained bio will also be freed when it completes. + * + * The caller must not set bi_private or bi_end_io in @bio. + */ +void bio_chain(struct bio *bio, struct bio *parent) +{ + BUG_ON(bio->bi_private || bio->bi_end_io); + + bio->bi_private = parent; + bio->bi_end_io = bio_chain_endio; + bio_inc_remaining(parent); +} +EXPORT_SYMBOL(bio_chain); + +static void bio_alloc_rescue(struct work_struct *work) +{ + struct bio_set *bs = container_of(work, struct bio_set, rescue_work); + struct bio *bio; + + while (1) { + spin_lock(&bs->rescue_lock); + bio = bio_list_pop(&bs->rescue_list); + spin_unlock(&bs->rescue_lock); + + if (!bio) + break; + + generic_make_request(bio); + } +} + +static void punt_bios_to_rescuer(struct bio_set *bs) +{ + struct bio_list punt, nopunt; + struct bio *bio; + + /* + * In order to guarantee forward progress we must punt only bios that + * were allocated from this bio_set; otherwise, if there was a bio on + * there for a stacking driver higher up in the stack, processing it + * could require allocating bios from this bio_set, and doing that from + * our own rescuer would be bad. + * + * Since bio lists are singly linked, pop them all instead of trying to + * remove from the middle of the list: + */ + + bio_list_init(&punt); + bio_list_init(&nopunt); + + while ((bio = bio_list_pop(current->bio_list))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + + *current->bio_list = nopunt; + + spin_lock(&bs->rescue_lock); + bio_list_merge(&bs->rescue_list, &punt); + spin_unlock(&bs->rescue_lock); + + queue_work(bs->rescue_workqueue, &bs->rescue_work); +} + +/** + * bio_alloc_bioset - allocate a bio for I/O + * @gfp_mask: the GFP_ mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * @bs: the bio_set to allocate from. + * + * Description: + * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is + * backed by the @bs's mempool. + * + * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will + * always be able to allocate a bio. This is due to the mempool guarantees. + * To make this work, callers must never allocate more than 1 bio at a time + * from this pool. Callers that need to allocate more than 1 bio must always + * submit the previously allocated bio for IO before attempting to allocate + * a new one. Failure to do so can cause deadlocks under memory pressure. + * + * Note that when running under generic_make_request() (i.e. any block + * driver), bios are not submitted until after you return - see the code in + * generic_make_request() that converts recursion into iteration, to prevent + * stack overflows. + * + * This would normally mean allocating multiple bios under + * generic_make_request() would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. + * + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * generic_make_request() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. + * + * RETURNS: + * Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) +{ + gfp_t saved_gfp = gfp_mask; + unsigned front_pad; + unsigned inline_vecs; + unsigned long idx = BIO_POOL_NONE; + struct bio_vec *bvl = NULL; + struct bio *bio; + void *p; + + if (!bs) { + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + p = kmalloc(sizeof(struct bio) + + nr_iovecs * sizeof(struct bio_vec), + gfp_mask); + front_pad = 0; + inline_vecs = nr_iovecs; + } else { + /* should not use nobvec bioset for nr_iovecs > 0 */ + if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0)) + return NULL; + /* + * generic_make_request() converts recursion to iteration; this + * means if we're running beneath it, any bios we allocate and + * submit will not be submitted (and thus freed) until after we + * return. + * + * This exposes us to a potential deadlock if we allocate + * multiple bios from the same bio_set() while running + * underneath generic_make_request(). If we were to allocate + * multiple bios (say a stacking block driver that was splitting + * bios), we would deadlock if we exhausted the mempool's + * reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are + * bios on current->bio_list, we first try the allocation + * without __GFP_DIRECT_RECLAIM; if that fails, we punt those + * bios we would be blocking to the rescuer workqueue before + * we retry with the original gfp_flags. + */ + + if (current->bio_list && !bio_list_empty(current->bio_list)) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + + p = mempool_alloc(bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + p = mempool_alloc(bs->bio_pool, gfp_mask); + } + + front_pad = bs->front_pad; + inline_vecs = BIO_INLINE_VECS; + } + + if (unlikely(!p)) + return NULL; + + bio = p + front_pad; + bio_init(bio); + + if (nr_iovecs > inline_vecs) { + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + if (!bvl && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + } + + if (unlikely(!bvl)) + goto err_free; + + bio_set_flag(bio, BIO_OWNS_VEC); + } else if (nr_iovecs) { + bvl = bio->bi_inline_vecs; + } + + bio->bi_pool = bs; + bio->bi_flags |= idx << BIO_POOL_OFFSET; + bio->bi_max_vecs = nr_iovecs; + bio->bi_io_vec = bvl; + return bio; + +err_free: + mempool_free(p, bs->bio_pool); + return NULL; +} +EXPORT_SYMBOL(bio_alloc_bioset); + +void zero_fill_bio(struct bio *bio) +{ + unsigned long flags; + struct bio_vec bv; + struct bvec_iter iter; + + bio_for_each_segment(bv, bio, iter) { + char *data = bvec_kmap_irq(&bv, &flags); + memset(data, 0, bv.bv_len); + flush_dcache_page(bv.bv_page); + bvec_kunmap_irq(data, &flags); + } +} +EXPORT_SYMBOL(zero_fill_bio); + +/** + * bio_put - release a reference to a bio + * @bio: bio to release reference to + * + * Description: + * Put a reference to a &struct bio, either one you have gotten with + * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. + **/ +void bio_put(struct bio *bio) +{ + if (!bio_flagged(bio, BIO_REFFED)) + bio_free(bio); + else { + BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); + + /* + * last put frees it + */ + if (atomic_dec_and_test(&bio->__bi_cnt)) + bio_free(bio); + } +} +EXPORT_SYMBOL(bio_put); + +inline int bio_phys_segments(struct request_queue *q, struct bio *bio) +{ + if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) + blk_recount_segments(q, bio); + + return bio->bi_phys_segments; +} +EXPORT_SYMBOL(bio_phys_segments); + +/** + * __bio_clone_fast - clone a bio that shares the original bio's biovec + * @bio: destination bio + * @bio_src: bio to clone + * + * Clone a &bio. Caller will own the returned bio, but not + * the actual data it points to. Reference count of returned + * bio will be one. + * + * Caller must ensure that @bio_src is not freed before @bio. + */ +void __bio_clone_fast(struct bio *bio, struct bio *bio_src) +{ + BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); + + /* + * most users will be overriding ->bi_bdev with a new target, + * so we don't set nor calculate new physical/hw segment counts here + */ + bio->bi_bdev = bio_src->bi_bdev; + bio_set_flag(bio, BIO_CLONED); + bio->bi_rw = bio_src->bi_rw; + bio->bi_iter = bio_src->bi_iter; + bio->bi_io_vec = bio_src->bi_io_vec; +} +EXPORT_SYMBOL(__bio_clone_fast); + +/** + * bio_clone_fast - clone a bio that shares the original bio's biovec + * @bio: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * + * Like __bio_clone_fast, only also allocates the returned bio + */ +struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) +{ + struct bio *b; + + b = bio_alloc_bioset(gfp_mask, 0, bs); + if (!b) + return NULL; + + __bio_clone_fast(b, bio); + + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, gfp_mask); + + if (ret < 0) { + bio_put(b); + return NULL; + } + } + + return b; +} +EXPORT_SYMBOL(bio_clone_fast); + +/** + * bio_clone_bioset - clone a bio + * @bio_src: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * + * Clone bio. Caller will own the returned bio, but not the actual data it + * points to. Reference count of returned bio will be one. + */ +struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, + struct bio_set *bs) +{ + struct bvec_iter iter; + struct bio_vec bv; + struct bio *bio; + + /* + * Pre immutable biovecs, __bio_clone() used to just do a memcpy from + * bio_src->bi_io_vec to bio->bi_io_vec. + * + * We can't do that anymore, because: + * + * - The point of cloning the biovec is to produce a bio with a biovec + * the caller can modify: bi_idx and bi_bvec_done should be 0. + * + * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * we tried to clone the whole thing bio_alloc_bioset() would fail. + * But the clone should succeed as long as the number of biovecs we + * actually need to allocate is fewer than BIO_MAX_PAGES. + * + * - Lastly, bi_vcnt should not be looked at or relied upon by code + * that does not own the bio - reason being drivers don't use it for + * iterating over the biovec anymore, so expecting it to be kept up + * to date (i.e. for clones that share the parent biovec) is just + * asking for trouble and would force extra work on + * __bio_clone_fast() anyways. + */ + + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); + if (!bio) + return NULL; + + bio->bi_bdev = bio_src->bi_bdev; + bio->bi_rw = bio_src->bi_rw; + bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; + bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + + if (bio->bi_rw & REQ_DISCARD) + goto integrity_clone; + + if (bio->bi_rw & REQ_WRITE_SAME) { + bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; + goto integrity_clone; + } + + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + +integrity_clone: + if (bio_integrity(bio_src)) { + int ret; + + ret = bio_integrity_clone(bio, bio_src, gfp_mask); + if (ret < 0) { + bio_put(bio); + return NULL; + } + } + + return bio; +} +EXPORT_SYMBOL(bio_clone_bioset); + +/** + * bio_add_pc_page - attempt to add page to bio + * @q: the target queue + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + * + * This should only be used by REQ_PC bios. + */ +int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page + *page, unsigned int len, unsigned int offset) +{ + int retried_segments = 0; + struct bio_vec *bvec; + + /* + * cloned bio must not modify vec list + */ + if (unlikely(bio_flagged(bio, BIO_CLONED))) + return 0; + + if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q)) + return 0; + + /* + * For filesystems with a blocksize smaller than the pagesize + * we will often be called with the same page as last time and + * a consecutive offset. Optimize this special case. + */ + if (bio->bi_vcnt > 0) { + struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (page == prev->bv_page && + offset == prev->bv_offset + prev->bv_len) { + prev->bv_len += len; + bio->bi_iter.bi_size += len; + goto done; + } + + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, disallow it. + */ + if (bvec_gap_to_prev(q, prev, offset)) + return 0; + } + + if (bio->bi_vcnt >= bio->bi_max_vecs) + return 0; + + /* + * setup the new entry, we might clear it again later if we + * cannot add the page + */ + bvec = &bio->bi_io_vec[bio->bi_vcnt]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = offset; + bio->bi_vcnt++; + bio->bi_phys_segments++; + bio->bi_iter.bi_size += len; + + /* + * Perform a recount if the number of segments is greater + * than queue_max_segments(q). + */ + + while (bio->bi_phys_segments > queue_max_segments(q)) { + + if (retried_segments) + goto failed; + + retried_segments = 1; + blk_recount_segments(q, bio); + } + + /* If we may be able to merge these biovecs, force a recount */ + if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + bio_clear_flag(bio, BIO_SEG_VALID); + + done: + return len; + + failed: + bvec->bv_page = NULL; + bvec->bv_len = 0; + bvec->bv_offset = 0; + bio->bi_vcnt--; + bio->bi_iter.bi_size -= len; + blk_recount_segments(q, bio); + return 0; +} +EXPORT_SYMBOL(bio_add_pc_page); + +/** + * bio_add_page - attempt to add page to bio + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This will only fail + * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. + */ +int bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct bio_vec *bv; + + /* + * cloned bio must not modify vec list + */ + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; + + /* + * For filesystems with a blocksize smaller than the pagesize + * we will often be called with the same page as last time and + * a consecutive offset. Optimize this special case. + */ + if (bio->bi_vcnt > 0) { + bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (page == bv->bv_page && + offset == bv->bv_offset + bv->bv_len) { + bv->bv_len += len; + goto done; + } + } + + if (bio->bi_vcnt >= bio->bi_max_vecs) + return 0; + + bv = &bio->bi_io_vec[bio->bi_vcnt]; + bv->bv_page = page; + bv->bv_len = len; + bv->bv_offset = offset; + + bio->bi_vcnt++; +done: + bio->bi_iter.bi_size += len; + return len; +} +EXPORT_SYMBOL(bio_add_page); + +struct submit_bio_ret { + struct completion event; + int error; +}; + +static void submit_bio_wait_endio(struct bio *bio) +{ + struct submit_bio_ret *ret = bio->bi_private; + + ret->error = bio->bi_error; + complete(&ret->event); +} + +/** + * submit_bio_wait - submit a bio, and wait until it completes + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) + * @bio: The &struct bio which describes the I/O + * + * Simple wrapper around submit_bio(). Returns 0 on success, or the error from + * bio_endio() on failure. + */ +int submit_bio_wait(int rw, struct bio *bio) +{ + struct submit_bio_ret ret; + + rw |= REQ_SYNC; + init_completion(&ret.event); + bio->bi_private = &ret; + bio->bi_end_io = submit_bio_wait_endio; + submit_bio(rw, bio); + wait_for_completion(&ret.event); + + return ret.error; +} +EXPORT_SYMBOL(submit_bio_wait); + +/** + * bio_advance - increment/complete a bio by some number of bytes + * @bio: bio to advance + * @bytes: number of bytes to complete + * + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will + * be updated on the last bvec as well. + * + * @bio will then represent the remaining, uncompleted portion of the io. + */ +void bio_advance(struct bio *bio, unsigned bytes) +{ + if (bio_integrity(bio)) + bio_integrity_advance(bio, bytes); + + bio_advance_iter(bio, &bio->bi_iter, bytes); +} +EXPORT_SYMBOL(bio_advance); + +/** + * bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} +EXPORT_SYMBOL(bio_alloc_pages); + +/** + * bio_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats + * @src and @dst as linked lists of bios. + * + * Stops when it reaches the end of either @src or @dst - that is, copies + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). + */ +void bio_copy_data(struct bio *dst, struct bio *src) +{ + struct bvec_iter src_iter, dst_iter; + struct bio_vec src_bv, dst_bv; + void *src_p, *dst_p; + unsigned bytes; + + src_iter = src->bi_iter; + dst_iter = dst->bi_iter; + + while (1) { + if (!src_iter.bi_size) { + src = src->bi_next; + if (!src) + break; + + src_iter = src->bi_iter; + } + + if (!dst_iter.bi_size) { + dst = dst->bi_next; + if (!dst) + break; + + dst_iter = dst->bi_iter; + } + + src_bv = bio_iter_iovec(src, src_iter); + dst_bv = bio_iter_iovec(dst, dst_iter); + + bytes = min(src_bv.bv_len, dst_bv.bv_len); + + src_p = kmap_atomic(src_bv.bv_page); + dst_p = kmap_atomic(dst_bv.bv_page); + + memcpy(dst_p + dst_bv.bv_offset, + src_p + src_bv.bv_offset, + bytes); + + kunmap_atomic(dst_p); + kunmap_atomic(src_p); + + bio_advance_iter(src, &src_iter, bytes); + bio_advance_iter(dst, &dst_iter, bytes); + } +} +EXPORT_SYMBOL(bio_copy_data); + +struct bio_map_data { + int is_our_pages; + struct iov_iter iter; + struct iovec iov[]; +}; + +static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, + gfp_t gfp_mask) +{ + if (iov_count > UIO_MAXIOV) + return NULL; + + return kmalloc(sizeof(struct bio_map_data) + + sizeof(struct iovec) * iov_count, gfp_mask); +} + +/** + * bio_copy_from_iter - copy all pages from iov_iter to bio + * @bio: The &struct bio which describes the I/O as destination + * @iter: iov_iter as source + * + * Copy all pages from iov_iter to bio. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_from_iter(struct bio *bio, struct iov_iter iter) +{ + int i; + struct bio_vec *bvec; + + bio_for_each_segment_all(bvec, bio, i) { + ssize_t ret; + + ret = copy_page_from_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); + + if (!iov_iter_count(&iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_copy_to_iter - copy all pages from bio to iov_iter + * @bio: The &struct bio which describes the I/O as source + * @iter: iov_iter as destination + * + * Copy all pages from bio to iov_iter. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) +{ + int i; + struct bio_vec *bvec; + + bio_for_each_segment_all(bvec, bio, i) { + ssize_t ret; + + ret = copy_page_to_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); + + if (!iov_iter_count(&iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +static void bio_free_pages(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); +} + +/** + * bio_uncopy_user - finish previously mapped bio + * @bio: bio being terminated + * + * Free pages allocated from bio_copy_user_iov() and write back data + * to user space in case of a read. + */ +int bio_uncopy_user(struct bio *bio) +{ + struct bio_map_data *bmd = bio->bi_private; + int ret = 0; + + if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + /* + * if we're in a workqueue, the request is orphaned, so + * don't copy into a random user address space, just free. + */ + if (current->mm && bio_data_dir(bio) == READ) + ret = bio_copy_to_iter(bio, bmd->iter); + if (bmd->is_our_pages) + bio_free_pages(bio); + } + kfree(bmd); + bio_put(bio); + return ret; +} +EXPORT_SYMBOL(bio_uncopy_user); + +/** + * bio_copy_user_iov - copy user data to bio + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +struct bio *bio_copy_user_iov(struct request_queue *q, + struct rq_map_data *map_data, + const struct iov_iter *iter, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + struct page *page; + struct bio *bio; + int i, ret; + int nr_pages = 0; + unsigned int len = iter->count; + unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; + + for (i = 0; i < iter->nr_segs; i++) { + unsigned long uaddr; + unsigned long end; + unsigned long start; + + uaddr = (unsigned long) iter->iov[i].iov_base; + end = (uaddr + iter->iov[i].iov_len + PAGE_SIZE - 1) + >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages += end - start; + } + + if (offset) + nr_pages++; + + bmd = bio_alloc_map_data(iter->nr_segs, gfp_mask); + if (!bmd) + return ERR_PTR(-ENOMEM); + + /* + * We need to do a deep copy of the iov_iter including the iovecs. + * The caller provided iov might point to an on-stack or otherwise + * shortlived one. + */ + bmd->is_our_pages = map_data ? 0 : 1; + memcpy(bmd->iov, iter->iov, sizeof(struct iovec) * iter->nr_segs); + iov_iter_init(&bmd->iter, iter->type, bmd->iov, + iter->nr_segs, iter->count); + + ret = -ENOMEM; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + goto out_bmd; + + if (iter->type & WRITE) + bio->bi_rw |= REQ_WRITE; + + ret = 0; + + if (map_data) { + nr_pages = 1 << map_data->page_order; + i = map_data->offset / PAGE_SIZE; + } + while (len) { + unsigned int bytes = PAGE_SIZE; + + bytes -= offset; + + if (bytes > len) + bytes = len; + + if (map_data) { + if (i == map_data->nr_entries * nr_pages) { + ret = -ENOMEM; + break; + } + + page = map_data->pages[i / nr_pages]; + page += (i % nr_pages); + + i++; + } else { + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) + break; + + len -= bytes; + offset = 0; + } + + if (ret) + goto cleanup; + + /* + * success + */ + if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) || + (map_data && map_data->from_user)) { + ret = bio_copy_from_iter(bio, *iter); + if (ret) + goto cleanup; + } + + bio->bi_private = bmd; + return bio; +cleanup: + if (!map_data) + bio_free_pages(bio); + bio_put(bio); +out_bmd: + kfree(bmd); + return ERR_PTR(ret); +} + +/** + * bio_map_user_iov - map user iovec into bio + * @q: the struct request_queue for the bio + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_user_iov(struct request_queue *q, + const struct iov_iter *iter, + gfp_t gfp_mask) +{ + int j; + int nr_pages = 0; + struct page **pages; + struct bio *bio; + int cur_page = 0; + int ret, offset; + struct iov_iter i; + struct iovec iov; + + iov_for_each(iov, i, *iter) { + unsigned long uaddr = (unsigned long) iov.iov_base; + unsigned long len = iov.iov_len; + unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = uaddr >> PAGE_SHIFT; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages += end - start; + /* + * buffer must be aligned to at least hardsector size for now + */ + if (uaddr & queue_dma_alignment(q)) + return ERR_PTR(-EINVAL); + } + + if (!nr_pages) + return ERR_PTR(-EINVAL); + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + ret = -ENOMEM; + pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); + if (!pages) + goto out; + + iov_for_each(iov, i, *iter) { + unsigned long uaddr = (unsigned long) iov.iov_base; + unsigned long len = iov.iov_len; + unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = uaddr >> PAGE_SHIFT; + const int local_nr_pages = end - start; + const int page_limit = cur_page + local_nr_pages; + + ret = get_user_pages_fast(uaddr, local_nr_pages, + (iter->type & WRITE) != WRITE, + &pages[cur_page]); + if (ret < local_nr_pages) { + ret = -EFAULT; + goto out_unmap; + } + + offset = offset_in_page(uaddr); + for (j = cur_page; j < page_limit; j++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + /* + * sorry... + */ + if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < + bytes) + break; + + len -= bytes; + offset = 0; + } + + cur_page = j; + /* + * release the pages we didn't map into the bio, if any + */ + while (j < page_limit) + page_cache_release(pages[j++]); + } + + kfree(pages); + + /* + * set data direction, and check if mapped pages need bouncing + */ + if (iter->type & WRITE) + bio->bi_rw |= REQ_WRITE; + + bio_set_flag(bio, BIO_USER_MAPPED); + + /* + * subtle -- if __bio_map_user() ended up bouncing a bio, + * it would normally disappear when its bi_end_io is run. + * however, we need it for the unmap, so grab an extra + * reference to it + */ + bio_get(bio); + return bio; + + out_unmap: + for (j = 0; j < nr_pages; j++) { + if (!pages[j]) + break; + page_cache_release(pages[j]); + } + out: + kfree(pages); + bio_put(bio); + return ERR_PTR(ret); +} + +static void __bio_unmap_user(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + /* + * make sure we dirty pages we wrote to + */ + bio_for_each_segment_all(bvec, bio, i) { + if (bio_data_dir(bio) == READ) + set_page_dirty_lock(bvec->bv_page); + + page_cache_release(bvec->bv_page); + } + + bio_put(bio); +} + +/** + * bio_unmap_user - unmap a bio + * @bio: the bio being unmapped + * + * Unmap a bio previously mapped by bio_map_user(). Must be called with + * a process context. + * + * bio_unmap_user() may sleep. + */ +void bio_unmap_user(struct bio *bio) +{ + __bio_unmap_user(bio); + bio_put(bio); +} +EXPORT_SYMBOL(bio_unmap_user); + +static void bio_map_kern_endio(struct bio *bio) +{ + bio_put(bio); +} + +/** + * bio_map_kern - map kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to map + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, + gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + int offset, i; + struct bio *bio; + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, + offset) < bytes) { + /* we don't support partial mappings */ + bio_put(bio); + return ERR_PTR(-EINVAL); + } + + data += bytes; + len -= bytes; + offset = 0; + } + + bio->bi_end_io = bio_map_kern_endio; + return bio; +} +EXPORT_SYMBOL(bio_map_kern); + +static void bio_copy_kern_endio(struct bio *bio) +{ + bio_free_pages(bio); + bio_put(bio); +} + +static void bio_copy_kern_endio_read(struct bio *bio) +{ + char *p = bio->bi_private; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + p += bvec->bv_len; + } + + bio_copy_kern_endio(bio); +} + +/** + * bio_copy_kern - copy kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to copy + * @len: length in bytes + * @gfp_mask: allocation flags for bio and page allocation + * @reading: data direction is READ + * + * copy the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, + gfp_t gfp_mask, int reading) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + struct bio *bio; + void *p = data; + int nr_pages = 0; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages = end - start; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (len) { + struct page *page; + unsigned int bytes = PAGE_SIZE; + + if (bytes > len) + bytes = len; + + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) + goto cleanup; + + if (!reading) + memcpy(page_address(page), p, bytes); + + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + break; + + len -= bytes; + p += bytes; + } + + if (reading) { + bio->bi_end_io = bio_copy_kern_endio_read; + bio->bi_private = data; + } else { + bio->bi_end_io = bio_copy_kern_endio; + bio->bi_rw |= REQ_WRITE; + } + + return bio; + +cleanup: + bio_free_pages(bio); + bio_put(bio); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(bio_copy_kern); + +/* + * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions + * for performing direct-IO in BIOs. + * + * The problem is that we cannot run set_page_dirty() from interrupt context + * because the required locks are not interrupt-safe. So what we can do is to + * mark the pages dirty _before_ performing IO. And in interrupt context, + * check that the pages are still dirty. If so, fine. If not, redirty them + * in process context. + * + * We special-case compound pages here: normally this means reads into hugetlb + * pages. The logic in here doesn't really work right for compound pages + * because the VM does not uniformly chase down the head page in all cases. + * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't + * handle them at all. So we skip compound pages here at an early stage. + * + * Note that this code is very hard to test under normal circumstances because + * direct-io pins the pages with get_user_pages(). This makes + * is_page_cache_freeable return false, and the VM will not clean the pages. + * But other code (eg, flusher threads) could clean the pages if they are mapped + * pagecache. + * + * Simply disabling the call to bio_set_pages_dirty() is a good way to test the + * deferred bio dirtying paths. + */ + +/* + * bio_set_pages_dirty() will mark all the bio's pages as dirty. + */ +void bio_set_pages_dirty(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (page && !PageCompound(page)) + set_page_dirty_lock(page); + } +} + +static void bio_release_pages(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (page) + put_page(page); + } +} + +/* + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. + * If they are, then fine. If, however, some pages are clean then they must + * have been written out during the direct-IO read. So we take another ref on + * the BIO and the offending pages and re-dirty the pages in process context. + * + * It is expected that bio_check_pages_dirty() will wholly own the BIO from + * here on. It will run one page_cache_release() against each page and will + * run one bio_put() against the BIO. + */ + +static void bio_dirty_fn(struct work_struct *work); + +static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); +static DEFINE_SPINLOCK(bio_dirty_lock); +static struct bio *bio_dirty_list; + +/* + * This runs in process context + */ +static void bio_dirty_fn(struct work_struct *work) +{ + unsigned long flags; + struct bio *bio; + + spin_lock_irqsave(&bio_dirty_lock, flags); + bio = bio_dirty_list; + bio_dirty_list = NULL; + spin_unlock_irqrestore(&bio_dirty_lock, flags); + + while (bio) { + struct bio *next = bio->bi_private; + + bio_set_pages_dirty(bio); + bio_release_pages(bio); + bio_put(bio); + bio = next; + } +} + +void bio_check_pages_dirty(struct bio *bio) +{ + struct bio_vec *bvec; + int nr_clean_pages = 0; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (PageDirty(page) || PageCompound(page)) { + page_cache_release(page); + bvec->bv_page = NULL; + } else { + nr_clean_pages++; + } + } + + if (nr_clean_pages) { + unsigned long flags; + + spin_lock_irqsave(&bio_dirty_lock, flags); + bio->bi_private = bio_dirty_list; + bio_dirty_list = bio; + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); + } else { + bio_put(bio); + } +} + +void generic_start_io_acct(int rw, unsigned long sectors, + struct hd_struct *part) +{ + int cpu = part_stat_lock(); + + part_round_stats(cpu, part); + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, sectors[rw], sectors); + part_inc_in_flight(part, rw); + + part_stat_unlock(); +} +EXPORT_SYMBOL(generic_start_io_acct); + +void generic_end_io_acct(int rw, struct hd_struct *part, + unsigned long start_time) +{ + unsigned long duration = jiffies - start_time; + int cpu = part_stat_lock(); + + part_stat_add(cpu, part, ticks[rw], duration); + part_round_stats(cpu, part); + part_dec_in_flight(part, rw); + + part_stat_unlock(); +} +EXPORT_SYMBOL(generic_end_io_acct); + +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +void bio_flush_dcache_pages(struct bio *bi) +{ + struct bio_vec bvec; + struct bvec_iter iter; + + bio_for_each_segment(bvec, bi, iter) + flush_dcache_page(bvec.bv_page); +} +EXPORT_SYMBOL(bio_flush_dcache_pages); +#endif + +static inline bool bio_remaining_done(struct bio *bio) +{ + /* + * If we're not chaining, then ->__bi_remaining is always 1 and + * we always end io on the first invocation. + */ + if (!bio_flagged(bio, BIO_CHAIN)) + return true; + + BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); + + if (atomic_dec_and_test(&bio->__bi_remaining)) { + bio_clear_flag(bio, BIO_CHAIN); + return true; + } + + return false; +} + +/** + * bio_endio - end I/O on a bio + * @bio: bio + * + * Description: + * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred + * way to end I/O on a bio. No one should call bi_end_io() directly on a + * bio unless they own it and thus know that it has an end_io function. + **/ +void bio_endio(struct bio *bio) +{ + while (bio) { + if (unlikely(!bio_remaining_done(bio))) + break; + + /* + * Need to have a real endio function for chained bios, + * otherwise various corner cases will break (like stacking + * block devices that save/restore bi_end_io) - however, we want + * to avoid unbounded recursion and blowing the stack. Tail call + * optimization would handle this, but compiling with frame + * pointers also disables gcc's sibling call optimization. + */ + if (bio->bi_end_io == bio_chain_endio) { + struct bio *parent = bio->bi_private; + parent->bi_error = bio->bi_error; + bio_put(bio); + bio = parent; + } else { + if (bio->bi_end_io) + bio->bi_end_io(bio); + bio = NULL; + } + } +} +EXPORT_SYMBOL(bio_endio); + +/** + * bio_split - split a bio + * @bio: bio to split + * @sectors: number of sectors to split from the front of @bio + * @gfp: gfp mask + * @bs: bio set to allocate from + * + * Allocates and returns a new bio which represents @sectors from the start of + * @bio, and updates @bio to represent the remaining sectors. + * + * Unless this is a discard request the newly allocated bio will point + * to @bio's bi_io_vec; it is the caller's responsibility to ensure that + * @bio is not freed before the split. + */ +struct bio *bio_split(struct bio *bio, int sectors, + gfp_t gfp, struct bio_set *bs) +{ + struct bio *split = NULL; + + BUG_ON(sectors <= 0); + BUG_ON(sectors >= bio_sectors(bio)); + + /* + * Discards need a mutable bio_vec to accommodate the payload + * required by the DSM TRIM and UNMAP commands. + */ + if (bio->bi_rw & REQ_DISCARD) + split = bio_clone_bioset(bio, gfp, bs); + else + split = bio_clone_fast(bio, gfp, bs); + + if (!split) + return NULL; + + split->bi_iter.bi_size = sectors << 9; + + if (bio_integrity(split)) + bio_integrity_trim(split, 0, sectors); + + bio_advance(bio, split->bi_iter.bi_size); + + return split; +} +EXPORT_SYMBOL(bio_split); + +/** + * bio_trim - trim a bio + * @bio: bio to trim + * @offset: number of sectors to trim from the front of @bio + * @size: size we want to trim @bio to, in sectors + */ +void bio_trim(struct bio *bio, int offset, int size) +{ + /* 'bio' is a cloned bio which we need to trim to match + * the given offset and size. + */ + + size <<= 9; + if (offset == 0 && size == bio->bi_iter.bi_size) + return; + + bio_clear_flag(bio, BIO_SEG_VALID); + + bio_advance(bio, offset << 9); + + bio->bi_iter.bi_size = size; +} +EXPORT_SYMBOL_GPL(bio_trim); + +/* + * create memory pools for biovec's in a bio_set. + * use the global biovec slabs created for general use. + */ +mempool_t *biovec_create_pool(int pool_entries) +{ + struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; + + return mempool_create_slab_pool(pool_entries, bp->slab); +} + +void bioset_free(struct bio_set *bs) +{ + if (bs->rescue_workqueue) + destroy_workqueue(bs->rescue_workqueue); + + if (bs->bio_pool) + mempool_destroy(bs->bio_pool); + + if (bs->bvec_pool) + mempool_destroy(bs->bvec_pool); + + bioset_integrity_free(bs); + bio_put_slab(bs); + + kfree(bs); +} +EXPORT_SYMBOL(bioset_free); + +static struct bio_set *__bioset_create(unsigned int pool_size, + unsigned int front_pad, + bool create_bvec_pool) +{ + unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + struct bio_set *bs; + + bs = kzalloc(sizeof(*bs), GFP_KERNEL); + if (!bs) + return NULL; + + bs->front_pad = front_pad; + + spin_lock_init(&bs->rescue_lock); + bio_list_init(&bs->rescue_list); + INIT_WORK(&bs->rescue_work, bio_alloc_rescue); + + bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + if (!bs->bio_slab) { + kfree(bs); + return NULL; + } + + bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); + if (!bs->bio_pool) + goto bad; + + if (create_bvec_pool) { + bs->bvec_pool = biovec_create_pool(pool_size); + if (!bs->bvec_pool) + goto bad; + } + + bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + + return bs; +bad: + bioset_free(bs); + return NULL; +} + +/** + * bioset_create - Create a bio_set + * @pool_size: Number of bio and bio_vecs to cache in the mempool + * @front_pad: Number of bytes to allocate in front of the returned bio + * + * Description: + * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller + * to ask for a number of bytes to be allocated in front of the bio. + * Front pad allocation is useful for embedding the bio inside + * another structure, to avoid allocating extra data to go with the bio. + * Note that the bio must be embedded at the END of that structure always, + * or things will break badly. + */ +struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) +{ + return __bioset_create(pool_size, front_pad, true); +} +EXPORT_SYMBOL(bioset_create); + +/** + * bioset_create_nobvec - Create a bio_set without bio_vec mempool + * @pool_size: Number of bio to cache in the mempool + * @front_pad: Number of bytes to allocate in front of the returned bio + * + * Description: + * Same functionality as bioset_create() except that mempool is not + * created for bio_vecs. Saving some memory for bio_clone_fast() users. + */ +struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_pad) +{ + return __bioset_create(pool_size, front_pad, false); +} +EXPORT_SYMBOL(bioset_create_nobvec); + +#ifdef CONFIG_BLK_CGROUP + +/** + * bio_associate_blkcg - associate a bio with the specified blkcg + * @bio: target bio + * @blkcg_css: css of the blkcg to associate + * + * Associate @bio with the blkcg specified by @blkcg_css. Block layer will + * treat @bio as if it were issued by a task which belongs to the blkcg. + * + * This function takes an extra reference of @blkcg_css which will be put + * when @bio is released. The caller must own @bio and is responsible for + * synchronizing calls to this function. + */ +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) +{ + if (unlikely(bio->bi_css)) + return -EBUSY; + css_get(blkcg_css); + bio->bi_css = blkcg_css; + return 0; +} +EXPORT_SYMBOL_GPL(bio_associate_blkcg); + +/** + * bio_associate_current - associate a bio with %current + * @bio: target bio + * + * Associate @bio with %current if it hasn't been associated yet. Block + * layer will treat @bio as if it were issued by %current no matter which + * task actually issues it. + * + * This function takes an extra reference of @task's io_context and blkcg + * which will be put when @bio is released. The caller must own @bio, + * ensure %current->io_context exists, and is responsible for synchronizing + * calls to this function. + */ +int bio_associate_current(struct bio *bio) +{ + struct io_context *ioc; + + if (bio->bi_css) + return -EBUSY; + + ioc = current->io_context; + if (!ioc) + return -ENOENT; + + get_io_context_active(ioc); + bio->bi_ioc = ioc; + bio->bi_css = task_get_css(current, io_cgrp_id); + return 0; +} +EXPORT_SYMBOL_GPL(bio_associate_current); + +/** + * bio_disassociate_task - undo bio_associate_current() + * @bio: target bio + */ +void bio_disassociate_task(struct bio *bio) +{ + if (bio->bi_ioc) { + put_io_context(bio->bi_ioc); + bio->bi_ioc = NULL; + } + if (bio->bi_css) { + css_put(bio->bi_css); + bio->bi_css = NULL; + } +} + +#endif /* CONFIG_BLK_CGROUP */ + +static void __init biovec_init_slabs(void) +{ + int i; + + for (i = 0; i < BIOVEC_NR_POOLS; i++) { + int size; + struct biovec_slab *bvs = bvec_slabs + i; + + if (bvs->nr_vecs <= BIO_INLINE_VECS) { + bvs->slab = NULL; + continue; + } + + size = bvs->nr_vecs * sizeof(struct bio_vec); + bvs->slab = kmem_cache_create(bvs->name, size, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + } +} + +static int __init init_bio(void) +{ + bio_slab_max = 2; + bio_slab_nr = 0; + bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); + if (!bio_slabs) + panic("bio: can't allocate bios\n"); + + bio_integrity_init(); + biovec_init_slabs(); + + fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); + if (!fs_bio_set) + panic("bio: can't allocate bios\n"); + + if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) + panic("bio: can't create integrity pool\n"); + + return 0; +} +subsys_initcall(init_bio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1039fb9ff5f5..5a37188b559f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -9,29 +9,46 @@ * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> + * + * For policy-specific per-blkcg data: + * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> + * Arianna Avanzini <avanzini.arianna@gmail.com> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/genhd.h> #include <linux/delay.h> #include <linux/atomic.h> -#include "blk-cgroup.h" +#include <linux/ctype.h> +#include <linux/blk-cgroup.h> #include "blk.h" #define MAX_KEY_LEN 100 +/* + * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. + * blkcg_pol_register_mutex nests outside of it and synchronizes entire + * policy [un]register operations including cgroup file additions / + * removals. Putting cgroup file registration outside blkcg_pol_mutex + * allows grabbing it from cgroup callbacks. + */ +static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, - .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; +struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); +struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; + static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; +static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -52,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg) return; for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkg->pd[i]); + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - blk_exit_rl(&blkg->rl); + if (blkg->blkcg != &blkcg_root) + blk_exit_rl(&blkg->rl); + + blkg_rwstat_exit(&blkg->stat_ios); + blkg_rwstat_exit(&blkg->stat_bytes); kfree(blkg); } @@ -77,10 +99,14 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg) return NULL; + if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || + blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; - blkg->refcnt = 1; + atomic_set(&blkg->refcnt, 1); /* root blkg uses @q->root_rl, init rl only for !root blkgs */ if (blkcg != &blkcg_root) { @@ -97,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, continue; /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + pd = pol->pd_alloc_fn(gfp_mask, q->node); if (!pd) goto err_free; @@ -113,26 +139,11 @@ err_free: return NULL; } -/** - * __blkg_lookup - internal version of blkg_lookup() - * @blkcg: blkcg of interest - * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not - * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. - */ -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint) +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; - blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) - return blkg; - /* * Hint didn't match. Look up from the radix tree. Note that the * hint can only be updated under queue_lock as otherwise @blkg @@ -150,62 +161,53 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, return NULL; } - -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock and is guaranteed to return %NULL if @q is bypassing - * - see blk_queue_bypass_start() for details. - */ -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (unlikely(blk_queue_bypass(q))) - return NULL; - return __blkg_lookup(blkcg, q, false); -} -EXPORT_SYMBOL_GPL(blkg_lookup); +EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as - * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. + * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct request_queue *q, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; + struct bdi_writeback_congested *wb_congested; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(q->queue_lock); /* blkg holds a reference to blkcg */ - if (!css_tryget(&blkcg->css)) { - ret = -EINVAL; + if (!css_tryget_online(&blkcg->css)) { + ret = -ENODEV; goto err_free_blkg; } + wb_congested = wb_congested_get_create(&q->backing_dev_info, + blkcg->css.id, GFP_NOWAIT); + if (!wb_congested) { + ret = -ENOMEM; + goto err_put_css; + } + /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_css; + goto err_put_congested; } } blkg = new_blkg; + blkg->wb_congested = wb_congested; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { - ret = -EINVAL; - goto err_put_css; + ret = -ENODEV; + goto err_put_congested; } blkg_get(blkg->parent); } @@ -215,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) - pol->pd_init_fn(blkg); + pol->pd_init_fn(blkg->pd[i]); } /* insert */ @@ -229,24 +231,21 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg); + pol->pd_online_fn(blkg->pd[i]); } } blkg->online = true; spin_unlock(&blkcg->lock); - if (!ret) { - if (blkcg == &blkcg_root) { - q->root_blkg = blkg; - q->root_rl.blkg = blkg; - } + if (!ret) return blkg; - } /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); +err_put_congested: + wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: @@ -281,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -305,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; } } -EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(blkg->q->queue_lock); @@ -323,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg); + pol->pd_offline_fn(blkg->pd[i]); } + + if (parent) { + blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); + blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); + } + blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -340,15 +345,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* - * If root blkg is destroyed. Just clear the pointer since root_rl - * does not take reference on root blkg. - */ - if (blkcg == &blkcg_root) { - blkg->q->root_blkg = NULL; - blkg->q->root_rl.blkg = NULL; - } - - /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ @@ -374,6 +370,9 @@ static void blkg_destroy_all(struct request_queue *q) blkg_destroy(blkg); spin_unlock(&blkcg->lock); } + + q->root_blkg = NULL; + q->root_rl.blkg = NULL; } /* @@ -387,23 +386,13 @@ static void blkg_destroy_all(struct request_queue *q) void __blkg_release_rcu(struct rcu_head *rcu_head) { struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); - int i; - - /* tell policies that this one is being freed */ - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkg->pd[i] && pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - } /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); - if (blkg->parent) { - spin_lock_irq(blkg->q->queue_lock); + if (blkg->parent) blkg_put(blkg->parent); - spin_unlock_irq(blkg->q->queue_lock); - } + + wb_congested_put(blkg->wb_congested); blkg_free(blkg); } @@ -451,20 +440,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct blkcg_gq *blkg; int i; - /* - * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex - * which ends up putting cgroup's internal cgroup_tree_mutex under - * it; however, cgroup_tree_mutex is nested above cgroup file - * active protection and grabbing blkcg_pol_mutex from a cgroup - * file operation creates a possible circular dependency. cgroup - * internal locking is planned to go through further simplification - * and this issue should go away soon. For now, let's trylock - * blkcg_pol_mutex and restart the write on failure. - * - * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com - */ - if (!mutex_trylock(&blkcg_pol_mutex)) - return restart_syscall(); + mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* @@ -473,12 +449,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + blkg_rwstat_reset(&blkg->stat_bytes); + blkg_rwstat_reset(&blkg->stat_ios); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkcg_policy_enabled(blkg->q, pol) && - pol->pd_reset_stats_fn) - pol->pd_reset_stats_fn(blkg); + if (blkg->pd[i] && pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg->pd[i]); } } @@ -487,13 +465,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, return 0; } -static const char *blkg_dev_name(struct blkcg_gq *blkg) +const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info.dev) return dev_name(blkg->q->backing_dev_info.dev); return NULL; } +EXPORT_SYMBOL_GPL(blkg_dev_name); /** * blkcg_print_blkgs - helper for printing per-blkg data @@ -582,9 +561,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, for (i = 0; i < BLKG_RWSTAT_NR; i++) seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - (unsigned long long)rwstat->cnt[i]); + (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); - v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; + v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } @@ -621,31 +601,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); +static u64 blkg_prfill_rwstat_field(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); + + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_bytes. + * cftype->private must be set to the blkcg_policy. + */ +int blkg_print_stat_bytes(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private + * must be set to the blkcg_policy. + */ +int blkg_print_stat_ios(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios); + +static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, + NULL, off); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); + +/** + * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); + /** * blkg_stat_recursive_sum - collect hierarchical blkg_stat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_stat + * @off: offset to the blkg_stat in blkg_policy_data or @blkg + * + * Collect the blkg_stat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. * - * Collect the blkg_stat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is + * at @off bytes into @blkg's blkg_policy_data of the policy. */ -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_stat *stat = (void *)pos_pd + off; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_stat *stat; + + if (!pos_blkg->online) + continue; - if (pos_blkg->online) - sum += blkg_stat_read(stat); + if (pol) + stat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + stat = (void *)blkg + off; + + sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); @@ -655,37 +726,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg * - * Collect the blkg_rwstat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. */ -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off) +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum = { }; int i; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_rwstat *rwstat = (void *)pos_pd + off; - struct blkg_rwstat tmp; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; if (!pos_blkg->online) continue; - tmp = blkg_rwstat_read(rwstat); + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum.cnt[i] += tmp.cnt[i]; + atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), + &sum.aux_cnt[i]); } rcu_read_unlock(); @@ -701,26 +778,35 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); * @ctx: blkg_conf_ctx to be filled * * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new - * value. This function returns with RCU read lock and queue lock held and - * must be paired with blkg_conf_finish(). + * result. @ctx->blkg points to the blkg to be updated and @ctx->body the + * part of @input following MAJ:MIN. This function returns with RCU read + * lock and queue lock held and must be paired with blkg_conf_finish(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx) + char *input, struct blkg_conf_ctx *ctx) __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; struct blkcg_gq *blkg; unsigned int major, minor; - unsigned long long v; - int part, ret; + int key_len, part, ret; + char *body; - if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; - disk = get_gendisk(MKDEV(major, minor), &part); - if (!disk || part) + body = input + key_len; + if (!isspace(*body)) return -EINVAL; + body = skip_spaces(body); + + disk = get_gendisk(MKDEV(major, minor), &part); + if (!disk) + return -ENODEV; + if (part) { + put_disk(disk); + return -ENODEV; + } rcu_read_lock(); spin_lock_irq(disk->queue->queue_lock); @@ -728,7 +814,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (blkcg_policy_enabled(disk->queue, pol)) blkg = blkg_lookup_create(blkcg, disk->queue); else - blkg = ERR_PTR(-EINVAL); + blkg = ERR_PTR(-EOPNOTSUPP); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); @@ -750,7 +836,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ctx->disk = disk; ctx->blkg = blkg; - ctx->v = v; + ctx->body = body; return 0; } EXPORT_SYMBOL_GPL(blkg_conf_prep); @@ -771,8 +857,56 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + const char *dname; + struct blkg_rwstat rwstat; + u64 rbytes, wbytes, rios, wios; + + dname = blkg_dev_name(blkg); + if (!dname) + continue; + + spin_lock_irq(blkg->q->queue_lock); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_ios)); + rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + spin_unlock_irq(blkg->q->queue_lock); + + if (rbytes || wbytes || rios || wios) + seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", + dname, rbytes, wbytes, rios, wios); + } + + rcu_read_unlock(); + return 0; +} + struct cftype blkcg_files[] = { { + .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = blkcg_print_stat, + }, + { } /* terminate */ +}; + +struct cftype blkcg_legacy_files[] = { + { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, @@ -812,40 +946,91 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) } spin_unlock_irq(&blkcg->lock); + + wb_blkcg_offline(blkcg); } static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); + int i; + + mutex_lock(&blkcg_pol_mutex); - if (blkcg != &blkcg_root) - kfree(blkcg); + list_del(&blkcg->all_blkcgs_node); + + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); + + mutex_unlock(&blkcg_pol_mutex); + + kfree(blkcg); } static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { - static atomic64_t id_seq = ATOMIC64_INIT(0); struct blkcg *blkcg; + struct cgroup_subsys_state *ret; + int i; + + mutex_lock(&blkcg_pol_mutex); if (!parent_css) { blkcg = &blkcg_root; - goto done; + } else { + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) { + ret = ERR_PTR(-ENOMEM); + goto free_blkcg; + } } - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) - return ERR_PTR(-ENOMEM); + for (i = 0; i < BLKCG_MAX_POLS ; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg_policy_data *cpd; + + /* + * If the policy hasn't been attached yet, wait for it + * to be attached before doing anything else. Otherwise, + * check if the policy requires any specific per-cgroup + * data: if it does, allocate and initialize it. + */ + if (!pol || !pol->cpd_alloc_fn) + continue; + + cpd = pol->cpd_alloc_fn(GFP_KERNEL); + if (!cpd) { + ret = ERR_PTR(-ENOMEM); + goto free_pd_blkcg; + } + blkcg->cpd[i] = cpd; + cpd->blkcg = blkcg; + cpd->plid = i; + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); + } - blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; - blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; - blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ -done: spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); +#endif + list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); + mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; + +free_pd_blkcg: + for (i--; i >= 0; i--) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); +free_blkcg: + kfree(blkcg); + mutex_unlock(&blkcg_pol_mutex); + return ret; } /** @@ -860,9 +1045,45 @@ done: */ int blkcg_init_queue(struct request_queue *q) { - might_sleep(); + struct blkcg_gq *new_blkg, *blkg; + bool preloaded; + int ret; + + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) + return -ENOMEM; + + preloaded = !radix_tree_preload(GFP_KERNEL); + + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code insertion. + */ + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + blkg = blkg_create(&blkcg_root, q, new_blkg); + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); - return blk_throtl_init(q); + if (preloaded) + radix_tree_preload_end(); + + if (IS_ERR(blkg)) { + blkg_free(new_blkg); + return PTR_ERR(blkg); + } + + q->root_blkg = blkg; + q->root_rl.blkg = blkg; + + ret = blk_throtl_init(q); + if (ret) { + spin_lock_irq(q->queue_lock); + blkg_destroy_all(q); + spin_unlock_irq(q->queue_lock); + } + return ret; } /** @@ -875,6 +1096,13 @@ void blkcg_drain_queue(struct request_queue *q) { lockdep_assert_held(q->queue_lock); + /* + * @q could be exiting and already have destroyed all blkgs as + * indicated by NULL root_blkg. If so, don't confuse policies. + */ + if (!q->root_blkg) + return; + blk_throtl_drain(q); } @@ -899,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *dst_css; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) @@ -919,14 +1147,45 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } -struct cgroup_subsys blkio_cgrp_subsys = { +static void blkcg_bind(struct cgroup_subsys_state *root_css) +{ + int i; + + mutex_lock(&blkcg_pol_mutex); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg *blkcg; + + if (!pol || !pol->cpd_bind_fn) + continue; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) + if (blkcg->cpd[pol->plid]) + pol->cpd_bind_fn(blkcg->cpd[pol->plid]); + } + mutex_unlock(&blkcg_pol_mutex); +} + +struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .base_cftypes = blkcg_files, + .bind = blkcg_bind, + .dfl_cftypes = blkcg_files, + .legacy_cftypes = blkcg_legacy_files, + .legacy_name = "blkio", +#ifdef CONFIG_MEMCG + /* + * This ensures that, if available, memcg is automatically enabled + * together on the default hierarchy so that the owner cgroup can + * be retrieved from writeback pages. + */ + .depends_on = 1 << memory_cgrp_id, +#endif }; -EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); +EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -947,96 +1206,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { - LIST_HEAD(pds); - struct blkcg_gq *blkg, *new_blkg; - struct blkg_policy_data *pd, *n; - int cnt = 0, ret; - bool preloaded; + struct blkg_policy_data *pd_prealloc = NULL; + struct blkcg_gq *blkg; + int ret; if (blkcg_policy_enabled(q, pol)) return 0; - /* preallocations for root blkg */ - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!new_blkg) - return -ENOMEM; - blk_queue_bypass_start(q); - - preloaded = !radix_tree_preload(GFP_KERNEL); - - /* - * Make sure the root blkg exists and count the existing blkgs. As - * @q is bypassing at this point, blkg_lookup_create() can't be - * used. Open code it. - */ - spin_lock_irq(q->queue_lock); - - rcu_read_lock(); - blkg = __blkg_lookup(&blkcg_root, q, false); - if (blkg) - blkg_free(new_blkg); - else - blkg = blkg_create(&blkcg_root, q, new_blkg); - rcu_read_unlock(); - - if (preloaded) - radix_tree_preload_end(); - - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - goto out_unlock; - } - - list_for_each_entry(blkg, &q->blkg_list, q_node) - cnt++; - - spin_unlock_irq(q->queue_lock); - - /* allocate policy_data for all existing blkgs */ - while (cnt--) { - pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); - if (!pd) { +pd_prealloc: + if (!pd_prealloc) { + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); + if (!pd_prealloc) { ret = -ENOMEM; - goto out_free; + goto out_bypass_end; } - list_add_tail(&pd->alloc_node, &pds); } - /* - * Install the allocated pds. With @q bypassing, no new blkg - * should have been created while the queue lock was dropped. - */ spin_lock_irq(q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds))) { - /* umm... this shouldn't happen, just abort */ - ret = -ENOMEM; - goto out_unlock; - } - pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); - list_del_init(&pd->alloc_node); + struct blkg_policy_data *pd; - /* grab blkcg lock too while installing @pd on @blkg */ - spin_lock(&blkg->blkcg->lock); + if (blkg->pd[pol->plid]) + continue; + + pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); + if (!pd) + swap(pd, pd_prealloc); + if (!pd) { + spin_unlock_irq(q->queue_lock); + goto pd_prealloc; + } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; - pol->pd_init_fn(blkg); - - spin_unlock(&blkg->blkcg->lock); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; -out_unlock: + spin_unlock_irq(q->queue_lock); -out_free: +out_bypass_end: blk_queue_bypass_end(q); - list_for_each_entry_safe(pd, n, &pds, alloc_node) - kfree(pd); + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1062,21 +1279,16 @@ void blkcg_deactivate_policy(struct request_queue *q, __clear_bit(pol->plid, q->blkcg_pols); - /* if no policy is left, no need for blkgs - shoot them down */ - if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) - blkg_destroy_all(q); - list_for_each_entry(blkg, &q->blkg_list, q_node) { /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); - if (pol->pd_offline_fn) - pol->pd_offline_fn(blkg); - if (pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - - kfree(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; + if (blkg->pd[pol->plid]) { + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[pol->plid]); + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } spin_unlock(&blkg->blkcg->lock); } @@ -1095,11 +1307,10 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); */ int blkcg_policy_register(struct blkcg_policy *pol) { + struct blkcg *blkcg; int i, ret; - if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) - return -EINVAL; - + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ @@ -1108,18 +1319,55 @@ int blkcg_policy_register(struct blkcg_policy *pol) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) - goto out_unlock; + goto err_unlock; - /* register and update blkgs */ + /* register @pol */ pol->plid = i; - blkcg_policy[i] = pol; + blkcg_policy[pol->plid] = pol; + + /* allocate and install cpd's */ + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + struct blkcg_policy_data *cpd; + + cpd = pol->cpd_alloc_fn(GFP_KERNEL); + if (!cpd) { + mutex_unlock(&blkcg_pol_mutex); + goto err_free_cpds; + } + + blkcg->cpd[pol->plid] = cpd; + cpd->blkcg = blkcg; + cpd->plid = pol->plid; + pol->cpd_init_fn(cpd); + } + } + + mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->cftypes) - WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes)); - ret = 0; -out_unlock: + if (pol->dfl_cftypes) + WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + if (pol->legacy_cftypes) + WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, + pol->legacy_cftypes)); + mutex_unlock(&blkcg_pol_register_mutex); + return 0; + +err_free_cpds: + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } + } + blkcg_policy[pol->plid] = NULL; +err_unlock: mutex_unlock(&blkcg_pol_mutex); + mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); @@ -1132,18 +1380,34 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register); */ void blkcg_policy_unregister(struct blkcg_policy *pol) { - mutex_lock(&blkcg_pol_mutex); + struct blkcg *blkcg; + + mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; /* kill the intf files first */ - if (pol->cftypes) - cgroup_rm_cftypes(pol->cftypes); + if (pol->dfl_cftypes) + cgroup_rm_cftypes(pol->dfl_cftypes); + if (pol->legacy_cftypes) + cgroup_rm_cftypes(pol->legacy_cftypes); + + /* remove cpds and unregister */ + mutex_lock(&blkcg_pol_mutex); - /* unregister and update blkgs */ + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } + } blkcg_policy[pol->plid] = NULL; -out_unlock: + mutex_unlock(&blkcg_pol_mutex); +out_unlock: + mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h deleted file mode 100644 index 371fe8e92ab5..000000000000 --- a/block/blk-cgroup.h +++ /dev/null @@ -1,609 +0,0 @@ -#ifndef _BLK_CGROUP_H -#define _BLK_CGROUP_H -/* - * Common Block IO controller cgroup interface - * - * Based on ideas and code from CFQ, CFS and BFQ: - * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> - * - * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> - * Paolo Valente <paolo.valente@unimore.it> - * - * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> - * Nauman Rafique <nauman@google.com> - */ - -#include <linux/cgroup.h> -#include <linux/u64_stats_sync.h> -#include <linux/seq_file.h> -#include <linux/radix-tree.h> -#include <linux/blkdev.h> - -/* Max limits for throttle policy */ -#define THROTL_IOPS_MAX UINT_MAX - -/* CFQ specific, out here for blkcg->cfq_weight */ -#define CFQ_WEIGHT_MIN 10 -#define CFQ_WEIGHT_MAX 1000 -#define CFQ_WEIGHT_DEFAULT 500 - -#ifdef CONFIG_BLK_CGROUP - -enum blkg_rwstat_type { - BLKG_RWSTAT_READ, - BLKG_RWSTAT_WRITE, - BLKG_RWSTAT_SYNC, - BLKG_RWSTAT_ASYNC, - - BLKG_RWSTAT_NR, - BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, -}; - -struct blkcg_gq; - -struct blkcg { - struct cgroup_subsys_state css; - spinlock_t lock; - - struct radix_tree_root blkg_tree; - struct blkcg_gq *blkg_hint; - struct hlist_head blkg_list; - - /* for policies to test whether associated blkcg has changed */ - uint64_t id; - - /* TODO: per-policy storage in blkcg */ - unsigned int cfq_weight; /* belongs to cfq */ - unsigned int cfq_leaf_weight; -}; - -struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; -}; - -struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; -}; - -/* - * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a - * request_queue (q). This is used by blkcg policies which need to track - * information per blkcg - q pair. - * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. - */ -struct blkg_policy_data { - /* the blkg and policy id this per-policy data belongs to */ - struct blkcg_gq *blkg; - int plid; - - /* used during policy activation */ - struct list_head alloc_node; -}; - -/* association between a blk cgroup and a request queue */ -struct blkcg_gq { - /* Pointer to the associated request_queue */ - struct request_queue *q; - struct list_head q_node; - struct hlist_node blkcg_node; - struct blkcg *blkcg; - - /* all non-root blkcg_gq's are guaranteed to have access to parent */ - struct blkcg_gq *parent; - - /* request allocation list for this blkcg-q pair */ - struct request_list rl; - - /* reference count */ - int refcnt; - - /* is this blkg online? protected by both blkcg and q locks */ - bool online; - - struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - - struct rcu_head rcu_head; -}; - -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); - -struct blkcg_policy { - int plid; - /* policy specific private data size */ - size_t pd_size; - /* cgroup files for the policy */ - struct cftype *cftypes; - - /* operations */ - blkcg_pol_init_pd_fn *pd_init_fn; - blkcg_pol_online_pd_fn *pd_online_fn; - blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; - blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; -}; - -extern struct blkcg blkcg_root; - -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); -int blkcg_init_queue(struct request_queue *q); -void blkcg_drain_queue(struct request_queue *q); -void blkcg_exit_queue(struct request_queue *q); - -/* Blkio controller policy registration */ -int blkcg_policy_register(struct blkcg_policy *pol); -void blkcg_policy_unregister(struct blkcg_policy *pol); -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol); -void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol); - -void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, - u64 (*prfill)(struct seq_file *, - struct blkg_policy_data *, int), - const struct blkcg_policy *pol, int data, - bool show_total); -u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat *rwstat); -u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off); - -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off); - -struct blkg_conf_ctx { - struct gendisk *disk; - struct blkcg_gq *blkg; - u64 v; -}; - -int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); -void blkg_conf_finish(struct blkg_conf_ctx *ctx); - - -static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct blkcg, css) : NULL; -} - -static inline struct blkcg *task_blkcg(struct task_struct *tsk) -{ - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); -} - -static inline struct blkcg *bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); - return task_blkcg(current); -} - -/** - * blkcg_parent - get the parent of a blkcg - * @blkcg: blkcg of interest - * - * Return the parent blkcg of @blkcg. Can be called anytime. - */ -static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) -{ - return css_to_blkcg(css_parent(&blkcg->css)); -} - -/** - * blkg_to_pdata - get policy private data - * @blkg: blkg of interest - * @pol: policy of interest - * - * Return pointer to private data associated with the @blkg-@pol pair. - */ -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) -{ - return blkg ? blkg->pd[pol->plid] : NULL; -} - -/** - * pdata_to_blkg - get blkg associated with policy private data - * @pd: policy private data of interest - * - * @pd is policy private data. Determine the blkg it's associated with. - */ -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) -{ - return pd ? pd->blkg : NULL; -} - -/** - * blkg_path - format cgroup path of blkg - * @blkg: blkg of interest - * @buf: target buffer - * @buflen: target buffer length - * - * Format the path of the cgroup of @blkg into @buf. - */ -static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) -{ - char *p; - - p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - if (!p) { - strncpy(buf, "<unavailable>", buflen); - return -ENAMETOOLONG; - } - - memmove(buf, p, buf + buflen - p); - return 0; -} - -/** - * blkg_get - get a blkg reference - * @blkg: blkg to get - * - * The caller should be holding queue_lock and an existing reference. - */ -static inline void blkg_get(struct blkcg_gq *blkg) -{ - lockdep_assert_held(blkg->q->queue_lock); - WARN_ON_ONCE(!blkg->refcnt); - blkg->refcnt++; -} - -void __blkg_release_rcu(struct rcu_head *rcu); - -/** - * blkg_put - put a blkg reference - * @blkg: blkg to put - * - * The caller should be holding queue_lock. - */ -static inline void blkg_put(struct blkcg_gq *blkg) -{ - lockdep_assert_held(blkg->q->queue_lock); - WARN_ON_ONCE(blkg->refcnt <= 0); - if (!--blkg->refcnt) - call_rcu(&blkg->rcu_head, __blkg_release_rcu); -} - -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint); - -/** - * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU - * read locked. If called under either blkcg or queue lock, the iteration - * is guaranteed to include all and only online blkgs. The caller may - * update @pos_css by calling css_rightmost_descendant() to skip subtree. - * @p_blkg is included in the iteration and the first node to be visited. - */ -#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blkg_for_each_descendant_post - post-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Similar to blkg_for_each_descendant_pre() but performs post-order - * traversal instead. Synchronization rules are the same. @p_blkg is - * included in the iteration and the last node to be visited. - */ -#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blk_get_rl - get request_list to use - * @q: request_queue of interest - * @bio: bio which will be attached to the allocated request (may be %NULL) - * - * The caller wants to allocate a request from @q to use for @bio. Find - * the request_list to use and obtain a reference on it. Should be called - * under queue_lock. This function is guaranteed to return non-%NULL - * request_list. - */ -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) -{ - struct blkcg *blkcg; - struct blkcg_gq *blkg; - - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - - /* bypass blkg lookup and use @q->root_rl directly for root */ - if (blkcg == &blkcg_root) - goto root_rl; - - /* - * Try to use blkg->rl. blkg lookup may fail under memory pressure - * or if either the blkcg or queue is going away. Fall back to - * root_rl in such cases. - */ - blkg = blkg_lookup_create(blkcg, q); - if (unlikely(IS_ERR(blkg))) - goto root_rl; - - blkg_get(blkg); - rcu_read_unlock(); - return &blkg->rl; -root_rl: - rcu_read_unlock(); - return &q->root_rl; -} - -/** - * blk_put_rl - put request_list - * @rl: request_list to put - * - * Put the reference acquired by blk_get_rl(). Should be called under - * queue_lock. - */ -static inline void blk_put_rl(struct request_list *rl) -{ - /* root_rl may not have blkg set */ - if (rl->blkg && rl->blkg->blkcg != &blkcg_root) - blkg_put(rl->blkg); -} - -/** - * blk_rq_set_rl - associate a request with a request_list - * @rq: request of interest - * @rl: target request_list - * - * Associate @rq with @rl so that accounting and freeing can know the - * request_list @rq came from. - */ -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) -{ - rq->rl = rl; -} - -/** - * blk_rq_rl - return the request_list a request came from - * @rq: request of interest - * - * Return the request_list @rq is allocated from. - */ -static inline struct request_list *blk_rq_rl(struct request *rq) -{ - return rq->rl; -} - -struct request_list *__blk_queue_next_rl(struct request_list *rl, - struct request_queue *q); -/** - * blk_queue_for_each_rl - iterate through all request_lists of a request_queue - * - * Should be used under queue_lock. - */ -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) - -static inline void blkg_stat_init(struct blkg_stat *stat) -{ - u64_stats_init(&stat->syncp); -} - -/** - * blkg_stat_add - add a value to a blkg_stat - * @stat: target blkg_stat - * @val: value to add - * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. - */ -static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) -{ - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); -} - -/** - * blkg_stat_read - read the current value of a blkg_stat - * @stat: blkg_stat to read - * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. - */ -static inline uint64_t blkg_stat_read(struct blkg_stat *stat) -{ - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin_irq(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); - - return v; -} - -/** - * blkg_stat_reset - reset a blkg_stat - * @stat: blkg_stat to reset - */ -static inline void blkg_stat_reset(struct blkg_stat *stat) -{ - stat->cnt = 0; -} - -/** - * blkg_stat_merge - merge a blkg_stat into another - * @to: the destination blkg_stat - * @from: the source - * - * Add @from's count to @to. - */ -static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) -{ - blkg_stat_add(to, blkg_stat_read(from)); -} - -static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) -{ - u64_stats_init(&rwstat->syncp); -} - -/** - * blkg_rwstat_add - add a value to a blkg_rwstat - * @rwstat: target blkg_rwstat - * @rw: mask of REQ_{WRITE|SYNC} - * @val: value to add - * - * Add @val to @rwstat. The counters are chosen according to @rw. The - * caller is responsible for synchronizing calls to this function. - */ -static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - int rw, uint64_t val) -{ - u64_stats_update_begin(&rwstat->syncp); - - if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; - else - rwstat->cnt[BLKG_RWSTAT_READ] += val; - if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; - else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; - - u64_stats_update_end(&rwstat->syncp); -} - -/** - * blkg_rwstat_read - read the current values of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. - */ -static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) -{ - unsigned int start; - struct blkg_rwstat tmp; - - do { - start = u64_stats_fetch_begin_irq(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); - - return tmp; -} - -/** - * blkg_rwstat_total - read the total count of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Return the total count of @rwstat regardless of the IO direction. This - * function can be called without synchronization and takes care of u64 - * atomicity. - */ -static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) -{ - struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); - - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; -} - -/** - * blkg_rwstat_reset - reset a blkg_rwstat - * @rwstat: blkg_rwstat to reset - */ -static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) -{ - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); -} - -/** - * blkg_rwstat_merge - merge a blkg_rwstat into another - * @to: the destination blkg_rwstat - * @from: the source - * - * Add @from's counts to @to. - */ -static inline void blkg_rwstat_merge(struct blkg_rwstat *to, - struct blkg_rwstat *from) -{ - struct blkg_rwstat v = blkg_rwstat_read(from); - int i; - - u64_stats_update_begin(&to->syncp); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - to->cnt[i] += v.cnt[i]; - u64_stats_update_end(&to->syncp); -} - -#else /* CONFIG_BLK_CGROUP */ - -struct cgroup; -struct blkcg; - -struct blkg_policy_data { -}; - -struct blkcg_gq { -}; - -struct blkcg_policy { -}; - -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline int blkcg_init_queue(struct request_queue *q) { return 0; } -static inline void blkcg_drain_queue(struct request_queue *q) { } -static inline void blkcg_exit_queue(struct request_queue *q) { } -static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } -static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } -static inline int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { return 0; } -static inline void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { } - -static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } - -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) { return NULL; } -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } -static inline void blkg_get(struct blkcg_gq *blkg) { } -static inline void blkg_put(struct blkcg_gq *blkg) { } - -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) { return &q->root_rl; } -static inline void blk_put_rl(struct request_list *rl) { } -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } -static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } - -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) - -#endif /* CONFIG_BLK_CGROUP */ -#endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index a0e3096c4bb5..ab51685988c2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -32,17 +32,18 @@ #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> +#include <linux/blk-cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); @@ -50,7 +51,7 @@ DEFINE_IDA(blk_queue_ida); /* * For the allocated request tables */ -struct kmem_cache *request_cachep = NULL; +struct kmem_cache *request_cachep; /* * For queue allocation @@ -62,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; +static void blk_clear_congested(struct request_list *rl, int sync) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + clear_wb_congested(rl->blkg->wb_congested, sync); +#else + /* + * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't + * flip its congestion state for events on other blkcgs. + */ + if (rl == &rl->q->root_rl) + clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + +static void blk_set_congested(struct request_list *rl, int sync) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + set_wb_congested(rl->blkg->wb_congested, sync); +#else + /* see blk_clear_congested() */ + if (rl == &rl->q->root_rl) + set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -82,18 +108,14 @@ void blk_queue_congestion_threshold(struct request_queue *q) * @bdev: device * * Locates the passed device's request queue and returns the address of its - * backing_dev_info - * - * Will return NULL if the request queue cannot be located. + * backing_dev_info. This function can only be called if @bdev is opened + * and the return value is never NULL. */ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) { - struct backing_dev_info *ret = NULL; struct request_queue *q = bdev_get_queue(bdev); - if (q) - ret = &q->backing_dev_info; - return ret; + return &q->backing_dev_info; } EXPORT_SYMBOL(blk_get_backing_dev_info); @@ -121,18 +143,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; + bio->bi_error = error; if (unlikely(rq->cmd_flags & REQ_QUIET)) - set_bit(BIO_QUIET, &bio->bi_flags); + bio_set_flag(bio, BIO_QUIET); bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) - bio_endio(bio, error); + bio_endio(bio); } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -146,8 +166,8 @@ void blk_dump_rq_flags(struct request *rq, char *msg) printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); - printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", - rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); + printk(KERN_INFO " bio %p, biotail %p, len %u\n", + rq->bio, rq->biotail, blk_rq_bytes(rq)); if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { printk(KERN_INFO " cdb: "); @@ -187,6 +207,22 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs) EXPORT_SYMBOL(blk_delay_queue); /** + * blk_start_queue_async - asynchronously restart a previously stopped queue + * @q: The &struct request_queue in question + * + * Description: + * blk_start_queue_async() will clear the stop flag on the queue, and + * ensure that the request_fn for the queue is run from an async + * context. + **/ +void blk_start_queue_async(struct request_queue *q) +{ + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + blk_run_queue_async(q); +} +EXPORT_SYMBOL(blk_start_queue_async); + +/** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question * @@ -239,7 +275,7 @@ EXPORT_SYMBOL(blk_stop_queue); * this function. * * This function does not cancel any asynchronous activity arising - * out of elevator or throttling code. That would require elevaotor_exit() + * out of elevator or throttling code. That would require elevator_exit() * and blkcg_exit_queue() to be called with queue lock initialized. * */ @@ -251,8 +287,10 @@ void blk_sync_queue(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) - cancel_delayed_work_sync(&hctx->delayed_work); + queue_for_each_hw_ctx(q, hctx, i) { + cancel_delayed_work_sync(&hctx->run_work); + cancel_delayed_work_sync(&hctx->delay_work); + } } else { cancel_delayed_work_sync(&q->delay_work); } @@ -286,6 +324,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q) q->request_fn(q); q->request_fn_active--; } +EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); /** * __blk_run_queue - run a single device queue @@ -391,11 +430,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) * be drained. Check all the queues and counters. */ if (drain_all) { + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { drain |= q->nr_rqs[i]; drain |= q->in_flight[i]; - drain |= !list_empty(&q->flush_queue[i]); + if (fq) + drain |= !list_empty(&fq->flush_queue[i]); } } @@ -435,14 +476,17 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) */ void blk_queue_bypass_start(struct request_queue *q) { - bool drain; - spin_lock_irq(q->queue_lock); - drain = !q->bypass_depth++; + q->bypass_depth++; queue_flag_set(QUEUE_FLAG_BYPASS, q); spin_unlock_irq(q->queue_lock); - if (drain) { + /* + * Queues start drained. Skip actual draining till init is + * complete. This avoids lenghty delays during queue init which + * can happen many times during boot. + */ + if (blk_queue_init_done(q)) { spin_lock_irq(q->queue_lock); __blk_drain_queue(q, false); spin_unlock_irq(q->queue_lock); @@ -469,6 +513,25 @@ void blk_queue_bypass_end(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_queue_bypass_end); +void blk_set_queue_dying(struct request_queue *q) +{ + queue_flag_set_unlocked(QUEUE_FLAG_DYING, q); + + if (q->mq_ops) + blk_mq_wake_waiters(q); + else { + struct request_list *rl; + + blk_queue_for_each_rl(rl, q) { + if (rl->rq_pool) { + wake_up(&rl->wait[BLK_RW_SYNC]); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } + } + } +} +EXPORT_SYMBOL_GPL(blk_set_queue_dying); + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -482,7 +545,7 @@ void blk_cleanup_queue(struct request_queue *q) /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); - queue_flag_set_unlocked(QUEUE_FLAG_DYING, q); + blk_set_queue_dying(q); spin_lock_irq(lock); /* @@ -507,30 +570,48 @@ void blk_cleanup_queue(struct request_queue *q) * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that q->request_fn() gets invoked after draining finished. */ - if (q->mq_ops) { - blk_mq_drain_queue(q); - spin_lock_irq(lock); - } else { - spin_lock_irq(lock); + blk_freeze_queue(q); + spin_lock_irq(lock); + if (!q->mq_ops) __blk_drain_queue(q, true); - } queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); + /* for synchronous bio-based driver finish in-flight integrity i/o */ + blk_flush_integrity(); + /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); blk_sync_queue(q); + if (q->mq_ops) + blk_mq_free_queue(q); + percpu_ref_exit(&q->q_usage_counter); + spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); + bdi_unregister(&q->backing_dev_info); + /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); +/* Allocate memory local to the request queue */ +static void *alloc_request_struct(gfp_t gfp_mask, void *data) +{ + int nid = (int)(long)data; + return kmem_cache_alloc_node(request_cachep, gfp_mask, nid); +} + +static void free_request_struct(void *element, void *unused) +{ + kmem_cache_free(request_cachep, element); +} + int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask) { @@ -543,9 +624,10 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q, init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, request_cachep, - gfp_mask, q->node); + rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct, + free_request_struct, + (void *)(long)q->node, gfp_mask, + q->node); if (!rl->rq_pool) return -ENOMEM; @@ -564,6 +646,47 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); +int blk_queue_enter(struct request_queue *q, bool nowait) +{ + while (true) { + int ret; + + if (percpu_ref_tryget_live(&q->q_usage_counter)) + return 0; + + if (nowait) + return -EBUSY; + + ret = wait_event_interruptible(q->mq_freeze_wq, + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); + if (blk_queue_dying(q)) + return -ENODEV; + if (ret) + return ret; + } +} + +void blk_queue_exit(struct request_queue *q) +{ + percpu_ref_put(&q->q_usage_counter); +} + +static void blk_queue_usage_counter_release(struct percpu_ref *ref) +{ + struct request_queue *q = + container_of(ref, struct request_queue, q_usage_counter); + + wake_up_all(&q->mq_freeze_wq); +} + +static void blk_rq_timed_out_timer(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + + kblockd_schedule_work(&q->timeout_work); +} + struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; @@ -574,23 +697,23 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - if (percpu_counter_init(&q->mq_usage_counter, 0)) - goto fail_q; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) - goto fail_c; + goto fail_q; + + q->bio_split = bioset_create(BIO_POOL_SIZE, 0); + if (!q->bio_split) + goto fail_id; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.state = 0; - q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; + q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info.name = "block"; q->node = node_id; err = bdi_init(&q->backing_dev_info); if (err) - goto fail_id; + goto fail_split; setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); @@ -601,9 +724,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) #ifdef CONFIG_BLK_CGROUP INIT_LIST_HEAD(&q->blkg_list); #endif - INIT_LIST_HEAD(&q->flush_queue[0]); - INIT_LIST_HEAD(&q->flush_queue[1]); - INIT_LIST_HEAD(&q->flush_data_in_flight); INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); kobject_init(&q->kobj, &blk_queue_ktype); @@ -628,17 +748,28 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) init_waitqueue_head(&q->mq_freeze_wq); - if (blkcg_init_queue(q)) + /* + * Init percpu_ref in atomic mode so that it's faster to shutdown. + * See blk_register_queue() for details. + */ + if (percpu_ref_init(&q->q_usage_counter, + blk_queue_usage_counter_release, + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_bdi; + if (blkcg_init_queue(q)) + goto fail_ref; + return q; +fail_ref: + percpu_ref_exit(&q->q_usage_counter); fail_bdi: bdi_destroy(&q->backing_dev_info); +fail_split: + bioset_free(q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); -fail_c: - percpu_counter_destroy(&q->mq_usage_counter); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; @@ -701,6 +832,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) } EXPORT_SYMBOL(blk_init_queue_node); +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); + struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, spinlock_t *lock) @@ -708,13 +841,14 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); - if (!q->flush_rq) + q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0); + if (!q->fq) return NULL; if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto fail; + INIT_WORK(&q->timeout_work, blk_timeout_work); q->request_fn = rfn; q->prep_rq_fn = NULL; q->unprep_rq_fn = NULL; @@ -745,7 +879,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, return q; fail: - kfree(q->flush_rq); + blk_free_flush_queue(q->fq); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -810,13 +944,8 @@ static void __freed_request(struct request_list *rl, int sync) { struct request_queue *q = rl->q; - /* - * bdi isn't aware of blkcg yet. As all async IOs end up root - * blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl && - rl->count[sync] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, sync); + if (rl->count[sync] < queue_congestion_off_threshold(q)) + blk_clear_congested(rl, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) @@ -846,6 +975,47 @@ static void freed_request(struct request_list *rl, unsigned int flags) __freed_request(rl, sync ^ 1); } +int blk_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct request_list *rl; + int on_thresh, off_thresh; + + spin_lock_irq(q->queue_lock); + q->nr_requests = nr; + blk_queue_congestion_threshold(q); + on_thresh = queue_congestion_on_threshold(q); + off_thresh = queue_congestion_off_threshold(q); + + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_SYNC); + + if (rl->count[BLK_RW_ASYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_ASYNC); + + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_SYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_SYNC); + wake_up(&rl->wait[BLK_RW_SYNC]); + } + + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_ASYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_ASYNC); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } + } + + spin_unlock_irq(q->queue_lock); + return 0; +} + /* * Determine if elevator data should be initialized when allocating the * request associated with @bio. @@ -891,9 +1061,9 @@ static struct io_context *rq_ioc(struct bio *bio) * Get a free request from @q. This function may fail under memory * pressure or if @q is dead. * - * Must be callled with @q->queue_lock held and, - * Returns %NULL on failure, with @q->queue_lock held. - * Returns !%NULL on success, with @q->queue_lock *not held*. + * Must be called with @q->queue_lock held and, + * Returns ERR_PTR on failure, with @q->queue_lock held. + * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, int rw_flags, struct bio *bio, gfp_t gfp_mask) @@ -907,7 +1077,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, int may_queue; if (unlikely(blk_queue_dying(q))) - return NULL; + return ERR_PTR(-ENODEV); may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) @@ -932,16 +1102,11 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, * process is not a "batcher", and not * exempted by the IO scheduler */ - return NULL; + return ERR_PTR(-ENOMEM); } } } - /* - * bdi isn't aware of blkcg yet. As all async IOs end up - * root blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl) - blk_set_queue_congested(q, is_sync); + blk_set_congested(rl, is_sync); } /* @@ -950,7 +1115,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, * allocated with any setting of ->nr_requests */ if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) - return NULL; + return ERR_PTR(-ENOMEM); q->nr_rqs[is_sync]++; rl->count[is_sync]++; @@ -1023,8 +1188,8 @@ fail_elvpriv: * shouldn't stall IO. Treat this request as !elvpriv. This will * disturb iosched and blkcg but weird is bettern than dead. */ - printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n", - dev_name(q->backing_dev_info.dev)); + printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", + __func__, dev_name(q->backing_dev_info.dev)); rq->cmd_flags &= ~REQ_ELVPRIV; rq->elv.icq = NULL; @@ -1055,7 +1220,7 @@ fail_alloc: rq_starved: if (unlikely(rl->count[is_sync] == 0)) rl->starved[is_sync] = 1; - return NULL; + return ERR_PTR(-ENOMEM); } /** @@ -1065,12 +1230,12 @@ rq_starved: * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * - * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this - * function keeps retrying under memory pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, + * this function keeps retrying under memory pressure and fails iff @q is dead. * - * Must be callled with @q->queue_lock held and, - * Returns %NULL on failure, with @q->queue_lock held. - * Returns !%NULL on success, with @q->queue_lock *not held*. + * Must be called with @q->queue_lock held and, + * Returns ERR_PTR on failure, with @q->queue_lock held. + * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) @@ -1083,12 +1248,12 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: rq = __get_request(rl, rw_flags, bio, gfp_mask); - if (rq) + if (!IS_ERR(rq)) return rq; - if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); - return NULL; + return rq; } /* wait on @rl and retry */ @@ -1125,7 +1290,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, spin_lock_irq(q->queue_lock); rq = get_request(q, rw, NULL, gfp_mask); - if (!rq) + if (IS_ERR(rq)) spin_unlock_irq(q->queue_lock); /* q->queue_lock is unlocked at this point */ @@ -1135,7 +1300,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask); + return blk_mq_alloc_request(q, rw, + (gfp_mask & __GFP_DIRECT_RECLAIM) ? + 0 : BLK_MQ_REQ_NOWAIT); else return blk_old_get_request(q, rw, gfp_mask); } @@ -1164,11 +1331,11 @@ EXPORT_SYMBOL(blk_get_request); * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be - * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for - * anything but the first bio in the chain. Otherwise you risk waiting for IO - * completion of a bio that hasn't been submitted yet, thus resulting in a - * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead - * of bio_alloc(), as that avoids the mempool deadlock. + * given to how you allocate bios. In particular, you cannot use + * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise + * you risk waiting for IO completion of a bio that hasn't been submitted yet, + * thus resulting in a deadlock. Alternatively bios should be allocated using + * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ @@ -1177,8 +1344,10 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, { struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask); - if (unlikely(!rq)) - return ERR_PTR(-ENOMEM); + if (IS_ERR(rq)) + return rq; + + blk_rq_set_block_pc(rq); for_each_bio(bio) { struct bio *bounce_bio = bio; @@ -1197,6 +1366,21 @@ struct request *blk_make_request(struct request_queue *q, struct bio *bio, EXPORT_SYMBOL(blk_make_request); /** + * blk_rq_set_block_pc - initialize a request to type BLOCK_PC + * @rq: request to be initialized + * + */ +void blk_rq_set_block_pc(struct request *rq) +{ + rq->cmd_type = REQ_TYPE_BLOCK_PC; + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; + memset(rq->__cmd, 0, sizeof(rq->__cmd)); +} +EXPORT_SYMBOL(blk_rq_set_block_pc); + +/** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted * @rq: request to be inserted @@ -1212,7 +1396,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - if (blk_rq_tagged(rq)) + if (rq->cmd_flags & REQ_QUEUED) blk_queue_end_tag(q, rq); BUG_ON(blk_queued_rq(rq)); @@ -1231,12 +1415,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq, static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { + int inflight; + if (now == part->stamp) return; - if (part_in_flight(part)) { + inflight = part_in_flight(part); + if (inflight) { __part_stat_add(cpu, part, time_in_queue, - part_in_flight(part) * (now - part->stamp)); + inflight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1268,7 +1455,7 @@ void part_round_stats(int cpu, struct hd_struct *part) } EXPORT_SYMBOL_GPL(part_round_stats); -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM static void blk_pm_put_request(struct request *rq) { if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending) @@ -1360,7 +1547,6 @@ void blk_add_request_payload(struct request *rq, struct page *page, rq->__data_len = rq->resid_len = len; rq->nr_phys_segments = 1; - rq->buffer = bio_data(bio); } EXPORT_SYMBOL_GPL(blk_add_request_payload); @@ -1402,12 +1588,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, bio->bi_next = req->bio; req->bio = bio; - /* - * may not be valid. if the low level driver said - * it didn't need a bounce buffer then it better - * not touch req->buffer either... - */ - req->buffer = bio_data(bio); req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); @@ -1421,6 +1601,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * @q: request_queue new bio is being queued at * @bio: new bio being queued * @request_count: out parameter for number of traversed plugged requests + * @same_queue_rq: pointer to &struct request that gets filled in when + * another request associated with @q is found on the plug list + * (optional, may be %NULL) * * Determine whether @bio being queued on @q can be merged with a request * on %current's plugged list. Returns %true if merge was successful, @@ -1432,18 +1615,18 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * added on the elevator at this point. In addition, we don't have * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count) + unsigned int *request_count, + struct request **same_queue_rq) { struct blk_plug *plug; struct request *rq; bool ret = false; struct list_head *plug_list; - if (blk_queue_nomerges(q)) - goto out; - plug = current->plug; if (!plug) goto out; @@ -1457,8 +1640,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; - if (rq->q == q) + if (rq->q == q) { (*request_count)++; + /* + * Only blk-mq multiple hardware queues case checks the + * rq in the same queue, there should be only one such + * rq in a queue + **/ + if (same_queue_rq) + *same_queue_rq = rq; + } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; @@ -1478,6 +1669,30 @@ out: return ret; } +unsigned int blk_plug_queued_count(struct request_queue *q) +{ + struct blk_plug *plug; + struct request *rq; + struct list_head *plug_list; + unsigned int ret = 0; + + plug = current->plug; + if (!plug) + goto out; + + if (q->mq_ops) + plug_list = &plug->mq_list; + else + plug_list = &plug->list; + + list_for_each_entry(rq, plug_list, queuelist) { + if (rq->q == q) + ret++; + } +out: + return ret; +} + void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; @@ -1492,7 +1707,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -void blk_queue_bio(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; @@ -1507,9 +1722,12 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) */ blk_queue_bounce(q, &bio); + blk_queue_split(q, &bio, q->bio_split); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); - return; + bio->bi_error = -EIO; + bio_endio(bio); + return BLK_QC_T_NONE; } if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { @@ -1522,8 +1740,11 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (blk_attempt_plug_merge(q, bio, &request_count)) - return; + if (!blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return BLK_QC_T_NONE; + } else + request_count = blk_plug_queued_count(q); spin_lock_irq(q->queue_lock); @@ -1559,8 +1780,9 @@ get_rq: * Returns with the queue unlocked. */ req = get_request(q, rw_flags, bio, GFP_NOIO); - if (unlikely(!req)) { - bio_endio(bio, -ENODEV); /* @q is dead */ + if (IS_ERR(req)) { + bio->bi_error = PTR_ERR(req); + bio_endio(bio); goto out_unlock; } @@ -1598,8 +1820,9 @@ get_rq: out_unlock: spin_unlock_irq(q->queue_lock); } + + return BLK_QC_T_NONE; } -EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */ /* * If bio->bi_dev is a partition, remap the location @@ -1630,8 +1853,6 @@ static void handle_bad_sector(struct bio *bio) bio->bi_rw, (unsigned long long)bio_end_sector(bio), (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); - - set_bit(BIO_EOF, &bio->bi_flags); } #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -1654,7 +1875,7 @@ static int __init fail_make_request_debugfs(void) struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); - return IS_ERR(dir) ? PTR_ERR(dir) : 0; + return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); @@ -1722,15 +1943,6 @@ generic_make_request_checks(struct bio *bio) goto end_io; } - if (likely(bio_is_rw(bio) && - nr_sectors > queue_max_hw_sectors(q))) { - printk(KERN_ERR "bio too big device %s (%u > %u)\n", - bdevname(bio->bi_bdev, b), - bio_sectors(bio), - queue_max_hw_sectors(q)); - goto end_io; - } - part = bio->bi_bdev->bd_part; if (should_fail_request(part, bio->bi_iter.bi_size) || should_fail_request(&part_to_disk(part)->part0, @@ -1779,14 +1991,15 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); - if (blk_throtl_bio(q, bio)) - return false; /* throttled, will be resubmitted later */ + if (!blkcg_bio_issue_check(q, bio)) + return false; trace_block_bio_queue(q, bio); return true; end_io: - bio_endio(bio, err); + bio->bi_error = err; + bio_endio(bio); return false; } @@ -1814,12 +2027,13 @@ end_io: * a lower device by calling into generic_make_request recursively, which * means the bio should NOT be touched after the call to ->make_request_fn. */ -void generic_make_request(struct bio *bio) +blk_qc_t generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; + blk_qc_t ret = BLK_QC_T_NONE; if (!generic_make_request_checks(bio)) - return; + goto out; /* * We only want one ->make_request_fn to be active at a time, else @@ -1833,7 +2047,7 @@ void generic_make_request(struct bio *bio) */ if (current->bio_list) { bio_list_add(current->bio_list, bio); - return; + goto out; } /* following loop may be a bit non-obvious, and so deserves some @@ -1856,11 +2070,23 @@ void generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - q->make_request_fn(q, bio); + if (likely(blk_queue_enter(q, false) == 0)) { + ret = q->make_request_fn(q, bio); + + blk_queue_exit(q); + + bio = bio_list_pop(current->bio_list); + } else { + struct bio *bio_next = bio_list_pop(current->bio_list); - bio = bio_list_pop(current->bio_list); + bio_io_error(bio); + bio = bio_next; + } } while (bio); current->bio_list = NULL; /* deactivate */ + +out: + return ret; } EXPORT_SYMBOL(generic_make_request); @@ -1874,7 +2100,7 @@ EXPORT_SYMBOL(generic_make_request); * interfaces; @bio must be presetup and ready for I/O. * */ -void submit_bio(int rw, struct bio *bio) +blk_qc_t submit_bio(int rw, struct bio *bio) { bio->bi_rw |= rw; @@ -1908,12 +2134,13 @@ void submit_bio(int rw, struct bio *bio) } } - generic_make_request(bio); + return generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); /** - * blk_rq_check_limits - Helper function to check a request for the queue limit + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for new the queue limits * @q: the queue * @rq: the request being checked * @@ -1924,20 +2151,13 @@ EXPORT_SYMBOL(submit_bio); * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * - * This function should also be useful for request stacking drivers - * in some cases below, so export this function. * Request stacking drivers like request-based dm may change the queue - * limits while requests are in the queue (e.g. dm's table swapping). - * Such request stacking drivers should check those requests against - * the new queue limits again when they dispatch those requests, - * although such checkings are also done against the old queue limits - * when submitting requests. + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. */ -int blk_rq_check_limits(struct request_queue *q, struct request *rq) +static int blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) { - if (!rq_mergeable(rq)) - return 0; - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; @@ -1957,7 +2177,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) return 0; } -EXPORT_SYMBOL_GPL(blk_rq_check_limits); /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request @@ -1969,13 +2188,20 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) unsigned long flags; int where = ELEVATOR_INSERT_BACK; - if (blk_rq_check_limits(q, rq)) + if (blk_cloned_rq_check_limits(q, rq)) return -EIO; if (rq->rq_disk && should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return -EIO; + if (q->mq_ops) { + if (blk_queue_io_stat(q)) + blk_account_io_start(rq, true); + blk_mq_insert_request(rq, false, true, true); + return 0; + } + spin_lock_irqsave(q->queue_lock, flags); if (unlikely(blk_queue_dying(q))) { spin_unlock_irqrestore(q->queue_lock, flags); @@ -2084,7 +2310,7 @@ void blk_account_io_done(struct request *req) } } -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM /* * Don't process normal requests when queue is suspended * or in the process of suspending/resuming @@ -2350,11 +2576,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) { int total_bytes; + trace_block_rq_complete(req->q, req, nr_bytes); + if (!req->bio) return false; - trace_block_rq_complete(req->q, req, nr_bytes); - /* * For fs requests, rq is just carrier of independent bio's * and each partial completion should be handled separately. @@ -2394,8 +2620,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) error_type = "I/O"; break; } - printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n", - error_type, req->rq_disk ? + printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", + __func__, error_type, req->rq_disk ? req->rq_disk->disk_name : "?", (unsigned long long)blk_rq_pos(req)); @@ -2434,7 +2660,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) } req->__data_len -= total_bytes; - req->buffer = bio_data(req->bio); /* update sector only for requests with clear definition of sector */ if (req->cmd_type == REQ_TYPE_FS) @@ -2503,9 +2728,9 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); /* * queue lock must be held */ -static void blk_finish_request(struct request *req, int error) +void blk_finish_request(struct request *req, int error) { - if (blk_rq_tagged(req)) + if (req->cmd_flags & REQ_QUEUED) blk_queue_end_tag(req->q, req); BUG_ON(blk_queued_rq(req)); @@ -2529,6 +2754,7 @@ static void blk_finish_request(struct request *req, int error) __blk_put_request(req->q, req); } } +EXPORT_SYMBOL(blk_finish_request); /** * blk_end_bidi_request - Complete a bidi request @@ -2752,10 +2978,9 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ rq->cmd_flags |= bio->bi_rw & REQ_WRITE; - if (bio_has_data(bio)) { + if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); - rq->buffer = bio_data(bio); - } + rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; @@ -2831,12 +3056,12 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /* * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied. + * The actual data parts (e.g. ->cmd, ->sense) are not copied. */ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; + dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); @@ -2857,7 +3082,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src) * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense) + * The actual data parts of @rq_src (e.g. ->cmd, ->sense) * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. @@ -2874,10 +3099,8 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, if (!bs) bs = fs_bio_set; - blk_rq_init(NULL, rq); - __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_bioset(bio_src, gfp_mask, bs); + bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; @@ -2904,20 +3127,25 @@ free_and_out: } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) +int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); -int kblockd_schedule_delayed_work(struct request_queue *q, - struct delayed_work *dwork, unsigned long delay) +int kblockd_schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) { return queue_delayed_work(kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_schedule_delayed_work); -#define PLUG_MAGIC 0x91827364 +int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); /** * blk_start_plug - initialize blk_plug and track it inside the task_struct @@ -2937,22 +3165,20 @@ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; - plug->magic = PLUG_MAGIC; + /* + * If this is a nested plug, don't actually assign it. + */ + if (tsk->plug) + return; + INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); - /* - * If this is a nested plug, don't actually assign it. It will be - * flushed on its own. + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier */ - if (!tsk->plug) { - /* - * Store ordering should not be needed here, since a potential - * preempt will imply a full memory barrier - */ - tsk->plug = plug; - } + tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); @@ -3034,8 +3260,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) LIST_HEAD(list); unsigned int depth; - BUG_ON(plug->magic != PLUG_MAGIC); - flush_plug_callbacks(plug, from_schedule); if (!list_empty(&plug->mq_list)) @@ -3101,14 +3325,56 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) void blk_finish_plug(struct blk_plug *plug) { + if (plug != current->plug) + return; blk_flush_plug_list(plug, false); - if (plug == current->plug) - current->plug = NULL; + current->plug = NULL; } EXPORT_SYMBOL(blk_finish_plug); -#ifdef CONFIG_PM_RUNTIME +bool blk_poll(struct request_queue *q, blk_qc_t cookie) +{ + struct blk_plug *plug; + long state; + + if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return false; + + plug = current->plug; + if (plug) + blk_flush_plug_list(plug, false); + + state = current->state; + while (!need_resched()) { + unsigned int queue_num = blk_qc_t_to_queue_num(cookie); + struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); + if (ret > 0) { + hctx->poll_success++; + set_current_state(TASK_RUNNING); + return true; + } + + if (signal_pending_state(state, current)) + set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return true; + if (ret < 0) + break; + cpu_relax(); + } + + return false; +} + +#ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine * @q: the queue of the device @@ -3164,6 +3430,9 @@ int blk_pre_runtime_suspend(struct request_queue *q) { int ret = 0; + if (!q->dev) + return ret; + spin_lock_irq(q->queue_lock); if (q->nr_pending) { ret = -EBUSY; @@ -3191,6 +3460,9 @@ EXPORT_SYMBOL(blk_pre_runtime_suspend); */ void blk_post_runtime_suspend(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_SUSPENDED; @@ -3215,6 +3487,9 @@ EXPORT_SYMBOL(blk_post_runtime_suspend); */ void blk_pre_runtime_resume(struct request_queue *q) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); q->rpm_status = RPM_RESUMING; spin_unlock_irq(q->queue_lock); @@ -3237,6 +3512,9 @@ EXPORT_SYMBOL(blk_pre_runtime_resume); */ void blk_post_runtime_resume(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_ACTIVE; @@ -3254,19 +3532,18 @@ EXPORT_SYMBOL(blk_post_runtime_resume); int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * - sizeof(((struct request *)0)->cmd_flags)); + FIELD_SIZEOF(struct request, cmd_flags)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", - WQ_MEM_RECLAIM | WQ_HIGHPRI | - WQ_POWER_EFFICIENT, 0); + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, SLAB_PANIC, NULL); - blk_requestq_cachep = kmem_cache_create("blkdev_queue", + blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); return 0; diff --git a/block/blk-exec.c b/block/blk-exec.c index dbf4502b1d67..3fec8a29d0fa 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -53,9 +53,9 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq_end_io_fn *done) { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - bool is_pm_resume; WARN_ON(irqs_disabled()); + WARN_ON(rq->cmd_type == REQ_TYPE_FS); rq->rq_disk = bd_disk; rq->end_io = done; @@ -69,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, return; } - /* - * need to check this before __blk_run_queue(), because rq can - * be freed before that returns. - */ - is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME; - spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { @@ -87,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, __elv_add_request(q, rq, where); __blk_run_queue(q); - /* the queue is stopped so it won't be run */ - if (is_pm_resume) - __blk_run_queue_uncond(q); spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -132,6 +123,11 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, if (rq->errors) err = -EIO; + if (rq->sense == sense) { + rq->sense = NULL; + rq->sense_len = 0; + } + return err; } EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-flush.c b/block/blk-flush.c index 43e6b4755e9a..9c423e53324a 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -28,7 +28,7 @@ * * The actual execution of flush is double buffered. Whenever a request * needs to execute PRE or POSTFLUSH, it queues at - * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a + * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a * flush is issued and the pending_idx is toggled. When the flush * completes, all the requests which were pending are proceeded to the next * step. This allows arbitrary merging of different types of FLUSH/FUA @@ -73,6 +73,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-mq-tag.h" /* FLUSH/FUA sequences */ enum { @@ -91,7 +92,8 @@ enum { FLUSH_PENDING_TIMEOUT = 5 * HZ, }; -static bool blk_kick_flush(struct request_queue *q); +static bool blk_kick_flush(struct request_queue *q, + struct blk_flush_queue *fq); static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) { @@ -126,25 +128,15 @@ static void blk_flush_restore_request(struct request *rq) /* make @rq a normal request */ rq->cmd_flags &= ~REQ_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; - - blk_clear_rq_complete(rq); -} - -static void mq_flush_run(struct work_struct *work) -{ - struct request *rq; - - rq = container_of(work, struct request, mq_flush_work); - - memset(&rq->csd, 0, sizeof(rq->csd)); - blk_mq_insert_request(rq, false, true, false); } static bool blk_flush_queue_rq(struct request *rq, bool add_front) { if (rq->q->mq_ops) { - INIT_WORK(&rq->mq_flush_work, mq_flush_run); - kblockd_schedule_work(rq->q, &rq->mq_flush_work); + struct request_queue *q = rq->q; + + blk_mq_add_to_requeue_list(rq, add_front); + blk_mq_kick_requeue_list(q); return false; } else { if (add_front) @@ -158,6 +150,7 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) /** * blk_flush_complete_seq - complete flush sequence * @rq: FLUSH/FUA request being sequenced + * @fq: flush queue * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) * @error: whether an error occurred * @@ -165,16 +158,17 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) * completion and trigger the next step. * * CONTEXT: - * spin_lock_irq(q->queue_lock or q->mq_flush_lock) + * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) * * RETURNS: * %true if requests were added to the dispatch queue, %false otherwise. */ -static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, - int error) +static bool blk_flush_complete_seq(struct request *rq, + struct blk_flush_queue *fq, + unsigned int seq, int error) { struct request_queue *q = rq->q; - struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; bool queued = false, kicked; BUG_ON(rq->flush.seq & seq); @@ -190,12 +184,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, case REQ_FSEQ_POSTFLUSH: /* queue for flush */ if (list_empty(pending)) - q->flush_pending_since = jiffies; + fq->flush_pending_since = jiffies; list_move_tail(&rq->flush.list, pending); break; case REQ_FSEQ_DATA: - list_move_tail(&rq->flush.list, &q->flush_data_in_flight); + list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); queued = blk_flush_queue_rq(rq, true); break; @@ -210,7 +204,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, list_del_init(&rq->flush.list); blk_flush_restore_request(rq); if (q->mq_ops) - blk_mq_end_io(rq, error); + blk_mq_end_request(rq, error); else __blk_end_request_all(rq, error); break; @@ -219,7 +213,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, BUG(); } - kicked = blk_kick_flush(q); + kicked = blk_kick_flush(q, fq); return kicked | queued; } @@ -230,15 +224,23 @@ static void flush_end_io(struct request *flush_rq, int error) bool queued = false; struct request *rq, *n; unsigned long flags = 0; + struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); - if (q->mq_ops) - spin_lock_irqsave(&q->mq_flush_lock, flags); + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + + /* release the tag's ownership to the req cloned from */ + spin_lock_irqsave(&fq->mq_flush_lock, flags); + hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu); + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); + flush_rq->tag = -1; + } - running = &q->flush_queue[q->flush_running_idx]; - BUG_ON(q->flush_pending_idx == q->flush_running_idx); + running = &fq->flush_queue[fq->flush_running_idx]; + BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); /* account completion of the flush request */ - q->flush_running_idx ^= 1; + fq->flush_running_idx ^= 1; if (!q->mq_ops) elv_completed_request(q, flush_rq); @@ -248,7 +250,7 @@ static void flush_end_io(struct request *flush_rq, int error) unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); - queued |= blk_flush_complete_seq(rq, seq, error); + queued |= blk_flush_complete_seq(rq, fq, seq, error); } /* @@ -262,85 +264,88 @@ static void flush_end_io(struct request *flush_rq, int error) * directly into request_fn may confuse the driver. Always use * kblockd. */ - if (queued || q->flush_queue_delayed) { + if (queued || fq->flush_queue_delayed) { WARN_ON(q->mq_ops); blk_run_queue_async(q); } - q->flush_queue_delayed = 0; + fq->flush_queue_delayed = 0; if (q->mq_ops) - spin_unlock_irqrestore(&q->mq_flush_lock, flags); + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); } /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked + * @fq: flush queue * * Flush related states of @q have changed, consider issuing flush request. * Please read the comment at the top of this file for more info. * * CONTEXT: - * spin_lock_irq(q->queue_lock or q->mq_flush_lock) + * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) * * RETURNS: * %true if flush was issued, %false otherwise. */ -static bool blk_kick_flush(struct request_queue *q) +static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) { - struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = list_first_entry(pending, struct request, flush.list); + struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ - if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending)) + if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return false; /* C2 and C3 */ - if (!list_empty(&q->flush_data_in_flight) && + if (!list_empty(&fq->flush_data_in_flight) && time_before(jiffies, - q->flush_pending_since + FLUSH_PENDING_TIMEOUT)) + fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return false; /* * Issue flush and toggle pending_idx. This makes pending_idx * different from running_idx, which means flush is in flight. */ - q->flush_pending_idx ^= 1; + fq->flush_pending_idx ^= 1; + + blk_rq_init(q, flush_rq); + /* + * Borrow tag from the first request since they can't + * be in flight at the same time. And acquire the tag's + * ownership for flush req. + */ if (q->mq_ops) { - struct blk_mq_ctx *ctx = first_rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx; - blk_mq_rq_init(hctx, q->flush_rq); - q->flush_rq->mq_ctx = ctx; + flush_rq->mq_ctx = first_rq->mq_ctx; + flush_rq->tag = first_rq->tag; + fq->orig_rq = first_rq; - /* - * Reuse the tag value from the fist waiting request, - * with blk-mq the tag is generated during request - * allocation and drivers can rely on it being inside - * the range they asked for. - */ - q->flush_rq->tag = first_rq->tag; - } else { - blk_rq_init(q, q->flush_rq); + hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu); + blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); } - q->flush_rq->cmd_type = REQ_TYPE_FS; - q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; - q->flush_rq->rq_disk = first_rq->rq_disk; - q->flush_rq->end_io = flush_end_io; + flush_rq->cmd_type = REQ_TYPE_FS; + flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; + flush_rq->rq_disk = first_rq->rq_disk; + flush_rq->end_io = flush_end_io; - return blk_flush_queue_rq(q->flush_rq, false); + return blk_flush_queue_rq(flush_rq, false); } static void flush_data_end_io(struct request *rq, int error) { struct request_queue *q = rq->q; + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ - if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) + if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) blk_run_queue_async(q); } @@ -348,20 +353,20 @@ static void mq_flush_data_end_io(struct request *rq, int error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; + struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; + struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); - ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ - spin_lock_irqsave(&q->mq_flush_lock, flags); - if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) + spin_lock_irqsave(&fq->mq_flush_lock, flags); + if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) blk_mq_run_hw_queue(hctx, true); - spin_unlock_irqrestore(&q->mq_flush_lock, flags); + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); } /** @@ -381,6 +386,7 @@ void blk_insert_flush(struct request *rq) struct request_queue *q = rq->q; unsigned int fflags = q->flush_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); + struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); /* * @policy now records what operations need to be done. Adjust @@ -398,7 +404,7 @@ void blk_insert_flush(struct request *rq) */ if (!policy) { if (q->mq_ops) - blk_mq_end_io(rq, 0); + blk_mq_end_request(rq, 0); else __blk_end_bidi_request(rq, 0, 0, 0); return; @@ -431,52 +437,14 @@ void blk_insert_flush(struct request *rq) if (q->mq_ops) { rq->end_io = mq_flush_data_end_io; - spin_lock_irq(&q->mq_flush_lock); - blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); - spin_unlock_irq(&q->mq_flush_lock); + spin_lock_irq(&fq->mq_flush_lock); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); + spin_unlock_irq(&fq->mq_flush_lock); return; } rq->end_io = flush_data_end_io; - blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); -} - -/** - * blk_abort_flushes - @q is being aborted, abort flush requests - * @q: request_queue being aborted - * - * To be called from elv_abort_queue(). @q is being aborted. Prepare all - * FLUSH/FUA requests for abortion. - * - * CONTEXT: - * spin_lock_irq(q->queue_lock) - */ -void blk_abort_flushes(struct request_queue *q) -{ - struct request *rq, *n; - int i; - - /* - * Requests in flight for data are already owned by the dispatch - * queue or the device driver. Just restore for normal completion. - */ - list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) { - list_del_init(&rq->flush.list); - blk_flush_restore_request(rq); - } - - /* - * We need to give away requests on flush queues. Restore for - * normal completion and put them on the dispatch queue. - */ - for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) { - list_for_each_entry_safe(rq, n, &q->flush_queue[i], - flush.list) { - list_del_init(&rq->flush.list); - blk_flush_restore_request(rq); - list_add_tail(&rq->queuelist, &q->queue_head); - } - } + blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); } /** @@ -532,7 +500,43 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -void blk_mq_init_flush(struct request_queue *q) +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size) { - spin_lock_init(&q->mq_flush_lock); + struct blk_flush_queue *fq; + int rq_sz = sizeof(struct request); + + fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node); + if (!fq) + goto fail; + + if (q->mq_ops) { + spin_lock_init(&fq->mq_flush_lock); + rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); + } + + fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node); + if (!fq->flush_rq) + goto fail_rq; + + INIT_LIST_HEAD(&fq->flush_queue[0]); + INIT_LIST_HEAD(&fq->flush_queue[1]); + INIT_LIST_HEAD(&fq->flush_data_in_flight); + + return fq; + + fail_rq: + kfree(fq); + fail: + return NULL; +} + +void blk_free_flush_queue(struct blk_flush_queue *fq) +{ + /* bio based request queue hasn't flush queue */ + if (!fq) + return; + + kfree(fq->flush_rq); + kfree(fq); } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 7fbab84399e6..d69c5c79f98e 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -21,6 +21,7 @@ */ #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/mempool.h> #include <linux/bio.h> #include <linux/scatterlist.h> @@ -29,10 +30,6 @@ #include "blk.h" -static struct kmem_cache *integrity_cachep; - -static const char *bi_unsupported_name = "unsupported"; - /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements * @q: request queue @@ -145,40 +142,40 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); */ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) { - struct blk_integrity *b1 = gd1->integrity; - struct blk_integrity *b2 = gd2->integrity; + struct blk_integrity *b1 = &gd1->queue->integrity; + struct blk_integrity *b2 = &gd2->queue->integrity; - if (!b1 && !b2) + if (!b1->profile && !b2->profile) return 0; - if (!b1 || !b2) + if (!b1->profile || !b2->profile) return -1; - if (b1->sector_size != b2->sector_size) { - printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->sector_size, b2->sector_size); + if (b1->interval_exp != b2->interval_exp) { + pr_err("%s: %s/%s protection interval %u != %u\n", + __func__, gd1->disk_name, gd2->disk_name, + 1 << b1->interval_exp, 1 << b2->interval_exp); return -1; } if (b1->tuple_size != b2->tuple_size) { - printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__, + pr_err("%s: %s/%s tuple sz %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, b1->tuple_size, b2->tuple_size); return -1; } if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { - printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__, + pr_err("%s: %s/%s tag sz %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, b1->tag_size, b2->tag_size); return -1; } - if (strcmp(b1->name, b2->name)) { - printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__, + if (b1->profile != b2->profile) { + pr_err("%s: %s/%s type %s != %s\n", __func__, gd1->disk_name, gd2->disk_name, - b1->name, b2->name); + b1->profile->name, b2->profile->name); return -1; } @@ -186,37 +183,56 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) } EXPORT_SYMBOL(blk_integrity_compare); -int blk_integrity_merge_rq(struct request_queue *q, struct request *req, - struct request *next) +bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, + struct request *next) { - if (blk_integrity_rq(req) != blk_integrity_rq(next)) - return -1; + if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0) + return true; + + if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0) + return false; + + if (bio_integrity(req->bio)->bip_flags != + bio_integrity(next->bio)->bip_flags) + return false; if (req->nr_integrity_segments + next->nr_integrity_segments > q->limits.max_integrity_segments) - return -1; + return false; - return 0; + if (integrity_req_gap_back_merge(req, next->bio)) + return false; + + return true; } EXPORT_SYMBOL(blk_integrity_merge_rq); -int blk_integrity_merge_bio(struct request_queue *q, struct request *req, - struct bio *bio) +bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, + struct bio *bio) { int nr_integrity_segs; struct bio *next = bio->bi_next; + if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL) + return true; + + if (blk_integrity_rq(req) == 0 || bio_integrity(bio) == NULL) + return false; + + if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags) + return false; + bio->bi_next = NULL; nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); bio->bi_next = next; if (req->nr_integrity_segments + nr_integrity_segs > q->limits.max_integrity_segments) - return -1; + return false; req->nr_integrity_segments += nr_integrity_segs; - return 0; + return true; } EXPORT_SYMBOL(blk_integrity_merge_bio); @@ -229,8 +245,8 @@ struct integrity_sysfs_entry { static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { - struct blk_integrity *bi = - container_of(kobj, struct blk_integrity, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); + struct blk_integrity *bi = &disk->queue->integrity; struct integrity_sysfs_entry *entry = container_of(attr, struct integrity_sysfs_entry, attr); @@ -241,8 +257,8 @@ static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = - container_of(kobj, struct blk_integrity, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); + struct blk_integrity *bi = &disk->queue->integrity; struct integrity_sysfs_entry *entry = container_of(attr, struct integrity_sysfs_entry, attr); ssize_t ret = 0; @@ -255,56 +271,65 @@ static ssize_t integrity_attr_store(struct kobject *kobj, static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) { - if (bi != NULL && bi->name != NULL) - return sprintf(page, "%s\n", bi->name); + if (bi->profile && bi->profile->name) + return sprintf(page, "%s\n", bi->profile->name); else return sprintf(page, "none\n"); } static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) { - if (bi != NULL) - return sprintf(page, "%u\n", bi->tag_size); - else - return sprintf(page, "0\n"); + return sprintf(page, "%u\n", bi->tag_size); } -static ssize_t integrity_read_store(struct blk_integrity *bi, - const char *page, size_t count) +static ssize_t integrity_interval_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%u\n", + bi->interval_exp ? 1 << bi->interval_exp : 0); +} + +static ssize_t integrity_verify_store(struct blk_integrity *bi, + const char *page, size_t count) { char *p = (char *) page; unsigned long val = simple_strtoul(p, &p, 10); if (val) - bi->flags |= INTEGRITY_FLAG_READ; + bi->flags |= BLK_INTEGRITY_VERIFY; else - bi->flags &= ~INTEGRITY_FLAG_READ; + bi->flags &= ~BLK_INTEGRITY_VERIFY; return count; } -static ssize_t integrity_read_show(struct blk_integrity *bi, char *page) +static ssize_t integrity_verify_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_READ) != 0); + return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_VERIFY) != 0); } -static ssize_t integrity_write_store(struct blk_integrity *bi, - const char *page, size_t count) +static ssize_t integrity_generate_store(struct blk_integrity *bi, + const char *page, size_t count) { char *p = (char *) page; unsigned long val = simple_strtoul(p, &p, 10); if (val) - bi->flags |= INTEGRITY_FLAG_WRITE; + bi->flags |= BLK_INTEGRITY_GENERATE; else - bi->flags &= ~INTEGRITY_FLAG_WRITE; + bi->flags &= ~BLK_INTEGRITY_GENERATE; return count; } -static ssize_t integrity_write_show(struct blk_integrity *bi, char *page) +static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_WRITE) != 0); + return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_GENERATE) != 0); +} + +static ssize_t integrity_device_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%u\n", + (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) != 0); } static struct integrity_sysfs_entry integrity_format_entry = { @@ -317,23 +342,35 @@ static struct integrity_sysfs_entry integrity_tag_size_entry = { .show = integrity_tag_size_show, }; -static struct integrity_sysfs_entry integrity_read_entry = { +static struct integrity_sysfs_entry integrity_interval_entry = { + .attr = { .name = "protection_interval_bytes", .mode = S_IRUGO }, + .show = integrity_interval_show, +}; + +static struct integrity_sysfs_entry integrity_verify_entry = { .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR }, - .show = integrity_read_show, - .store = integrity_read_store, + .show = integrity_verify_show, + .store = integrity_verify_store, }; -static struct integrity_sysfs_entry integrity_write_entry = { +static struct integrity_sysfs_entry integrity_generate_entry = { .attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR }, - .show = integrity_write_show, - .store = integrity_write_store, + .show = integrity_generate_show, + .store = integrity_generate_store, +}; + +static struct integrity_sysfs_entry integrity_device_entry = { + .attr = { .name = "device_is_integrity_capable", .mode = S_IRUGO }, + .show = integrity_device_show, }; static struct attribute *integrity_attrs[] = { &integrity_format_entry.attr, &integrity_tag_size_entry.attr, - &integrity_read_entry.attr, - &integrity_write_entry.attr, + &integrity_interval_entry.attr, + &integrity_verify_entry.attr, + &integrity_generate_entry.attr, + &integrity_device_entry.attr, NULL, }; @@ -342,115 +379,89 @@ static const struct sysfs_ops integrity_ops = { .store = &integrity_attr_store, }; -static int __init blk_dev_integrity_init(void) -{ - integrity_cachep = kmem_cache_create("blkdev_integrity", - sizeof(struct blk_integrity), - 0, SLAB_PANIC, NULL); - return 0; -} -subsys_initcall(blk_dev_integrity_init); - -static void blk_integrity_release(struct kobject *kobj) -{ - struct blk_integrity *bi = - container_of(kobj, struct blk_integrity, kobj); - - kmem_cache_free(integrity_cachep, bi); -} - static struct kobj_type integrity_ktype = { .default_attrs = integrity_attrs, .sysfs_ops = &integrity_ops, - .release = blk_integrity_release, }; -bool blk_integrity_is_initialized(struct gendisk *disk) +static int blk_integrity_nop_fn(struct blk_integrity_iter *iter) { - struct blk_integrity *bi = blk_get_integrity(disk); - - return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0); + return 0; } -EXPORT_SYMBOL(blk_integrity_is_initialized); + +static struct blk_integrity_profile nop_profile = { + .name = "nop", + .generate_fn = blk_integrity_nop_fn, + .verify_fn = blk_integrity_nop_fn, +}; /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware - * @template: optional integrity profile to register + * @template: block integrity profile to register * - * Description: When a device needs to advertise itself as being able - * to send/receive integrity metadata it must use this function to - * register the capability with the block layer. The template is a - * blk_integrity struct with values appropriate for the underlying - * hardware. If template is NULL the new profile is allocated but - * not filled out. See Documentation/block/data-integrity.txt. + * Description: When a device needs to advertise itself as being able to + * send/receive integrity metadata it must use this function to register + * the capability with the block layer. The template is a blk_integrity + * struct with values appropriate for the underlying hardware. See + * Documentation/block/data-integrity.txt. */ -int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) +void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) { - struct blk_integrity *bi; - - BUG_ON(disk == NULL); - - if (disk->integrity == NULL) { - bi = kmem_cache_alloc(integrity_cachep, - GFP_KERNEL | __GFP_ZERO); - if (!bi) - return -1; - - if (kobject_init_and_add(&bi->kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, - "%s", "integrity")) { - kmem_cache_free(integrity_cachep, bi); - return -1; - } - - kobject_uevent(&bi->kobj, KOBJ_ADD); - - bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE; - bi->sector_size = queue_logical_block_size(disk->queue); - disk->integrity = bi; - } else - bi = disk->integrity; + struct blk_integrity *bi = &disk->queue->integrity; - /* Use the provided profile as template */ - if (template != NULL) { - bi->name = template->name; - bi->generate_fn = template->generate_fn; - bi->verify_fn = template->verify_fn; - bi->tuple_size = template->tuple_size; - bi->set_tag_fn = template->set_tag_fn; - bi->get_tag_fn = template->get_tag_fn; - bi->tag_size = template->tag_size; - } else - bi->name = bi_unsupported_name; + bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | + template->flags; + bi->interval_exp = ilog2(queue_logical_block_size(disk->queue)); + bi->profile = template->profile ? template->profile : &nop_profile; + bi->tuple_size = template->tuple_size; + bi->tag_size = template->tag_size; - disk->queue->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; - - return 0; + blk_integrity_revalidate(disk); } EXPORT_SYMBOL(blk_integrity_register); /** - * blk_integrity_unregister - Remove block integrity profile - * @disk: disk whose integrity profile to deallocate + * blk_integrity_unregister - Unregister block integrity profile + * @disk: disk whose integrity profile to unregister * - * Description: This function frees all memory used by the block - * integrity profile. To be called at device teardown. + * Description: This function unregisters the integrity capability from + * a block device. */ void blk_integrity_unregister(struct gendisk *disk) { - struct blk_integrity *bi; + blk_integrity_revalidate(disk); + memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); +} +EXPORT_SYMBOL(blk_integrity_unregister); - if (!disk || !disk->integrity) +void blk_integrity_revalidate(struct gendisk *disk) +{ + struct blk_integrity *bi = &disk->queue->integrity; + + if (!(disk->flags & GENHD_FL_UP)) return; - disk->queue->backing_dev_info.capabilities &= ~BDI_CAP_STABLE_WRITES; + if (bi->profile) + disk->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + disk->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; +} + +void blk_integrity_add(struct gendisk *disk) +{ + if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, + &disk_to_dev(disk)->kobj, "%s", "integrity")) + return; - bi = disk->integrity; + kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); +} - kobject_uevent(&bi->kobj, KOBJ_REMOVE); - kobject_del(&bi->kobj); - kobject_put(&bi->kobj); - disk->integrity = NULL; +void blk_integrity_del(struct gendisk *disk) +{ + kobject_uevent(&disk->integrity_kobj, KOBJ_REMOVE); + kobject_del(&disk->integrity_kobj); + kobject_put(&disk->integrity_kobj); } -EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 1a27f45ec776..381cb50a673c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -289,7 +289,7 @@ struct io_context *get_task_io_context(struct task_struct *task, { struct io_context *ioc; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); do { task_lock(task); diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c deleted file mode 100644 index c11d24e379e2..000000000000 --- a/block/blk-iopoll.c +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Functions related to interrupt-poll handling in the block layer. This - * is similar to NAPI for network devices. - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/blk-iopoll.h> -#include <linux/delay.h> - -#include "blk.h" - -static unsigned int blk_iopoll_budget __read_mostly = 256; - -static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll); - -/** - * blk_iopoll_sched - Schedule a run of the iopoll handler - * @iop: The parent iopoll structure - * - * Description: - * Add this blk_iopoll structure to the pending poll list and trigger the - * raise of the blk iopoll softirq. The driver must already have gotten a - * successful return from blk_iopoll_sched_prep() before calling this. - **/ -void blk_iopoll_sched(struct blk_iopoll *iop) -{ - unsigned long flags; - - local_irq_save(flags); - list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); - local_irq_restore(flags); -} -EXPORT_SYMBOL(blk_iopoll_sched); - -/** - * __blk_iopoll_complete - Mark this @iop as un-polled again - * @iop: The parent iopoll structure - * - * Description: - * See blk_iopoll_complete(). This function must be called with interrupts - * disabled. - **/ -void __blk_iopoll_complete(struct blk_iopoll *iop) -{ - list_del(&iop->list); - smp_mb__before_clear_bit(); - clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); -} -EXPORT_SYMBOL(__blk_iopoll_complete); - -/** - * blk_iopoll_complete - Mark this @iop as un-polled again - * @iop: The parent iopoll structure - * - * Description: - * If a driver consumes less than the assigned budget in its run of the - * iopoll handler, it'll end the polled mode by calling this function. The - * iopoll handler will not be invoked again before blk_iopoll_sched_prep() - * is called. - **/ -void blk_iopoll_complete(struct blk_iopoll *iopoll) -{ - unsigned long flags; - - local_irq_save(flags); - __blk_iopoll_complete(iopoll); - local_irq_restore(flags); -} -EXPORT_SYMBOL(blk_iopoll_complete); - -static void blk_iopoll_softirq(struct softirq_action *h) -{ - struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); - int rearm = 0, budget = blk_iopoll_budget; - unsigned long start_time = jiffies; - - local_irq_disable(); - - while (!list_empty(list)) { - struct blk_iopoll *iop; - int work, weight; - - /* - * If softirq window is exhausted then punt. - */ - if (budget <= 0 || time_after(jiffies, start_time)) { - rearm = 1; - break; - } - - local_irq_enable(); - - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - iop = list_entry(list->next, struct blk_iopoll, list); - - weight = iop->weight; - work = 0; - if (test_bit(IOPOLL_F_SCHED, &iop->state)) - work = iop->poll(iop, weight); - - budget -= work; - - local_irq_disable(); - - /* - * Drivers must not modify the iopoll state, if they - * consume their assigned weight (or more, some drivers can't - * easily just stop processing, they have to complete an - * entire mask of commands).In such cases this code - * still "owns" the iopoll instance and therefore can - * move the instance around on the list at-will. - */ - if (work >= weight) { - if (blk_iopoll_disable_pending(iop)) - __blk_iopoll_complete(iop); - else - list_move_tail(&iop->list, list); - } - } - - if (rearm) - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); - - local_irq_enable(); -} - -/** - * blk_iopoll_disable - Disable iopoll on this @iop - * @iop: The parent iopoll structure - * - * Description: - * Disable io polling and wait for any pending callbacks to have completed. - **/ -void blk_iopoll_disable(struct blk_iopoll *iop) -{ - set_bit(IOPOLL_F_DISABLE, &iop->state); - while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state)) - msleep(1); - clear_bit(IOPOLL_F_DISABLE, &iop->state); -} -EXPORT_SYMBOL(blk_iopoll_disable); - -/** - * blk_iopoll_enable - Enable iopoll on this @iop - * @iop: The parent iopoll structure - * - * Description: - * Enable iopoll on this @iop. Note that the handler run will not be - * scheduled, it will only mark it as active. - **/ -void blk_iopoll_enable(struct blk_iopoll *iop) -{ - BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); - smp_mb__before_clear_bit(); - clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); -} -EXPORT_SYMBOL(blk_iopoll_enable); - -/** - * blk_iopoll_init - Initialize this @iop - * @iop: The parent iopoll structure - * @weight: The default weight (or command completion budget) - * @poll_fn: The handler to invoke - * - * Description: - * Initialize this blk_iopoll structure. Before being actively used, the - * driver must call blk_iopoll_enable(). - **/ -void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn) -{ - memset(iop, 0, sizeof(*iop)); - INIT_LIST_HEAD(&iop->list); - iop->weight = weight; - iop->poll = poll_fn; - set_bit(IOPOLL_F_SCHED, &iop->state); -} -EXPORT_SYMBOL(blk_iopoll_init); - -static int blk_iopoll_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), - this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); - local_irq_enable(); - } - - return NOTIFY_OK; -} - -static struct notifier_block blk_iopoll_cpu_notifier = { - .notifier_call = blk_iopoll_cpu_notify, -}; - -static __init int blk_iopoll_setup(void) -{ - int i; - - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); - - open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq); - register_hotcpu_notifier(&blk_iopoll_cpu_notifier); - return 0; -} -subsys_initcall(blk_iopoll_setup); diff --git a/block/blk-lib.c b/block/blk-lib.c index 97a733cf3d5f..9ebf65379556 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -11,16 +11,16 @@ struct bio_batch { atomic_t done; - unsigned long flags; + int error; struct completion *wait; }; -static void bio_batch_end_io(struct bio *bio, int err) +static void bio_batch_end_io(struct bio *bio) { struct bio_batch *bb = bio->bi_private; - if (err && (err != -EOPNOTSUPP)) - clear_bit(BIO_UPTODATE, &bb->flags); + if (bio->bi_error && bio->bi_error != -EOPNOTSUPP) + bb->error = bio->bi_error; if (atomic_dec_and_test(&bb->done)) complete(bb->wait); bio_put(bio); @@ -43,7 +43,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; - unsigned int max_discard_sectors, granularity; + unsigned int granularity; int alignment; struct bio_batch bb; struct bio *bio; @@ -60,17 +60,6 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, granularity = max(q->limits.discard_granularity >> 9, 1U); alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; - /* - * Ensure that max_discard_sectors is of the proper - * granularity, so that requests stay aligned after a split. - */ - max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - max_discard_sectors -= max_discard_sectors % granularity; - if (unlikely(!max_discard_sectors)) { - /* Avoid infinite loop below. Being cautious never hurts. */ - return -EOPNOTSUPP; - } - if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secdiscard(q)) return -EOPNOTSUPP; @@ -78,7 +67,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } atomic_set(&bb.done, 1); - bb.flags = 1 << BIO_UPTODATE; + bb.error = 0; bb.wait = &wait; blk_start_plug(&plug); @@ -92,7 +81,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, break; } - req_sects = min_t(sector_t, nr_sects, max_discard_sectors); + /* Make sure bi_size doesn't overflow */ + req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9); /* * If splitting a request, and the next starting sector would be @@ -134,9 +124,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!atomic_dec_and_test(&bb.done)) wait_for_completion_io(&wait); - if (!test_bit(BIO_UPTODATE, &bb.flags)) - ret = -EIO; - + if (bb.error) + return bb.error; return ret; } EXPORT_SYMBOL(blkdev_issue_discard); @@ -166,13 +155,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; - max_write_same_sectors = q->limits.max_write_same_sectors; - - if (max_write_same_sectors == 0) - return -EOPNOTSUPP; + /* Ensure that max_write_same_sectors doesn't overflow bi_size */ + max_write_same_sectors = UINT_MAX >> 9; atomic_set(&bb.done, 1); - bb.flags = 1 << BIO_UPTODATE; + bb.error = 0; bb.wait = &wait; while (nr_sects) { @@ -208,9 +195,8 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if (!atomic_dec_and_test(&bb.done)) wait_for_completion_io(&wait); - if (!test_bit(BIO_UPTODATE, &bb.flags)) - ret = -ENOTSUPP; - + if (bb.error) + return bb.error; return ret; } EXPORT_SYMBOL(blkdev_issue_write_same); @@ -226,8 +212,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same); * Generate and issue number of bios with zerofiled pages. */ -int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) +static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) { int ret; struct bio *bio; @@ -236,7 +222,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, DECLARE_COMPLETION_ONSTACK(wait); atomic_set(&bb.done, 1); - bb.flags = 1 << BIO_UPTODATE; + bb.error = 0; bb.wait = &wait; ret = 0; @@ -270,10 +256,8 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, if (!atomic_dec_and_test(&bb.done)) wait_for_completion_io(&wait); - if (!test_bit(BIO_UPTODATE, &bb.flags)) - /* One of bios in the batch was completed with error.*/ - ret = -EIO; - + if (bb.error) + return bb.error; return ret; } @@ -283,24 +267,34 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) + * @discard: whether to discard the block range * * Description: - * Generate and issue number of bios with zerofiled pages. + * Zero-fill a block range. If the discard flag is set and the block + * device guarantees that subsequent READ operations to the block range + * in question will return zeroes, the blocks will be discarded. Should + * the discard request fail, if the discard flag is not set, or if + * discard_zeroes_data is not supported, this function will resort to + * zeroing the blocks manually, thus provisioning (allocating, + * anchoring) them. If the block device supports the WRITE SAME command + * blkdev_issue_zeroout() will use it to optimize the process of + * clearing the block range. Otherwise the zeroing will be performed + * using regular WRITE calls. */ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) + sector_t nr_sects, gfp_t gfp_mask, bool discard) { - if (bdev_write_same(bdev)) { - unsigned char bdn[BDEVNAME_SIZE]; + struct request_queue *q = bdev_get_queue(bdev); - if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, - ZERO_PAGE(0))) - return 0; + if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data && + blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0) == 0) + return 0; - bdevname(bdev, bdn); - pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn); - } + if (bdev_write_same(bdev) && + blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, + ZERO_PAGE(0)) == 0) + return 0; return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); } diff --git a/block/blk-map.c b/block/blk-map.c index f7b22bc21518..f565e11f465a 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -5,10 +5,28 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> -#include <scsi/sg.h> /* for struct sg_iovec */ +#include <linux/uio.h> #include "blk.h" +static bool iovec_gap_to_prv(struct request_queue *q, + struct iovec *prv, struct iovec *cur) +{ + unsigned long prev_end; + + if (!queue_virt_boundary(q)) + return false; + + if (prv->iov_base == NULL && prv->iov_len == 0) + /* prv is not set - don't check */ + return false; + + prev_end = (unsigned long)(prv->iov_base + prv->iov_len); + + return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) || + prev_end & queue_virt_boundary(q)); +} + int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -39,139 +57,12 @@ static int __blk_rq_unmap_user(struct bio *bio) return ret; } -static int __blk_rq_map_user(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, void __user *ubuf, - unsigned int len, gfp_t gfp_mask) -{ - unsigned long uaddr; - struct bio *bio, *orig_bio; - int reading, ret; - - reading = rq_data_dir(rq) == READ; - - /* - * if alignment requirement is satisfied, map in user pages for - * direct dma. else, set up kernel bounce buffers - */ - uaddr = (unsigned long) ubuf; - if (blk_rq_aligned(q, uaddr, len) && !map_data) - bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); - else - bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (map_data && map_data->null_mapped) - bio->bi_flags |= (1 << BIO_NULL_MAPPED); - - orig_bio = bio; - blk_queue_bounce(q, &bio); - - /* - * We link the bounce buffer in and could have to traverse it - * later so we have to get a ref to prevent it from being freed - */ - bio_get(bio); - - ret = blk_rq_append_bio(q, rq, bio); - if (!ret) - return bio->bi_iter.bi_size; - - /* if it was boucned we must call the end io function */ - bio_endio(bio, 0); - __blk_rq_unmap_user(orig_bio); - bio_put(bio); - return ret; -} - -/** - * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request structure to fill - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @ubuf: the user buffer - * @len: length of user data - * @gfp_mask: memory allocation flags - * - * Description: - * Data will be mapped directly for zero copy I/O, if possible. Otherwise - * a kernel bounce buffer is used. - * - * A matching blk_rq_unmap_user() must be issued at the end of I/O, while - * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. - */ -int blk_rq_map_user(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, void __user *ubuf, - unsigned long len, gfp_t gfp_mask) -{ - unsigned long bytes_read = 0; - struct bio *bio = NULL; - int ret; - - if (len > (queue_max_hw_sectors(q) << 9)) - return -EINVAL; - if (!len) - return -EINVAL; - - if (!ubuf && (!map_data || !map_data->null_mapped)) - return -EINVAL; - - while (bytes_read != len) { - unsigned long map_len, end, start; - - map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); - end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) - >> PAGE_SHIFT; - start = (unsigned long)ubuf >> PAGE_SHIFT; - - /* - * A bad offset could cause us to require BIO_MAX_PAGES + 1 - * pages. If this happens we just lower the requested - * mapping len by a page so that we can fit - */ - if (end - start > BIO_MAX_PAGES) - map_len -= PAGE_SIZE; - - ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len, - gfp_mask); - if (ret < 0) - goto unmap_rq; - if (!bio) - bio = rq->bio; - bytes_read += ret; - ubuf += ret; - - if (map_data) - map_data->offset += ret; - } - - if (!bio_flagged(bio, BIO_USER_MAPPED)) - rq->cmd_flags |= REQ_COPY_USER; - - rq->buffer = NULL; - return 0; -unmap_rq: - blk_rq_unmap_user(bio); - rq->bio = NULL; - return ret; -} -EXPORT_SYMBOL(blk_rq_map_user); - /** * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage * @q: request queue where request should be inserted * @rq: request to map data to * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iov: pointer to the iovec - * @iov_count: number of elements in the iovec - * @len: I/O byte count + * @iter: iovec iterator * @gfp_mask: memory allocation flags * * Description: @@ -188,46 +79,53 @@ EXPORT_SYMBOL(blk_rq_map_user); * unmapping. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, const struct sg_iovec *iov, - int iov_count, unsigned int len, gfp_t gfp_mask) + struct rq_map_data *map_data, + const struct iov_iter *iter, gfp_t gfp_mask) { struct bio *bio; - int i, read = rq_data_dir(rq) == READ; int unaligned = 0; + struct iov_iter i; + struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; - if (!iov || iov_count <= 0) + if (!iter || !iter->count) return -EINVAL; - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; + iov_for_each(iov, i, *iter) { + unsigned long uaddr = (unsigned long) iov.iov_base; - if (!iov[i].iov_len) + if (!iov.iov_len) return -EINVAL; /* * Keep going so we check length of all segments */ - if (uaddr & queue_dma_alignment(q)) + if ((uaddr & queue_dma_alignment(q)) || + iovec_gap_to_prv(q, &prv, &iov)) unaligned = 1; + + prv.iov_base = iov.iov_base; + prv.iov_len = iov.iov_len; } - if (unaligned || (q->dma_pad_mask & len) || map_data) - bio = bio_copy_user_iov(q, map_data, iov, iov_count, read, - gfp_mask); + if (unaligned || (q->dma_pad_mask & iter->count) || map_data) + bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); else - bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask); + bio = bio_map_user_iov(q, iter, gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); - if (bio->bi_iter.bi_size != len) { + if (map_data && map_data->null_mapped) + bio_set_flag(bio, BIO_NULL_MAPPED); + + if (bio->bi_iter.bi_size != iter->count) { /* * Grab an extra reference to this bio, as bio_unmap_user() * expects to be able to drop it twice as it happens on the * normal IO completion path */ bio_get(bio); - bio_endio(bio, 0); + bio_endio(bio); __blk_rq_unmap_user(bio); return -EINVAL; } @@ -238,11 +136,25 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, blk_queue_bounce(q, &bio); bio_get(bio); blk_rq_bio_prep(q, rq, bio); - rq->buffer = NULL; return 0; } EXPORT_SYMBOL(blk_rq_map_user_iov); +int blk_rq_map_user(struct request_queue *q, struct request *rq, + struct rq_map_data *map_data, void __user *ubuf, + unsigned long len, gfp_t gfp_mask) +{ + struct iovec iov; + struct iov_iter i; + int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i); + + if (unlikely(ret < 0)) + return ret; + + return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask); +} +EXPORT_SYMBOL(blk_rq_map_user); + /** * blk_rq_unmap_user - unmap a request with user data * @bio: start of bio list @@ -325,7 +237,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, } blk_queue_bounce(q, &rq->bio); - rq->buffer = NULL; return 0; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-merge.c b/block/blk-merge.c index 6c583f9c5b65..888a7fec81f7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -7,13 +7,201 @@ #include <linux/blkdev.h> #include <linux/scatterlist.h> +#include <trace/events/block.h> + #include "blk.h" +static struct bio *blk_bio_discard_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs, + unsigned *nsegs) +{ + unsigned int max_discard_sectors, granularity; + int alignment; + sector_t tmp; + unsigned split_sectors; + + *nsegs = 1; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + + max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); + max_discard_sectors -= max_discard_sectors % granularity; + + if (unlikely(!max_discard_sectors)) { + /* XXX: warn */ + return NULL; + } + + if (bio_sectors(bio) <= max_discard_sectors) + return NULL; + + split_sectors = max_discard_sectors; + + /* + * If the next starting sector would be misaligned, stop the discard at + * the previous aligned sector. + */ + alignment = (q->limits.discard_alignment >> 9) % granularity; + + tmp = bio->bi_iter.bi_sector + split_sectors - alignment; + tmp = sector_div(tmp, granularity); + + if (split_sectors > tmp) + split_sectors -= tmp; + + return bio_split(bio, split_sectors, GFP_NOIO, bs); +} + +static struct bio *blk_bio_write_same_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs, + unsigned *nsegs) +{ + *nsegs = 1; + + if (!q->limits.max_write_same_sectors) + return NULL; + + if (bio_sectors(bio) <= q->limits.max_write_same_sectors) + return NULL; + + return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); +} + +static inline unsigned get_max_io_size(struct request_queue *q, + struct bio *bio) +{ + unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); + unsigned mask = queue_logical_block_size(q) - 1; + + /* aligned to logical block size */ + sectors &= ~(mask >> 9); + + return sectors; +} + +static struct bio *blk_bio_segment_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs, + unsigned *segs) +{ + struct bio_vec bv, bvprv, *bvprvp = NULL; + struct bvec_iter iter; + unsigned seg_size = 0, nsegs = 0, sectors = 0; + unsigned front_seg_size = bio->bi_seg_front_size; + bool do_split = true; + struct bio *new = NULL; + const unsigned max_sectors = get_max_io_size(q, bio); + + bio_for_each_segment(bv, bio, iter) { + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, disallow it. + */ + if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) + goto split; + + if (sectors + (bv.bv_len >> 9) > max_sectors) { + /* + * Consider this a new segment if we're splitting in + * the middle of this vector. + */ + if (nsegs < queue_max_segments(q) && + sectors < max_sectors) { + nsegs++; + sectors = max_sectors; + } + if (sectors) + goto split; + /* Make this single bvec as the 1st segment */ + } + + if (bvprvp && blk_queue_cluster(q)) { + if (seg_size + bv.bv_len > queue_max_segment_size(q)) + goto new_segment; + if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv)) + goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv)) + goto new_segment; + + seg_size += bv.bv_len; + bvprv = bv; + bvprvp = &bvprv; + sectors += bv.bv_len >> 9; + + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; + continue; + } +new_segment: + if (nsegs == queue_max_segments(q)) + goto split; + + nsegs++; + bvprv = bv; + bvprvp = &bvprv; + seg_size = bv.bv_len; + sectors += bv.bv_len >> 9; + + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; + } + + do_split = false; +split: + *segs = nsegs; + + if (do_split) { + new = bio_split(bio, sectors, GFP_NOIO, bs); + if (new) + bio = new; + } + + bio->bi_seg_front_size = front_seg_size; + if (seg_size > bio->bi_seg_back_size) + bio->bi_seg_back_size = seg_size; + + return do_split ? new : NULL; +} + +void blk_queue_split(struct request_queue *q, struct bio **bio, + struct bio_set *bs) +{ + struct bio *split, *res; + unsigned nsegs; + + if ((*bio)->bi_rw & REQ_DISCARD) + split = blk_bio_discard_split(q, *bio, bs, &nsegs); + else if ((*bio)->bi_rw & REQ_WRITE_SAME) + split = blk_bio_write_same_split(q, *bio, bs, &nsegs); + else + split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs); + + /* physical segments can be figured out during splitting */ + res = split ? split : *bio; + res->bi_phys_segments = nsegs; + bio_set_flag(res, BIO_SEG_VALID); + + if (split) { + /* there isn't chance to merge the splitted bio */ + split->bi_rw |= REQ_NOMERGE; + + bio_chain(split, *bio); + trace_block_split(q, split, (*bio)->bi_iter.bi_sector); + generic_make_request(*bio); + *bio = split; + } +} +EXPORT_SYMBOL(blk_queue_split); + static unsigned int __blk_recalc_rq_segments(struct request_queue *q, - struct bio *bio) + struct bio *bio, + bool no_sg_merge) { struct bio_vec bv, bvprv = { NULL }; - int cluster, high, highprv = 1; + int cluster, prev = 0; unsigned int seg_size, nr_phys_segs; struct bio *fbio, *bbio; struct bvec_iter iter; @@ -38,12 +226,13 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, for_each_bio(bio) { bio_for_each_segment(bv, bio, iter) { /* - * the trick here is making sure that a high page is - * never considered part of another segment, since that - * might change with the bounce page. + * If SG merging is disabled, each bio vector is + * a segment */ - high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); - if (!high && !highprv && cluster) { + if (no_sg_merge) + goto new_segment; + + if (prev && cluster) { if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; @@ -63,8 +252,8 @@ new_segment: nr_phys_segs++; bvprv = bv; + prev = 1; seg_size = bv.bv_len; - highprv = high; } bbio = bio; } @@ -79,17 +268,35 @@ new_segment: void blk_recalc_rq_segments(struct request *rq) { - rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio); + bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, + &rq->q->queue_flags); + + rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio, + no_sg_merge); } void blk_recount_segments(struct request_queue *q, struct bio *bio) { - struct bio *nxt = bio->bi_next; + unsigned short seg_cnt; + + /* estimate segment number by bi_vcnt for non-cloned bio */ + if (bio_flagged(bio, BIO_CLONED)) + seg_cnt = bio_segments(bio); + else + seg_cnt = bio->bi_vcnt; + + if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) && + (seg_cnt < queue_max_segments(q))) + bio->bi_phys_segments = seg_cnt; + else { + struct bio *nxt = bio->bi_next; + + bio->bi_next = NULL; + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false); + bio->bi_next = nxt; + } - bio->bi_next = NULL; - bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); - bio->bi_next = nxt; - bio->bi_flags |= (1 << BIO_SEG_VALID); + bio_set_flag(bio, BIO_SEG_VALID); } EXPORT_SYMBOL(blk_recount_segments); @@ -239,7 +446,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (rq->cmd_flags & REQ_WRITE) memset(q->dma_drain_buffer, 0, q->dma_drain_size); - sg->page_link &= ~0x02; + sg_unmark_end(sg); sg = sg_next(sg); sg_set_page(sg, virt_to_page(q->dma_drain_buffer), q->dma_drain_size, @@ -252,38 +459,15 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (sg) sg_mark_end(sg); - return nsegs; -} -EXPORT_SYMBOL(blk_rq_map_sg); - -/** - * blk_bio_map_sg - map a bio to a scatterlist - * @q: request_queue in question - * @bio: bio being mapped - * @sglist: scatterlist being mapped - * - * Note: - * Caller must make sure sg can hold bio->bi_phys_segments entries - * - * Will return the number of sg entries setup - */ -int blk_bio_map_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist) -{ - struct scatterlist *sg = NULL; - int nsegs; - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - - nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); - bio->bi_next = next; - if (sg) - sg_mark_end(sg); + /* + * Something must have been wrong if the figured number of + * segment is bigger than number of req's physical segments + */ + WARN_ON(nsegs > rq->nr_phys_segments); - BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); return nsegs; } -EXPORT_SYMBOL(blk_bio_map_sg); +EXPORT_SYMBOL(blk_rq_map_sg); static inline int ll_new_hw_segment(struct request_queue *q, struct request *req, @@ -294,7 +478,7 @@ static inline int ll_new_hw_segment(struct request_queue *q, if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) goto no_merge; - if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio)) + if (blk_integrity_merge_bio(q, req, bio) == false) goto no_merge; /* @@ -314,6 +498,11 @@ no_merge: int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { + if (req_gap_back_merge(req, bio)) + return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_back_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -332,6 +521,12 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { + + if (req_gap_front_merge(req, bio)) + return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_front_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -372,6 +567,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (req_no_special_merge(req) || req_no_special_merge(next)) return 0; + if (req_gap_back_merge(req, next->bio)) + return 0; + /* * Will it become too large? */ @@ -391,7 +589,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (total_phys_segments > queue_max_segments(q)) return 0; - if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next)) + if (blk_integrity_merge_rq(q, req, next) == false) return 0; /* Merge is OK... */ @@ -569,7 +767,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; /* only merge integrity protected bio into ditto rq */ - if (bio_integrity(bio) != blk_integrity_rq(rq)) + if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; /* must be using the same buffer */ diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c index 136ef8643bba..bb3ed488f7b5 100644 --- a/block/blk-mq-cpu.c +++ b/block/blk-mq-cpu.c @@ -1,3 +1,8 @@ +/* + * CPU notifier helper code for blk-mq + * + * Copyright (C) 2013-2014 Jens Axboe + */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> @@ -18,14 +23,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self, { unsigned int cpu = (unsigned long) hcpu; struct blk_mq_cpu_notifier *notify; + int ret = NOTIFY_OK; raw_spin_lock(&blk_mq_cpu_notify_lock); - list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) - notify->notify(notify->data, action, cpu); + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) { + ret = notify->notify(notify->data, action, cpu); + if (ret != NOTIFY_OK) + break; + } raw_spin_unlock(&blk_mq_cpu_notify_lock); - return NOTIFY_OK; + return ret; } void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) @@ -45,7 +54,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) } void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - void (*fn)(void *, unsigned long, unsigned int), + int (*fn)(void *, unsigned long, unsigned int), void *data) { notifier->notify = fn; diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 097921329619..d0634bcf322f 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -1,3 +1,8 @@ +/* + * CPU <-> hardware queue mapping helpers + * + * Copyright (C) 2013-2014 Jens Axboe + */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/module.h> @@ -12,21 +17,22 @@ static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, const int cpu) { - return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); + return cpu * nr_queues / nr_cpus; } static int get_first_sibling(unsigned int cpu) { unsigned int ret; - ret = cpumask_first(topology_thread_cpumask(cpu)); + ret = cpumask_first(topology_sibling_cpumask(cpu)); if (ret < nr_cpu_ids) return ret; return cpu; } -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, + const struct cpumask *online_mask) { unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; cpumask_var_t cpus; @@ -36,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) cpumask_clear(cpus); nr_cpus = nr_uniq_cpus = 0; - for_each_online_cpu(i) { + for_each_cpu(i, online_mask) { nr_cpus++; first_sibling = get_first_sibling(i); if (!cpumask_test_cpu(first_sibling, cpus)) @@ -46,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) queue = 0; for_each_possible_cpu(i) { - if (!cpu_online(i)) { + if (!cpumask_test_cpu(i, online_mask)) { map[i] = 0; continue; } @@ -80,19 +86,35 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) return 0; } -unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) +unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) { unsigned int *map; /* If cpus are offline, map them to first hctx */ - map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, - reg->numa_node); + map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL, + set->numa_node); if (!map) return NULL; - if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) + if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask)) return map; kfree(map); return NULL; } + +/* + * We have no quick way of doing reverse lookups. This is only used at + * queue init time, so runtime isn't important. + */ +int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) +{ + int i; + + for_each_possible_cpu(i) { + if (index == mq_map[i]) + return local_memory_node(cpu_to_node(i)); + } + + return NUMA_NO_NODE; +} diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index b0ba264b0522..1cf18784c5cf 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -141,15 +141,26 @@ static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) { - char *start_page = page; struct request *rq; + int len = snprintf(page, PAGE_SIZE - 1, "%s:\n", msg); + + list_for_each_entry(rq, list, queuelist) { + const int rq_len = 2 * sizeof(rq) + 2; + + /* if the output will be truncated */ + if (PAGE_SIZE - 1 < len + rq_len) { + /* backspacing if it can't hold '\t...\n' */ + if (PAGE_SIZE - 1 < len + 5) + len -= rq_len; + len += snprintf(page + len, PAGE_SIZE - 1 - len, + "\t...\n"); + break; + } + len += snprintf(page + len, PAGE_SIZE - 1 - len, + "\t%p\n", rq); + } - page += sprintf(page, "%s:\n", msg); - - list_for_each_entry(rq, list, queuelist) - page += sprintf(page, "\t%p\n", rq); - - return page - start_page; + return len; } static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) @@ -163,6 +174,11 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) return ret; } +static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success); +} + static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -203,59 +219,22 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, return ret; } -static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - ssize_t ret; - - spin_lock(&hctx->lock); - ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); - spin_unlock(&hctx->lock); - - return ret; -} - -static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, - const char *page, size_t len) +static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { - struct blk_mq_ctx *ctx; - unsigned long ret; - unsigned int i; - - if (kstrtoul(page, 10, &ret)) { - pr_err("blk-mq-sysfs: invalid input '%s'\n", page); - return -EINVAL; - } - - spin_lock(&hctx->lock); - if (ret) - hctx->flags |= BLK_MQ_F_SHOULD_IPI; - else - hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; - spin_unlock(&hctx->lock); - - hctx_for_each_ctx(hctx, ctx, i) - ctx->ipi_redirect = !!ret; - - return len; + return blk_mq_tag_sysfs_show(hctx->tags, page); } -static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) +static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) { - return blk_mq_tag_sysfs_show(hctx->tags, page); + return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); } static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { - unsigned int i, queue_num, first = 1; + unsigned int i, first = 1; ssize_t ret = 0; - blk_mq_disable_hotplug(); - - for_each_online_cpu(i) { - queue_num = hctx->queue->mq_map[i]; - if (queue_num != hctx->queue_num) - continue; - + for_each_cpu(i, hctx->cpumask) { if (first) ret += sprintf(ret + page, "%u", i); else @@ -264,8 +243,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) first = 0; } - blk_mq_enable_hotplug(); - ret += sprintf(ret + page, "\n"); return ret; } @@ -307,15 +284,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_dispatched_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { + .attr = {.name = "active", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_active_show, +}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { .attr = {.name = "pending", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_rq_list_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { - .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, - .show = blk_mq_hw_sysfs_ipi_show, - .store = blk_mq_hw_sysfs_ipi_store, -}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { .attr = {.name = "tags", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_tags_show, @@ -324,15 +300,20 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_cpus_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { + .attr = {.name = "io_poll", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_poll_show, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, &blk_mq_hw_sysfs_run.attr, &blk_mq_hw_sysfs_dispatched.attr, &blk_mq_hw_sysfs_pending.attr, - &blk_mq_hw_sysfs_ipi.attr, &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_cpus.attr, + &blk_mq_hw_sysfs_active.attr, + &blk_mq_hw_sysfs_poll.attr, NULL, }; @@ -363,6 +344,42 @@ static struct kobj_type blk_mq_hw_ktype = { .release = blk_mq_sysfs_release, }; +static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + int i; + + if (!hctx->nr_ctx) + return; + + hctx_for_each_ctx(hctx, ctx, i) + kobject_del(&ctx->kobj); + + kobject_del(&hctx->kobj); +} + +static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_ctx *ctx; + int i, ret; + + if (!hctx->nr_ctx) + return 0; + + ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); + if (ret) + return ret; + + hctx_for_each_ctx(hctx, ctx, i) { + ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); + if (ret) + break; + } + + return ret; +} + void blk_mq_unregister_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -370,12 +387,14 @@ void blk_mq_unregister_disk(struct gendisk *disk) struct blk_mq_ctx *ctx; int i, j; + blk_mq_disable_hotplug(); + queue_for_each_hw_ctx(q, hctx, i) { - hctx_for_each_ctx(hctx, ctx, j) { - kobject_del(&ctx->kobj); + blk_mq_unregister_hctx(hctx); + + hctx_for_each_ctx(hctx, ctx, j) kobject_put(&ctx->kobj); - } - kobject_del(&hctx->kobj); + kobject_put(&hctx->kobj); } @@ -384,6 +403,24 @@ void blk_mq_unregister_disk(struct gendisk *disk) kobject_put(&q->mq_kobj); kobject_put(&disk_to_dev(disk)->kobj); + + q->mq_sysfs_init_done = false; + blk_mq_enable_hotplug(); +} + +static void blk_mq_sysfs_init(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i; + + kobject_init(&q->mq_kobj, &blk_mq_ktype); + + queue_for_each_hw_ctx(q, hctx, i) + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); + + queue_for_each_ctx(q, ctx, i) + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } int blk_mq_register_disk(struct gendisk *disk) @@ -391,38 +428,60 @@ int blk_mq_register_disk(struct gendisk *disk) struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - int ret, i, j; + int ret, i; - kobject_init(&q->mq_kobj, &blk_mq_ktype); + blk_mq_disable_hotplug(); + + blk_mq_sysfs_init(q); ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) - return ret; + goto out; kobject_uevent(&q->mq_kobj, KOBJ_ADD); queue_for_each_hw_ctx(q, hctx, i) { - kobject_init(&hctx->kobj, &blk_mq_hw_ktype); - ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i); + ret = blk_mq_register_hctx(hctx); if (ret) break; - - if (!hctx->nr_ctx) - continue; - - hctx_for_each_ctx(hctx, ctx, j) { - kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); - ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); - if (ret) - break; - } } - if (ret) { + if (ret) blk_mq_unregister_disk(disk); + else + q->mq_sysfs_init_done = true; +out: + blk_mq_enable_hotplug(); + + return ret; +} +EXPORT_SYMBOL_GPL(blk_mq_register_disk); + +void blk_mq_sysfs_unregister(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + if (!q->mq_sysfs_init_done) + return; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_unregister_hctx(hctx); +} + +int blk_mq_sysfs_register(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i, ret = 0; + + if (!q->mq_sysfs_init_done) return ret; + + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_register_hctx(hctx); + if (ret) + break; } - return 0; + return ret; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 83ae96c51a27..abdbb47405cb 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -1,129 +1,624 @@ +/* + * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread + * over multiple cachelines to avoid ping-pong between multiple submitters + * or submitter and completer. Uses rolling wakeups to avoid falling of + * the scaling cliff when we run out of tags and have to start putting + * submitters to sleep. + * + * Uses active queue tracking to support fairer distribution of tags + * between multiple submitters when a shared tag map is used. + * + * Copyright (C) 2013-2014 Jens Axboe + */ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/percpu_ida.h> +#include <linux/random.h> #include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) +{ + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + int ret; + + ret = find_first_zero_bit(&bm->word, bm->depth); + if (ret < bm->depth) + return true; + } + + return false; +} + +bool blk_mq_has_free_tags(struct blk_mq_tags *tags) +{ + if (!tags) + return true; + + return bt_has_free_tags(&tags->bitmap_tags); +} + +static inline int bt_index_inc(int index) +{ + return (index + 1) & (BT_WAIT_QUEUES - 1); +} + +static inline void bt_index_atomic_inc(atomic_t *index) +{ + int old = atomic_read(index); + int new = bt_index_inc(old); + atomic_cmpxchg(index, old, new); +} + +/* + * If a previously inactive queue goes active, bump the active user count. + */ +bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + atomic_inc(&hctx->tags->active_queues); + + return true; +} + +/* + * Wakeup all potentially sleeping on tags + */ +void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) +{ + struct blk_mq_bitmap_tags *bt; + int i, wake_index; + + /* + * Make sure all changes prior to this are visible from other CPUs. + */ + smp_mb(); + bt = &tags->bitmap_tags; + wake_index = atomic_read(&bt->wake_index); + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) + wake_up(&bs->wait); + + wake_index = bt_index_inc(wake_index); + } + + if (include_reserve) { + bt = &tags->breserved_tags; + if (waitqueue_active(&bt->bs[0].wait)) + wake_up(&bt->bs[0].wait); + } +} + /* - * Per tagged queue (tag address space) map + * If a previously busy queue goes inactive, potential waiters could now + * be allowed to queue. Wake them up and check. */ -struct blk_mq_tags { - unsigned int nr_tags; - unsigned int nr_reserved_tags; - unsigned int nr_batch_move; - unsigned int nr_max_cache; +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; - struct percpu_ida free_tags; - struct percpu_ida reserved_tags; -}; + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + + atomic_dec(&tags->active_queues); + + blk_mq_tag_wakeup_all(tags, false); +} -void blk_mq_wait_for_tags(struct blk_mq_tags *tags) +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt) { - int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); - blk_mq_put_tag(tags, tag); + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return true; + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->depth == 1) + return true; + + users = atomic_read(&hctx->tags->active_queues); + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->depth + users - 1) / users, 4U); + return atomic_read(&hctx->nr_active) < depth; } -bool blk_mq_has_free_tags(struct blk_mq_tags *tags) +static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag, + bool nowrap) +{ + int tag, org_last_tag = last_tag; + + while (1) { + tag = find_next_zero_bit(&bm->word, bm->depth, last_tag); + if (unlikely(tag >= bm->depth)) { + /* + * We started with an offset, and we didn't reset the + * offset to 0 in a failure case, so start from 0 to + * exhaust the map. + */ + if (org_last_tag && last_tag && !nowrap) { + last_tag = org_last_tag = 0; + continue; + } + return -1; + } + + if (!test_and_set_bit(tag, &bm->word)) + break; + + last_tag = tag + 1; + if (last_tag >= bm->depth - 1) + last_tag = 0; + } + + return tag; +} + +#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) + +/* + * Straight forward bitmap tag implementation, where each bit is a tag + * (cleared == free, and set == busy). The small twist is using per-cpu + * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue + * contexts. This enables us to drastically limit the space searched, + * without dirtying an extra shared cacheline like we would if we stored + * the cache value inside the shared blk_mq_bitmap_tags structure. On top + * of that, each word of tags is in a separate cacheline. This means that + * multiple users will tend to stick to different cachelines, at least + * until the map is exhausted. + */ +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, + unsigned int *tag_cache, struct blk_mq_tags *tags) +{ + unsigned int last_tag, org_last_tag; + int index, i, tag; + + if (!hctx_may_queue(hctx, bt)) + return -1; + + last_tag = org_last_tag = *tag_cache; + index = TAG_TO_INDEX(bt, last_tag); + + for (i = 0; i < bt->map_nr; i++) { + tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag), + BT_ALLOC_RR(tags)); + if (tag != -1) { + tag += (index << bt->bits_per_word); + goto done; + } + + /* + * Jump to next index, and reset the last tag to be the + * first tag of that index + */ + index++; + last_tag = (index << bt->bits_per_word); + + if (index >= bt->map_nr) { + index = 0; + last_tag = 0; + } + } + + *tag_cache = 0; + return -1; + + /* + * Only update the cache from the allocation path, if we ended + * up using the specific cached tag. + */ +done: + if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) { + last_tag = tag + 1; + if (last_tag >= bt->depth - 1) + last_tag = 0; + + *tag_cache = last_tag; + } + + return tag; +} + +static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx) { - return !tags || - percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; + struct bt_wait_state *bs; + int wait_index; + + if (!hctx) + return &bt->bs[0]; + + wait_index = atomic_read(&hctx->wait_index); + bs = &bt->bs[wait_index]; + bt_index_atomic_inc(&hctx->wait_index); + return bs; } -static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) +static int bt_get(struct blk_mq_alloc_data *data, + struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, struct blk_mq_tags *tags) { + struct bt_wait_state *bs; + DEFINE_WAIT(wait); int tag; - tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ? - TASK_UNINTERRUPTIBLE : TASK_RUNNING); - if (tag < 0) - return BLK_MQ_TAG_FAIL; - return tag + tags->nr_reserved_tags; + tag = __bt_get(hctx, bt, last_tag, tags); + if (tag != -1) + return tag; + + if (data->flags & BLK_MQ_REQ_NOWAIT) + return -1; + + bs = bt_wait_ptr(bt, hctx); + do { + prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + + tag = __bt_get(hctx, bt, last_tag, tags); + if (tag != -1) + break; + + /* + * We're out of tags on this hardware queue, kick any + * pending IO submits before going to sleep waiting for + * some to complete. Note that hctx can be NULL here for + * reserved tag allocation. + */ + if (hctx) + blk_mq_run_hw_queue(hctx, false); + + /* + * Retry tag allocation after running the hardware queue, + * as running the queue may also have found completions. + */ + tag = __bt_get(hctx, bt, last_tag, tags); + if (tag != -1) + break; + + blk_mq_put_ctx(data->ctx); + + io_schedule(); + + data->ctx = blk_mq_get_ctx(data->q); + data->hctx = data->q->mq_ops->map_queue(data->q, + data->ctx->cpu); + if (data->flags & BLK_MQ_REQ_RESERVED) { + bt = &data->hctx->tags->breserved_tags; + } else { + last_tag = &data->ctx->last_tag; + hctx = data->hctx; + bt = &hctx->tags->bitmap_tags; + } + finish_wait(&bs->wait, &wait); + bs = bt_wait_ptr(bt, hctx); + } while (1); + + finish_wait(&bs->wait, &wait); + return tag; } -static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, - gfp_t gfp) +static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) { int tag; - if (unlikely(!tags->nr_reserved_tags)) { + tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, + &data->ctx->last_tag, data->hctx->tags); + if (tag >= 0) + return tag + data->hctx->tags->nr_reserved_tags; + + return BLK_MQ_TAG_FAIL; +} + +static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) +{ + int tag, zero = 0; + + if (unlikely(!data->hctx->tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_TAG_FAIL; } - tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ? - TASK_UNINTERRUPTIBLE : TASK_RUNNING); + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero, + data->hctx->tags); if (tag < 0) return BLK_MQ_TAG_FAIL; + return tag; } -unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { - if (!reserved) - return __blk_mq_get_tag(tags, gfp); - - return __blk_mq_get_reserved_tag(tags, gfp); + if (data->flags & BLK_MQ_REQ_RESERVED) + return __blk_mq_get_reserved_tag(data); + return __blk_mq_get_tag(data); } -static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) +static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) { - BUG_ON(tag >= tags->nr_tags); + int i, wake_index; - percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); + wake_index = atomic_read(&bt->wake_index); + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) { + int o = atomic_read(&bt->wake_index); + if (wake_index != o) + atomic_cmpxchg(&bt->wake_index, o, wake_index); + + return bs; + } + + wake_index = bt_index_inc(wake_index); + } + + return NULL; } -static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, - unsigned int tag) +static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) { - BUG_ON(tag >= tags->nr_reserved_tags); + const int index = TAG_TO_INDEX(bt, tag); + struct bt_wait_state *bs; + int wait_cnt; - percpu_ida_free(&tags->reserved_tags, tag); + clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); + + /* Ensure that the wait list checks occur after clear_bit(). */ + smp_mb(); + + bs = bt_wake_ptr(bt); + if (!bs) + return; + + wait_cnt = atomic_dec_return(&bs->wait_cnt); + if (unlikely(wait_cnt < 0)) + wait_cnt = atomic_inc_return(&bs->wait_cnt); + if (wait_cnt == 0) { + atomic_add(bt->wake_cnt, &bs->wait_cnt); + bt_index_atomic_inc(&bt->wake_index); + wake_up(&bs->wait); + } } -void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, + unsigned int *last_tag) { - if (tag >= tags->nr_reserved_tags) - __blk_mq_put_tag(tags, tag); - else - __blk_mq_put_reserved_tag(tags, tag); + struct blk_mq_tags *tags = hctx->tags; + + if (tag >= tags->nr_reserved_tags) { + const int real_tag = tag - tags->nr_reserved_tags; + + BUG_ON(real_tag >= tags->nr_tags); + bt_clear_tag(&tags->bitmap_tags, real_tag); + if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) + *last_tag = real_tag; + } else { + BUG_ON(tag >= tags->nr_reserved_tags); + bt_clear_tag(&tags->breserved_tags, tag); + } } -static int __blk_mq_tag_iter(unsigned id, void *data) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt, unsigned int off, + busy_iter_fn *fn, void *data, bool reserved) { - unsigned long *tag_map = data; - __set_bit(id, tag_map); - return 0; + struct request *rq; + int bit, i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + for (bit = find_first_bit(&bm->word, bm->depth); + bit < bm->depth; + bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { + rq = hctx->tags->rqs[off + bit]; + if (rq->q == hctx->queue) + fn(hctx, rq, data, reserved); + } + + off += (1 << bt->bits_per_word); + } } -void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, - void (*fn)(void *, unsigned long *), void *data) +static void bt_tags_for_each(struct blk_mq_tags *tags, + struct blk_mq_bitmap_tags *bt, unsigned int off, + busy_tag_iter_fn *fn, void *data, bool reserved) { - unsigned long *tag_map; - size_t map_size; + struct request *rq; + int bit, i; - map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; - tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); - if (!tag_map) + if (!tags->rqs) return; + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + for (bit = find_first_bit(&bm->word, bm->depth); + bit < bm->depth; + bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { + rq = tags->rqs[off + bit]; + fn(rq, data, reserved); + } - percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); + off += (1 << bt->bits_per_word); + } +} + +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv) +{ if (tags->nr_reserved_tags) - percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, - tag_map); + bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); + bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); +} +EXPORT_SYMBOL(blk_mq_all_tag_busy_iter); + +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + + /* + * If not software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); + } + +} + +static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +{ + unsigned int i, used; + + for (i = 0, used = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + used += bitmap_weight(&bm->word, bm->depth); + } + + return bt->depth - used; +} + +static void bt_update_count(struct blk_mq_bitmap_tags *bt, + unsigned int depth) +{ + unsigned int tags_per_word = 1U << bt->bits_per_word; + unsigned int map_depth = depth; + + if (depth) { + int i; + + for (i = 0; i < bt->map_nr; i++) { + bt->map[i].depth = min(map_depth, tags_per_word); + map_depth -= bt->map[i].depth; + } + } + + bt->wake_cnt = BT_WAIT_BATCH; + if (bt->wake_cnt > depth / BT_WAIT_QUEUES) + bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES); + + bt->depth = depth; +} + +static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, + int node, bool reserved) +{ + int i; + + bt->bits_per_word = ilog2(BITS_PER_LONG); + + /* + * Depth can be zero for reserved tags, that's not a failure + * condition. + */ + if (depth) { + unsigned int nr, tags_per_word; + + tags_per_word = (1 << bt->bits_per_word); - fn(data, tag_map); - kfree(tag_map); + /* + * If the tag space is small, shrink the number of tags + * per word so we spread over a few cachelines, at least. + * If less than 4 tags, just forget about it, it's not + * going to work optimally anyway. + */ + if (depth >= 4) { + while (tags_per_word * 4 > depth) { + bt->bits_per_word--; + tags_per_word = (1 << bt->bits_per_word); + } + } + + nr = ALIGN(depth, tags_per_word) / tags_per_word; + bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bt->map) + return -ENOMEM; + + bt->map_nr = nr; + } + + bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); + if (!bt->bs) { + kfree(bt->map); + bt->map = NULL; + return -ENOMEM; + } + + bt_update_count(bt, depth); + + for (i = 0; i < BT_WAIT_QUEUES; i++) { + init_waitqueue_head(&bt->bs[i].wait); + atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt); + } + + return 0; +} + +static void bt_free(struct blk_mq_bitmap_tags *bt) +{ + kfree(bt->map); + kfree(bt->bs); +} + +static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node, int alloc_policy) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + tags->alloc_policy = alloc_policy; + + if (bt_alloc(&tags->bitmap_tags, depth, node, false)) + goto enomem; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) + goto enomem; + + return tags; +enomem: + bt_free(&tags->bitmap_tags); + kfree(tags); + return NULL; } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, - unsigned int reserved_tags, int node) + unsigned int reserved_tags, + int node, int alloc_policy) { - unsigned int nr_tags, nr_cache; struct blk_mq_tags *tags; - int ret; if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); @@ -134,73 +629,93 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; - nr_tags = total_tags - reserved_tags; - nr_cache = nr_tags / num_possible_cpus(); - - if (nr_cache < BLK_MQ_TAG_CACHE_MIN) - nr_cache = BLK_MQ_TAG_CACHE_MIN; - else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) - nr_cache = BLK_MQ_TAG_CACHE_MAX; + if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) { + kfree(tags); + return NULL; + } tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - tags->nr_max_cache = nr_cache; - tags->nr_batch_move = max(1u, nr_cache / 2); - ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - - tags->nr_reserved_tags, - tags->nr_max_cache, - tags->nr_batch_move); - if (ret) - goto err_free_tags; + return blk_mq_init_bitmap_tags(tags, node, alloc_policy); +} - if (reserved_tags) { - /* - * With max_cahe and batch set to 1, the allocator fallbacks to - * no cached. It's fine reserved tags allocation is slow. - */ - ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, - 1, 1); - if (ret) - goto err_reserved_tags; - } +void blk_mq_free_tags(struct blk_mq_tags *tags) +{ + bt_free(&tags->bitmap_tags); + bt_free(&tags->breserved_tags); + free_cpumask_var(tags->cpumask); + kfree(tags); +} - return tags; +void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; -err_reserved_tags: - percpu_ida_destroy(&tags->free_tags); -err_free_tags: - kfree(tags); - return NULL; + *tag = prandom_u32() % depth; } -void blk_mq_free_tags(struct blk_mq_tags *tags) +int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) { - percpu_ida_destroy(&tags->free_tags); - percpu_ida_destroy(&tags->reserved_tags); - kfree(tags); + tdepth -= tags->nr_reserved_tags; + if (tdepth > tags->nr_tags) + return -EINVAL; + + /* + * Don't need (or can't) update reserved tags here, they remain + * static and should never need resizing. + */ + bt_update_count(&tags->bitmap_tags, tdepth); + blk_mq_tag_wakeup_all(tags, false); + return 0; +} + +/** + * blk_mq_unique_tag() - return a tag that is unique queue-wide + * @rq: request for which to compute a unique tag + * + * The tag field in struct request is unique per hardware queue but not over + * all hardware queues. Hence this function that returns a tag with the + * hardware context index in the upper bits and the per hardware queue tag in + * the lower bits. + * + * Note: When called for a request that is queued on a non-multiqueue request + * queue, the hardware context index is set to zero. + */ +u32 blk_mq_unique_tag(struct request *rq) +{ + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx; + int hwq = 0; + + if (q->mq_ops) { + hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + hwq = hctx->queue_num; + } + + return (hwq << BLK_MQ_UNIQUE_TAG_BITS) | + (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } +EXPORT_SYMBOL(blk_mq_unique_tag); ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) { char *orig_page = page; - unsigned int cpu; + unsigned int free, res; if (!tags) return 0; - page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," - " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, - tags->nr_batch_move, tags->nr_max_cache); + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " + "bits_per_word=%u\n", + tags->nr_tags, tags->nr_reserved_tags, + tags->bitmap_tags.bits_per_word); - page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", - percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), - percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); + free = bt_unused_tags(&tags->bitmap_tags); + res = bt_unused_tags(&tags->breserved_tags); - for_each_possible_cpu(cpu) { - page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, - percpu_ida_free_tags(&tags->free_tags, cpu)); - } + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); + page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); return page - orig_page; } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 947ba2c6148e..d468a79f2c4a 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -1,17 +1,65 @@ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H -struct blk_mq_tags; +#include "blk-mq.h" -extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); +enum { + BT_WAIT_QUEUES = 8, + BT_WAIT_BATCH = 8, +}; + +struct bt_wait_state { + atomic_t wait_cnt; + wait_queue_head_t wait; +} ____cacheline_aligned_in_smp; + +#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) +#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) + +struct blk_mq_bitmap_tags { + unsigned int depth; + unsigned int wake_cnt; + unsigned int bits_per_word; + + unsigned int map_nr; + struct blk_align_bitmap *map; + + atomic_t wake_index; + struct bt_wait_state *bs; +}; + +/* + * Tag address space map. + */ +struct blk_mq_tags { + unsigned int nr_tags; + unsigned int nr_reserved_tags; + + atomic_t active_queues; + + struct blk_mq_bitmap_tags bitmap_tags; + struct blk_mq_bitmap_tags breserved_tags; + + struct request **rqs; + struct list_head page_list; + + int alloc_policy; + cpumask_var_t cpumask; +}; + + +extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); extern void blk_mq_free_tags(struct blk_mq_tags *tags); -extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); -extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); -extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); -extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); +extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); +extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); +extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); +extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv); enum { BLK_MQ_TAG_CACHE_MIN = 1, @@ -24,4 +72,35 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, }; +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); + +static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return false; + + return __blk_mq_tag_busy(hctx); +} + +static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return; + + __blk_mq_tag_idle(hctx); +} + +/* + * This helper should only be used for flush request to share tag + * with the request cloned from, and both the two requests can't be + * in flight at the same time. The caller has to make sure the tag + * can't be freed. + */ +static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, + unsigned int tag, struct request *rq) +{ + hctx->tags->rqs[tag] = rq; +} + #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 1d2a9bdbee57..4c0622fae413 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1,8 +1,15 @@ +/* + * Block multiqueue core code + * + * Copyright (C) 2013-2014 Jens Axboe + * Copyright (C) 2013-2014 Christoph Hellwig + */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> @@ -14,6 +21,7 @@ #include <linux/cache.h> #include <linux/sched/sysctl.h> #include <linux/delay.h> +#include <linux/crash_dump.h> #include <trace/events/block.h> @@ -27,28 +35,6 @@ static LIST_HEAD(all_q_list); static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); -static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, - unsigned int cpu) -{ - return per_cpu_ptr(q->queue_ctx, cpu); -} - -/* - * This assumes per-cpu software queueing queues. They could be per-node - * as well, for instance. For now this is hardcoded as-is. Note that we don't - * care about preemption, since we know the ctx's are persistent. This does - * mean that we can't rely on ctx always matching the currently running CPU. - */ -static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) -{ - return __blk_mq_get_ctx(q, get_cpu()); -} - -static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) -{ - put_cpu(); -} - /* * Check if any of the ctx's have pending work in this hardware queue */ @@ -56,122 +42,114 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { unsigned int i; - for (i = 0; i < hctx->nr_ctx_map; i++) - if (hctx->ctx_map[i]) + for (i = 0; i < hctx->ctx_map.size; i++) + if (hctx->ctx_map.map[i].word) return true; return false; } +static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; +} + +#define CTX_TO_BIT(hctx, ctx) \ + ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) + /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - if (!test_bit(ctx->index_hw, hctx->ctx_map)) - set_bit(ctx->index_hw, hctx->ctx_map); -} - -static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - gfp_t gfp, bool reserved) -{ - struct request *rq; - unsigned int tag; - - tag = blk_mq_get_tag(hctx->tags, gfp, reserved); - if (tag != BLK_MQ_TAG_FAIL) { - rq = hctx->rqs[tag]; - rq->tag = tag; + struct blk_align_bitmap *bm = get_bm(hctx, ctx); - return rq; - } - - return NULL; + if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) + set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static int blk_mq_queue_enter(struct request_queue *q) +static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) { - int ret; - - __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); - smp_wmb(); - /* we have problems to freeze the queue if it's initializing */ - if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) - return 0; + struct blk_align_bitmap *bm = get_bm(hctx, ctx); - __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); - - spin_lock_irq(q->queue_lock); - ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, - !blk_queue_bypass(q) || blk_queue_dying(q), - *q->queue_lock); - /* inc usage with lock hold to avoid freeze_queue runs here */ - if (!ret && !blk_queue_dying(q)) - __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); - else if (blk_queue_dying(q)) - ret = -ENODEV; - spin_unlock_irq(q->queue_lock); - - return ret; + clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static void blk_mq_queue_exit(struct request_queue *q) +void blk_mq_freeze_queue_start(struct request_queue *q) { - __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); + int freeze_depth; + + freeze_depth = atomic_inc_return(&q->mq_freeze_depth); + if (freeze_depth == 1) { + percpu_ref_kill(&q->q_usage_counter); + blk_mq_run_hw_queues(q, false); + } } +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); -static void __blk_mq_drain_queue(struct request_queue *q) +static void blk_mq_freeze_queue_wait(struct request_queue *q) { - while (true) { - s64 count; - - spin_lock_irq(q->queue_lock); - count = percpu_counter_sum(&q->mq_usage_counter); - spin_unlock_irq(q->queue_lock); - - if (count == 0) - break; - blk_mq_run_queues(q, false); - msleep(10); - } + wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } /* * Guarantee no request is in use, so we can change any data structure of * the queue afterward. */ -static void blk_mq_freeze_queue(struct request_queue *q) +void blk_freeze_queue(struct request_queue *q) { - bool drain; - - spin_lock_irq(q->queue_lock); - drain = !q->bypass_depth++; - queue_flag_set(QUEUE_FLAG_BYPASS, q); - spin_unlock_irq(q->queue_lock); - - if (drain) - __blk_mq_drain_queue(q); + /* + * In the !blk_mq case we are only calling this to kill the + * q_usage_counter, otherwise this increases the freeze depth + * and waits for it to return to zero. For this reason there is + * no blk_unfreeze_queue(), and blk_freeze_queue() is not + * exported to drivers as the only user for unfreeze is blk_mq. + */ + blk_mq_freeze_queue_start(q); + blk_mq_freeze_queue_wait(q); } -void blk_mq_drain_queue(struct request_queue *q) +void blk_mq_freeze_queue(struct request_queue *q) { - __blk_mq_drain_queue(q); + /* + * ...just an alias to keep freeze and unfreeze actions balanced + * in the blk_mq_* namespace + */ + blk_freeze_queue(q); } +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); -static void blk_mq_unfreeze_queue(struct request_queue *q) +void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake = false; + int freeze_depth; - spin_lock_irq(q->queue_lock); - if (!--q->bypass_depth) { - queue_flag_clear(QUEUE_FLAG_BYPASS, q); - wake = true; - } - WARN_ON_ONCE(q->bypass_depth < 0); - spin_unlock_irq(q->queue_lock); - if (wake) + freeze_depth = atomic_dec_return(&q->mq_freeze_depth); + WARN_ON_ONCE(freeze_depth < 0); + if (!freeze_depth) { + percpu_ref_reinit(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); + } +} +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); + +void blk_mq_wake_waiters(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_wakeup_all(hctx->tags, true); + + /* + * If we are called because the queue has now been marked as + * dying, we need to ensure that processes currently waiting on + * the queue are notified as well. + */ + wake_up_all(&q->mq_freeze_wq); } bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) @@ -186,78 +164,107 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; + INIT_LIST_HEAD(&rq->queuelist); + /* csd/requeue_work/fifo_time is initialized before use */ + rq->q = q; rq->mq_ctx = ctx; - rq->cmd_flags = rw_flags; + rq->cmd_flags |= rw_flags; + /* do not touch atomic flags, it needs atomic ops against the timer */ + rq->cpu = -1; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + rq->rq_disk = NULL; + rq->part = NULL; rq->start_time = jiffies; +#ifdef CONFIG_BLK_CGROUP + rq->rl = NULL; set_start_time_ns(rq); + rq->io_start_time_ns = 0; +#endif + rq->nr_phys_segments = 0; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + rq->nr_integrity_segments = 0; +#endif + rq->special = NULL; + /* tag was already set */ + rq->errors = 0; + + rq->cmd = rq->__cmd; + + rq->extra_len = 0; + rq->sense_len = 0; + rq->resid_len = 0; + rq->sense = NULL; + + INIT_LIST_HEAD(&rq->timeout_list); + rq->timeout = 0; + + rq->end_io = NULL; + rq->end_io_data = NULL; + rq->next_rq = NULL; + ctx->rq_dispatched[rw_is_sync(rw_flags)]++; } -static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, - int rw, gfp_t gfp, - bool reserved) +static struct request * +__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) { struct request *rq; + unsigned int tag; - do { - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); + tag = blk_mq_get_tag(data); + if (tag != BLK_MQ_TAG_FAIL) { + rq = data->hctx->tags->rqs[tag]; - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); - if (rq) { - blk_mq_rq_ctx_init(q, ctx, rq, rw); - break; + if (blk_mq_tag_busy(data->hctx)) { + rq->cmd_flags = REQ_MQ_INFLIGHT; + atomic_inc(&data->hctx->nr_active); } - blk_mq_put_ctx(ctx); - if (!(gfp & __GFP_WAIT)) - break; - - __blk_mq_run_hw_queue(hctx); - blk_mq_wait_for_tags(hctx->tags); - } while (1); + rq->tag = tag; + blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw); + return rq; + } - return rq; + return NULL; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, + unsigned int flags) { + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; struct request *rq; + struct blk_mq_alloc_data alloc_data; + int ret; - if (blk_mq_queue_enter(q)) - return NULL; - - rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); - if (rq) - blk_mq_put_ctx(rq->mq_ctx); - return rq; -} + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); + if (ret) + return ERR_PTR(ret); -struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, - gfp_t gfp) -{ - struct request *rq; + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - if (blk_mq_queue_enter(q)) - return NULL; + rq = __blk_mq_alloc_request(&alloc_data, rw); + if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); - rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); - if (rq) - blk_mq_put_ctx(rq->mq_ctx); + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw); + ctx = alloc_data.ctx; + } + blk_mq_put_ctx(ctx); + if (!rq) { + blk_queue_exit(q); + return ERR_PTR(-EWOULDBLOCK); + } return rq; } -EXPORT_SYMBOL(blk_mq_alloc_reserved_request); - -/* - * Re-init and set pdu, if we have it - */ -void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) -{ - blk_rq_init(hctx->queue, rq); - - if (hctx->cmd_size) - rq->special = blk_mq_rq_to_pdu(rq); -} +EXPORT_SYMBOL(blk_mq_alloc_request); static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq) @@ -265,38 +272,56 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; - blk_mq_rq_init(hctx, rq); - blk_mq_put_tag(hctx->tags, tag); + if (rq->cmd_flags & REQ_MQ_INFLIGHT) + atomic_dec(&hctx->nr_active); + rq->cmd_flags = 0; - blk_mq_queue_exit(q); + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + blk_mq_put_tag(hctx, tag, &ctx->last_tag); + blk_queue_exit(q); } -void blk_mq_free_request(struct request *rq) +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; ctx->rq_completed[rq_is_sync(rq)]++; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); __blk_mq_free_request(hctx, ctx, rq); + } +EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); -bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes) +void blk_mq_free_request(struct request *rq) { - if (blk_update_request(rq, error, blk_rq_bytes(rq))) - return true; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq->q; + hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + blk_mq_free_hctx_request(hctx, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_free_request); + +inline void __blk_mq_end_request(struct request *rq, int error) +{ blk_account_io_done(rq); - if (rq->end_io) + if (rq->end_io) { rq->end_io(rq, error); - else + } else { + if (unlikely(blk_bidi_rq(rq))) + blk_mq_free_request(rq->next_rq); blk_mq_free_request(rq); - return false; + } } -EXPORT_SYMBOL(blk_mq_end_io_partial); +EXPORT_SYMBOL(__blk_mq_end_request); + +void blk_mq_end_request(struct request *rq, int error) +{ + if (blk_update_request(rq, error, blk_rq_bytes(rq))) + BUG(); + __blk_mq_end_request(rq, error); +} +EXPORT_SYMBOL(blk_mq_end_request); static void __blk_mq_complete_request_remote(void *data) { @@ -305,18 +330,22 @@ static void __blk_mq_complete_request_remote(void *data) rq->q->softirq_done_fn(rq); } -void __blk_mq_complete_request(struct request *rq) +static void blk_mq_ipi_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; + bool shared = false; int cpu; - if (!ctx->ipi_redirect) { + if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { rq->q->softirq_done_fn(rq); return; } cpu = get_cpu(); - if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) + shared = cpus_share_cache(cpu, ctx->cpu); + + if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; @@ -327,6 +356,16 @@ void __blk_mq_complete_request(struct request *rq) put_cpu(); } +static void __blk_mq_complete_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (!q->softirq_done_fn) + blk_mq_end_request(rq, rq->errors); + else + blk_mq_ipi_complete_request(rq); +} + /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -335,28 +374,53 @@ void __blk_mq_complete_request(struct request *rq) * Ends all I/O on a request. It does not handle partial completions. * The actual completion happens out-of-order, through a IPI handler. **/ -void blk_mq_complete_request(struct request *rq) +void blk_mq_complete_request(struct request *rq, int error) { - if (unlikely(blk_should_fake_timeout(rq->q))) + struct request_queue *q = rq->q; + + if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) + if (!blk_mark_rq_complete(rq)) { + rq->errors = error; __blk_mq_complete_request(rq); + } } EXPORT_SYMBOL(blk_mq_complete_request); -static void blk_mq_start_request(struct request *rq, bool last) +int blk_mq_request_started(struct request *rq) +{ + return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); +} +EXPORT_SYMBOL_GPL(blk_mq_request_started); + +void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; trace_block_rq_issue(q, rq); + rq->resid_len = blk_rq_bytes(rq); + if (unlikely(blk_bidi_rq(rq))) + rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + + blk_add_timer(rq); + /* - * Just mark start time and set the started bit. Due to memory - * ordering, we know we'll see the correct deadline as long as - * REQ_ATOMIC_STARTED is seen. + * Ensure that ->deadline is visible before set the started + * flag and clear the completed flag. */ - rq->deadline = jiffies + q->rq_timeout; - set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + smp_mb__before_atomic(); + + /* + * Mark us as started and clear complete. Complete might have been + * set if requeue raced with timeout, which then marked it as + * complete. So be sure to clear complete again when we start + * the request, otherwise we'll ignore the completion event. + */ + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -366,94 +430,217 @@ static void blk_mq_start_request(struct request *rq, bool last) */ rq->nr_phys_segments++; } +} +EXPORT_SYMBOL(blk_mq_start_request); + +static void __blk_mq_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + trace_block_rq_requeue(q, rq); + + if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + if (q->dma_drain_size && blk_rq_bytes(rq)) + rq->nr_phys_segments--; + } +} + +void blk_mq_requeue_request(struct request *rq) +{ + __blk_mq_requeue_request(rq); + + BUG_ON(blk_queued_rq(rq)); + blk_mq_add_to_requeue_list(rq, true); +} +EXPORT_SYMBOL(blk_mq_requeue_request); + +static void blk_mq_requeue_work(struct work_struct *work) +{ + struct request_queue *q = + container_of(work, struct request_queue, requeue_work); + LIST_HEAD(rq_list); + struct request *rq, *next; + unsigned long flags; + + spin_lock_irqsave(&q->requeue_lock, flags); + list_splice_init(&q->requeue_list, &rq_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); + + list_for_each_entry_safe(rq, next, &rq_list, queuelist) { + if (!(rq->cmd_flags & REQ_SOFTBARRIER)) + continue; + + rq->cmd_flags &= ~REQ_SOFTBARRIER; + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, true, false, false); + } + + while (!list_empty(&rq_list)) { + rq = list_entry(rq_list.next, struct request, queuelist); + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, false, false, false); + } /* - * Flag the last request in the series so that drivers know when IO - * should be kicked off, if they don't do it on a per-request basis. - * - * Note: the flag isn't the only condition drivers should do kick off. - * If drive is busy, the last request might not have the bit set. + * Use the start variant of queue running here, so that running + * the requeue work will kick stopped queues. */ - if (last) - rq->cmd_flags |= REQ_END; + blk_mq_start_hw_queues(q); } -static void blk_mq_requeue_request(struct request *rq) +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) { struct request_queue *q = rq->q; + unsigned long flags; - trace_block_rq_requeue(q, rq); - clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + /* + * We abuse this flag that is otherwise used by the I/O scheduler to + * request head insertation from the workqueue. + */ + BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); + + spin_lock_irqsave(&q->requeue_lock, flags); + if (at_head) { + rq->cmd_flags |= REQ_SOFTBARRIER; + list_add(&rq->queuelist, &q->requeue_list); + } else { + list_add_tail(&rq->queuelist, &q->requeue_list); + } + spin_unlock_irqrestore(&q->requeue_lock, flags); +} +EXPORT_SYMBOL(blk_mq_add_to_requeue_list); + +void blk_mq_cancel_requeue_work(struct request_queue *q) +{ + cancel_work_sync(&q->requeue_work); +} +EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); + +void blk_mq_kick_requeue_list(struct request_queue *q) +{ + kblockd_schedule_work(&q->requeue_work); +} +EXPORT_SYMBOL(blk_mq_kick_requeue_list); + +void blk_mq_abort_requeue_list(struct request_queue *q) +{ + unsigned long flags; + LIST_HEAD(rq_list); + + spin_lock_irqsave(&q->requeue_lock, flags); + list_splice_init(&q->requeue_list, &rq_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); - rq->cmd_flags &= ~REQ_END; + while (!list_empty(&rq_list)) { + struct request *rq; - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; + rq = list_first_entry(&rq_list, struct request, queuelist); + list_del_init(&rq->queuelist); + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + } } +EXPORT_SYMBOL(blk_mq_abort_requeue_list); + +struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) +{ + return tags->rqs[tag]; +} +EXPORT_SYMBOL(blk_mq_tag_to_rq); struct blk_mq_timeout_data { - struct blk_mq_hw_ctx *hctx; - unsigned long *next; - unsigned int *next_set; + unsigned long next; + unsigned int next_set; }; -static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) +void blk_mq_rq_timed_out(struct request *req, bool reserved) { - struct blk_mq_timeout_data *data = __data; - struct blk_mq_hw_ctx *hctx = data->hctx; - unsigned int tag; + struct blk_mq_ops *ops = req->q->mq_ops; + enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; - /* It may not be in flight yet (this is where - * the REQ_ATOMIC_STARTED flag comes in). The requests are - * statically allocated, so we know it's always safe to access the - * memory associated with a bit offset into ->rqs[]. + /* + * We know that complete is set at this point. If STARTED isn't set + * anymore, then the request isn't active and the "timeout" should + * just be ignored. This can happen due to the bitflag ordering. + * Timeout first checks if STARTED is set, and if it is, assumes + * the request is active. But if we race with completion, then + * we both flags will get cleared. So check here again, and ignore + * a timeout event with a request that isn't active. */ - tag = 0; - do { - struct request *rq; + if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) + return; - tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); - if (tag >= hctx->queue_depth) - break; + if (ops->timeout) + ret = ops->timeout(req, reserved); + + switch (ret) { + case BLK_EH_HANDLED: + __blk_mq_complete_request(req); + break; + case BLK_EH_RESET_TIMER: + blk_add_timer(req); + blk_clear_rq_complete(req); + break; + case BLK_EH_NOT_HANDLED: + break; + default: + printk(KERN_ERR "block: bad eh return: %d\n", ret); + break; + } +} - rq = hctx->rqs[tag++]; +static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + struct blk_mq_timeout_data *data = priv; - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - continue; + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + /* + * If a request wasn't started before the queue was + * marked dying, kill it here or it'll go unnoticed. + */ + if (unlikely(blk_queue_dying(rq->q))) + blk_mq_complete_request(rq, -EIO); + return; + } - blk_rq_check_expired(rq, data->next, data->next_set); - } while (1); + if (time_after_eq(jiffies, rq->deadline)) { + if (!blk_mark_rq_complete(rq)) + blk_mq_rq_timed_out(rq, reserved); + } else if (!data->next_set || time_after(data->next, rq->deadline)) { + data->next = rq->deadline; + data->next_set = 1; + } } -static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, - unsigned long *next, - unsigned int *next_set) +static void blk_mq_timeout_work(struct work_struct *work) { + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); struct blk_mq_timeout_data data = { - .hctx = hctx, - .next = next, - .next_set = next_set, + .next = 0, + .next_set = 0, }; + int i; - /* - * Ask the tagging code to iterate busy requests, so we can - * check them for timeout. - */ - blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); -} + if (blk_queue_enter(q, true)) + return; -static void blk_mq_rq_timer(unsigned long data) -{ - struct request_queue *q = (struct request_queue *) data; - struct blk_mq_hw_ctx *hctx; - unsigned long next = 0; - int i, next_set = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); - queue_for_each_hw_ctx(q, hctx, i) - blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + if (data.next_set) { + data.next = blk_rq_timeout(round_jiffies_up(data.next)); + mod_timer(&q->timeout, data.next); + } else { + struct blk_mq_hw_ctx *hctx; - if (next_set) - mod_timer(&q->timeout, round_jiffies_up(next)); + queue_for_each_hw_ctx(q, hctx, i) { + /* the hctx may be unmapped, so check it here */ + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_idle(hctx); + } + } + blk_queue_exit(q); } /* @@ -495,9 +682,38 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } -void blk_mq_add_timer(struct request *rq) +/* + * Process software queues that have been marked busy, splicing them + * to the for-dispatch + */ +static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - __blk_add_timer(rq, NULL); + struct blk_mq_ctx *ctx; + int i; + + for (i = 0; i < hctx->ctx_map.size; i++) { + struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; + unsigned int off, bit; + + if (!bm->word) + continue; + + bit = 0; + off = i * hctx->ctx_map.bits_per_word; + do { + bit = find_next_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + ctx = hctx->ctxs[bit + off]; + clear_bit(bit, &bm->word); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, list); + spin_unlock(&ctx->lock); + + bit++; + } while (1); + } } /* @@ -509,10 +725,13 @@ void blk_mq_add_timer(struct request *rq) static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; - struct blk_mq_ctx *ctx; struct request *rq; LIST_HEAD(rq_list); - int bit, queued; + LIST_HEAD(driver_list); + struct list_head *dptr; + int queued; + + WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return; @@ -522,15 +741,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) /* * Touch any software queue that has pending entries. */ - for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { - clear_bit(bit, hctx->ctx_map); - ctx = hctx->ctxs[bit]; - BUG_ON(bit != ctx->index_hw); - - spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, &rq_list); - spin_unlock(&ctx->lock); - } + flush_busy_ctxs(hctx, &rq_list); /* * If we have previous entries on our dispatch list, grab them @@ -544,45 +755,52 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } /* - * Delete and return all entries from our dispatch list + * Start off with dptr being NULL, so we start the first request + * immediately, even if we have more pending. */ - queued = 0; + dptr = NULL; /* * Now process all the entries, sending them to the driver. */ + queued = 0; while (!list_empty(&rq_list)) { + struct blk_mq_queue_data bd; int ret; rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_start_request(rq, list_empty(&rq_list)); + bd.rq = rq; + bd.list = dptr; + bd.last = list_empty(&rq_list); - ret = q->mq_ops->queue_rq(hctx, rq); + ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; continue; case BLK_MQ_RQ_QUEUE_BUSY: - /* - * FIXME: we should have a mechanism to stop the queue - * like blk_stop_queue, otherwise we will waste cpu - * time - */ list_add(&rq->queuelist, &rq_list); - blk_mq_requeue_request(rq); + __blk_mq_requeue_request(rq); break; default: pr_err("blk-mq: bad return on queue: %d\n", ret); case BLK_MQ_RQ_QUEUE_ERROR: rq->errors = -EIO; - blk_mq_end_io(rq, rq->errors); + blk_mq_end_request(rq, rq->errors); break; } if (ret == BLK_MQ_RQ_QUEUE_BUSY) break; + + /* + * We've done the first request. If we have more than 1 + * left in the list, set dptr to defer issue. + */ + if (!dptr && rq_list.next != rq_list.prev) + dptr = &driver_list; } if (!queued) @@ -598,24 +816,68 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) spin_lock(&hctx->lock); list_splice(&rq_list, &hctx->dispatch); spin_unlock(&hctx->lock); + /* + * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but + * it's possible the queue is stopped and restarted again + * before this. Queue restart will dispatch requests. And since + * requests in rq_list aren't added into hctx->dispatch yet, + * the requests in rq_list might get lost. + * + * blk_mq_run_hw_queue() already checks the STOPPED bit + **/ + blk_mq_run_hw_queue(hctx, true); } } +/* + * It'd be great if the workqueue API had a way to pass + * in a mask and had some smarts for more clever placement. + * For now we just round-robin here, switching for every + * BLK_MQ_CPU_WORK_BATCH queued items. + */ +static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->queue->nr_hw_queues == 1) + return WORK_CPU_UNBOUND; + + if (--hctx->next_cpu_batch <= 0) { + int cpu = hctx->next_cpu, next_cpu; + + next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(hctx->cpumask); + + hctx->next_cpu = next_cpu; + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + + return cpu; + } + + return hctx->next_cpu; +} + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || + !blk_mq_hw_queue_mapped(hctx))) return; - if (!async) - __blk_mq_run_hw_queue(hctx); - else { - struct request_queue *q = hctx->queue; + if (!async) { + int cpu = get_cpu(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); + put_cpu(); + return; + } - kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); + put_cpu(); } + + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->run_work, 0); } -void blk_mq_run_queues(struct request_queue *q, bool async) +void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; int i; @@ -629,11 +891,12 @@ void blk_mq_run_queues(struct request_queue *q, bool async) blk_mq_run_hw_queue(hctx, async); } } -EXPORT_SYMBOL(blk_mq_run_queues); +EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { - cancel_delayed_work(&hctx->delayed_work); + cancel_delayed_work(&hctx->run_work); + cancel_delayed_work(&hctx->delay_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } EXPORT_SYMBOL(blk_mq_stop_hw_queue); @@ -651,11 +914,22 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queues); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - __blk_mq_run_hw_queue(hctx); + + blk_mq_run_hw_queue(hctx, false); } EXPORT_SYMBOL(blk_mq_start_hw_queue); -void blk_mq_start_stopped_hw_queues(struct request_queue *q) +void blk_mq_start_hw_queues(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_start_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_start_hw_queues); + +void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; int i; @@ -665,36 +939,60 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q) continue; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - blk_mq_run_hw_queue(hctx, true); + blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); -static void blk_mq_work_fn(struct work_struct *work) +static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx; - hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); + hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + __blk_mq_run_hw_queue(hctx); } -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool at_head) +static void blk_mq_delay_work_fn(struct work_struct *work) { - struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); + + if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) + __blk_mq_run_hw_queue(hctx); +} + +void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) +{ + if (unlikely(!blk_mq_hw_queue_mapped(hctx))) + return; + + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->delay_work, msecs_to_jiffies(msecs)); +} +EXPORT_SYMBOL(blk_mq_delay_queue); +static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, + bool at_head) +{ trace_block_rq_insert(hctx->queue, rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); - blk_mq_hctx_mark_pending(hctx, ctx); +} - /* - * We do this early, to ensure we are on the right CPU. - */ - blk_mq_add_timer(rq); +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool at_head) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + + __blk_mq_insert_req_list(hctx, ctx, rq, at_head); + blk_mq_hctx_mark_pending(hctx, ctx); } void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, @@ -710,19 +1008,14 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, hctx = q->mq_ops->map_queue(q, ctx->cpu); - if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) && - !(rq->cmd_flags & (REQ_FLUSH_SEQ))) { - blk_insert_flush(rq); - } else { - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - } - - blk_mq_put_ctx(current_ctx); + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, at_head); + spin_unlock(&ctx->lock); if (run_queue) blk_mq_run_hw_queue(hctx, async); + + blk_mq_put_ctx(current_ctx); } static void blk_mq_insert_requests(struct request_queue *q, @@ -754,13 +1047,13 @@ static void blk_mq_insert_requests(struct request_queue *q, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); rq->mq_ctx = ctx; - __blk_mq_insert_request(hctx, rq, false); + __blk_mq_insert_req_list(hctx, ctx, rq, false); } + blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); - blk_mq_put_ctx(current_ctx); - blk_mq_run_hw_queue(hctx, from_schedule); + blk_mq_put_ctx(current_ctx); } static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -823,109 +1116,295 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { init_request_from_bio(rq, bio); - blk_account_io_start(rq, 1); + + if (blk_do_io_stat(rq)) + blk_account_io_start(rq, 1); } -static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - const int is_sync = rw_is_sync(bio->bi_rw); - const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - int rw = bio_data_dir(bio); - struct request *rq; - unsigned int use_plug, request_count = 0; + return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && + !blk_queue_nomerges(hctx->queue); +} - /* - * If we have multiple hardware queues, just go directly to - * one of those for sync IO. - */ - use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); +static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, struct bio *bio) +{ + if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { + blk_mq_bio_to_request(rq, bio); + spin_lock(&ctx->lock); +insert_rq: + __blk_mq_insert_request(hctx, rq, false); + spin_unlock(&ctx->lock); + return false; + } else { + struct request_queue *q = hctx->queue; - blk_queue_bounce(q, &bio); + spin_lock(&ctx->lock); + if (!blk_mq_attempt_merge(q, ctx, bio)) { + blk_mq_bio_to_request(rq, bio); + goto insert_rq; + } - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); - return; + spin_unlock(&ctx->lock); + __blk_mq_free_request(hctx, ctx, rq); + return true; } +} - if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) - return; +struct blk_map_ctx { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; +}; - if (blk_mq_queue_enter(q)) { - bio_endio(bio, -EIO); - return; - } +static struct request *blk_mq_map_request(struct request_queue *q, + struct bio *bio, + struct blk_map_ctx *data) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + struct request *rq; + int rw = bio_data_dir(bio); + struct blk_mq_alloc_data alloc_data; + blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - if (is_sync) + if (rw_is_sync(bio->bi_rw)) rw |= REQ_SYNC; + trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); - if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, rw); - else { + blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw); + if (unlikely(!rq)) { + __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, - false); - ctx = rq->mq_ctx; + + ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); + blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw); + ctx = alloc_data.ctx; + hctx = alloc_data.hctx; } hctx->queued++; + data->hctx = hctx; + data->ctx = ctx; + return rq; +} + +static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) +{ + int ret; + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, + rq->mq_ctx->cpu); + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; + blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + if (ret == BLK_MQ_RQ_QUEUE_OK) { + *cookie = new_cookie; + return 0; + } + + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + *cookie = BLK_QC_T_NONE; + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + return 0; + } + + return -1; +} + +/* + * Multiple hardware queue variant. This will not use per-process plugs, + * but will attempt to bypass the hctx queueing if we can go straight to + * hardware for SYNC IO. + */ +static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) +{ + const int is_sync = rw_is_sync(bio->bi_rw); + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); + struct blk_map_ctx data; + struct request *rq; + unsigned int request_count = 0; + struct blk_plug *plug; + struct request *same_queue_rq = NULL; + blk_qc_t cookie; + + blk_queue_bounce(q, &bio); + + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_io_error(bio); + return BLK_QC_T_NONE; + } + + blk_queue_split(q, &bio, q->bio_split); + + if (!is_flush_fua && !blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, + &same_queue_rq)) + return BLK_QC_T_NONE; + } else + request_count = blk_plug_queued_count(q); + + rq = blk_mq_map_request(q, bio, &data); + if (unlikely(!rq)) + return BLK_QC_T_NONE; + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); - blk_mq_put_ctx(ctx); blk_insert_flush(rq); goto run_queue; } + plug = current->plug; /* - * A task plug currently exists. Since this is completely lockless, - * utilize that to temporarily store requests until the task is - * either done or scheduled away. + * If the driver supports defer issued based on 'last', then + * queue it up like normal since we can potentially save some + * CPU this way. */ - if (use_plug) { - struct blk_plug *plug = current->plug; + if (((plug && !blk_queue_nomerges(q)) || is_sync) && + !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct request *old_rq = NULL; + blk_mq_bio_to_request(rq, bio); + + /* + * We do limited pluging. If the bio can be merged, do that. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most + */ if (plug) { - blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) - trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); + /* + * The plug list might get flushed before this. If that + * happens, same_queue_rq is invalid and plug list is + * empty + */ + if (same_queue_rq && !list_empty(&plug->mq_list)) { + old_rq = same_queue_rq; + list_del_init(&old_rq->queuelist); } list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(ctx); - return; - } + } else /* is_sync */ + old_rq = rq; + blk_mq_put_ctx(data.ctx); + if (!old_rq) + goto done; + if (!blk_mq_direct_issue_request(old_rq, &cookie)) + goto done; + blk_mq_insert_request(old_rq, false, true, true); + goto done; } - spin_lock(&ctx->lock); + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); + } + blk_mq_put_ctx(data.ctx); +done: + return cookie; +} - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - blk_mq_attempt_merge(q, ctx, bio)) - __blk_mq_free_request(hctx, ctx, rq); - else { - blk_mq_bio_to_request(rq, bio); - __blk_mq_insert_request(hctx, rq, false); +/* + * Single hardware queue variant. This will attempt to use any per-process + * plug for merging and IO deferral. + */ +static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) +{ + const int is_sync = rw_is_sync(bio->bi_rw); + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); + struct blk_plug *plug; + unsigned int request_count = 0; + struct blk_map_ctx data; + struct request *rq; + blk_qc_t cookie; + + blk_queue_bounce(q, &bio); + + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_io_error(bio); + return BLK_QC_T_NONE; } - spin_unlock(&ctx->lock); - blk_mq_put_ctx(ctx); + blk_queue_split(q, &bio, q->bio_split); + + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return BLK_QC_T_NONE; + + rq = blk_mq_map_request(q, bio, &data); + if (unlikely(!rq)) + return BLK_QC_T_NONE; + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + + if (unlikely(is_flush_fua)) { + blk_mq_bio_to_request(rq, bio); + blk_insert_flush(rq); + goto run_queue; + } /* - * For a SYNC request, send it to the hardware immediately. For an - * ASYNC request, just ensure that we run it later on. The latter - * allows for merging opportunities and more efficient dispatching. + * A task plug currently exists. Since this is completely lockless, + * utilize that to temporarily store requests until the task is + * either done or scheduled away. */ + plug = current->plug; + if (plug) { + blk_mq_bio_to_request(rq, bio); + if (!request_count) + trace_block_plug(q); + + blk_mq_put_ctx(data.ctx); + + if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); + } + + list_add_tail(&rq->queuelist, &plug->mq_list); + return cookie; + } + + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ run_queue: - blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); + } + + blk_mq_put_ctx(data.ctx); + return cookie; } /* @@ -937,32 +1416,165 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) } EXPORT_SYMBOL(blk_mq_map_queue); -struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, - unsigned int hctx_index) +static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) { - return kmalloc_node(sizeof(struct blk_mq_hw_ctx), - GFP_KERNEL | __GFP_ZERO, reg->numa_node); + struct page *page; + + if (tags->rqs && set->ops->exit_request) { + int i; + + for (i = 0; i < tags->nr_tags; i++) { + if (!tags->rqs[i]) + continue; + set->ops->exit_request(set->driver_data, tags->rqs[i], + hctx_idx, i); + tags->rqs[i] = NULL; + } + } + + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_init_rq_map(). + */ + kmemleak_free(page_address(page)); + __free_pages(page, page->private); + } + + kfree(tags->rqs); + + blk_mq_free_tags(tags); } -EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); -void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, - unsigned int hctx_index) +static size_t order_to_size(unsigned int order) { - kfree(hctx); + return (size_t)PAGE_SIZE << order; } -EXPORT_SYMBOL(blk_mq_free_single_hw_queue); -static void blk_mq_hctx_notify(void *data, unsigned long action, - unsigned int cpu) +static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + struct blk_mq_tags *tags; + unsigned int i, j, entries_per_page, max_order = 4; + size_t rq_size, left; + + tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, + set->numa_node, + BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); + if (!tags) + return NULL; + + INIT_LIST_HEAD(&tags->page_list); + + tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, + set->numa_node); + if (!tags->rqs) { + blk_mq_free_tags(tags); + return NULL; + } + + /* + * rq_size is the size of the request plus driver payload, rounded + * to the cacheline size + */ + rq_size = round_up(sizeof(struct request) + set->cmd_size, + cache_line_size()); + left = rq_size * set->queue_depth; + + for (i = 0; i < set->queue_depth; ) { + int this_order = max_order; + struct page *page; + int to_do; + void *p; + + while (left < order_to_size(this_order - 1) && this_order) + this_order--; + + do { + page = alloc_pages_node(set->numa_node, + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, + this_order); + if (page) + break; + if (!this_order--) + break; + if (order_to_size(this_order) < rq_size) + break; + } while (1); + + if (!page) + goto fail; + + page->private = this_order; + list_add_tail(&page->lru, &tags->page_list); + + p = page_address(page); + /* + * Allow kmemleak to scan these pages as they contain pointers + * to additional allocations like via ops->init_request(). + */ + kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); + entries_per_page = order_to_size(this_order) / rq_size; + to_do = min(entries_per_page, set->queue_depth - i); + left -= to_do * rq_size; + for (j = 0; j < to_do; j++) { + tags->rqs[i] = p; + if (set->ops->init_request) { + if (set->ops->init_request(set->driver_data, + tags->rqs[i], hctx_idx, i, + set->numa_node)) { + tags->rqs[i] = NULL; + goto fail; + } + } + + p += rq_size; + i++; + } + } + return tags; + +fail: + blk_mq_free_rq_map(set, tags, hctx_idx); + return NULL; +} + +static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) +{ + kfree(bitmap->map); +} + +static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) +{ + unsigned int bpw = 8, total, num_maps, i; + + bitmap->bits_per_word = bpw; + + num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; + bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bitmap->map) + return -ENOMEM; + + total = nr_cpu_ids; + for (i = 0; i < num_maps; i++) { + bitmap->map[i].depth = min(total, bitmap->bits_per_word); + total -= bitmap->map[i].depth; + } + + return 0; +} + +static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) { - struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return; - /* * Move ctx entries to new CPU, if this one is going away. */ @@ -971,12 +1583,12 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_list)) { list_splice_init(&ctx->rq_list, &tmp); - clear_bit(ctx->index_hw, hctx->ctx_map); + blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); if (list_empty(&tmp)) - return; + return NOTIFY_OK; ctx = blk_mq_get_ctx(q); spin_lock(&ctx->lock); @@ -993,231 +1605,154 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); - blk_mq_put_ctx(ctx); blk_mq_run_hw_queue(hctx, true); + blk_mq_put_ctx(ctx); + return NOTIFY_OK; } -static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, - int (*init)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) +static int blk_mq_hctx_notify(void *data, unsigned long action, + unsigned int cpu) { - unsigned int i; - int ret = 0; + struct blk_mq_hw_ctx *hctx = data; - for (i = 0; i < hctx->queue_depth; i++) { - struct request *rq = hctx->rqs[i]; + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + return blk_mq_hctx_cpu_offline(hctx, cpu); - ret = init(data, hctx, rq, i); - if (ret) - break; - } + /* + * In case of CPU online, tags may be reallocated + * in blk_mq_map_swqueue() after mapping is updated. + */ - return ret; + return NOTIFY_OK; } -int blk_mq_init_commands(struct request_queue *q, - int (*init)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) +/* hctx->ctxs will be freed in queue's release handler */ +static void blk_mq_exit_hctx(struct request_queue *q, + struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; - int ret = 0; + unsigned flush_start_tag = set->queue_depth; - queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_init_hw_commands(hctx, init, data); - if (ret) - break; - } + blk_mq_tag_idle(hctx); - return ret; + if (set->ops->exit_request) + set->ops->exit_request(set->driver_data, + hctx->fq->flush_rq, hctx_idx, + flush_start_tag + hctx_idx); + + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); + + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + blk_free_flush_queue(hctx->fq); + blk_mq_free_bitmap(&hctx->ctx_map); } -EXPORT_SYMBOL(blk_mq_init_commands); -static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx, - void (*free)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) +static void blk_mq_exit_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set, int nr_queue) { + struct blk_mq_hw_ctx *hctx; unsigned int i; - for (i = 0; i < hctx->queue_depth; i++) { - struct request *rq = hctx->rqs[i]; - - free(data, hctx, rq, i); + queue_for_each_hw_ctx(q, hctx, i) { + if (i == nr_queue) + break; + blk_mq_exit_hctx(q, set, hctx, i); } } -void blk_mq_free_commands(struct request_queue *q, - void (*free)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) +static void blk_mq_free_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set) { struct blk_mq_hw_ctx *hctx; unsigned int i; queue_for_each_hw_ctx(q, hctx, i) - blk_mq_free_hw_commands(hctx, free, data); + free_cpumask_var(hctx->cpumask); } -EXPORT_SYMBOL(blk_mq_free_commands); -static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) +static int blk_mq_init_hctx(struct request_queue *q, + struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { - struct page *page; + int node; + unsigned flush_start_tag = set->queue_depth; - while (!list_empty(&hctx->page_list)) { - page = list_first_entry(&hctx->page_list, struct page, lru); - list_del_init(&page->lru); - __free_pages(page, page->private); - } - - kfree(hctx->rqs); - - if (hctx->tags) - blk_mq_free_tags(hctx->tags); -} - -static size_t order_to_size(unsigned int order) -{ - size_t ret = PAGE_SIZE; - - while (order--) - ret *= 2; - - return ret; -} + node = hctx->numa_node; + if (node == NUMA_NO_NODE) + node = hctx->numa_node = set->numa_node; -static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, - unsigned int reserved_tags, int node) -{ - unsigned int i, j, entries_per_page, max_order = 4; - size_t rq_size, left; + INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); + spin_lock_init(&hctx->lock); + INIT_LIST_HEAD(&hctx->dispatch); + hctx->queue = q; + hctx->queue_num = hctx_idx; + hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; - INIT_LIST_HEAD(&hctx->page_list); + blk_mq_init_cpu_notifier(&hctx->cpu_notifier, + blk_mq_hctx_notify, hctx); + blk_mq_register_cpu_notifier(&hctx->cpu_notifier); - hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), - GFP_KERNEL, node); - if (!hctx->rqs) - return -ENOMEM; + hctx->tags = set->tags[hctx_idx]; /* - * rq_size is the size of the request plus driver payload, rounded - * to the cacheline size + * Allocate space for all possible cpus to avoid allocation at + * runtime */ - rq_size = round_up(sizeof(struct request) + hctx->cmd_size, - cache_line_size()); - left = rq_size * hctx->queue_depth; - - for (i = 0; i < hctx->queue_depth;) { - int this_order = max_order; - struct page *page; - int to_do; - void *p; + hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), + GFP_KERNEL, node); + if (!hctx->ctxs) + goto unregister_cpu_notifier; - while (left < order_to_size(this_order - 1) && this_order) - this_order--; + if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) + goto free_ctxs; - do { - page = alloc_pages_node(node, GFP_KERNEL, this_order); - if (page) - break; - if (!this_order--) - break; - if (order_to_size(this_order) < rq_size) - break; - } while (1); + hctx->nr_ctx = 0; - if (!page) - break; + if (set->ops->init_hctx && + set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) + goto free_bitmap; - page->private = this_order; - list_add_tail(&page->lru, &hctx->page_list); + hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); + if (!hctx->fq) + goto exit_hctx; - p = page_address(page); - entries_per_page = order_to_size(this_order) / rq_size; - to_do = min(entries_per_page, hctx->queue_depth - i); - left -= to_do * rq_size; - for (j = 0; j < to_do; j++) { - hctx->rqs[i] = p; - blk_mq_rq_init(hctx, hctx->rqs[i]); - p += rq_size; - i++; - } - } + if (set->ops->init_request && + set->ops->init_request(set->driver_data, + hctx->fq->flush_rq, hctx_idx, + flush_start_tag + hctx_idx, node)) + goto free_fq; - if (i < (reserved_tags + BLK_MQ_TAG_MIN)) - goto err_rq_map; - else if (i != hctx->queue_depth) { - hctx->queue_depth = i; - pr_warn("%s: queue depth set to %u because of low memory\n", - __func__, i); - } + return 0; - hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); - if (!hctx->tags) { -err_rq_map: - blk_mq_free_rq_map(hctx); - return -ENOMEM; - } + free_fq: + kfree(hctx->fq); + exit_hctx: + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); + free_bitmap: + blk_mq_free_bitmap(&hctx->ctx_map); + free_ctxs: + kfree(hctx->ctxs); + unregister_cpu_notifier: + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - return 0; + return -1; } static int blk_mq_init_hw_queues(struct request_queue *q, - struct blk_mq_reg *reg, void *driver_data) + struct blk_mq_tag_set *set) { struct blk_mq_hw_ctx *hctx; - unsigned int i, j; + unsigned int i; /* * Initialize hardware queues */ queue_for_each_hw_ctx(q, hctx, i) { - unsigned int num_maps; - int node; - - node = hctx->numa_node; - if (node == NUMA_NO_NODE) - node = hctx->numa_node = reg->numa_node; - - INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); - spin_lock_init(&hctx->lock); - INIT_LIST_HEAD(&hctx->dispatch); - hctx->queue = q; - hctx->queue_num = i; - hctx->flags = reg->flags; - hctx->queue_depth = reg->queue_depth; - hctx->cmd_size = reg->cmd_size; - - blk_mq_init_cpu_notifier(&hctx->cpu_notifier, - blk_mq_hctx_notify, hctx); - blk_mq_register_cpu_notifier(&hctx->cpu_notifier); - - if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) - break; - - /* - * Allocate space for all possible cpus to avoid allocation in - * runtime - */ - hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), - GFP_KERNEL, node); - if (!hctx->ctxs) - break; - - num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; - hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), - GFP_KERNEL, node); - if (!hctx->ctx_map) - break; - - hctx->nr_ctx_map = num_maps; - hctx->nr_ctx = 0; - - if (reg->ops->init_hctx && - reg->ops->init_hctx(hctx, driver_data, i)) + if (blk_mq_init_hctx(q, set, hctx, i)) break; } @@ -1227,17 +1762,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, /* * Init failed */ - queue_for_each_hw_ctx(q, hctx, j) { - if (i == j) - break; - - if (reg->ops->exit_hctx) - reg->ops->exit_hctx(hctx, j); - - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - blk_mq_free_rq_map(hctx); - kfree(hctx->ctxs); - } + blk_mq_exit_hw_queues(q, set, i); return 1; } @@ -1258,28 +1783,35 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, __ctx->queue = q; /* If the cpu isn't online, the cpu is mapped to first hctx */ - hctx = q->mq_ops->map_queue(q, i); - hctx->nr_ctx++; - if (!cpu_online(i)) continue; + hctx = q->mq_ops->map_queue(q, i); + /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) - hctx->numa_node = cpu_to_node(i); + hctx->numa_node = local_memory_node(cpu_to_node(i)); } } -static void blk_mq_map_swqueue(struct request_queue *q) +static void blk_mq_map_swqueue(struct request_queue *q, + const struct cpumask *online_mask) { unsigned int i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; + struct blk_mq_tag_set *set = q->tag_set; + + /* + * Avoid others reading imcomplete hctx->cpumask through sysfs + */ + mutex_lock(&q->sysfs_lock); queue_for_each_hw_ctx(q, hctx, i) { + cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; } @@ -1288,156 +1820,288 @@ static void blk_mq_map_swqueue(struct request_queue *q) */ queue_for_each_ctx(q, ctx, i) { /* If the cpu isn't online, the cpu is mapped to first hctx */ + if (!cpumask_test_cpu(i, online_mask)) + continue; + hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; } + + mutex_unlock(&q->sysfs_lock); + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_ctxmap *map = &hctx->ctx_map; + + /* + * If no software queues are mapped to this hardware queue, + * disable it and free the request entries. + */ + if (!hctx->nr_ctx) { + if (set->tags[i]) { + blk_mq_free_rq_map(set, set->tags[i], i); + set->tags[i] = NULL; + } + hctx->tags = NULL; + continue; + } + + /* unmapped hw queue can be remapped after CPU topo changed */ + if (!set->tags[i]) + set->tags[i] = blk_mq_init_rq_map(set, i); + hctx->tags = set->tags[i]; + WARN_ON(!hctx->tags); + + cpumask_copy(hctx->tags->cpumask, hctx->cpumask); + /* + * Set the map size to the number of mapped software queues. + * This is more accurate and more efficient than looping + * over all possibly mapped software queues. + */ + map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word); + + /* + * Initialize batch roundrobin counts + */ + hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } } -struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, - void *driver_data) +static void queue_set_hctx_shared(struct request_queue *q, bool shared) { - struct blk_mq_hw_ctx **hctxs; - struct blk_mq_ctx *ctx; - struct request_queue *q; + struct blk_mq_hw_ctx *hctx; int i; - if (!reg->nr_hw_queues || - !reg->ops->queue_rq || !reg->ops->map_queue || - !reg->ops->alloc_hctx || !reg->ops->free_hctx) - return ERR_PTR(-EINVAL); + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } +} + +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) +{ + struct request_queue *q; + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_freeze_queue(q); + queue_set_hctx_shared(q, shared); + blk_mq_unfreeze_queue(q); + } +} - if (!reg->queue_depth) - reg->queue_depth = BLK_MQ_MAX_DEPTH; - else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { - pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); - reg->queue_depth = BLK_MQ_MAX_DEPTH; +static void blk_mq_del_queue_tag_set(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + mutex_lock(&set->tag_list_lock); + list_del_init(&q->tag_set_list); + if (list_is_singular(&set->tag_list)) { + /* just transitioned to unshared */ + set->flags &= ~BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, false); } + mutex_unlock(&set->tag_list_lock); +} - if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) - return ERR_PTR(-EINVAL); +static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + q->tag_set = set; + + mutex_lock(&set->tag_list_lock); + + /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ + if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { + set->flags |= BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, true); + } + if (set->flags & BLK_MQ_F_TAG_SHARED) + queue_set_hctx_shared(q, true); + list_add_tail(&q->tag_set_list, &set->tag_list); + + mutex_unlock(&set->tag_list_lock); +} + +/* + * It is the actual release handler for mq, but we do it from + * request queue's release handler for avoiding use-after-free + * and headache because q->mq_kobj shouldn't have been introduced, + * but we can't group ctx/kctx kobj without it. + */ +void blk_mq_release(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + /* hctx kobj stays in hctx */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx) + continue; + kfree(hctx->ctxs); + kfree(hctx); + } + + kfree(q->mq_map); + q->mq_map = NULL; + + kfree(q->queue_hw_ctx); + + /* ctx kobj stays in queue_ctx */ + free_percpu(q->queue_ctx); +} + +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + struct request_queue *uninit_q, *q; + + uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + if (!uninit_q) + return ERR_PTR(-ENOMEM); + + q = blk_mq_init_allocated_queue(set, uninit_q); + if (IS_ERR(q)) + blk_cleanup_queue(uninit_q); + + return q; +} +EXPORT_SYMBOL(blk_mq_init_queue); + +struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + struct blk_mq_hw_ctx **hctxs; + struct blk_mq_ctx __percpu *ctx; + unsigned int *map; + int i; ctx = alloc_percpu(struct blk_mq_ctx); if (!ctx) return ERR_PTR(-ENOMEM); - hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, - reg->numa_node); + hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, + set->numa_node); if (!hctxs) goto err_percpu; - for (i = 0; i < reg->nr_hw_queues; i++) { - hctxs[i] = reg->ops->alloc_hctx(reg, i); + map = blk_mq_make_queue_map(set); + if (!map) + goto err_map; + + for (i = 0; i < set->nr_hw_queues; i++) { + int node = blk_mq_hw_queue_to_node(map, i); + + hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), + GFP_KERNEL, node); if (!hctxs[i]) goto err_hctxs; - hctxs[i]->numa_node = NUMA_NO_NODE; + if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, + node)) + goto err_hctxs; + + atomic_set(&hctxs[i]->nr_active, 0); + hctxs[i]->numa_node = node; hctxs[i]->queue_num = i; } - q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); - if (!q) - goto err_hctxs; - - q->mq_map = blk_mq_make_queue_map(reg); - if (!q->mq_map) - goto err_map; - - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); - blk_queue_rq_timeout(q, 30000); + INIT_WORK(&q->timeout_work, blk_mq_timeout_work); + blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->nr_queues = nr_cpu_ids; - q->nr_hw_queues = reg->nr_hw_queues; + q->nr_hw_queues = set->nr_hw_queues; + q->mq_map = map; q->queue_ctx = ctx; q->queue_hw_ctx = hctxs; - q->mq_ops = reg->ops; + q->mq_ops = set->ops; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; + if (!(set->flags & BLK_MQ_F_SG_MERGE)) + q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; + q->sg_reserved_size = INT_MAX; - blk_queue_make_request(q, blk_mq_make_request); - blk_queue_rq_timed_out(q, reg->ops->timeout); - if (reg->timeout) - blk_queue_rq_timeout(q, reg->timeout); + INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_LIST_HEAD(&q->requeue_list); + spin_lock_init(&q->requeue_lock); - if (reg->ops->complete) - blk_queue_softirq_done(q, reg->ops->complete); + if (q->nr_hw_queues > 1) + blk_queue_make_request(q, blk_mq_make_request); + else + blk_queue_make_request(q, blk_sq_make_request); - blk_mq_init_flush(q); - blk_mq_init_cpu_queues(q, reg->nr_hw_queues); + /* + * Do this after blk_queue_make_request() overrides it... + */ + q->nr_requests = set->queue_depth; - q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, - cache_line_size()), GFP_KERNEL); - if (!q->flush_rq) - goto err_hw; + if (set->ops->complete) + blk_queue_softirq_done(q, set->ops->complete); - if (blk_mq_init_hw_queues(q, reg, driver_data)) - goto err_flush_rq; + blk_mq_init_cpu_queues(q, set->nr_hw_queues); - blk_mq_map_swqueue(q); + if (blk_mq_init_hw_queues(q, set)) + goto err_hctxs; + get_online_cpus(); mutex_lock(&all_q_mutex); + list_add_tail(&q->all_q_node, &all_q_list); + blk_mq_add_queue_tag_set(set, q); + blk_mq_map_swqueue(q, cpu_online_mask); + mutex_unlock(&all_q_mutex); + put_online_cpus(); return q; -err_flush_rq: - kfree(q->flush_rq); -err_hw: - kfree(q->mq_map); -err_map: - blk_cleanup_queue(q); err_hctxs: - for (i = 0; i < reg->nr_hw_queues; i++) { + kfree(map); + for (i = 0; i < set->nr_hw_queues; i++) { if (!hctxs[i]) break; - reg->ops->free_hctx(hctxs[i], i); + free_cpumask_var(hctxs[i]->cpumask); + kfree(hctxs[i]); } +err_map: kfree(hctxs); err_percpu: free_percpu(ctx); return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL(blk_mq_init_queue); +EXPORT_SYMBOL(blk_mq_init_allocated_queue); void blk_mq_free_queue(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - int i; - - queue_for_each_hw_ctx(q, hctx, i) { - kfree(hctx->ctx_map); - kfree(hctx->ctxs); - blk_mq_free_rq_map(hctx); - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - if (q->mq_ops->exit_hctx) - q->mq_ops->exit_hctx(hctx, i); - q->mq_ops->free_hctx(hctx, i); - } - - free_percpu(q->queue_ctx); - kfree(q->queue_hw_ctx); - kfree(q->mq_map); - - q->queue_ctx = NULL; - q->queue_hw_ctx = NULL; - q->mq_map = NULL; + struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&all_q_mutex); list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); + + blk_mq_del_queue_tag_set(q); + + blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); + blk_mq_free_hw_queues(q, set); } /* Basically redo blk_mq_init_queue with queue frozen */ -static void blk_mq_queue_reinit(struct request_queue *q) +static void blk_mq_queue_reinit(struct request_queue *q, + const struct cpumask *online_mask) { - blk_mq_freeze_queue(q); + WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); + blk_mq_sysfs_unregister(q); + + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe @@ -1445,33 +2109,236 @@ static void blk_mq_queue_reinit(struct request_queue *q) * involves free and re-allocate memory, worthy doing?) */ - blk_mq_map_swqueue(q); + blk_mq_map_swqueue(q, online_mask); - blk_mq_unfreeze_queue(q); + blk_mq_sysfs_register(q); } static int blk_mq_queue_reinit_notify(struct notifier_block *nb, unsigned long action, void *hcpu) { struct request_queue *q; + int cpu = (unsigned long)hcpu; + /* + * New online cpumask which is going to be set in this hotplug event. + * Declare this cpumasks as global as cpu-hotplug operation is invoked + * one-by-one and dynamically allocating this could result in a failure. + */ + static struct cpumask online_new; /* - * Before new mapping is established, hotadded cpu might already start - * handling requests. This doesn't break anything as we map offline - * CPUs to first hardware queue. We will re-init queue below to get - * optimal settings. + * Before hotadded cpu starts handling requests, new mappings must + * be established. Otherwise, these requests in hw queue might + * never be dispatched. + * + * For example, there is a single hw queue (hctx) and two CPU queues + * (ctx0 for CPU0, and ctx1 for CPU1). + * + * Now CPU1 is just onlined and a request is inserted into + * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is + * still zero. + * + * And then while running hw queue, flush_busy_ctxs() finds bit0 is + * set in pending bitmap and tries to retrieve requests in + * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, + * so the request in ctx1->rq_list is ignored. */ - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && - action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DEAD: + case CPU_UP_CANCELED: + cpumask_copy(&online_new, cpu_online_mask); + break; + case CPU_UP_PREPARE: + cpumask_copy(&online_new, cpu_online_mask); + cpumask_set_cpu(cpu, &online_new); + break; + default: return NOTIFY_OK; + } mutex_lock(&all_q_mutex); + + /* + * We need to freeze and reinit all existing queues. Freezing + * involves synchronous wait for an RCU grace period and doing it + * one by one may take a long time. Start freezing all queues in + * one swoop and then wait for the completions so that freezing can + * take place in parallel. + */ + list_for_each_entry(q, &all_q_list, all_q_node) + blk_mq_freeze_queue_start(q); + list_for_each_entry(q, &all_q_list, all_q_node) { + blk_mq_freeze_queue_wait(q); + + /* + * timeout handler can't touch hw queue during the + * reinitialization + */ + del_timer_sync(&q->timeout); + } + + list_for_each_entry(q, &all_q_list, all_q_node) + blk_mq_queue_reinit(q, &online_new); + list_for_each_entry(q, &all_q_list, all_q_node) - blk_mq_queue_reinit(q); + blk_mq_unfreeze_queue(q); + mutex_unlock(&all_q_mutex); return NOTIFY_OK; } +static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +{ + int i; + + for (i = 0; i < set->nr_hw_queues; i++) { + set->tags[i] = blk_mq_init_rq_map(set, i); + if (!set->tags[i]) + goto out_unwind; + } + + return 0; + +out_unwind: + while (--i >= 0) + blk_mq_free_rq_map(set, set->tags[i], i); + + return -ENOMEM; +} + +/* + * Allocate the request maps associated with this tag_set. Note that this + * may reduce the depth asked for, if memory is tight. set->queue_depth + * will be updated to reflect the allocated depth. + */ +static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +{ + unsigned int depth; + int err; + + depth = set->queue_depth; + do { + err = __blk_mq_alloc_rq_maps(set); + if (!err) + break; + + set->queue_depth >>= 1; + if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { + err = -ENOMEM; + break; + } + } while (set->queue_depth); + + if (!set->queue_depth || err) { + pr_err("blk-mq: failed to allocate request map\n"); + return -ENOMEM; + } + + if (depth != set->queue_depth) + pr_info("blk-mq: reduced tag depth (%u -> %u)\n", + depth, set->queue_depth); + + return 0; +} + +struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) +{ + return tags->cpumask; +} +EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); + +/* + * Alloc a tag set to be associated with one or more request queues. + * May fail with EINVAL for various error conditions. May adjust the + * requested depth down, if if it too large. In that case, the set + * value will be stored in set->queue_depth. + */ +int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) +{ + BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); + + if (!set->nr_hw_queues) + return -EINVAL; + if (!set->queue_depth) + return -EINVAL; + if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) + return -EINVAL; + + if (!set->ops->queue_rq || !set->ops->map_queue) + return -EINVAL; + + if (set->queue_depth > BLK_MQ_MAX_DEPTH) { + pr_info("blk-mq: reduced tag depth to %u\n", + BLK_MQ_MAX_DEPTH); + set->queue_depth = BLK_MQ_MAX_DEPTH; + } + + /* + * If a crashdump is active, then we are potentially in a very + * memory constrained environment. Limit us to 1 queue and + * 64 tags to prevent using too much memory. + */ + if (is_kdump_kernel()) { + set->nr_hw_queues = 1; + set->queue_depth = min(64U, set->queue_depth); + } + + set->tags = kmalloc_node(set->nr_hw_queues * + sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!set->tags) + return -ENOMEM; + + if (blk_mq_alloc_rq_maps(set)) + goto enomem; + + mutex_init(&set->tag_list_lock); + INIT_LIST_HEAD(&set->tag_list); + + return 0; +enomem: + kfree(set->tags); + set->tags = NULL; + return -ENOMEM; +} +EXPORT_SYMBOL(blk_mq_alloc_tag_set); + +void blk_mq_free_tag_set(struct blk_mq_tag_set *set) +{ + int i; + + for (i = 0; i < set->nr_hw_queues; i++) { + if (set->tags[i]) + blk_mq_free_rq_map(set, set->tags[i], i); + } + + kfree(set->tags); + set->tags = NULL; +} +EXPORT_SYMBOL(blk_mq_free_tag_set); + +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int i, ret; + + if (!set || nr > set->queue_depth) + return -EINVAL; + + ret = 0; + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_tag_update_depth(hctx->tags, nr); + if (ret) + break; + } + + if (!ret) + q->nr_requests = nr; + + return ret; +} + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); @@ -1486,8 +2353,7 @@ static int __init blk_mq_init(void) { blk_mq_cpu_init(); - /* Must be called after percpu_counter_hotcpu_callback() */ - hotcpu_notifier(blk_mq_queue_reinit_notify, -10); + hotcpu_notifier(blk_mq_queue_reinit_notify, 0); return 0; } diff --git a/block/blk-mq.h b/block/blk-mq.h index ebbe6bac9d61..eaede8e45c9c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +struct blk_mq_tag_set; + struct blk_mq_ctx { struct { spinlock_t lock; @@ -9,7 +11,8 @@ struct blk_mq_ctx { unsigned int cpu; unsigned int index_hw; - unsigned int ipi_redirect; + + unsigned int last_tag ____cacheline_aligned_in_smp; /* incremented at dispatch time */ unsigned long rq_dispatched[2]; @@ -20,21 +23,20 @@ struct blk_mq_ctx { struct request_queue *queue; struct kobject kobj; -}; +} ____cacheline_aligned_in_smp; -void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -void blk_mq_init_flush(struct request_queue *q); -void blk_mq_drain_queue(struct request_queue *q); +void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); -void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq); +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); +void blk_mq_wake_waiters(struct request_queue *q); /* * CPU hotplug helpers */ struct blk_mq_cpu_notifier; void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - void (*fn)(void *, unsigned long, unsigned int), + int (*fn)(void *, unsigned long, unsigned int), void *data); void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); @@ -45,10 +47,75 @@ void blk_mq_disable_hotplug(void); /* * CPU -> queue mappings */ -struct blk_mq_reg; -extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg); -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); +extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, + const struct cpumask *online_mask); +extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); + +/* + * sysfs helpers + */ +extern int blk_mq_sysfs_register(struct request_queue *q); +extern void blk_mq_sysfs_unregister(struct request_queue *q); + +extern void blk_mq_rq_timed_out(struct request *req, bool reserved); + +void blk_mq_release(struct request_queue *q); + +/* + * Basic implementation of sparser bitmap, allowing the user to spread + * the bits over more cachelines. + */ +struct blk_align_bitmap { + unsigned long word; + unsigned long depth; +} ____cacheline_aligned_in_smp; + +static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, + unsigned int cpu) +{ + return per_cpu_ptr(q->queue_ctx, cpu); +} + +/* + * This assumes per-cpu software queueing queues. They could be per-node + * as well, for instance. For now this is hardcoded as-is. Note that we don't + * care about preemption, since we know the ctx's are persistent. This does + * mean that we can't rely on ctx always matching the currently running CPU. + */ +static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) +{ + return __blk_mq_get_ctx(q, get_cpu()); +} + +static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) +{ + put_cpu(); +} + +struct blk_mq_alloc_data { + /* input parameter */ + struct request_queue *q; + unsigned int flags; + + /* input & output parameter */ + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; +}; + +static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, + struct request_queue *q, unsigned int flags, + struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx) +{ + data->q = q; + data->flags = flags; + data->ctx = ctx; + data->hctx = hctx; +} -void blk_mq_add_timer(struct request *rq); +static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) +{ + return hctx->nr_ctx && hctx->tags; +} #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index 5d21239bc859..dd4973583978 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -53,28 +53,6 @@ void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn) } EXPORT_SYMBOL(blk_queue_unprep_rq); -/** - * blk_queue_merge_bvec - set a merge_bvec function for queue - * @q: queue - * @mbfn: merge_bvec_fn - * - * Usually queues have static limitations on the max sectors or segments that - * we can put in a request. Stacking drivers may have some settings that - * are dynamic, and thus we have to query the queue whether it is ok to - * add a new bio_vec to a bio at a given offset or not. If the block device - * has such limitations, it needs to register a merge_bvec_fn to control - * the size of bio's sent to it. Note that a block device *must* allow a - * single page to be added to an empty bio. The block device driver may want - * to use the bio_split() function to deal with these bio's. By default - * no merge_bvec_fn is defined for a queue, and only the fixed limits are - * honored. - */ -void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) -{ - q->merge_bvec_fn = mbfn; -} -EXPORT_SYMBOL(blk_queue_merge_bvec); - void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) { q->softirq_done_fn = fn; @@ -111,10 +89,14 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_segments = BLK_MAX_SEGMENTS; lim->max_integrity_segments = 0; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; + lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors = + BLK_SAFE_MAX_SECTORS; + lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; + lim->max_hw_discard_sectors = 0; lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; @@ -146,6 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; + lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -233,23 +216,29 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request - * @limits: the queue limits + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: * Enables a low level driver to set a hard upper limit, * max_hw_sectors, on the size of requests. max_hw_sectors is set by - * the device driver based upon the combined capabilities of I/O - * controller and storage device. + * the device driver based upon the capabilities of the I/O + * controller. + * + * max_dev_sectors is a hard limit imposed by the storage device for + * READ/WRITE requests. It is set by the disk driver. * * max_sectors is a soft limit imposed by the block layer for * filesystem type requests. This value can be overridden on a * per-device basis in /sys/block/<device>/queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ -void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) { + struct queue_limits *limits = &q->limits; + unsigned int max_sectors; + if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); printk(KERN_INFO "%s: set to minimum %d\n", @@ -257,24 +246,31 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_ } limits->max_hw_sectors = max_hw_sectors; - limits->max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); + max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + limits->max_sectors = max_sectors; } -EXPORT_SYMBOL(blk_limits_max_hw_sectors); +EXPORT_SYMBOL(blk_queue_max_hw_sectors); /** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * blk_queue_chunk_sectors - set size of the chunk for this queue * @q: the request queue for the device - * @max_hw_sectors: max hardware sectors in the usual 512b unit + * @chunk_sectors: chunk sectors in the usual 512b unit * * Description: - * See description for blk_limits_max_hw_sectors(). + * If a driver doesn't want IOs to cross a given chunk size, it can set + * this limit and prevent merging across chunks. Note that the chunk size + * must currently be a power-of-2 in sectors. Also note that the block + * layer must accept a page worth of data at any offset. So if the + * crossing of chunks is a hard limitation in the driver, it must still be + * prepared to split single page bios. **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) { - blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); + BUG_ON(!is_power_of_2(chunk_sectors)); + q->limits.chunk_sectors = chunk_sectors; } -EXPORT_SYMBOL(blk_queue_max_hw_sectors); +EXPORT_SYMBOL(blk_queue_chunk_sectors); /** * blk_queue_max_discard_sectors - set max sectors for a single discard @@ -284,6 +280,7 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors); void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors) { + q->limits.max_hw_discard_sectors = max_discard_sectors; q->limits.max_discard_sectors = max_discard_sectors; } EXPORT_SYMBOL(blk_queue_max_discard_sectors); @@ -525,12 +522,15 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_same_sectors = min(t->max_write_same_sectors, b->max_write_same_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); + t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask, + b->virt_boundary_mask); t->max_segments = min_not_zero(t->max_segments, b->max_segments); t->max_integrity_segments = min_not_zero(t->max_integrity_segments, @@ -553,7 +553,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, bottom = max(b->physical_block_size, b->io_min) + alignment; /* Verify that top and bottom intervals line up */ - if (max(top, bottom) & (min(top, bottom) - 1)) { + if (max(top, bottom) % min(top, bottom)) { t->misaligned = 1; ret = -1; } @@ -566,7 +566,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->physical_block_size); t->io_min = max(t->io_min, b->io_min); - t->io_opt = lcm(t->io_opt, b->io_opt); + t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); t->cluster &= b->cluster; t->discard_zeroes_data &= b->discard_zeroes_data; @@ -597,8 +597,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->raid_partial_stripes_expensive); /* Find lowest common alignment_offset */ - t->alignment_offset = lcm(t->alignment_offset, alignment) - & (max(t->physical_block_size, t->io_min) - 1); + t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment) + % max(t->physical_block_size, t->io_min); /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) { @@ -622,9 +622,11 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_discard_sectors = min_not_zero(t->max_discard_sectors, b->max_discard_sectors); + t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors, + b->max_hw_discard_sectors); t->discard_granularity = max(t->discard_granularity, b->discard_granularity); - t->discard_alignment = lcm(t->discard_alignment, alignment) % + t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % t->discard_granularity; } @@ -769,6 +771,17 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) EXPORT_SYMBOL(blk_queue_segment_boundary); /** + * blk_queue_virt_boundary - set boundary rules for bio merging + * @q: the request queue for the device + * @mask: the memory boundary mask + **/ +void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask) +{ + q->limits.virt_boundary_mask = mask; +} +EXPORT_SYMBOL(blk_queue_virt_boundary); + +/** * blk_queue_dma_alignment - set dma length and memory alignment * @q: the request queue for the device * @mask: alignment mask diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7500f876dae4..e140cc487ce1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -6,11 +6,12 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/blktrace_api.h> #include <linux/blk-mq.h> +#include <linux/blk-cgroup.h> #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" struct queue_sysfs_entry { @@ -48,11 +49,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { - struct request_list *rl; unsigned long nr; - int ret; + int ret, err; - if (!q->request_fn) + if (!q->request_fn && !q->mq_ops) return -EINVAL; ret = queue_var_store(&nr, page, count); @@ -62,40 +62,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - spin_lock_irq(q->queue_lock); - q->nr_requests = nr; - blk_queue_congestion_threshold(q); - - /* congestion isn't cgroup aware and follows root blkcg for now */ - rl = &q->root_rl; - - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_SYNC); - - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_ASYNC); - - blk_queue_for_each_rl(rl, q) { - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_SYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_SYNC); - wake_up(&rl->wait[BLK_RW_SYNC]); - } - - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_ASYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_ASYNC); - wake_up(&rl->wait[BLK_RW_ASYNC]); - } - } + if (q->request_fn) + err = blk_update_nr_requests(q, nr); + else + err = blk_mq_update_nr_requests(q, nr); + + if (err) + return err; - spin_unlock_irq(q->queue_lock); return ret; } @@ -171,12 +145,43 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag return queue_var_show(q->limits.discard_granularity, page); } +static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) +{ + unsigned long long val; + + val = q->limits.max_hw_discard_sectors << 9; + return sprintf(page, "%llu\n", val); +} + static ssize_t queue_discard_max_show(struct request_queue *q, char *page) { return sprintf(page, "%llu\n", (unsigned long long)q->limits.max_discard_sectors << 9); } +static ssize_t queue_discard_max_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long max_discard; + ssize_t ret = queue_var_store(&max_discard, page, count); + + if (ret < 0) + return ret; + + if (max_discard & (q->limits.discard_granularity - 1)) + return -EINVAL; + + max_discard >>= 9; + if (max_discard > UINT_MAX) + return -EINVAL; + + if (max_discard > q->limits.max_hw_discard_sectors) + max_discard = q->limits.max_hw_discard_sectors; + + q->limits.max_discard_sectors = max_discard; + return ret; +} + static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) { return queue_var_show(queue_discard_zeroes_data(q), page); @@ -200,6 +205,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; + max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long) + q->limits.max_dev_sectors >> 1); + if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) return -EINVAL; @@ -312,6 +320,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_poll_show(struct request_queue *q, char *page) +{ + return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); +} + +static ssize_t queue_poll_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long poll_on; + ssize_t ret; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + ret = queue_var_store(&poll_on, page, count); + if (ret < 0) + return ret; + + spin_lock_irq(q->queue_lock); + if (poll_on) + queue_flag_set(QUEUE_FLAG_POLL, q); + else + queue_flag_clear(QUEUE_FLAG_POLL, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -386,9 +422,15 @@ static struct queue_sysfs_entry queue_discard_granularity_entry = { .show = queue_discard_granularity_show, }; +static struct queue_sysfs_entry queue_discard_max_hw_entry = { + .attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO }, + .show = queue_discard_max_hw_show, +}; + static struct queue_sysfs_entry queue_discard_max_entry = { - .attr = {.name = "discard_max_bytes", .mode = S_IRUGO }, + .attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR }, .show = queue_discard_max_show, + .store = queue_discard_max_store, }; static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { @@ -431,6 +473,12 @@ static struct queue_sysfs_entry queue_random_entry = { .store = queue_store_random, }; +static struct queue_sysfs_entry queue_poll_entry = { + .attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR }, + .show = queue_poll_show, + .store = queue_poll_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -447,6 +495,7 @@ static struct attribute *default_attrs[] = { &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, &queue_discard_max_entry.attr, + &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_nonrot_entry.attr, @@ -454,6 +503,7 @@ static struct attribute *default_attrs[] = { &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, &queue_random_entry.attr, + &queue_poll_entry.attr, NULL, }; @@ -519,17 +569,16 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) * Currently, its primary task it to free all the &struct request * structures that were allocated to the queue and the queue itself. * - * Caveat: - * Hopefully the low level driver will have finished any - * outstanding requests first... + * Note: + * The low level driver must have finished any outstanding requests first + * via blk_cleanup_queue(). **/ static void blk_release_queue(struct kobject *kobj) { struct request_queue *q = container_of(kobj, struct request_queue, kobj); - blk_sync_queue(q); - + bdi_exit(&q->backing_dev_info); blkcg_exit_queue(q); if (q->elevator) { @@ -544,16 +593,15 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - percpu_counter_destroy(&q->mq_usage_counter); - - if (q->mq_ops) - blk_mq_free_queue(q); - - kfree(q->flush_rq); + if (!q->mq_ops) + blk_free_flush_queue(q->fq); + else + blk_mq_release(q); blk_trace_shutdown(q); - bdi_destroy(&q->backing_dev_info); + if (q->bio_split) + bioset_free(q->bio_split); ida_simple_remove(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); @@ -580,11 +628,19 @@ int blk_register_queue(struct gendisk *disk) return -ENXIO; /* - * Initialization must be complete by now. Finish the initial - * bypass from queue allocation. + * SCSI probing may synchronously create and destroy a lot of + * request_queues for non-existent devices. Shutting down a fully + * functional queue takes measureable wallclock time as RCU grace + * periods are involved. To avoid excessive latency in these + * cases, a request_queue starts out in a degraded mode which is + * faster to shut down and is made fully functional here as + * request_queues for non-existent devices never get registered. */ - blk_queue_bypass_end(q); - queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); + if (!blk_queue_init_done(q)) { + queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); + percpu_ref_switch_to_percpu(&q->q_usage_counter); + blk_queue_bypass_end(q); + } ret = blk_trace_init_sysfs(dev); if (ret) diff --git a/block/blk-tag.c b/block/blk-tag.c index 3f33d8672268..f0344e6939d5 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -27,18 +27,15 @@ struct request *blk_queue_find_tag(struct request_queue *q, int tag) EXPORT_SYMBOL(blk_queue_find_tag); /** - * __blk_free_tags - release a given set of tag maintenance info + * blk_free_tags - release a given set of tag maintenance info * @bqt: the tag map to free * - * Tries to free the specified @bqt. Returns true if it was - * actually freed and false if there are still references using it + * Drop the reference count on @bqt and frees it when the last reference + * is dropped. */ -static int __blk_free_tags(struct blk_queue_tag *bqt) +void blk_free_tags(struct blk_queue_tag *bqt) { - int retval; - - retval = atomic_dec_and_test(&bqt->refcnt); - if (retval) { + if (atomic_dec_and_test(&bqt->refcnt)) { BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) < bqt->max_depth); @@ -50,9 +47,8 @@ static int __blk_free_tags(struct blk_queue_tag *bqt) kfree(bqt); } - - return retval; } +EXPORT_SYMBOL(blk_free_tags); /** * __blk_queue_free_tags - release tag maintenance info @@ -69,28 +65,13 @@ void __blk_queue_free_tags(struct request_queue *q) if (!bqt) return; - __blk_free_tags(bqt); + blk_free_tags(bqt); q->queue_tags = NULL; queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q); } /** - * blk_free_tags - release a given set of tag maintenance info - * @bqt: the tag map to free - * - * For externally managed @bqt frees the map. Callers of this - * function must guarantee to have released all the queues that - * might have been using this tag map. - */ -void blk_free_tags(struct blk_queue_tag *bqt) -{ - if (unlikely(!__blk_free_tags(bqt))) - BUG(); -} -EXPORT_SYMBOL(blk_free_tags); - -/** * blk_queue_free_tags - release tag maintenance info * @q: the request queue for the device * @@ -138,7 +119,7 @@ fail: } static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, - int depth) + int depth, int alloc_policy) { struct blk_queue_tag *tags; @@ -150,6 +131,8 @@ static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, goto fail; atomic_set(&tags->refcnt, 1); + tags->alloc_policy = alloc_policy; + tags->next_tag = 0; return tags; fail: kfree(tags); @@ -159,10 +142,11 @@ fail: /** * blk_init_tags - initialize the tag info for an external tag map * @depth: the maximum queue depth supported + * @alloc_policy: tag allocation policy **/ -struct blk_queue_tag *blk_init_tags(int depth) +struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy) { - return __blk_queue_init_tags(NULL, depth); + return __blk_queue_init_tags(NULL, depth, alloc_policy); } EXPORT_SYMBOL(blk_init_tags); @@ -171,19 +155,20 @@ EXPORT_SYMBOL(blk_init_tags); * @q: the request queue for the device * @depth: the maximum queue depth supported * @tags: the tag to use + * @alloc_policy: tag allocation policy * * Queue lock must be held here if the function is called to resize an * existing map. **/ int blk_queue_init_tags(struct request_queue *q, int depth, - struct blk_queue_tag *tags) + struct blk_queue_tag *tags, int alloc_policy) { int rc; BUG_ON(tags && q->queue_tags && tags != q->queue_tags); if (!tags && !q->queue_tags) { - tags = __blk_queue_init_tags(q, depth); + tags = __blk_queue_init_tags(q, depth, alloc_policy); if (!tags) return -ENOMEM; @@ -363,9 +348,21 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) } do { - tag = find_first_zero_bit(bqt->tag_map, max_depth); - if (tag >= max_depth) - return 1; + if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) { + tag = find_first_zero_bit(bqt->tag_map, max_depth); + if (tag >= max_depth) + return 1; + } else { + int start = bqt->next_tag; + int size = min_t(int, bqt->max_depth, max_depth + start); + tag = find_next_zero_bit(bqt->tag_map, size, start); + if (tag >= size && start + size > bqt->max_depth) { + size = start + size - bqt->max_depth; + tag = find_first_zero_bit(bqt->tag_map, size); + } + if (tag >= size) + return 1; + } } while (test_and_set_bit_lock(tag, bqt->tag_map)); /* @@ -373,6 +370,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) * See blk_queue_end_tag for details. */ + bqt->next_tag = (tag + 1) % bqt->max_depth; rq->cmd_flags |= REQ_QUEUED; rq->tag = tag; bqt->tag_index[tag] = rq; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 033745cd7fba..2149a1ddbacf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -9,7 +9,7 @@ #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> -#include "blk-cgroup.h" +#include <linux/blk-cgroup.h> #include "blk.h" /* Max dispatch from a group in 1 round */ @@ -83,14 +83,6 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -/* Per-cpu group stats */ -struct tg_stats_cpu { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; -}; - struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -141,12 +133,6 @@ struct throtl_grp { /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; - - /* Per cpu stats pointer */ - struct tg_stats_cpu __percpu *stats_cpu; - - /* List of tgs waiting for per cpu stats memory to be allocated */ - struct list_head stats_alloc_node; }; struct throtl_data @@ -168,13 +154,6 @@ struct throtl_data struct work_struct dispatch_work; }; -/* list and work item to allocate percpu group stats */ -static DEFINE_SPINLOCK(tg_stats_alloc_lock); -static LIST_HEAD(tg_stats_alloc_list); - -static void tg_stats_alloc_fn(struct work_struct *); -static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); - static void throtl_pending_timer_fn(unsigned long arg); static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) @@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) return pd_to_blkg(&tg->pd); } -static inline struct throtl_grp *td_root_tg(struct throtl_data *td) -{ - return blkg_to_tg(td->queue->root_blkg); -} - /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest @@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) } \ } while (0) -static void tg_stats_init(struct tg_stats_cpu *tg_stats) -{ - blkg_rwstat_init(&tg_stats->service_bytes); - blkg_rwstat_init(&tg_stats->serviced); -} - -/* - * Worker for allocating per cpu stat for tgs. This is scheduled on the - * system_wq once there are some groups on the alloc_list waiting for - * allocation. - */ -static void tg_stats_alloc_fn(struct work_struct *work) -{ - static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ - struct delayed_work *dwork = to_delayed_work(work); - bool empty = false; - -alloc_stats: - if (!stats_cpu) { - int cpu; - - stats_cpu = alloc_percpu(struct tg_stats_cpu); - if (!stats_cpu) { - /* allocation failed, try again after some time */ - schedule_delayed_work(dwork, msecs_to_jiffies(10)); - return; - } - for_each_possible_cpu(cpu) - tg_stats_init(per_cpu_ptr(stats_cpu, cpu)); - } - - spin_lock_irq(&tg_stats_alloc_lock); - - if (!list_empty(&tg_stats_alloc_list)) { - struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, - struct throtl_grp, - stats_alloc_node); - swap(tg->stats_cpu, stats_cpu); - list_del_init(&tg->stats_alloc_node); - } - - empty = list_empty(&tg_stats_alloc_list); - spin_unlock_irq(&tg_stats_alloc_lock); - if (!empty) - goto alloc_stats; -} - static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); @@ -387,49 +314,25 @@ static struct bio *throtl_pop_queued(struct list_head *queued, } /* init a service_queue, assumes the caller zeroed it */ -static void throtl_service_queue_init(struct throtl_service_queue *sq, - struct throtl_service_queue *parent_sq) +static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[1]); sq->pending_tree = RB_ROOT; - sq->parent_sq = parent_sq; setup_timer(&sq->pending_timer, throtl_pending_timer_fn, (unsigned long)sq); } -static void throtl_service_queue_exit(struct throtl_service_queue *sq) -{ - del_timer_sync(&sq->pending_timer); -} - -static void throtl_pd_init(struct blkcg_gq *blkg) +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) { - struct throtl_grp *tg = blkg_to_tg(blkg); - struct throtl_data *td = blkg->q->td; - struct throtl_service_queue *parent_sq; - unsigned long flags; + struct throtl_grp *tg; int rw; - /* - * If sane_hierarchy is enabled, we switch to properly hierarchical - * behavior where limits on a given throtl_grp are applied to the - * whole subtree rather than just the group itself. e.g. If 16M - * read_bps limit is set on the root group, the whole system can't - * exceed 16M for the device. - * - * If sane_hierarchy is not enabled, the broken flat hierarchy - * behavior is retained where all throtl_grps are treated as if - * they're all separate root groups right below throtl_data. - * Limits of a group don't interact with limits of other groups - * regardless of the position of the group in the hierarchy. - */ - parent_sq = &td->service_queue; - - if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent) - parent_sq = &blkg_to_tg(blkg->parent)->service_queue; + tg = kzalloc_node(sizeof(*tg), gfp, node); + if (!tg) + return NULL; - throtl_service_queue_init(&tg->service_queue, parent_sq); + throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { throtl_qnode_init(&tg->qnode_on_self[rw], tg); @@ -437,22 +340,38 @@ static void throtl_pd_init(struct blkcg_gq *blkg) } RB_CLEAR_NODE(&tg->rb_node); - tg->td = td; - tg->bps[READ] = -1; tg->bps[WRITE] = -1; tg->iops[READ] = -1; tg->iops[WRITE] = -1; + return &tg->pd; +} + +static void throtl_pd_init(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = tg_to_blkg(tg); + struct throtl_data *td = blkg->q->td; + struct throtl_service_queue *sq = &tg->service_queue; + /* - * Ugh... We need to perform per-cpu allocation for tg->stats_cpu - * but percpu allocator can't be called from IO path. Queue tg on - * tg_stats_alloc_list and allocate from work item. + * If on the default hierarchy, we switch to properly hierarchical + * behavior where limits on a given throtl_grp are applied to the + * whole subtree rather than just the group itself. e.g. If 16M + * read_bps limit is set on the root group, the whole system can't + * exceed 16M for the device. + * + * If not on the default hierarchy, the broken flat hierarchy + * behavior is retained where all throtl_grps are treated as if + * they're all separate root groups right below throtl_data. + * Limits of a group don't interact with limits of other groups + * regardless of the position of the group in the hierarchy. */ - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); - schedule_delayed_work(&tg_stats_alloc_work, 0); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); + sq->parent_sq = &td->service_queue; + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) + sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; + tg->td = td; } /* @@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg) (tg->bps[rw] != -1 || tg->iops[rw] != -1); } -static void throtl_pd_online(struct blkcg_gq *blkg) +static void throtl_pd_online(struct blkg_policy_data *pd) { /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ - tg_update_has_rules(blkg_to_tg(blkg)); -} - -static void throtl_pd_exit(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - unsigned long flags; - - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_del_init(&tg->stats_alloc_node); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); - - free_percpu(tg->stats_cpu); - - throtl_service_queue_exit(&tg->service_queue); -} - -static void throtl_pd_reset_stats(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - int cpu; - - if (tg->stats_cpu == NULL) - return; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - blkg_rwstat_reset(&sc->service_bytes); - blkg_rwstat_reset(&sc->serviced); - } -} - -static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, - struct blkcg *blkcg) -{ - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) - return td_root_tg(td); - - return blkg_to_tg(blkg_lookup(blkcg, td->queue)); + tg_update_has_rules(pd_to_tg(pd)); } -static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, - struct blkcg *blkcg) +static void throtl_pd_free(struct blkg_policy_data *pd) { - struct request_queue *q = td->queue; - struct throtl_grp *tg = NULL; - - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) { - tg = td_root_tg(td); - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - - /* if %NULL and @q is alive, fall back to root_tg */ - if (!IS_ERR(blkg)) - tg = blkg_to_tg(blkg); - else if (!blk_queue_dying(q)) - tg = td_root_tg(td); - } + struct throtl_grp *tg = pd_to_tg(pd); - return tg; + del_timer_sync(&tg->service_queue.pending_timer); + kfree(tg); } static struct throtl_grp * @@ -744,7 +601,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) - return 0; + return false; return 1; } @@ -842,7 +699,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -880,7 +737,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -923,7 +780,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { if (wait) *wait = 0; - return 1; + return true; } /* @@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, - int rw) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - struct tg_stats_cpu *stats_cpu; - unsigned long flags; - - /* If per cpu stats are not allocated yet, don't do any accounting. */ - if (tg->stats_cpu == NULL) - return; - - /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. - */ - local_irq_save(flags); - - stats_cpu = this_cpu_ptr(tg->stats_cpu); - - blkg_rwstat_add(&stats_cpu->serviced, rw, 1); - blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); - - local_irq_restore(flags); -} - static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) * more than once as a throttled bio will go through blk-throtl the * second time when it eventually gets issued. Set it when a bio * is being charged to a tg. - * - * Dispatch stats aren't recursive and each @bio should only be - * accounted by the @tg it was originally associated with. Let's - * update the stats when setting REQ_THROTTLED for the first time - * which is guaranteed to be for the @bio's original tg. */ - if (!(bio->bi_rw & REQ_THROTTLED)) { + if (!(bio->bi_rw & REQ_THROTTLED)) bio->bi_rw |= REQ_THROTTLED; - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - } } /** @@ -1258,7 +1081,7 @@ out_unlock: * of throtl_data->service_queue. Those bio's are ready and issued by this * function. */ -void blk_throtl_dispatch_work_fn(struct work_struct *work) +static void blk_throtl_dispatch_work_fn(struct work_struct *work) { struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); @@ -1285,31 +1108,6 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work) } } -static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct throtl_grp *tg = pd_to_tg(pd); - struct blkg_rwstat rwstat = { }, tmp; - int i, cpu; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - tmp = blkg_rwstat_read((void *)sc + off); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - rwstat.cnt[i] += tmp.cnt[i]; - } - - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, - &blkcg_policy_throtl, seq_cft(sf)->private, true); - return 0; -} - static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1346,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buf, bool is_u64) +static void tg_conf_updated(struct throtl_grp *tg) { - struct blkcg *blkcg = css_to_blkcg(css); - struct blkg_conf_ctx ctx; - struct throtl_grp *tg; - struct throtl_service_queue *sq; - struct blkcg_gq *blkg; + struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; - int ret; - - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); - if (ret) - return ret; - - tg = blkg_to_tg(ctx.blkg); - sq = &tg->service_queue; - - if (!ctx.v) - ctx.v = -1; - - if (is_u64) - *(u64 *)((void *)tg + cft->private) = ctx.v; - else - *(unsigned int *)((void *)tg + cft->private) = ctx.v; + struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", @@ -1384,7 +1162,7 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) + blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) tg_update_has_rules(blkg_to_tg(blkg)); /* @@ -1402,57 +1180,198 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } +} +static ssize_t tg_set_conf(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, bool is_u64) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + int ret; + u64 v; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + if (!v) + v = -1; + + tg = blkg_to_tg(ctx.blkg); + + if (is_u64) + *(u64 *)((void *)tg + of_cft(of)->private) = v; + else + *(unsigned int *)((void *)tg + of_cft(of)->private) = v; + + tg_conf_updated(tg); + ret = 0; +out_finish: blkg_conf_finish(&ctx); - return 0; + return ret ?: nbytes; } -static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, - char *buf) +static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return tg_set_conf(css, cft, buf, true); + return tg_set_conf(of, buf, nbytes, off, true); } -static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, - char *buf) +static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return tg_set_conf(css, cft, buf, false); + return tg_set_conf(of, buf, nbytes, off, false); } -static struct cftype throtl_files[] = { +static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, - .write_string = tg_set_conf_u64, + .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, - .write_string = tg_set_conf_u64, + .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, - .write_string = tg_set_conf_uint, + .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, - .write_string = tg_set_conf_uint, + .write = tg_set_conf_uint, }, { .name = "throttle.io_service_bytes", - .private = offsetof(struct tg_stats_cpu, service_bytes), - .seq_show = tg_print_cpu_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes, }, { .name = "throttle.io_serviced", - .private = offsetof(struct tg_stats_cpu, serviced), - .seq_show = tg_print_cpu_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios, + }, + { } /* terminate */ +}; + +static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + char bufs[4][21] = { "max", "max", "max", "max" }; + + if (!dname) + return 0; + if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && + tg->iops[READ] == -1 && tg->iops[WRITE] == -1) + return 0; + + if (tg->bps[READ] != -1) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); + if (tg->bps[WRITE] != -1) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); + if (tg->iops[READ] != -1) + snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); + if (tg->iops[WRITE] != -1) + snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3]); + return 0; +} + +static int tg_print_max(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t tg_set_max(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + u64 v[4]; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + tg = blkg_to_tg(ctx.blkg); + + v[0] = tg->bps[READ]; + v[1] = tg->bps[WRITE]; + v[2] = tg->iops[READ]; + v[3] = tg->iops[WRITE]; + + while (true) { + char tok[27]; /* wiops=18446744073709551616 */ + char *p; + u64 val = -1; + int len; + + if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) + break; + if (tok[0] == '\0') + break; + ctx.body += len; + + ret = -EINVAL; + p = tok; + strsep(&p, "="); + if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) + goto out_finish; + + ret = -ERANGE; + if (!val) + goto out_finish; + + ret = -EINVAL; + if (!strcmp(tok, "rbps")) + v[0] = val; + else if (!strcmp(tok, "wbps")) + v[1] = val; + else if (!strcmp(tok, "riops")) + v[2] = min_t(u64, val, UINT_MAX); + else if (!strcmp(tok, "wiops")) + v[3] = min_t(u64, val, UINT_MAX); + else + goto out_finish; + } + + tg->bps[READ] = v[0]; + tg->bps[WRITE] = v[1]; + tg->iops[READ] = v[2]; + tg->iops[WRITE] = v[3]; + + tg_conf_updated(tg); + ret = 0; +out_finish: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static struct cftype throtl_files[] = { + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_max, + .write = tg_set_max, }, { } /* terminate */ }; @@ -1465,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { - .pd_size = sizeof(struct throtl_grp), - .cftypes = throtl_files, + .dfl_cftypes = throtl_files, + .legacy_cftypes = throtl_legacy_files, + .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, - .pd_exit_fn = throtl_pd_exit, - .pd_reset_stats_fn = throtl_pd_reset_stats, + .pd_free_fn = throtl_pd_free, }; -bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { - struct throtl_data *td = q->td; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg; + struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); - struct blkcg *blkcg; bool throttled = false; + WARN_ON_ONCE(!rcu_read_lock_held()); + /* see throtl_charge_bio() */ - if (bio->bi_rw & REQ_THROTTLED) + if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) goto out; - /* - * A throtl_grp pointer retrieved under rcu can be used to access - * basic fields like stats and io rates. If a group has no rules, - * just update the dispatch stats in lockless manner and return. - */ - rcu_read_lock(); - blkcg = bio_blkcg(bio); - tg = throtl_lookup_tg(td, blkcg); - if (tg) { - if (!tg->has_rules[rw]) { - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - goto out_unlock_rcu; - } - } - - /* - * Either group has not been allocated yet or it is not an unlimited - * IO group - */ spin_lock_irq(q->queue_lock); - tg = throtl_lookup_create_tg(td, blkcg); - if (unlikely(!tg)) + + if (unlikely(blk_queue_bypass(q))) goto out_unlock; sq = &tg->service_queue; @@ -1577,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) out_unlock: spin_unlock_irq(q->queue_lock); -out_unlock_rcu: - rcu_read_unlock(); out: /* * As multiple blk-throtls may stack in the same issue path, we @@ -1664,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); - throtl_service_queue_init(&td->service_queue, NULL); + throtl_service_queue_init(&td->service_queue); q->td = td; td->queue = q; diff --git a/block/blk-timeout.c b/block/blk-timeout.c index d96f7061c6fd..a30441a200c0 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -90,17 +90,10 @@ static void blk_rq_timed_out(struct request *req) switch (ret) { case BLK_EH_HANDLED: /* Can we use req->errors here? */ - if (q->mq_ops) - __blk_mq_complete_request(req); - else - __blk_complete_request(req); + __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: - if (q->mq_ops) - blk_mq_add_timer(req); - else - blk_add_timer(req); - + blk_add_timer(req); blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: @@ -117,7 +110,7 @@ static void blk_rq_timed_out(struct request *req) } } -void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, +static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set) { if (time_after_eq(jiffies, rq->deadline)) { @@ -134,13 +127,16 @@ void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, } } -void blk_rq_timed_out_timer(unsigned long data) +void blk_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *) data; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); unsigned long flags, next = 0; struct request *rq, *tmp; int next_set = 0; + if (blk_queue_enter(q, true)) + return; spin_lock_irqsave(q->queue_lock, flags); list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) @@ -150,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); + blk_queue_exit(q); } /** @@ -165,17 +162,43 @@ void blk_abort_request(struct request *req) { if (blk_mark_rq_complete(req)) return; - blk_delete_timer(req); - blk_rq_timed_out(req); + + if (req->q->mq_ops) { + blk_mq_rq_timed_out(req, false); + } else { + blk_delete_timer(req); + blk_rq_timed_out(req); + } } EXPORT_SYMBOL_GPL(blk_abort_request); -void __blk_add_timer(struct request *req, struct list_head *timeout_list) +unsigned long blk_rq_timeout(unsigned long timeout) +{ + unsigned long maxt; + + maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); + if (time_after(timeout, maxt)) + timeout = maxt; + + return timeout; +} + +/** + * blk_add_timer - Start timeout timer for a single request + * @req: request that is about to start running. + * + * Notes: + * Each request has its own timer, and as it is added to the queue, we + * set up the timer. When the request completes, we cancel the timer. + * Queue lock must be held for the non-mq case, mq case doesn't care. + */ +void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; - if (!q->rq_timed_out_fn) + /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ + if (!q->mq_ops && !q->rq_timed_out_fn) return; BUG_ON(!list_empty(&req->timeout_list)); @@ -188,32 +211,34 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list) req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; - if (timeout_list) - list_add_tail(&req->timeout_list, timeout_list); + + /* + * Only the non-mq case needs to add the request to a protected list. + * For the mq case we simply scan the tag map. + */ + if (!q->mq_ops) + list_add_tail(&req->timeout_list, &req->q->timeout_list); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = round_jiffies_up(req->deadline); + expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); if (!timer_pending(&q->timeout) || - time_before(expiry, q->timeout.expires)) - mod_timer(&q->timeout, expiry); + time_before(expiry, q->timeout.expires)) { + unsigned long diff = q->timeout.expires - expiry; -} + /* + * Due to added timer slack to group timers, the timer + * will often be a little in front of what we asked for. + * So apply some tolerance here too, otherwise we keep + * modifying the timer because expires for value X + * will be X + something. + */ + if (!timer_pending(&q->timeout) || (diff >= HZ / 2)) + mod_timer(&q->timeout, expiry); + } -/** - * blk_add_timer - Start timeout timer for a single request - * @req: request that is about to start running. - * - * Notes: - * Each request has its own timer, and as it is added to the queue, we - * set up the timer. When the request completes, we cancel the timer. - */ -void blk_add_timer(struct request *req) -{ - __blk_add_timer(req, &req->q->timeout_list); } - diff --git a/block/blk.h b/block/blk.h index 1d880f1f957f..70e4aee9cdcb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,6 +2,8 @@ #define BLK_INTERNAL_H #include <linux/idr.h> +#include <linux/blk-mq.h> +#include "blk-mq.h" /* Amount of time in which a process may batch requests */ #define BLK_BATCH_TIME (HZ/50UL) @@ -9,16 +11,53 @@ /* Number of requests a "batching" process may submit */ #define BLK_BATCH_REQ 32 +/* Max future timer expiry for timeouts */ +#define BLK_MAX_TIMEOUT (5 * HZ) + +struct blk_flush_queue { + unsigned int flush_queue_delayed:1; + unsigned int flush_pending_idx:1; + unsigned int flush_running_idx:1; + unsigned long flush_pending_since; + struct list_head flush_queue[2]; + struct list_head flush_data_in_flight; + struct request *flush_rq; + + /* + * flush_rq shares tag with this rq, both can't be active + * at the same time + */ + struct request *orig_rq; + spinlock_t mq_flush_lock; +}; + extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; +static inline struct blk_flush_queue *blk_get_flush_queue( + struct request_queue *q, struct blk_mq_ctx *ctx) +{ + struct blk_mq_hw_ctx *hctx; + + if (!q->mq_ops) + return q->fq; + + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + return hctx->fq; +} + static inline void __blk_get_queue(struct request_queue *q) { kobject_get(&q->kobj); } +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size); +void blk_free_flush_queue(struct blk_flush_queue *q); + int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); void blk_exit_rl(struct request_list *rl); @@ -33,13 +72,31 @@ void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); +void blk_freeze_queue(struct request_queue *q); -void blk_rq_timed_out_timer(unsigned long data); -void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, - unsigned int *next_set); -void __blk_add_timer(struct request *req, struct list_head *timeout_list); +static inline void blk_queue_enter_live(struct request_queue *q) +{ + /* + * Given that running in generic_make_request() context + * guarantees that a live reference against q_usage_counter has + * been established, further references under that same context + * need not check that the queue has been frozen (marked dead). + */ + percpu_ref_get(&q->q_usage_counter); +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +void blk_flush_integrity(void); +#else +static inline void blk_flush_integrity(void) +{ +} +#endif + +void blk_timeout_work(struct work_struct *work); +unsigned long blk_rq_timeout(unsigned long timeout); +void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); -void blk_add_timer(struct request *); bool bio_attempt_front_merge(struct request_queue *q, struct request *req, @@ -47,7 +104,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count); + unsigned int *request_count, + struct request **same_queue_rq); +unsigned int blk_plug_queued_count(struct request_queue *q); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); @@ -81,11 +140,11 @@ static inline void blk_clear_rq_complete(struct request *rq) #define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED) void blk_insert_flush(struct request *rq); -void blk_abort_flushes(struct request_queue *q); static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); while (1) { if (!list_empty(&q->queue_head)) { @@ -108,9 +167,9 @@ static inline struct request *__elv_next_request(struct request_queue *q) * should be restarted later. Please see flush_end_io() for * details. */ - if (q->flush_pending_idx != q->flush_running_idx && + if (fq->flush_pending_idx != fq->flush_running_idx && !queue_flush_queueable(q)) { - q->flush_queue_delayed = 1; + fq->flush_queue_delayed = 1; return NULL; } if (unlikely(blk_queue_bypass(q)) || @@ -162,8 +221,6 @@ int blk_try_merge(struct request *rq, struct bio *bio); void blk_queue_congestion_threshold(struct request_queue *q); -void __blk_run_queue_uncond(struct request_queue *q); - int blk_dev_init(void); @@ -185,6 +242,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } +extern int blk_update_nr_requests(struct request_queue *, unsigned int); + /* * Contribute to IO statistics IFF: * @@ -234,15 +293,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) * Internal throttling interface */ #ifdef CONFIG_BLK_DEV_THROTTLING -extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ -static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) -{ - return false; -} static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } diff --git a/block/bounce.c b/block/bounce.c new file mode 100644 index 000000000000..1cb5dd3a5da1 --- /dev/null +++ b/block/bounce.c @@ -0,0 +1,268 @@ +/* bounce buffer handling for block devices + * + * - Split from highmem.c + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/mm.h> +#include <linux/export.h> +#include <linux/swap.h> +#include <linux/gfp.h> +#include <linux/bio.h> +#include <linux/pagemap.h> +#include <linux/mempool.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/printk.h> +#include <asm/tlbflush.h> + +#include <trace/events/block.h> + +#define POOL_SIZE 64 +#define ISA_POOL_SIZE 16 + +static mempool_t *page_pool, *isa_page_pool; + +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) +static __init int init_emergency_pool(void) +{ +#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) + if (max_pfn <= max_low_pfn) + return 0; +#endif + + page_pool = mempool_create_page_pool(POOL_SIZE, 0); + BUG_ON(!page_pool); + pr_info("pool size: %d pages\n", POOL_SIZE); + + return 0; +} + +__initcall(init_emergency_pool); +#endif + +#ifdef CONFIG_HIGHMEM +/* + * highmem version, map in to vec + */ +static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) +{ + unsigned long flags; + unsigned char *vto; + + local_irq_save(flags); + vto = kmap_atomic(to->bv_page); + memcpy(vto + to->bv_offset, vfrom, to->bv_len); + kunmap_atomic(vto); + local_irq_restore(flags); +} + +#else /* CONFIG_HIGHMEM */ + +#define bounce_copy_vec(to, vfrom) \ + memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) + +#endif /* CONFIG_HIGHMEM */ + +/* + * allocate pages in the DMA region for the ISA pool + */ +static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) +{ + return mempool_alloc_pages(gfp_mask | GFP_DMA, data); +} + +/* + * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA + * as the max address, so check if the pool has already been created. + */ +int init_emergency_isa_pool(void) +{ + if (isa_page_pool) + return 0; + + isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, + mempool_free_pages, (void *) 0); + BUG_ON(!isa_page_pool); + + pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); + return 0; +} + +/* + * Simple bounce buffer support for highmem pages. Depending on the + * queue gfp mask set, *to may or may not be a highmem page. kmap it + * always, it will do the Right Thing + */ +static void copy_to_high_bio_irq(struct bio *to, struct bio *from) +{ + unsigned char *vfrom; + struct bio_vec tovec, *fromvec = from->bi_io_vec; + struct bvec_iter iter; + + bio_for_each_segment(tovec, to, iter) { + if (tovec.bv_page != fromvec->bv_page) { + /* + * fromvec->bv_offset and fromvec->bv_len might have + * been modified by the block layer, so use the original + * copy, bounce_copy_vec already uses tovec->bv_len + */ + vfrom = page_address(fromvec->bv_page) + + tovec.bv_offset; + + bounce_copy_vec(&tovec, vfrom); + flush_dcache_page(tovec.bv_page); + } + + fromvec++; + } +} + +static void bounce_end_io(struct bio *bio, mempool_t *pool) +{ + struct bio *bio_orig = bio->bi_private; + struct bio_vec *bvec, *org_vec; + int i; + int start = bio_orig->bi_iter.bi_idx; + + /* + * free up bounce indirect pages used + */ + bio_for_each_segment_all(bvec, bio, i) { + org_vec = bio_orig->bi_io_vec + i + start; + + if (bvec->bv_page == org_vec->bv_page) + continue; + + dec_zone_page_state(bvec->bv_page, NR_BOUNCE); + mempool_free(bvec->bv_page, pool); + } + + bio_orig->bi_error = bio->bi_error; + bio_endio(bio_orig); + bio_put(bio); +} + +static void bounce_end_io_write(struct bio *bio) +{ + bounce_end_io(bio, page_pool); +} + +static void bounce_end_io_write_isa(struct bio *bio) +{ + + bounce_end_io(bio, isa_page_pool); +} + +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) +{ + struct bio *bio_orig = bio->bi_private; + + if (!bio->bi_error) + copy_to_high_bio_irq(bio_orig, bio); + + bounce_end_io(bio, pool); +} + +static void bounce_end_io_read(struct bio *bio) +{ + __bounce_end_io_read(bio, page_pool); +} + +static void bounce_end_io_read_isa(struct bio *bio) +{ + __bounce_end_io_read(bio, isa_page_pool); +} + +static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, + mempool_t *pool) +{ + struct bio *bio; + int rw = bio_data_dir(*bio_orig); + struct bio_vec *to, from; + struct bvec_iter iter; + unsigned i; + + bio_for_each_segment(from, *bio_orig, iter) + if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) + goto bounce; + + return; +bounce: + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); + + bio_for_each_segment_all(to, bio, i) { + struct page *page = to->bv_page; + + if (page_to_pfn(page) <= queue_bounce_pfn(q)) + continue; + + to->bv_page = mempool_alloc(pool, q->bounce_gfp); + inc_zone_page_state(to->bv_page, NR_BOUNCE); + + if (rw == WRITE) { + char *vto, *vfrom; + + flush_dcache_page(page); + + vto = page_address(to->bv_page) + to->bv_offset; + vfrom = kmap_atomic(page) + to->bv_offset; + memcpy(vto, vfrom, to->bv_len); + kunmap_atomic(vfrom); + } + } + + trace_block_bio_bounce(q, *bio_orig); + + bio->bi_flags |= (1 << BIO_BOUNCED); + + if (pool == page_pool) { + bio->bi_end_io = bounce_end_io_write; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read; + } else { + bio->bi_end_io = bounce_end_io_write_isa; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read_isa; + } + + bio->bi_private = *bio_orig; + *bio_orig = bio; +} + +void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) +{ + mempool_t *pool; + + /* + * Data-less bio, nothing to bounce + */ + if (!bio_has_data(*bio_orig)) + return; + + /* + * for non-isa bounce case, just check if the bounce pfn is equal + * to or bigger than the highest pfn in the system -- in that case, + * don't waste time iterating over bio segments + */ + if (!(q->bounce_gfp & GFP_DMA)) { + if (queue_bounce_pfn(q) >= blk_max_pfn) + return; + pool = page_pool; + } else { + BUG_ON(!isa_page_pool); + pool = isa_page_pool; + } + + /* + * slow path + */ + __blk_queue_bounce(q, bio_orig, pool); +} + +EXPORT_SYMBOL(blk_queue_bounce); diff --git a/block/bsg.c b/block/bsg.c index 420a5a9f1b23..d214e929ce18 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -136,42 +136,6 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index) return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; } -static int bsg_io_schedule(struct bsg_device *bd) -{ - DEFINE_WAIT(wait); - int ret = 0; - - spin_lock_irq(&bd->lock); - - BUG_ON(bd->done_cmds > bd->queued_cmds); - - /* - * -ENOSPC or -ENODATA? I'm going for -ENODATA, meaning "I have no - * work to do", even though we return -ENOSPC after this same test - * during bsg_write() -- there, it means our buffer can't have more - * bsg_commands added to it, thus has no space left. - */ - if (bd->done_cmds == bd->queued_cmds) { - ret = -ENODATA; - goto unlock; - } - - if (!test_bit(BSG_F_BLOCK, &bd->flags)) { - ret = -EAGAIN; - goto unlock; - } - - prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&bd->lock); - io_schedule(); - finish_wait(&bd->wq_done, &wait); - - return ret; -unlock: - spin_unlock_irq(&bd->lock); - return ret; -} - static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, struct sg_io_v4 *hdr, struct bsg_device *bd, fmode_t has_write_perm) @@ -196,7 +160,6 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, * fill in request structure */ rq->cmd_len = hdr->request_len; - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) @@ -271,8 +234,10 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, * map scatter-gather elements separately and string them to request */ rq = blk_get_request(q, rw, GFP_KERNEL); - if (!rq) - return ERR_PTR(-ENOMEM); + if (IS_ERR(rq)) + return rq; + blk_rq_set_block_pc(rq); + ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); if (ret) goto out; @@ -284,8 +249,9 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, } next_rq = blk_get_request(q, READ, GFP_KERNEL); - if (!next_rq) { - ret = -ENOMEM; + if (IS_ERR(next_rq)) { + ret = PTR_ERR(next_rq); + next_rq = NULL; goto out; } rq->next_rq = next_rq; @@ -480,6 +446,30 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, return ret; } +static bool bsg_complete(struct bsg_device *bd) +{ + bool ret = false; + bool spin; + + do { + spin_lock_irq(&bd->lock); + + BUG_ON(bd->done_cmds > bd->queued_cmds); + + /* + * All commands consumed. + */ + if (bd->done_cmds == bd->queued_cmds) + ret = true; + + spin = !test_bit(BSG_F_BLOCK, &bd->flags); + + spin_unlock_irq(&bd->lock); + } while (!ret && spin); + + return ret; +} + static int bsg_complete_all_commands(struct bsg_device *bd) { struct bsg_command *bc; @@ -490,17 +480,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd) /* * wait for all commands to complete */ - ret = 0; - do { - ret = bsg_io_schedule(bd); - /* - * look for -ENODATA specifically -- we'll sometimes get - * -ERESTARTSYS when we've taken a signal, but we can't - * return until we're done freeing the queue, so ignore - * it. The signal will get handled when we're done freeing - * the bsg_device. - */ - } while (ret != -ENODATA); + io_wait_event(bd->wq_done, bsg_complete(bd)); /* * discard done commands @@ -1008,7 +988,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, /* * we need a proper transport to send commands, not a stacked device */ - if (!q->request_fn) + if (!queue_is_rq_based(q)) return 0; bcd = &q->bsg_dev; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e0985f1955e7..1f9093e901da 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -14,8 +14,8 @@ #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/blktrace_api.h> +#include <linux/blk-cgroup.h> #include "blk.h" -#include "blk-cgroup.h" /* * tunables @@ -67,6 +67,11 @@ static struct kmem_cache *cfq_pool; #define sample_valid(samples) ((samples) > 80) #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) +/* blkio-related constants */ +#define CFQ_WEIGHT_LEGACY_MIN 10 +#define CFQ_WEIGHT_LEGACY_DFL 500 +#define CFQ_WEIGHT_LEGACY_MAX 1000 + struct cfq_ttime { unsigned long last_end_request; @@ -172,10 +177,6 @@ enum wl_type_t { struct cfqg_stats { #ifdef CONFIG_CFQ_GROUP_IOSCHED - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -184,8 +185,6 @@ struct cfqg_stats { struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; #ifdef CONFIG_DEBUG_BLK_CGROUP @@ -212,6 +211,15 @@ struct cfqg_stats { #endif /* CONFIG_CFQ_GROUP_IOSCHED */ }; +/* Per-cgroup data */ +struct cfq_group_data { + /* must be the first member */ + struct blkcg_policy_data cpd; + + unsigned int weight; + unsigned int leaf_weight; +}; + /* This is per cgroup per device grouping structure */ struct cfq_group { /* must be the first member */ @@ -290,7 +298,11 @@ struct cfq_group { int dispatched; struct cfq_ttime ttime; struct cfqg_stats stats; /* stats for this cfqg */ - struct cfqg_stats dead_stats; /* stats pushed from dead children */ + + /* async queue for each priority case */ + struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; + struct cfq_queue *async_idle_cfqq; + }; struct cfq_io_cq { @@ -299,7 +311,7 @@ struct cfq_io_cq { struct cfq_ttime ttime; int ioprio; /* the current ioprio */ #ifdef CONFIG_CFQ_GROUP_IOSCHED - uint64_t blkcg_id; /* the current blkcg ID */ + uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif }; @@ -356,12 +368,6 @@ struct cfq_data { struct cfq_queue *active_queue; struct cfq_io_cq *active_cic; - /* - * async queue for each priority case - */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - sector_t last_position; /* @@ -387,6 +393,7 @@ struct cfq_data { }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); +static void cfq_put_queue(struct cfq_queue *cfqq); static struct cfq_rb_root *st_for(struct cfq_group *cfqg, enum wl_class_t class, @@ -446,16 +453,6 @@ CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS -static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct cfq_group, pd) : NULL; -} - -static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) -{ - return pd_to_blkg(&cfqg->pd); -} - #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* cfqg stats flags */ @@ -600,6 +597,22 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } #ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct cfq_group, pd) : NULL; +} + +static struct cfq_group_data +*cpd_to_cfqgd(struct blkcg_policy_data *cpd) +{ + return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; +} + +static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) +{ + return pd_to_blkg(&cfqg->pd); +} + static struct blkcg_policy blkcg_policy_cfq; static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) @@ -607,6 +620,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); } +static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg) +{ + return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq)); +} + static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; @@ -668,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) blkg_rwstat_add(&cfqg->stats.merged, rw, 1); } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) -{ - blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); -} - static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { @@ -693,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, static void cfqg_stats_reset(struct cfqg_stats *stats) { /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); @@ -711,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats) } /* @to += @from */ -static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) +static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) { /* queued stats shouldn't be cleared */ - blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); - blkg_rwstat_merge(&to->serviced, &from->serviced); - blkg_rwstat_merge(&to->merged, &from->merged); - blkg_rwstat_merge(&to->service_time, &from->service_time); - blkg_rwstat_merge(&to->wait_time, &from->wait_time); - blkg_stat_merge(&from->time, &from->time); + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + blkg_stat_add_aux(&from->time, &from->time); #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_merge(&to->dequeue, &from->dequeue); - blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); - blkg_stat_merge(&to->idle_time, &from->idle_time); - blkg_stat_merge(&to->empty_time, &from->empty_time); + blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->dequeue, &from->dequeue); + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); + blkg_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* - * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' + * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this cfqg after * it's gone. */ @@ -745,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) if (unlikely(!parent)) return; - cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); - cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); + cfqg_stats_add_aux(&parent->stats, &cfqg->stats); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } #else /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -770,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, unsigned long time, unsigned long unaccounted_time) { } static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { } @@ -858,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, - struct cfq_io_cq *cic, struct bio *bio, - gfp_t gfp_mask); + struct cfq_io_cq *cic, struct bio *bio); static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) { @@ -908,7 +909,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) { if (cfqd->busy_queues) { cfq_log(cfqd, "schedule dispatch"); - kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); + kblockd_schedule_work(&cfqd->unplug_work); } } @@ -1272,15 +1273,22 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) rb_insert_color(&cfqg->rb_node, &st->rb); } +/* + * This has to be called only on activation of cfqg + */ static void cfq_update_group_weight(struct cfq_group *cfqg) { - BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - if (cfqg->new_weight) { cfqg->weight = cfqg->new_weight; cfqg->new_weight = 0; } +} + +static void +cfq_update_group_leaf_weight(struct cfq_group *cfqg) +{ + BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); if (cfqg->new_leaf_weight) { cfqg->leaf_weight = cfqg->new_leaf_weight; @@ -1299,7 +1307,12 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) /* add to the service tree */ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - cfq_update_group_weight(cfqg); + /* + * Update leaf_weight. We cannot update weight at this point + * because cfqg might already have been activated and is + * contributing its current weight to the parent's child_weight. + */ + cfq_update_group_leaf_weight(cfqg); __cfq_group_service_tree_add(st, cfqg); /* @@ -1323,6 +1336,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) */ while ((parent = cfqg_parent(pos))) { if (propagate) { + cfq_update_group_weight(pos); propagate = !parent->nr_active++; parent->children_weight += pos->weight; } @@ -1508,115 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void cfqg_stats_init(struct cfqg_stats *stats) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight); + +static void cfqg_stats_exit(struct cfqg_stats *stats) { - blkg_rwstat_init(&stats->service_bytes); - blkg_rwstat_init(&stats->serviced); - blkg_rwstat_init(&stats->merged); - blkg_rwstat_init(&stats->service_time); - blkg_rwstat_init(&stats->wait_time); - blkg_rwstat_init(&stats->queued); + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); + blkg_stat_exit(&stats->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_exit(&stats->unaccounted_time); + blkg_stat_exit(&stats->avg_queue_size_sum); + blkg_stat_exit(&stats->avg_queue_size_samples); + blkg_stat_exit(&stats->dequeue); + blkg_stat_exit(&stats->group_wait_time); + blkg_stat_exit(&stats->idle_time); + blkg_stat_exit(&stats->empty_time); +#endif +} - blkg_stat_init(&stats->sectors); - blkg_stat_init(&stats->time); +static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) +{ + if (blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || + blkg_stat_init(&stats->time, gfp)) + goto err; #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_init(&stats->unaccounted_time); - blkg_stat_init(&stats->avg_queue_size_sum); - blkg_stat_init(&stats->avg_queue_size_samples); - blkg_stat_init(&stats->dequeue); - blkg_stat_init(&stats->group_wait_time); - blkg_stat_init(&stats->idle_time); - blkg_stat_init(&stats->empty_time); + if (blkg_stat_init(&stats->unaccounted_time, gfp) || + blkg_stat_init(&stats->avg_queue_size_sum, gfp) || + blkg_stat_init(&stats->avg_queue_size_samples, gfp) || + blkg_stat_init(&stats->dequeue, gfp) || + blkg_stat_init(&stats->group_wait_time, gfp) || + blkg_stat_init(&stats->idle_time, gfp) || + blkg_stat_init(&stats->empty_time, gfp)) + goto err; #endif + return 0; +err: + cfqg_stats_exit(stats); + return -ENOMEM; } -static void cfq_pd_init(struct blkcg_gq *blkg) +static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group_data *cgd; + + cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); + if (!cgd) + return NULL; + return &cgd->cpd; +} + +static void cfq_cpd_init(struct blkcg_policy_data *cpd) +{ + struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); + unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? + CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (cpd_to_blkcg(cpd) == &blkcg_root) + weight *= 2; + + cgd->weight = weight; + cgd->leaf_weight = weight; +} + +static void cfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_cfqgd(cpd)); +} + +static void cfq_cpd_bind(struct blkcg_policy_data *cpd) +{ + struct blkcg *blkcg = cpd_to_blkcg(cpd); + bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys); + unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (blkcg == &blkcg_root) + weight *= 2; + + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); +} + +static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) +{ + struct cfq_group *cfqg; + + cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); + if (!cfqg) + return NULL; cfq_init_cfqg_base(cfqg); - cfqg->weight = blkg->blkcg->cfq_weight; - cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight; - cfqg_stats_init(&cfqg->stats); - cfqg_stats_init(&cfqg->dead_stats); + if (cfqg_stats_init(&cfqg->stats, gfp)) { + kfree(cfqg); + return NULL; + } + + return &cfqg->pd; +} + +static void cfq_pd_init(struct blkg_policy_data *pd) +{ + struct cfq_group *cfqg = pd_to_cfqg(pd); + struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); + + cfqg->weight = cgd->weight; + cfqg->leaf_weight = cgd->leaf_weight; } -static void cfq_pd_offline(struct blkcg_gq *blkg) +static void cfq_pd_offline(struct blkg_policy_data *pd) { + struct cfq_group *cfqg = pd_to_cfqg(pd); + int i; + + for (i = 0; i < IOPRIO_BE_NR; i++) { + if (cfqg->async_cfqq[0][i]) + cfq_put_queue(cfqg->async_cfqq[0][i]); + if (cfqg->async_cfqq[1][i]) + cfq_put_queue(cfqg->async_cfqq[1][i]); + } + + if (cfqg->async_idle_cfqq) + cfq_put_queue(cfqg->async_idle_cfqq); + /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so * that they don't get lost. If IOs complete after this point, the * stats for them will be lost. Oh well... */ - cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); -} - -/* offset delta from cfqg->stats to cfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - - offsetof(struct cfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -{ - u64 sum = 0; - - sum += blkg_stat_recursive_sum(pd, off); - sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); - return sum; + cfqg_stats_xfer_dead(cfqg); } -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, - int off) +static void cfq_pd_free(struct blkg_policy_data *pd) { - struct blkg_rwstat a, b; + struct cfq_group *cfqg = pd_to_cfqg(pd); - a = blkg_rwstat_recursive_sum(pd, off); - b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); - blkg_rwstat_merge(&a, &b); - return a; + cfqg_stats_exit(&cfqg->stats); + return kfree(cfqg); } -static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +static void cfq_pd_reset_stats(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg = pd_to_cfqg(pd); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } -/* - * Search for the cfq group current task belongs to. request_queue lock must - * be held. - */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { - struct request_queue *q = cfqd->queue; - struct cfq_group *cfqg = NULL; - - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - cfqg = cfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - cfqg = blkg_to_cfqg(blkg); - } + struct blkcg_gq *blkg; - return cfqg; + blkg = blkg_lookup(blkcg, cfqd->queue); + if (likely(blkg)) + return blkg_to_cfqg(blkg); + return NULL; } static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { - /* Currently, all async queues are mapped to root group */ - if (!cfq_cfqq_sync(cfqq)) - cfqg = cfqq->cfqd->root_group; - cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ cfqg_get(cfqg); @@ -1660,73 +1730,114 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) static int cfq_print_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight); + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + unsigned int val = 0; + + if (cgd) + val = cgd->weight; + + seq_printf(sf, "%u\n", val); return 0; } static int cfq_print_leaf_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight); + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + unsigned int val = 0; + + if (cgd) + val = cgd->leaf_weight; + + seq_printf(sf, "%u\n", val); return 0; } -static int __cfqg_set_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buf, - bool is_leaf_weight) +static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + bool on_dfl, bool is_leaf_weight) { - struct blkcg *blkcg = css_to_blkcg(css); + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; + struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; + struct cfq_group_data *cfqgd; int ret; + u64 v; ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); if (ret) return ret; - ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ + ret = -ERANGE; + if (!v && on_dfl) + goto out_finish; + } else if (!strcmp(strim(ctx.body), "default")) { + v = 0; + } else { + ret = -EINVAL; + goto out_finish; + } + cfqg = blkg_to_cfqg(ctx.blkg); - if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { + cfqgd = blkcg_to_cfqgd(blkcg); + + ret = -ERANGE; + if (!v || (v >= min && v <= max)) { if (!is_leaf_weight) { - cfqg->dev_weight = ctx.v; - cfqg->new_weight = ctx.v ?: blkcg->cfq_weight; + cfqg->dev_weight = v; + cfqg->new_weight = v ?: cfqgd->weight; } else { - cfqg->dev_leaf_weight = ctx.v; - cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight; + cfqg->dev_leaf_weight = v; + cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; } ret = 0; } - +out_finish: blkg_conf_finish(&ctx); - return ret; + return ret ?: nbytes; } -static int cfqg_set_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, char *buf) +static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(css, cft, buf, false); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); } -static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, char *buf) +static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(css, cft, buf, true); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); } -static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val, bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; + struct cfq_group_data *cfqgd; + int ret = 0; - if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) - return -EINVAL; + if (val < min || val > max) + return -ERANGE; spin_lock_irq(&blkcg->lock); + cfqgd = blkcg_to_cfqgd(blkcg); + if (!cfqgd) { + ret = -EINVAL; + goto out; + } if (!is_leaf_weight) - blkcg->cfq_weight = val; + cfqgd->weight = val; else - blkcg->cfq_leaf_weight = val; + cfqgd->leaf_weight = val; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct cfq_group *cfqg = blkg_to_cfqg(blkg); @@ -1735,28 +1846,33 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, continue; if (!is_leaf_weight) { + if (reset_dev) + cfqg->dev_weight = 0; if (!cfqg->dev_weight) - cfqg->new_weight = blkcg->cfq_weight; + cfqg->new_weight = cfqgd->weight; } else { + if (reset_dev) + cfqg->dev_leaf_weight = 0; if (!cfqg->dev_leaf_weight) - cfqg->new_leaf_weight = blkcg->cfq_leaf_weight; + cfqg->new_leaf_weight = cfqgd->leaf_weight; } } +out: spin_unlock_irq(&blkcg->lock); - return 0; + return ret; } static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, false); + return __cfq_set_weight(css, val, false, false, false); } static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, true); + return __cfq_set_weight(css, val, false, false, true); } static int cfqg_print_stat(struct seq_file *sf, void *v) @@ -1776,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v) static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = cfqg_stat_pd_recursive_sum(pd, off); - + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_u64(sf, pd, sum); } static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); - + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -1805,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); + return 0; +} + +static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, + false); + return 0; +} + #ifdef CONFIG_DEBUG_BLK_CGROUP static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) @@ -1831,13 +1981,13 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) } #endif /* CONFIG_DEBUG_BLK_CGROUP */ -static struct cftype cfq_blkcg_files[] = { +static struct cftype cfq_blkcg_legacy_files[] = { /* on root, weight is mapped to leaf_weight */ { .name = "weight_device", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cfqg_print_leaf_weight_device, - .write_string = cfqg_set_leaf_weight_device, + .write = cfqg_set_leaf_weight_device, }, { .name = "weight", @@ -1851,7 +2001,7 @@ static struct cftype cfq_blkcg_files[] = { .name = "weight_device", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cfqg_print_weight_device, - .write_string = cfqg_set_weight_device, + .write = cfqg_set_weight_device, }, { .name = "weight", @@ -1863,7 +2013,7 @@ static struct cftype cfq_blkcg_files[] = { { .name = "leaf_weight_device", .seq_show = cfqg_print_leaf_weight_device, - .write_string = cfqg_set_leaf_weight_device, + .write = cfqg_set_leaf_weight_device, }, { .name = "leaf_weight", @@ -1879,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat, + .seq_show = cfqg_print_stat_sectors, }, { .name = "io_service_bytes", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes, }, { .name = "io_serviced", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios, }, { .name = "io_service_time", @@ -1921,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors_recursive", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat_recursive, + .seq_show = cfqg_print_stat_sectors_recursive, }, { .name = "io_service_bytes_recursive", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes_recursive, }, { .name = "io_serviced_recursive", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios_recursive, }, { .name = "io_service_time_recursive", @@ -1987,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = { #endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ }; + +static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + + seq_printf(sf, "default %u\n", cgd->weight); + blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); + return 0; +} + +static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + char *endp; + int ret; + u64 v; + + buf = strim(buf); + + /* "WEIGHT" or "default WEIGHT" sets the default weight */ + v = simple_strtoull(buf, &endp, 0); + if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { + ret = __cfq_set_weight(of_css(of), v, true, false, false); + return ret ?: nbytes; + } + + /* "MAJ:MIN WEIGHT" */ + return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); +} + +static struct cftype cfq_blkcg_files[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cfq_print_weight_on_dfl, + .write = cfq_set_weight_on_dfl, + }, + { } /* terminate */ +}; + #else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { return cfqd->root_group; } @@ -2792,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); - cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); } /* @@ -3425,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq) struct cfq_io_cq *cic = icq_to_cic(icq); struct cfq_data *cfqd = cic_to_cfqd(cic); - if (cic->cfqq[BLK_RW_ASYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); - cic->cfqq[BLK_RW_ASYNC] = NULL; + if (cic_to_cfqq(cic, false)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); + cic_set_cfqq(cic, NULL, false); } - if (cic->cfqq[BLK_RW_SYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); - cic->cfqq[BLK_RW_SYNC] = NULL; + if (cic_to_cfqq(cic, true)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); + cic_set_cfqq(cic, NULL, true); } } @@ -3491,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) return; - cfqq = cic->cfqq[BLK_RW_ASYNC]; + cfqq = cic_to_cfqq(cic, false); if (cfqq) { - struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, - GFP_ATOMIC); - if (new_cfqq) { - cic->cfqq[BLK_RW_ASYNC] = new_cfqq; - cfq_put_queue(cfqq); - } + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); + cic_set_cfqq(cic, cfqq, false); } - cfqq = cic->cfqq[BLK_RW_SYNC]; + cfqq = cic_to_cfqq(cic, true); if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); @@ -3533,107 +3718,57 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *sync_cfqq; - uint64_t id; + struct cfq_queue *cfqq; + uint64_t serial_nr; rcu_read_lock(); - id = bio_blkcg(bio)->id; + serial_nr = bio_blkcg(bio)->css.serial_nr; rcu_read_unlock(); /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ - if (unlikely(!cfqd) || likely(cic->blkcg_id == id)) + if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) return; - sync_cfqq = cic_to_cfqq(cic, 1); - if (sync_cfqq) { - /* - * Drop reference to sync queue. A new sync queue will be - * assigned in new group upon arrival of a fresh request. - */ - cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); - cic_set_cfqq(cic, NULL, 1); - cfq_put_queue(sync_cfqq); - } - - cic->blkcg_id = id; -} -#else -static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } -#endif /* CONFIG_CFQ_GROUP_IOSCHED */ - -static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) -{ - struct blkcg *blkcg; - struct cfq_queue *cfqq, *new_cfqq = NULL; - struct cfq_group *cfqg; - -retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); - cfqq = cic_to_cfqq(cic, is_sync); - /* - * Always try a new alloc if we fell back to the OOM cfqq - * originally, since it should just be a temporary situation. + * Drop reference to queues. New queues will be assigned in new + * group upon arrival of fresh requests. */ - if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = NULL; - if (new_cfqq) { - cfqq = new_cfqq; - new_cfqq = NULL; - } else if (gfp_mask & __GFP_WAIT) { - rcu_read_unlock(); - spin_unlock_irq(cfqd->queue->queue_lock); - new_cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - spin_lock_irq(cfqd->queue->queue_lock); - if (new_cfqq) - goto retry; - else - return &cfqd->oom_cfqq; - } else { - cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - } - - if (cfqq) { - cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, cic); - cfq_link_cfqq_cfqg(cfqq, cfqg); - cfq_log_cfqq(cfqd, cfqq, "alloced"); - } else - cfqq = &cfqd->oom_cfqq; + cfqq = cic_to_cfqq(cic, false); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, false); + cfq_put_queue(cfqq); } - if (new_cfqq) - kmem_cache_free(cfq_pool, new_cfqq); + cfqq = cic_to_cfqq(cic, true); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, true); + cfq_put_queue(cfqq); + } - rcu_read_unlock(); - return cfqq; + cic->blkcg_serial_nr = serial_nr; } +#else +static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) +cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &cfqd->async_cfqq[0][ioprio]; + return &cfqg->async_cfqq[0][ioprio]; case IOPRIO_CLASS_NONE: ioprio = IOPRIO_NORM; /* fall through */ case IOPRIO_CLASS_BE: - return &cfqd->async_cfqq[1][ioprio]; + return &cfqg->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: - return &cfqd->async_idle_cfqq; + return &cfqg->async_idle_cfqq; default: BUG(); } @@ -3641,30 +3776,53 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio) { - const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); - const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); + int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); + int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq = NULL; - struct cfq_queue *cfqq = NULL; + struct cfq_queue *cfqq; + struct cfq_group *cfqg; + + rcu_read_lock(); + cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); + if (!cfqg) { + cfqq = &cfqd->oom_cfqq; + goto out; + } if (!is_sync) { - async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); + if (!ioprio_valid(cic->ioprio)) { + struct task_struct *tsk = current; + ioprio = task_nice_ioprio(tsk); + ioprio_class = task_nice_ioclass(tsk); + } + async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); cfqq = *async_cfqq; + if (cfqq) + goto out; } - if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); + cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, + cfqd->queue->node); + if (!cfqq) { + cfqq = &cfqd->oom_cfqq; + goto out; + } - /* - * pin the queue now that it's allocated, scheduler exit will prune it - */ - if (!is_sync && !(*async_cfqq)) { + cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); + cfq_init_prio_data(cfqq, cic); + cfq_link_cfqq_cfqg(cfqq, cfqg); + cfq_log_cfqq(cfqd, cfqq, "alloced"); + + if (async_cfqq) { + /* a new async queue is created, pin and remember */ cfqq->ref++; *async_cfqq = cfqq; } - +out: cfqq->ref++; + rcu_read_unlock(); return cfqq; } @@ -4198,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - might_sleep_if(gfp_mask & __GFP_WAIT); - spin_lock_irq(q->queue_lock); check_ioprio_changed(cic, bio); @@ -4207,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); + if (cfqq) + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); cic_set_cfqq(cic, cfqq, is_sync); } else { /* @@ -4313,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) cancel_work_sync(&cfqd->unplug_work); } -static void cfq_put_async_queues(struct cfq_data *cfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqd->async_cfqq[0][i]) - cfq_put_queue(cfqd->async_cfqq[0][i]); - if (cfqd->async_cfqq[1][i]) - cfq_put_queue(cfqd->async_cfqq[1][i]); - } - - if (cfqd->async_idle_cfqq) - cfq_put_queue(cfqd->async_idle_cfqq); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; @@ -4340,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e) if (cfqd->active_queue) __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - cfq_put_async_queues(cfqd); - spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); @@ -4395,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; cfq_init_cfqg_base(cfqd->root_group); + cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; + cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; #endif - cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; - cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -4408,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) cfqd->prio_trees[i] = RB_ROOT; /* - * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. + * Our fallback cfqq if cfq_get_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow * will not attempt to free it. oom_cfqq is linked to root_group * but shouldn't hold a reference as it'll never be unlinked. Lose @@ -4454,13 +4595,25 @@ out_free: return ret; } +static void cfq_registered_queue(struct request_queue *q) +{ + struct elevator_queue *e = q->elevator; + struct cfq_data *cfqd = e->elevator_data; + + /* + * Default to IOPS mode with no idling for SSDs + */ + if (blk_queue_nonrot(q)) + cfqd->cfq_slice_idle = 0; +} + /* * sysfs parts below --> */ static ssize_t cfq_var_show(unsigned int var, char *page) { - return sprintf(page, "%d\n", var); + return sprintf(page, "%u\n", var); } static ssize_t @@ -4569,6 +4722,7 @@ static struct elevator_type iosched_cfq = { .elevator_may_queue_fn = cfq_may_queue, .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, + .elevator_registered_fn = cfq_registered_queue, }, .icq_size = sizeof(struct cfq_io_cq), .icq_align = __alignof__(struct cfq_io_cq), @@ -4579,11 +4733,18 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .pd_size = sizeof(struct cfq_group), - .cftypes = cfq_blkcg_files, + .dfl_cftypes = cfq_blkcg_files, + .legacy_cftypes = cfq_blkcg_legacy_files, + + .cpd_alloc_fn = cfq_cpd_alloc, + .cpd_init_fn = cfq_cpd_init, + .cpd_free_fn = cfq_cpd_free, + .cpd_bind_fn = cfq_cpd_bind, + .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, .pd_offline_fn = cfq_pd_offline, + .pd_free_fn = cfq_pd_free, .pd_reset_stats_fn = cfq_pd_reset_stats, }; #endif diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index fbd5a67cb773..f678c733df40 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -663,6 +663,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) fmode_t mode = file->f_mode; struct backing_dev_info *bdi; loff_t size; + unsigned int max_sectors; /* * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have @@ -690,6 +691,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKROSET: case BLKDISCARD: case BLKSECDISCARD: + case BLKZEROOUT: /* * the ones below are implemented in blkdev_locked_ioctl, * but we call blkdev_ioctl, which gets the lock for us @@ -707,8 +709,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!arg) return -EINVAL; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; return compat_put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); case BLKROGET: /* compatible */ @@ -718,8 +718,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKSSZGET: /* get block device hardware sector size */ return compat_put_int(arg, bdev_logical_block_size(bdev)); case BLKSECTGET: - return compat_put_ushort(arg, - queue_max_sectors(bdev_get_queue(bdev))); + max_sectors = min_t(unsigned int, USHRT_MAX, + queue_max_sectors(bdev_get_queue(bdev))); + return compat_put_ushort(arg, max_sectors); case BLKROTATIONAL: return compat_put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev))); @@ -728,8 +729,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EACCES; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKGETSIZE: diff --git a/block/elevator.c b/block/elevator.c index 1e01b66a0b92..c3555c9c672f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -35,11 +35,11 @@ #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> +#include <linux/blk-cgroup.h> #include <trace/events/block.h> #include "blk.h" -#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -157,7 +157,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node); if (unlikely(!eq)) - goto err; + return NULL; eq->type = e; kobject_init(&eq->kobj, &elv_ktype); @@ -165,10 +165,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, hash_init(eq->hash); return eq; -err: - kfree(eq); - elevator_put(e); - return NULL; } EXPORT_SYMBOL(elevator_alloc); @@ -229,7 +225,9 @@ int elevator_init(struct request_queue *q, char *name) } err = e->ops.elevator_init_fn(q, e); - return 0; + if (err) + elevator_put(e); + return err; } EXPORT_SYMBOL(elevator_init); @@ -422,7 +420,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) * noxmerges: Only simple one-hit cache try * merges: All merge tries attempted */ - if (blk_queue_nomerges(q)) + if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return ELEVATOR_NO_MERGE; /* @@ -537,7 +535,7 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, e->type->ops.elevator_bio_merged_fn(q, rq, bio); } -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM static void blk_pm_requeue_request(struct request *rq) { if (rq->q->dev && !(rq->cmd_flags & REQ_PM)) @@ -729,26 +727,6 @@ int elv_may_queue(struct request_queue *q, int rw) return ELV_MQUEUE_MAY; } -void elv_abort_queue(struct request_queue *q) -{ - struct request *rq; - - blk_abort_flushes(q); - - while (!list_empty(&q->queue_head)) { - rq = list_entry_rq(q->queue_head.next); - rq->cmd_flags |= REQ_QUIET; - trace_block_rq_abort(q, rq); - /* - * Mark this request as started so we don't trigger - * any debug logic in the end I/O path. - */ - blk_start_request(rq); - __blk_end_request_all(rq, -EIO); - } -} -EXPORT_SYMBOL(elv_abort_queue); - void elv_completed_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; @@ -828,6 +806,8 @@ int elv_register_queue(struct request_queue *q) } kobject_uevent(&e->kobj, KOBJ_ADD); e->registered = 1; + if (e->type->ops.elevator_registered_fn) + e->type->ops.elevator_registered_fn(q); } return error; } diff --git a/block/genhd.c b/block/genhd.c index 791f41943132..9f42526b4d62 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -8,6 +8,7 @@ #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> @@ -19,6 +20,7 @@ #include <linux/idr.h> #include <linux/log2.h> #include <linux/pm_runtime.h> +#include <linux/badblocks.h> #include "blk.h" @@ -28,10 +30,10 @@ struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) -/* For extended devt allocation. ext_devt_mutex prevents look up +/* For extended devt allocation. ext_devt_lock prevents look up * results from going away underneath its user. */ -static DEFINE_MUTEX(ext_devt_mutex); +static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; @@ -420,9 +422,13 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) } /* allocate ext devt */ - mutex_lock(&ext_devt_mutex); - idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_KERNEL); - mutex_unlock(&ext_devt_mutex); + idr_preload(GFP_KERNEL); + + spin_lock_bh(&ext_devt_lock); + idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); + spin_unlock_bh(&ext_devt_lock); + + idr_preload_end(); if (idx < 0) return idx == -ENOSPC ? -EBUSY : idx; @@ -441,15 +447,13 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) */ void blk_free_devt(dev_t devt) { - might_sleep(); - if (devt == MKDEV(0, 0)) return; if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - mutex_lock(&ext_devt_mutex); + spin_lock_bh(&ext_devt_lock); idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - mutex_unlock(&ext_devt_mutex); + spin_unlock_bh(&ext_devt_lock); } } @@ -627,6 +631,7 @@ void add_disk(struct gendisk *disk) WARN_ON(retval); disk_add_events(disk); + blk_integrity_add(disk); } EXPORT_SYMBOL(add_disk); @@ -635,6 +640,7 @@ void del_gendisk(struct gendisk *disk) struct disk_part_iter piter; struct hd_struct *part; + blk_integrity_del(disk); disk_del_events(disk); /* invalidate stuff */ @@ -651,7 +657,6 @@ void del_gendisk(struct gendisk *disk) disk->flags &= ~GENHD_FL_UP; sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); - bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); @@ -660,15 +665,38 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); - disk->driverfs_dev = NULL; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); - blk_free_devt(disk_to_dev(disk)->devt); } EXPORT_SYMBOL(del_gendisk); +/* sysfs access to bad-blocks list. */ +static ssize_t disk_badblocks_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return sprintf(page, "\n"); + + return badblocks_show(disk->bb, page, 0); +} + +static ssize_t disk_badblocks_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t len) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return -ENXIO; + + return badblocks_store(disk->bb, page, len, 0); +} + /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for @@ -690,13 +718,13 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } else { struct hd_struct *part; - mutex_lock(&ext_devt_mutex); + spin_lock_bh(&ext_devt_lock); part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); if (part && get_disk(part_to_disk(part))) { *partno = part->partno; disk = part_to_disk(part); } - mutex_unlock(&ext_devt_mutex); + spin_unlock_bh(&ext_devt_lock); } return disk; @@ -987,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show, + disk_badblocks_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -1008,6 +1038,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_badblocks.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1069,9 +1100,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) struct disk_part_tbl *old_ptbl = disk->part_tbl; struct disk_part_tbl *new_ptbl; int len = old_ptbl ? old_ptbl->len : 0; - int target = partno + 1; + int i, target; size_t size; - int i; + + /* + * check for int overflow, since we can get here from blkpg_ioctl() + * with a user passed 'partno'. + */ + target = partno + 1; + if (target < 0) + return -EINVAL; /* disk_max_parts() is zero during initialization, ignore if so */ if (disk_max_parts(disk) && target > disk_max_parts(disk)) @@ -1098,11 +1136,11 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + blk_free_devt(dev->devt); disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); - free_part_stats(&disk->part0); - free_part_info(&disk->part0); + hd_free_part(&disk->part0); if (disk->queue) blk_put_queue(disk->queue); kfree(disk); @@ -1276,7 +1314,11 @@ struct gendisk *alloc_disk_node(int minors, int node_id) * converted to make use of bd_mutex and sequence counters. */ seqcount_init(&disk->part0.nr_sects_seq); - hd_ref_init(&disk->part0); + if (hd_ref_init(&disk->part0)) { + hd_free_part(&disk->part0); + kfree(disk); + return NULL; + } disk->minors = minors; rand_initialize_disk(disk); @@ -1407,7 +1449,7 @@ static DEFINE_MUTEX(disk_events_mutex); static LIST_HEAD(disk_events); /* disable in-kernel polling by default */ -static unsigned long disk_events_dfl_poll_msecs = 0; +static unsigned long disk_events_dfl_poll_msecs; static unsigned long disk_events_poll_jiffies(struct gendisk *disk) { @@ -1543,7 +1585,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) /** * disk_clear_events - synchronously check, clear and return pending events * @disk: disk to fetch and clear events from - * @mask: mask of events to be fetched and clearted + * @mask: mask of events to be fetched and cleared * * Disk events are synchronously checked and pending events in @mask * are cleared and returned. This ignores the block count. diff --git a/block/ioctl.c b/block/ioctl.c index 7d5c3b20af45..77f5d17779d6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -4,9 +4,11 @@ #include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> +#include <linux/badblocks.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/blktrace_api.h> +#include <linux/pr.h> #include <asm/uaccess.h> static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) @@ -150,26 +152,63 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user } } -static int blkdev_reread_part(struct block_device *bdev) +/* + * This is an exported API for the block driver, and will not + * acquire bd_mutex. This API should be used in case that + * caller has held bd_mutex already. + */ +int __blkdev_reread_part(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; - int res; if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (!mutex_trylock(&bdev->bd_mutex)) - return -EBUSY; - res = rescan_partitions(disk, bdev); + + lockdep_assert_held(&bdev->bd_mutex); + + return rescan_partitions(disk, bdev); +} +EXPORT_SYMBOL(__blkdev_reread_part); + +/* + * This is an exported API for the block driver, and will + * try to acquire bd_mutex. If bd_mutex has been held already + * in current context, please call __blkdev_reread_part(). + * + * Make sure the held locks in current context aren't required + * in open()/close() handler and I/O path for avoiding ABBA deadlock: + * - bd_mutex is held before calling block driver's open/close + * handler + * - reading partition table may submit I/O to the block device + */ +int blkdev_reread_part(struct block_device *bdev) +{ + int res; + + mutex_lock(&bdev->bd_mutex); + res = __blkdev_reread_part(bdev); mutex_unlock(&bdev->bd_mutex); + return res; } +EXPORT_SYMBOL(blkdev_reread_part); -static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, - uint64_t len, int secure) +static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, + unsigned long arg, unsigned long flags) { - unsigned long flags = 0; + uint64_t range[2]; + uint64_t start, len; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; if (start & 511) return -EINVAL; @@ -180,14 +219,24 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, if (start + len > (i_size_read(bdev->bd_inode) >> 9)) return -EINVAL; - if (secure) - flags |= BLKDEV_DISCARD_SECURE; return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); } -static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start, - uint64_t len) +static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, + unsigned long arg) { + uint64_t range[2]; + uint64_t start, len; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; + if (start & 511) return -EINVAL; if (len & 511) @@ -198,7 +247,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start, if (start + len > (i_size_read(bdev->bd_inode) >> 9)) return -EINVAL; - return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL); + return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false); } static int put_ushort(unsigned long arg, unsigned short val) @@ -248,6 +297,96 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, */ EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); +static int blkdev_pr_register(struct block_device *bdev, + struct pr_registration __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_registration reg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_register) + return -EOPNOTSUPP; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); +} + +static int blkdev_pr_reserve(struct block_device *bdev, + struct pr_reservation __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_reservation rsv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_reserve) + return -EOPNOTSUPP; + if (copy_from_user(&rsv, arg, sizeof(rsv))) + return -EFAULT; + + if (rsv.flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); +} + +static int blkdev_pr_release(struct block_device *bdev, + struct pr_reservation __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_reservation rsv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_release) + return -EOPNOTSUPP; + if (copy_from_user(&rsv, arg, sizeof(rsv))) + return -EFAULT; + + if (rsv.flags) + return -EOPNOTSUPP; + return ops->pr_release(bdev, rsv.key, rsv.type); +} + +static int blkdev_pr_preempt(struct block_device *bdev, + struct pr_preempt __user *arg, bool abort) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_preempt p; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_preempt) + return -EOPNOTSUPP; + if (copy_from_user(&p, arg, sizeof(p))) + return -EFAULT; + + if (p.flags) + return -EOPNOTSUPP; + return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); +} + +static int blkdev_pr_clear(struct block_device *bdev, + struct pr_clear __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_clear c; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_clear) + return -EOPNOTSUPP; + if (copy_from_user(&c, arg, sizeof(c))) + return -EFAULT; + + if (c.flags) + return -EOPNOTSUPP; + return ops->pr_clear(bdev, c.key); +} + /* * Is it an unrecognized ioctl? The correct returns are either * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a @@ -268,95 +407,185 @@ static inline int is_unrecognized_ioctl(int ret) ret == -ENOIOCTLCMD; } -/* - * always keep this in sync with compat_blkdev_ioctl() - */ -int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, - unsigned long arg) +#ifdef CONFIG_FS_DAX +bool blkdev_dax_capable(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; - struct backing_dev_info *bdi; - loff_t size; - int ret, n; - switch(cmd) { - case BLKFLSBUF: - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + if (!disk->fops->direct_access) + return false; + + /* + * If the partition is not aligned on a page boundary, we can't + * do dax I/O to it. + */ + if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) + || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + return false; + + /* + * If the device has known bad blocks, force all I/O through the + * driver / page cache. + * + * TODO: support finer grained dax error handling + */ + if (disk->bb && disk->bb->count) + return false; + + return true; +} - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; +static int blkdev_daxset(struct block_device *bdev, unsigned long argp) +{ + unsigned long arg; + int rc = 0; - fsync_bdev(bdev); - invalidate_bdev(bdev); - return 0; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; - case BLKROSET: - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (get_user(n, (int __user *)(arg))) - return -EFAULT; - set_device_ro(bdev, n); + if (get_user(arg, (int __user *)(argp))) + return -EFAULT; + arg = !!arg; + if (arg == !!(bdev->bd_inode->i_flags & S_DAX)) return 0; - case BLKDISCARD: - case BLKSECDISCARD: { - uint64_t range[2]; + if (arg) + arg = S_DAX; - if (!(mode & FMODE_WRITE)) - return -EBADF; + if (arg && !blkdev_dax_capable(bdev)) + return -ENOTTY; - if (copy_from_user(range, (void __user *)arg, sizeof(range))) - return -EFAULT; + inode_lock(bdev->bd_inode); + if (bdev->bd_map_count == 0) + inode_set_flags(bdev->bd_inode, arg, S_DAX); + else + rc = -EBUSY; + inode_unlock(bdev->bd_inode); + return rc; +} +#else +static int blkdev_daxset(struct block_device *bdev, int arg) +{ + if (arg) + return -ENOTTY; + return 0; +} +#endif - return blk_ioctl_discard(bdev, range[0], range[1], - cmd == BLKSECDISCARD); - } - case BLKZEROOUT: { - uint64_t range[2]; +static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + int ret; - if (!(mode & FMODE_WRITE)) - return -EBADF; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; - if (copy_from_user(range, (void __user *)arg, sizeof(range))) - return -EFAULT; + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!is_unrecognized_ioctl(ret)) + return ret; - return blk_ioctl_zeroout(bdev, range[0], range[1]); - } + fsync_bdev(bdev); + invalidate_bdev(bdev); + return 0; +} - case HDIO_GETGEO: { - struct hd_geometry geo; +static int blkdev_roset(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + int ret, n; - if (!arg) - return -EINVAL; - if (!disk->fops->getgeo) - return -ENOTTY; - - /* - * We need to set the startsect first, the driver may - * want to override it. - */ - memset(&geo, 0, sizeof(geo)); - geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); - if (ret) - return ret; - if (copy_to_user((struct hd_geometry __user *)arg, &geo, - sizeof(geo))) - return -EFAULT; - return 0; + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!is_unrecognized_ioctl(ret)) + return ret; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (get_user(n, (int __user *)arg)) + return -EFAULT; + set_device_ro(bdev, n); + return 0; +} + +static int blkdev_getgeo(struct block_device *bdev, + struct hd_geometry __user *argp) +{ + struct gendisk *disk = bdev->bd_disk; + struct hd_geometry geo; + int ret; + + if (!argp) + return -EINVAL; + if (!disk->fops->getgeo) + return -ENOTTY; + + /* + * We need to set the startsect first, the driver may + * want to override it. + */ + memset(&geo, 0, sizeof(geo)); + geo.start = get_start_sect(bdev); + ret = disk->fops->getgeo(bdev, &geo); + if (ret) + return ret; + if (copy_to_user(argp, &geo, sizeof(geo))) + return -EFAULT; + return 0; +} + +/* set the logical block size */ +static int blkdev_bszset(struct block_device *bdev, fmode_t mode, + int __user *argp) +{ + int ret, n; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!argp) + return -EINVAL; + if (get_user(n, argp)) + return -EFAULT; + + if (!(mode & FMODE_EXCL)) { + bdgrab(bdev); + if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) + return -EBUSY; } + + ret = set_blocksize(bdev, n); + if (!(mode & FMODE_EXCL)) + blkdev_put(bdev, mode | FMODE_EXCL); + return ret; +} + +/* + * always keep this in sync with compat_blkdev_ioctl() + */ +int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, + unsigned long arg) +{ + struct backing_dev_info *bdi; + void __user *argp = (void __user *)arg; + loff_t size; + unsigned int max_sectors; + + switch (cmd) { + case BLKFLSBUF: + return blkdev_flushbuf(bdev, mode, cmd, arg); + case BLKROSET: + return blkdev_roset(bdev, mode, cmd, arg); + case BLKDISCARD: + return blk_ioctl_discard(bdev, mode, arg, 0); + case BLKSECDISCARD: + return blk_ioctl_discard(bdev, mode, arg, + BLKDEV_DISCARD_SECURE); + case BLKZEROOUT: + return blk_ioctl_zeroout(bdev, mode, arg); + case HDIO_GETGEO: + return blkdev_getgeo(bdev, argp); case BLKRAGET: case BLKFRAGET: if (!arg) return -EINVAL; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); case BLKROGET: return put_int(arg, bdev_read_only(bdev) != 0); @@ -375,7 +604,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKDISCARDZEROES: return put_uint(arg, bdev_discard_zeroes_data(bdev)); case BLKSECTGET: - return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); + max_sectors = min_t(unsigned int, USHRT_MAX, + queue_max_sectors(bdev_get_queue(bdev))); + return put_ushort(arg, max_sectors); case BLKROTATIONAL: return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev))); case BLKRASET: @@ -383,33 +614,14 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if(!capable(CAP_SYS_ADMIN)) return -EACCES; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKBSZSET: - /* set the logical block size */ - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (!arg) - return -EINVAL; - if (get_user(n, (int __user *) arg)) - return -EFAULT; - if (!(mode & FMODE_EXCL)) { - bdgrab(bdev); - if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) - return -EBUSY; - } - ret = set_blocksize(bdev, n); - if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); - return ret; + return blkdev_bszset(bdev, mode, argp); case BLKPG: - ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); - break; + return blkpg_ioctl(bdev, argp); case BLKRRPART: - ret = blkdev_reread_part(bdev); - break; + return blkdev_reread_part(bdev); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) @@ -421,11 +633,26 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESTOP: case BLKTRACESETUP: case BLKTRACETEARDOWN: - ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg); + return blk_trace_ioctl(bdev, cmd, argp); + case BLKDAXSET: + return blkdev_daxset(bdev, arg); + case BLKDAXGET: + return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); break; + case IOC_PR_REGISTER: + return blkdev_pr_register(bdev, argp); + case IOC_PR_RESERVE: + return blkdev_pr_reserve(bdev, argp); + case IOC_PR_RELEASE: + return blkdev_pr_release(bdev, argp); + case IOC_PR_PREEMPT: + return blkdev_pr_preempt(bdev, argp, false); + case IOC_PR_PREEMPT_ABORT: + return blkdev_pr_preempt(bdev, argp, true); + case IOC_PR_CLEAR: + return blkdev_pr_clear(bdev, argp); default: - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + return __blkdev_driver_ioctl(bdev, mode, cmd, arg); } - return ret; } EXPORT_SYMBOL_GPL(blkdev_ioctl); diff --git a/block/ioprio.c b/block/ioprio.c new file mode 100644 index 000000000000..cc7800e9eb44 --- /dev/null +++ b/block/ioprio.c @@ -0,0 +1,245 @@ +/* + * fs/ioprio.c + * + * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk> + * + * Helper functions for setting/querying io priorities of processes. The + * system calls closely mimmick getpriority/setpriority, see the man page for + * those. The prio argument is a composite of prio class and prio data, where + * the data argument has meaning within that class. The standard scheduling + * classes have 8 distinct prio levels, with 0 being the highest prio and 7 + * being the lowest. + * + * IOW, setting BE scheduling class with prio 2 is done ala: + * + * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; + * + * ioprio_set(PRIO_PROCESS, pid, prio); + * + * See also Documentation/block/ioprio.txt + * + */ +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/ioprio.h> +#include <linux/blkdev.h> +#include <linux/capability.h> +#include <linux/syscalls.h> +#include <linux/security.h> +#include <linux/pid_namespace.h> + +int set_task_ioprio(struct task_struct *task, int ioprio) +{ + int err; + struct io_context *ioc; + const struct cred *cred = current_cred(), *tcred; + + rcu_read_lock(); + tcred = __task_cred(task); + if (!uid_eq(tcred->uid, cred->euid) && + !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + err = security_task_setioprio(task, ioprio); + if (err) + return err; + + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + ioc->ioprio = ioprio; + put_io_context(ioc); + } + + return err; +} +EXPORT_SYMBOL_GPL(set_task_ioprio); + +SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) +{ + int class = IOPRIO_PRIO_CLASS(ioprio); + int data = IOPRIO_PRIO_DATA(ioprio); + struct task_struct *p, *g; + struct user_struct *user; + struct pid *pgrp; + kuid_t uid; + int ret; + + switch (class) { + case IOPRIO_CLASS_RT: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* fall through, rt has prio field too */ + case IOPRIO_CLASS_BE: + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + break; + case IOPRIO_CLASS_IDLE: + break; + case IOPRIO_CLASS_NONE: + if (data) + return -EINVAL; + break; + default: + return -EINVAL; + } + + ret = -ESRCH; + rcu_read_lock(); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_vpid(who); + if (p) + ret = set_task_ioprio(p, ioprio); + break; + case IOPRIO_WHO_PGRP: + if (!who) + pgrp = task_pgrp(current); + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + ret = set_task_ioprio(p, ioprio); + if (ret) + break; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + uid = make_kuid(current_user_ns(), who); + if (!uid_valid(uid)) + break; + if (!who) + user = current_user(); + else + user = find_user(uid); + + if (!user) + break; + + do_each_thread(g, p) { + if (!uid_eq(task_uid(p), uid) || + !task_pid_vnr(p)) + continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + goto free_uid; + } while_each_thread(g, p); +free_uid: + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + rcu_read_unlock(); + return ret; +} + +static int get_task_ioprio(struct task_struct *p) +{ + int ret; + + ret = security_task_getioprio(p); + if (ret) + goto out; + ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + if (p->io_context) + ret = p->io_context->ioprio; +out: + return ret; +} + +int ioprio_best(unsigned short aprio, unsigned short bprio) +{ + unsigned short aclass; + unsigned short bclass; + + if (!ioprio_valid(aprio)) + aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + if (!ioprio_valid(bprio)) + bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + + aclass = IOPRIO_PRIO_CLASS(aprio); + bclass = IOPRIO_PRIO_CLASS(bprio); + if (aclass == bclass) + return min(aprio, bprio); + if (aclass > bclass) + return bprio; + else + return aprio; +} + +SYSCALL_DEFINE2(ioprio_get, int, which, int, who) +{ + struct task_struct *g, *p; + struct user_struct *user; + struct pid *pgrp; + kuid_t uid; + int ret = -ESRCH; + int tmpio; + + rcu_read_lock(); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_vpid(who); + if (p) + ret = get_task_ioprio(p); + break; + case IOPRIO_WHO_PGRP: + if (!who) + pgrp = task_pgrp(current); + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + tmpio = get_task_ioprio(p); + if (tmpio < 0) + continue; + if (ret == -ESRCH) + ret = tmpio; + else + ret = ioprio_best(ret, tmpio); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + uid = make_kuid(current_user_ns(), who); + if (!who) + user = current_user(); + else + user = find_user(uid); + + if (!user) + break; + + do_each_thread(g, p) { + if (!uid_eq(task_uid(p), user->uid) || + !task_pid_vnr(p)) + continue; + tmpio = get_task_ioprio(p); + if (tmpio < 0) + continue; + if (ret == -ESRCH) + ret = tmpio; + else + ret = ioprio_best(ret, tmpio); + } while_each_thread(g, p); + + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + rcu_read_unlock(); + return ret; +} diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 3de89d4690f3..a163c487cf38 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -21,10 +21,10 @@ static void noop_merged_requests(struct request_queue *q, struct request *rq, static int noop_dispatch(struct request_queue *q, int force) { struct noop_data *nd = q->elevator->elevator_data; + struct request *rq; - if (!list_empty(&nd->queue)) { - struct request *rq; - rq = list_entry(nd->queue.next, struct request, queuelist); + rq = list_first_entry_or_null(&nd->queue, struct request, queuelist); + if (rq) { list_del_init(&rq->queuelist); elv_dispatch_sort(q, rq); return 1; @@ -46,7 +46,7 @@ noop_former_request(struct request_queue *q, struct request *rq) if (rq->queuelist.prev == &nd->queue) return NULL; - return list_entry(rq->queuelist.prev, struct request, queuelist); + return list_prev_entry(rq, queuelist); } static struct request * @@ -56,7 +56,7 @@ noop_latter_request(struct request_queue *q, struct request *rq) if (rq->queuelist.next == &nd->queue) return NULL; - return list_entry(rq->queuelist.next, struct request, queuelist); + return list_next_entry(rq, queuelist); } static int noop_init_queue(struct request_queue *q, struct elevator_type *e) diff --git a/block/partition-generic.c b/block/partition-generic.c index 789cdea05893..746935a5973c 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -211,8 +211,8 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); - free_part_stats(p); - free_part_info(p); + blk_free_devt(dev->devt); + hd_free_part(p); kfree(p); } @@ -232,8 +232,9 @@ static void delete_partition_rcu_cb(struct rcu_head *head) put_device(part_to_dev(part)); } -void __delete_partition(struct hd_struct *part) +void __delete_partition(struct percpu_ref *ref) { + struct hd_struct *part = container_of(ref, struct hd_struct, ref); call_rcu(&part->rcu_head, delete_partition_rcu_cb); } @@ -253,9 +254,8 @@ void delete_partition(struct gendisk *disk, int partno) rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->holder_dir); device_del(part_to_dev(part)); - blk_free_devt(part_devt(part)); - hd_struct_put(part); + hd_struct_kill(part); } static ssize_t whole_disk_show(struct device *dev, @@ -356,8 +356,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - hd_ref_init(p); - return p; + if (!hd_ref_init(p)) + return p; out_free_info: free_part_info(p); @@ -397,7 +397,7 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev) struct hd_struct *part; int res; - if (bdev->bd_part_count) + if (bdev->bd_part_count || bdev->bd_super) return -EBUSY; res = invalidate_partition(disk, 0); if (res) @@ -428,6 +428,7 @@ rescan: if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); + blk_integrity_revalidate(disk); check_disk_size_change(disk, bdev); bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) diff --git a/block/partitions/aix.c b/block/partitions/aix.c index 43be471d9b1d..f3ed7b2d89bf 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -215,7 +215,7 @@ int aix_partition(struct parsed_partitions *state) numlvs = be16_to_cpu(p->numlvs); put_dev_sector(sect); } - lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL); + lvip = kcalloc(state->limit, sizeof(struct lv_info), GFP_KERNEL); if (!lvip) return 0; if (numlvs && (d = read_part_sector(state, vgda_sector + 1, §))) { @@ -253,7 +253,7 @@ int aix_partition(struct parsed_partitions *state) continue; } lv_ix = be16_to_cpu(p->lv_ix) - 1; - if (lv_ix > state->limit) { + if (lv_ix >= state->limit) { cur_lv_ix = -1; continue; } diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 70cbf44a1560..2b13533d60a2 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -7,6 +7,8 @@ * Re-organised Feb 1998 Russell King */ +#define pr_fmt(fmt) fmt + #include <linux/types.h> #include <linux/affs_hardblocks.h> @@ -40,7 +42,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { if (warn_no_part) - printk("Dev %s: unable to read RDB block %d\n", + pr_err("Dev %s: unable to read RDB block %d\n", bdevname(state->bdev, b), blk); res = -1; goto rdb_done; @@ -57,12 +59,12 @@ int amiga_partition(struct parsed_partitions *state) *(__be32 *)(data+0xdc) = 0; if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { - printk("Warning: Trashed word at 0xd0 in block %d " - "ignored in checksum calculation\n",blk); + pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n", + blk); break; } - printk("Dev %s: RDB in block %d has bad checksum\n", + pr_err("Dev %s: RDB in block %d has bad checksum\n", bdevname(state->bdev, b), blk); } @@ -83,7 +85,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { if (warn_no_part) - printk("Dev %s: unable to read partition block %d\n", + pr_err("Dev %s: unable to read partition block %d\n", bdevname(state->bdev, b), blk); res = -1; goto rdb_done; diff --git a/block/partitions/check.c b/block/partitions/check.c index 9ac1df74f699..16118d11dbfc 100644 --- a/block/partitions/check.c +++ b/block/partitions/check.c @@ -184,12 +184,12 @@ check_partition(struct gendisk *hd, struct block_device *bdev) if (err) /* The partition is unrecognized. So report I/O errors if there were any */ res = err; - if (!res) - strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE); - else if (warn_no_part) - strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); - - printk(KERN_INFO "%s", state->pp_buf); + if (res) { + if (warn_no_part) + strlcat(state->pp_buf, + " unable to read partition table\n", PAGE_SIZE); + printk(KERN_INFO "%s", state->pp_buf); + } free_page((unsigned long)state->pp_buf); free_partitions(state); diff --git a/block/partitions/efi.c b/block/partitions/efi.c index dc51f467a560..26cb624ace05 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -121,7 +121,7 @@ __setup("gpt", force_gpt_fn); /** * efi_crc32() - EFI version of crc32 function * @buf: buffer to calculate crc32 of - * @len - length of buf + * @len: length of buf * * Description: Returns EFI-style CRC32 value for @buf * @@ -240,10 +240,10 @@ done: /** * read_lba(): Read bytes from disk, starting at given LBA - * @state - * @lba - * @buffer - * @size_t + * @state: disk parsed partitions + * @lba: the Logical Block Address of the partition table + * @buffer: destination buffer + * @count: bytes to read * * Description: Reads @count bytes from @state->bdev into @buffer. * Returns number of bytes read on success, 0 on error. @@ -277,8 +277,8 @@ static size_t read_lba(struct parsed_partitions *state, /** * alloc_read_gpt_entries(): reads partition entries from disk - * @state - * @gpt - GPT header + * @state: disk parsed partitions + * @gpt: GPT header * * Description: Returns ptes on success, NULL on error. * Allocates space for PTEs based on information found in @gpt. @@ -312,8 +312,8 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, /** * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk - * @state - * @lba is the Logical Block Address of the partition table + * @state: disk parsed partitions + * @lba: the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates * and fills a GPT header starting at @ from @state->bdev. @@ -340,10 +340,10 @@ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, /** * is_gpt_valid() - tests one GPT header and PTEs for validity - * @state - * @lba is the logical block address of the GPT header to test - * @gpt is a GPT header ptr, filled on return. - * @ptes is a PTEs ptr, filled on return. + * @state: disk parsed partitions + * @lba: logical block address of the GPT header to test + * @gpt: GPT header ptr, filled on return. + * @ptes: PTEs ptr, filled on return. * * Description: returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. @@ -461,8 +461,8 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /** * is_pte_valid() - tests one PTE for validity - * @pte is the pte to check - * @lastlba is last lba of the disk + * @pte:pte to check + * @lastlba: last lba of the disk * * Description: returns 1 if valid, 0 on error. */ @@ -478,9 +478,10 @@ is_pte_valid(const gpt_entry *pte, const u64 lastlba) /** * compare_gpts() - Search disk for valid GPT headers and PTEs - * @pgpt is the primary GPT header - * @agpt is the alternate GPT header - * @lastlba is the last LBA number + * @pgpt: primary GPT header + * @agpt: alternate GPT header + * @lastlba: last LBA number + * * Description: Returns nothing. Sanity checks pgpt and agpt fields * and prints warnings on discrepancies. * @@ -572,9 +573,10 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) /** * find_valid_gpt() - Search disk for valid GPT headers and PTEs - * @state - * @gpt is a GPT header ptr, filled on return. - * @ptes is a PTEs ptr, filled on return. + * @state: disk parsed partitions + * @gpt: GPT header ptr, filled on return. + * @ptes: PTEs ptr, filled on return. + * * Description: Returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. * Validity depends on PMBR being valid (or being overridden by the @@ -663,7 +665,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, /** * efi_partition(struct parsed_partitions *state) - * @state + * @state: disk parsed partitions * * Description: called from check.c, if the disk contains GPT * partitions, sets up partition entries in the kernel. @@ -713,7 +715,7 @@ int efi_partition(struct parsed_partitions *state) state->parts[i + 1].flags = ADDPART_FLAG_RAID; info = &state->parts[i + 1].info; - efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid); + efi_guid_to_str(&ptes[i].unique_partition_guid, info->uuid); /* Naively convert UTF16-LE to 7 bits. */ label_max = min(ARRAY_SIZE(info->volname) - 1, diff --git a/block/partitions/mac.c b/block/partitions/mac.c index 76d8ba6379a9..621317ac4d59 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -32,7 +32,7 @@ int mac_partition(struct parsed_partitions *state) Sector sect; unsigned char *data; int slot, blocks_in_map; - unsigned secsize; + unsigned secsize, datasize, partoffset; #ifdef CONFIG_PPC_PMAC int found_root = 0; int found_root_goodness = 0; @@ -50,10 +50,14 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); - data = read_part_sector(state, secsize/512, §); + datasize = round_down(secsize, 512); + data = read_part_sector(state, datasize / 512, §); if (!data) return -1; - part = (struct mac_partition *) (data + secsize%512); + partoffset = secsize % 512; + if (partoffset + sizeof(*part) > datasize) + return -1; + part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); return 0; /* not a MacOS disk */ @@ -81,7 +85,7 @@ int mac_partition(struct parsed_partitions *state) be32_to_cpu(part->start_block) * (secsize/512), be32_to_cpu(part->block_count) * (secsize/512)); - if (!strnicmp(part->type, "Linux_RAID", 10)) + if (!strncasecmp(part->type, "Linux_RAID", 10)) state->parts[slot].flags = ADDPART_FLAG_RAID; #ifdef CONFIG_PPC_PMAC /* @@ -100,7 +104,7 @@ int mac_partition(struct parsed_partitions *state) goodness++; if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 - || (strnicmp(part->type, "Linux", 5) == 0 + || (strncasecmp(part->type, "Linux", 5) == 0 && strcasecmp(part->type, "Linux_swap") != 0)) { int i, l; @@ -109,13 +113,13 @@ int mac_partition(struct parsed_partitions *state) if (strcmp(part->name, "/") == 0) goodness++; for (i = 0; i <= l - 4; ++i) { - if (strnicmp(part->name + i, "root", + if (strncasecmp(part->name + i, "root", 4) == 0) { goodness += 2; break; } } - if (strnicmp(part->name, "swap", 4) == 0) + if (strncasecmp(part->name, "swap", 4) == 0) goodness--; } diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 9123f250b425..93e7c1b32edd 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -159,8 +159,9 @@ static void parse_extended(struct parsed_partitions *state, /* * First process the data partition(s) */ - for (i=0; i<4; i++, p++) { + for (i = 0; i < 4; i++, p++) { sector_t offs, size, next; + if (!nr_sects(p) || is_extended_partition(p)) continue; @@ -194,7 +195,7 @@ static void parse_extended(struct parsed_partitions *state, * It should be a link to the next logical partition. */ p -= 4; - for (i=0; i<4; i++, p++) + for (i = 0; i < 4; i++, p++) if (nr_sects(p) && is_extended_partition(p)) break; if (i == 4) @@ -243,8 +244,8 @@ static void parse_solaris_x86(struct parsed_partitions *state, return; } /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ - max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; - for (i=0; i<max_nparts && state->next<state->limit; i++) { + max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; + for (i = 0; i < max_nparts && state->next < state->limit; i++) { struct solaris_x86_slice *s = &v->v_slice[i]; char tmp[3 + 10 + 1 + 1]; @@ -409,7 +410,7 @@ static void parse_minix(struct parsed_partitions *state, /* The first sector of a Minix partition can have either * a secondary MBR describing its subpartitions, or * the normal boot sector. */ - if (msdos_magic_present (data + 510) && + if (msdos_magic_present(data + 510) && SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; @@ -527,6 +528,7 @@ int msdos_partition(struct parsed_partitions *state) for (slot = 1 ; slot <= 4 ; slot++, p++) { sector_t start = start_sect(p)*sector_size; sector_t size = nr_sects(p)*sector_size; + if (!size) continue; if (is_extended_partition(p)) { @@ -537,6 +539,7 @@ int msdos_partition(struct parsed_partitions *state) * sector, although it may not be enough/proper. */ sector_t n = 2; + n = min(size, max(sector_size, n)); put_partition(state, slot, start, n); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 26487972ac54..0774799942e0 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -82,9 +82,18 @@ static int sg_set_timeout(struct request_queue *q, int __user *p) return err; } +static int max_sectors_bytes(struct request_queue *q) +{ + unsigned int max_sectors = queue_max_sectors(q); + + max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9); + + return max_sectors << 9; +} + static int sg_get_reserved_size(struct request_queue *q, int __user *p) { - unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9); + int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q)); return put_user(val, p); } @@ -98,10 +107,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p) if (size < 0) return -EINVAL; - if (size > (queue_max_sectors(q) << 9)) - size = queue_max_sectors(q) << 9; - q->sg_reserved_size = size; + q->sg_reserved_size = min(size, max_sectors_bytes(q)); return 0; } @@ -135,7 +142,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) __set_bit(GPCMD_VERIFY_10, filter->read_ok); __set_bit(VERIFY_16, filter->read_ok); __set_bit(REPORT_LUNS, filter->read_ok); - __set_bit(SERVICE_ACTION_IN, filter->read_ok); + __set_bit(SERVICE_ACTION_IN_16, filter->read_ok); __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); __set_bit(MAINTENANCE_IN, filter->read_ok); __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); @@ -205,10 +212,6 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) if (capable(CAP_SYS_RAWIO)) return 0; - /* if there's no filter set, assume we're filtering everything out */ - if (!filter) - return -EPERM; - /* Anybody who can open the device can do a read-safe command */ if (test_bit(cmd[0], filter->read_ok)) return 0; @@ -233,7 +236,6 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, * fill in request structure */ rq->cmd_len = hdr->cmd_len; - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) @@ -277,7 +279,6 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, r = blk_rq_unmap_user(bio); if (!ret) ret = r; - blk_put_request(rq); return ret; } @@ -288,14 +289,13 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, unsigned long start_time; ssize_t ret = 0; int writing = 0; + int at_head = 0; struct request *rq; char sense[SCSI_SENSE_BUFFERSIZE]; struct bio *bio; if (hdr->interface_id != 'S') return -EINVAL; - if (hdr->cmd_len > BLK_MAX_CDB) - return -EINVAL; if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) return -EIO; @@ -311,48 +311,47 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, case SG_DXFER_FROM_DEV: break; } + if (hdr->flags & SG_FLAG_Q_AT_HEAD) + at_head = 1; + ret = -ENOMEM; rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); - if (!rq) - return -ENOMEM; - - if (blk_fill_sghdr_rq(q, rq, hdr, mode)) { - blk_put_request(rq); - return -EFAULT; + if (IS_ERR(rq)) + return PTR_ERR(rq); + blk_rq_set_block_pc(rq); + + if (hdr->cmd_len > BLK_MAX_CDB) { + rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); + if (!rq->cmd) + goto out_put_request; } + ret = blk_fill_sghdr_rq(q, rq, hdr, mode); + if (ret < 0) + goto out_free_cdb; + + ret = 0; if (hdr->iovec_count) { - size_t iov_data_len; + struct iov_iter i; struct iovec *iov = NULL; - ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count, - 0, NULL, &iov); - if (ret < 0) { - kfree(iov); - goto out; - } - - iov_data_len = ret; - ret = 0; + ret = import_iovec(rq_data_dir(rq), + hdr->dxferp, hdr->iovec_count, + 0, &iov, &i); + if (ret < 0) + goto out_free_cdb; /* SG_IO howto says that the shorter of the two wins */ - if (hdr->dxfer_len < iov_data_len) { - hdr->iovec_count = iov_shorten(iov, - hdr->iovec_count, - hdr->dxfer_len); - iov_data_len = hdr->dxfer_len; - } + iov_iter_truncate(&i, hdr->dxfer_len); - ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov, - hdr->iovec_count, - iov_data_len, GFP_KERNEL); + ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL); kfree(iov); } else if (hdr->dxfer_len) ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, GFP_KERNEL); if (ret) - goto out; + goto out_free_cdb; bio = rq->bio; memset(sense, 0, sizeof(sense)); @@ -366,12 +365,16 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, * (if he doesn't check that is his problem). * N.B. a non-zero SCSI status is _not_ necessarily an error. */ - blk_execute_rq(q, bd_disk, rq, 0); + blk_execute_rq(q, bd_disk, rq, at_head); hdr->duration = jiffies_to_msecs(jiffies - start_time); - return blk_complete_sghdr_rq(rq, hdr, bio); -out: + ret = blk_complete_sghdr_rq(rq, hdr, bio); + +out_free_cdb: + if (rq->cmd != rq->__cmd) + kfree(rq->cmd); +out_put_request: blk_put_request(rq); return ret; } @@ -441,7 +444,12 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } - rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto error_free_buffer; + } + blk_rq_set_block_pc(rq); cmdlen = COMMAND_SIZE(opcode); @@ -487,19 +495,17 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, break; } - if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { + if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) { err = DRIVER_ERROR << 24; - goto out; + goto error; } memset(sense, 0, sizeof(sense)); rq->sense = sense; rq->sense_len = 0; - rq->cmd_type = REQ_TYPE_BLOCK_PC; blk_execute_rq(q, disk, rq, 0); -out: err = rq->errors & 0xff; /* only 8 bit SCSI status */ if (err) { if (rq->sense_len && rq->sense) { @@ -514,8 +520,11 @@ out: } error: - kfree(buffer); blk_put_request(rq); + +error_free_buffer: + kfree(buffer); + return err; } EXPORT_SYMBOL_GPL(sg_scsi_ioctl); @@ -527,8 +536,10 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, struct request *rq; int err; - rq = blk_get_request(q, WRITE, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_BLOCK_PC; + rq = blk_get_request(q, WRITE, __GFP_RECLAIM); + if (IS_ERR(rq)) + return PTR_ERR(rq); + blk_rq_set_block_pc(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; rq->cmd[0] = cmd; rq->cmd[4] = data; diff --git a/block/t10-pi.c b/block/t10-pi.c new file mode 100644 index 000000000000..2c97912335a9 --- /dev/null +++ b/block/t10-pi.c @@ -0,0 +1,189 @@ +/* + * t10_pi.c - Functions for generating and verifying T10 Protection + * Information. + * + * Copyright (C) 2007, 2008, 2014 Oracle Corporation + * Written by: Martin K. Petersen <martin.petersen@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include <linux/t10-pi.h> +#include <linux/blkdev.h> +#include <linux/crc-t10dif.h> +#include <net/checksum.h> + +typedef __be16 (csum_fn) (void *, unsigned int); + +static const __be16 APP_ESCAPE = (__force __be16) 0xffff; +static const __be32 REF_ESCAPE = (__force __be32) 0xffffffff; + +static __be16 t10_pi_crc_fn(void *data, unsigned int len) +{ + return cpu_to_be16(crc_t10dif(data, len)); +} + +static __be16 t10_pi_ip_fn(void *data, unsigned int len) +{ + return (__force __be16)ip_compute_csum(data, len); +} + +/* + * Type 1 and Type 2 protection use the same format: 16 bit guard tag, + * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref + * tag. + */ +static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, + unsigned int type) +{ + unsigned int i; + + for (i = 0 ; i < iter->data_size ; i += iter->interval) { + struct t10_pi_tuple *pi = iter->prot_buf; + + pi->guard_tag = fn(iter->data_buf, iter->interval); + pi->app_tag = 0; + + if (type == 1) + pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); + else + pi->ref_tag = 0; + + iter->data_buf += iter->interval; + iter->prot_buf += sizeof(struct t10_pi_tuple); + iter->seed++; + } + + return 0; +} + +static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, + unsigned int type) +{ + unsigned int i; + + for (i = 0 ; i < iter->data_size ; i += iter->interval) { + struct t10_pi_tuple *pi = iter->prot_buf; + __be16 csum; + + switch (type) { + case 1: + case 2: + if (pi->app_tag == APP_ESCAPE) + goto next; + + if (be32_to_cpu(pi->ref_tag) != + lower_32_bits(iter->seed)) { + pr_err("%s: ref tag error at location %llu " \ + "(rcvd %u)\n", iter->disk_name, + (unsigned long long) + iter->seed, be32_to_cpu(pi->ref_tag)); + return -EILSEQ; + } + break; + case 3: + if (pi->app_tag == APP_ESCAPE && + pi->ref_tag == REF_ESCAPE) + goto next; + break; + } + + csum = fn(iter->data_buf, iter->interval); + + if (pi->guard_tag != csum) { + pr_err("%s: guard tag error at sector %llu " \ + "(rcvd %04x, want %04x)\n", iter->disk_name, + (unsigned long long)iter->seed, + be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); + return -EILSEQ; + } + +next: + iter->data_buf += iter->interval; + iter->prot_buf += sizeof(struct t10_pi_tuple); + iter->seed++; + } + + return 0; +} + +static int t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, 1); +} + +static int t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, 1); +} + +static int t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, 1); +} + +static int t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, 1); +} + +static int t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, 3); +} + +static int t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, 3); +} + +static int t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, 3); +} + +static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, 3); +} + +struct blk_integrity_profile t10_pi_type1_crc = { + .name = "T10-DIF-TYPE1-CRC", + .generate_fn = t10_pi_type1_generate_crc, + .verify_fn = t10_pi_type1_verify_crc, +}; +EXPORT_SYMBOL(t10_pi_type1_crc); + +struct blk_integrity_profile t10_pi_type1_ip = { + .name = "T10-DIF-TYPE1-IP", + .generate_fn = t10_pi_type1_generate_ip, + .verify_fn = t10_pi_type1_verify_ip, +}; +EXPORT_SYMBOL(t10_pi_type1_ip); + +struct blk_integrity_profile t10_pi_type3_crc = { + .name = "T10-DIF-TYPE3-CRC", + .generate_fn = t10_pi_type3_generate_crc, + .verify_fn = t10_pi_type3_verify_crc, +}; +EXPORT_SYMBOL(t10_pi_type3_crc); + +struct blk_integrity_profile t10_pi_type3_ip = { + .name = "T10-DIF-TYPE3-IP", + .generate_fn = t10_pi_type3_generate_ip, + .verify_fn = t10_pi_type3_verify_ip, +}; +EXPORT_SYMBOL(t10_pi_type3_ip); |