summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Makefile2
-rw-r--r--block/badblocks.c585
-rw-r--r--block/blk-cgroup.c6
-rw-r--r--block/blk-core.c53
-rw-r--r--block/blk-merge.c35
-rw-r--r--block/blk-mq.c14
-rw-r--r--block/blk-settings.c36
-rw-r--r--block/blk-sysfs.c3
-rw-r--r--block/blk-timeout.c8
-rw-r--r--block/blk.h2
-rw-r--r--block/genhd.c30
-rw-r--r--block/ioctl.c71
-rw-r--r--block/noop-iosched.c10
-rw-r--r--block/partition-generic.c2
-rw-r--r--block/partitions/mac.c10
15 files changed, 802 insertions, 65 deletions
diff --git a/block/Makefile b/block/Makefile
index 00ecc97629db..db5f622c9d67 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
- partitions/
+ badblocks.o partitions/
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
diff --git a/block/badblocks.c b/block/badblocks.c
new file mode 100644
index 000000000000..7be53cb1cc3c
--- /dev/null
+++ b/block/badblocks.c
@@ -0,0 +1,585 @@
+/*
+ * Bad block management
+ *
+ * - Heavily based on MD badblocks code from Neil Brown
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/badblocks.h>
+#include <linux/seqlock.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+/**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: sector (start) at which to check for badblocks
+ * @sectors: number of sectors to check for badblocks
+ * @first_bad: pointer to store location of the first badblock
+ * @bad_sectors: pointer to store number of badblocks after @first_bad
+ *
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide. This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ * A 'shift' can be set so that larger blocks are tracked and
+ * consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so badblocks_check
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad. So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ *
+ * Return:
+ * 0: there are no known bad blocks in the range
+ * 1: there are known bad block which are all acknowledged
+ * -1: there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ int hi;
+ int lo;
+ u64 *p = bb->page;
+ int rv;
+ sector_t target = s + sectors;
+ unsigned seq;
+
+ if (bb->shift > 0) {
+ /* round the start down, and the end up */
+ s >>= bb->shift;
+ target += (1<<bb->shift) - 1;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+ /* 'target' is now the first block after the bad range */
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+ lo = 0;
+ rv = 0;
+ hi = bb->count;
+
+ /* Binary search between lo and hi for 'target'
+ * i.e. for the last range that starts before 'target'
+ */
+ /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+ * are known not to be the last range before target.
+ * VARIANT: hi-lo is the number of possible
+ * ranges, and decreases until it reaches 1
+ */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a < target)
+ /* This could still be the one, earlier ranges
+ * could not.
+ */
+ lo = mid;
+ else
+ /* This and later ranges are definitely out. */
+ hi = mid;
+ }
+ /* 'lo' might be the last that started before target, but 'hi' isn't */
+ if (hi > lo) {
+ /* need to check all range that end after 's' to see if
+ * any are unacknowledged.
+ */
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ if (BB_OFFSET(p[lo]) < target) {
+ /* starts before the end, and finishes after
+ * the start, so they must overlap
+ */
+ if (rv != -1 && BB_ACK(p[lo]))
+ rv = 1;
+ else
+ rv = -1;
+ *first_bad = BB_OFFSET(p[lo]);
+ *bad_sectors = BB_LEN(p[lo]);
+ }
+ lo--;
+ }
+ }
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_check);
+
+/**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ * @acknowledged: weather to mark the bad sectors as acknowledged
+ *
+ * This might extend the table, or might contract it if two adjacent ranges
+ * can be merged. We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to set badblocks (out of space)
+ */
+int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+{
+ u64 *p;
+ int lo, hi;
+ int rv = 0;
+ unsigned long flags;
+
+ if (bb->shift < 0)
+ /* badblocks are disabled */
+ return 0;
+
+ if (bb->shift) {
+ /* round the start down, and the end up */
+ sector_t next = s + sectors;
+
+ s >>= bb->shift;
+ next += (1<<bb->shift) - 1;
+ next >>= bb->shift;
+ sectors = next - s;
+ }
+
+ write_seqlock_irqsave(&bb->lock, flags);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts at-or-before 's' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a <= s)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo && BB_OFFSET(p[lo]) > s)
+ hi = lo;
+
+ if (hi > lo) {
+ /* we found a range that might merge with the start
+ * of our new range
+ */
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t e = a + BB_LEN(p[lo]);
+ int ack = BB_ACK(p[lo]);
+
+ if (e >= s) {
+ /* Yes, we can merge with a previous range */
+ if (s == a && s + sectors >= e)
+ /* new range covers old */
+ ack = acknowledged;
+ else
+ ack = ack && acknowledged;
+
+ if (e < s + sectors)
+ e = s + sectors;
+ if (e - a <= BB_MAX_LEN) {
+ p[lo] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ /* does not all fit in one range,
+ * make p[lo] maximal
+ */
+ if (BB_LEN(p[lo]) != BB_MAX_LEN)
+ p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ }
+ }
+ if (sectors && hi < bb->count) {
+ /* 'hi' points to the first range that starts after 's'.
+ * Maybe we can merge with the start of that range
+ */
+ sector_t a = BB_OFFSET(p[hi]);
+ sector_t e = a + BB_LEN(p[hi]);
+ int ack = BB_ACK(p[hi]);
+
+ if (a <= s + sectors) {
+ /* merging is possible */
+ if (e <= s + sectors) {
+ /* full overlap */
+ e = s + sectors;
+ ack = acknowledged;
+ } else
+ ack = ack && acknowledged;
+
+ a = s;
+ if (e - a <= BB_MAX_LEN) {
+ p[hi] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ lo = hi;
+ hi++;
+ }
+ }
+ if (sectors == 0 && hi < bb->count) {
+ /* we might be able to combine lo and hi */
+ /* Note: 's' is at the end of 'lo' */
+ sector_t a = BB_OFFSET(p[hi]);
+ int lolen = BB_LEN(p[lo]);
+ int hilen = BB_LEN(p[hi]);
+ int newlen = lolen + hilen - (s - a);
+
+ if (s >= a && newlen < BB_MAX_LEN) {
+ /* yes, we can combine them */
+ int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+
+ p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+ memmove(p + hi, p + hi + 1,
+ (bb->count - hi - 1) * 8);
+ bb->count--;
+ }
+ }
+ while (sectors) {
+ /* didn't merge (it all).
+ * Need to add a range just before 'hi'
+ */
+ if (bb->count >= MAX_BADBLOCKS) {
+ /* No room for more */
+ rv = 1;
+ break;
+ } else {
+ int this_sectors = sectors;
+
+ memmove(p + hi + 1, p + hi,
+ (bb->count - hi) * 8);
+ bb->count++;
+
+ if (this_sectors > BB_MAX_LEN)
+ this_sectors = BB_MAX_LEN;
+ p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+ sectors -= this_sectors;
+ s += this_sectors;
+ }
+ }
+
+ bb->changed = 1;
+ if (!acknowledged)
+ bb->unacked_exist = 1;
+ write_sequnlock_irqrestore(&bb->lock, flags);
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_set);
+
+/**
+ * badblocks_clear() - Remove a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ *
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail. So if the table becomes full, we just
+ * drop the remove request.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to clear badblocks
+ */
+int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+ u64 *p;
+ int lo, hi;
+ sector_t target = s + sectors;
+ int rv = 0;
+
+ if (bb->shift > 0) {
+ /* When clearing we round the start up and the end down.
+ * This should not matter as the shift should align with
+ * the block size and no rounding should ever be needed.
+ * However it is better the think a block is bad when it
+ * isn't than to think a block is not bad when it is.
+ */
+ s += (1<<bb->shift) - 1;
+ s >>= bb->shift;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+
+ write_seqlock_irq(&bb->lock);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts before 'target' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a < target)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo) {
+ /* p[lo] is the last range that could overlap the
+ * current range. Earlier ranges could also overlap,
+ * but only this one can overlap the end of the range.
+ */
+ if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
+ /* Partial overlap, leave the tail of this range */
+ int ack = BB_ACK(p[lo]);
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t end = a + BB_LEN(p[lo]);
+
+ if (a < s) {
+ /* we need to split this range */
+ if (bb->count >= MAX_BADBLOCKS) {
+ rv = -ENOSPC;
+ goto out;
+ }
+ memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+ bb->count++;
+ p[lo] = BB_MAKE(a, s-a, ack);
+ lo++;
+ }
+ p[lo] = BB_MAKE(target, end - target, ack);
+ /* there is no longer an overlap */
+ hi = lo;
+ lo--;
+ }
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ /* This range does overlap */
+ if (BB_OFFSET(p[lo]) < s) {
+ /* Keep the early parts of this range. */
+ int ack = BB_ACK(p[lo]);
+ sector_t start = BB_OFFSET(p[lo]);
+
+ p[lo] = BB_MAKE(start, s - start, ack);
+ /* now low doesn't overlap, so.. */
+ break;
+ }
+ lo--;
+ }
+ /* 'lo' is strictly before, 'hi' is strictly after,
+ * anything between needs to be discarded
+ */
+ if (hi - lo > 1) {
+ memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+ bb->count -= (hi - lo - 1);
+ }
+ }
+
+ bb->changed = 1;
+out:
+ write_sequnlock_irq(&bb->lock);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_clear);
+
+/**
+ * ack_all_badblocks() - Acknowledge all bad blocks in a list.
+ * @bb: the badblocks structure that holds all badblock information
+ *
+ * This only succeeds if ->changed is clear. It is used by
+ * in-kernel metadata updates
+ */
+void ack_all_badblocks(struct badblocks *bb)
+{
+ if (bb->page == NULL || bb->changed)
+ /* no point even trying */
+ return;
+ write_seqlock_irq(&bb->lock);
+
+ if (bb->changed == 0 && bb->unacked_exist) {
+ u64 *p = bb->page;
+ int i;
+
+ for (i = 0; i < bb->count ; i++) {
+ if (!BB_ACK(p[i])) {
+ sector_t start = BB_OFFSET(p[i]);
+ int len = BB_LEN(p[i]);
+
+ p[i] = BB_MAKE(start, len, 1);
+ }
+ }
+ bb->unacked_exist = 0;
+ }
+ write_sequnlock_irq(&bb->lock);
+}
+EXPORT_SYMBOL_GPL(ack_all_badblocks);
+
+/**
+ * badblocks_show() - sysfs access to bad-blocks list
+ * @bb: the badblocks structure that holds all badblock information
+ * @page: buffer received from sysfs
+ * @unack: weather to show unacknowledged badblocks
+ *
+ * Return:
+ * Length of returned data
+ */
+ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
+{
+ size_t len;
+ int i;
+ u64 *p = bb->page;
+ unsigned seq;
+
+ if (bb->shift < 0)
+ return 0;
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ len = 0;
+ i = 0;
+
+ while (len < PAGE_SIZE && i < bb->count) {
+ sector_t s = BB_OFFSET(p[i]);
+ unsigned int length = BB_LEN(p[i]);
+ int ack = BB_ACK(p[i]);
+
+ i++;
+
+ if (unack && ack)
+ continue;
+
+ len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
+ (unsigned long long)s << bb->shift,
+ length << bb->shift);
+ }
+ if (unack && len == 0)
+ bb->unacked_exist = 0;
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(badblocks_show);
+
+/**
+ * badblocks_store() - sysfs access to bad-blocks list
+ * @bb: the badblocks structure that holds all badblock information
+ * @page: buffer received from sysfs
+ * @len: length of data received from sysfs
+ * @unack: weather to show unacknowledged badblocks
+ *
+ * Return:
+ * Length of the buffer processed or -ve error.
+ */
+ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
+ int unack)
+{
+ unsigned long long sector;
+ int length;
+ char newline;
+
+ switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
+ case 3:
+ if (newline != '\n')
+ return -EINVAL;
+ case 2:
+ if (length <= 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (badblocks_set(bb, sector, length, !unack))
+ return -ENOSPC;
+ else
+ return len;
+}
+EXPORT_SYMBOL_GPL(badblocks_store);
+
+static int __badblocks_init(struct device *dev, struct badblocks *bb,
+ int enable)
+{
+ bb->dev = dev;
+ bb->count = 0;
+ if (enable)
+ bb->shift = 0;
+ else
+ bb->shift = -1;
+ if (dev)
+ bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
+ else
+ bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!bb->page) {
+ bb->shift = -1;
+ return -ENOMEM;
+ }
+ seqlock_init(&bb->lock);
+
+ return 0;
+}
+
+/**
+ * badblocks_init() - initialize the badblocks structure
+ * @bb: the badblocks structure that holds all badblock information
+ * @enable: weather to enable badblocks accounting
+ *
+ * Return:
+ * 0: success
+ * -ve errno: on error
+ */
+int badblocks_init(struct badblocks *bb, int enable)
+{
+ return __badblocks_init(NULL, bb, enable);
+}
+EXPORT_SYMBOL_GPL(badblocks_init);
+
+int devm_init_badblocks(struct device *dev, struct badblocks *bb)
+{
+ if (!bb)
+ return -EINVAL;
+ return __badblocks_init(dev, bb, 1);
+}
+EXPORT_SYMBOL_GPL(devm_init_badblocks);
+
+/**
+ * badblocks_exit() - free the badblocks structure
+ * @bb: the badblocks structure that holds all badblock information
+ */
+void badblocks_exit(struct badblocks *bb)
+{
+ if (!bb)
+ return;
+ if (bb->dev)
+ devm_kfree(bb->dev, bb->page);
+ else
+ kfree(bb->page);
+ bb->page = NULL;
+}
+EXPORT_SYMBOL_GPL(badblocks_exit);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5bcdfc10c23a..5a37188b559f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1127,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q)
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
-static int blkcg_can_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static int blkcg_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
+ struct cgroup_subsys_state *dst_css;
struct io_context *ioc;
int ret = 0;
/* task_lock() is needed to avoid races with exit_io_context() */
- cgroup_taskset_for_each(task, tset) {
+ cgroup_taskset_for_each(task, dst_css, tset) {
task_lock(task);
ioc = task->io_context;
if (ioc && atomic_read(&ioc->nr_tasks) > 1)
diff --git a/block/blk-core.c b/block/blk-core.c
index 5131993b23a1..33e2f62d5062 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -207,6 +207,22 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs)
EXPORT_SYMBOL(blk_delay_queue);
/**
+ * blk_start_queue_async - asynchronously restart a previously stopped queue
+ * @q: The &struct request_queue in question
+ *
+ * Description:
+ * blk_start_queue_async() will clear the stop flag on the queue, and
+ * ensure that the request_fn for the queue is run from an async
+ * context.
+ **/
+void blk_start_queue_async(struct request_queue *q)
+{
+ queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+ blk_run_queue_async(q);
+}
+EXPORT_SYMBOL(blk_start_queue_async);
+
+/**
* blk_start_queue - restart a previously stopped queue
* @q: The &struct request_queue in question
*
@@ -1689,8 +1705,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
struct request *req;
unsigned int request_count = 0;
- blk_queue_split(q, &bio, q->bio_split);
-
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
@@ -1698,6 +1712,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
*/
blk_queue_bounce(q, &bio);
+ blk_queue_split(q, &bio, q->bio_split);
+
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio->bi_error = -EIO;
bio_endio(bio);
@@ -2114,7 +2130,8 @@ blk_qc_t submit_bio(int rw, struct bio *bio)
EXPORT_SYMBOL(submit_bio);
/**
- * blk_rq_check_limits - Helper function to check a request for the queue limit
+ * blk_cloned_rq_check_limits - Helper function to check a cloned request
+ * for new the queue limits
* @q: the queue
* @rq: the request being checked
*
@@ -2125,20 +2142,13 @@ EXPORT_SYMBOL(submit_bio);
* after it is inserted to @q, it should be checked against @q before
* the insertion using this generic function.
*
- * This function should also be useful for request stacking drivers
- * in some cases below, so export this function.
* Request stacking drivers like request-based dm may change the queue
- * limits while requests are in the queue (e.g. dm's table swapping).
- * Such request stacking drivers should check those requests against
- * the new queue limits again when they dispatch those requests,
- * although such checkings are also done against the old queue limits
- * when submitting requests.
+ * limits when retrying requests on other queues. Those requests need
+ * to be checked against the new queue limits again during dispatch.
*/
-int blk_rq_check_limits(struct request_queue *q, struct request *rq)
+static int blk_cloned_rq_check_limits(struct request_queue *q,
+ struct request *rq)
{
- if (!rq_mergeable(rq))
- return 0;
-
if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) {
printk(KERN_ERR "%s: over max size limit.\n", __func__);
return -EIO;
@@ -2158,7 +2168,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
return 0;
}
-EXPORT_SYMBOL_GPL(blk_rq_check_limits);
/**
* blk_insert_cloned_request - Helper for stacking drivers to submit a request
@@ -2170,7 +2179,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
unsigned long flags;
int where = ELEVATOR_INSERT_BACK;
- if (blk_rq_check_limits(q, rq))
+ if (blk_cloned_rq_check_limits(q, rq))
return -EIO;
if (rq->rq_disk &&
@@ -3412,6 +3421,9 @@ int blk_pre_runtime_suspend(struct request_queue *q)
{
int ret = 0;
+ if (!q->dev)
+ return ret;
+
spin_lock_irq(q->queue_lock);
if (q->nr_pending) {
ret = -EBUSY;
@@ -3439,6 +3451,9 @@ EXPORT_SYMBOL(blk_pre_runtime_suspend);
*/
void blk_post_runtime_suspend(struct request_queue *q, int err)
{
+ if (!q->dev)
+ return;
+
spin_lock_irq(q->queue_lock);
if (!err) {
q->rpm_status = RPM_SUSPENDED;
@@ -3463,6 +3478,9 @@ EXPORT_SYMBOL(blk_post_runtime_suspend);
*/
void blk_pre_runtime_resume(struct request_queue *q)
{
+ if (!q->dev)
+ return;
+
spin_lock_irq(q->queue_lock);
q->rpm_status = RPM_RESUMING;
spin_unlock_irq(q->queue_lock);
@@ -3485,6 +3503,9 @@ EXPORT_SYMBOL(blk_pre_runtime_resume);
*/
void blk_post_runtime_resume(struct request_queue *q, int err)
{
+ if (!q->dev)
+ return;
+
spin_lock_irq(q->queue_lock);
if (!err) {
q->rpm_status = RPM_ACTIVE;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index de5716d8e525..e01405a3e8b3 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -76,6 +76,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
unsigned seg_size = 0, nsegs = 0, sectors = 0;
+ unsigned front_seg_size = bio->bi_seg_front_size;
+ bool do_split = true;
+ struct bio *new = NULL;
bio_for_each_segment(bv, bio, iter) {
if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q))
@@ -98,8 +101,11 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
seg_size += bv.bv_len;
bvprv = bv;
- bvprvp = &bv;
+ bvprvp = &bvprv;
sectors += bv.bv_len >> 9;
+
+ if (nsegs == 1 && seg_size > front_seg_size)
+ front_seg_size = seg_size;
continue;
}
new_segment:
@@ -108,16 +114,29 @@ new_segment:
nsegs++;
bvprv = bv;
- bvprvp = &bv;
+ bvprvp = &bvprv;
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
+
+ if (nsegs == 1 && seg_size > front_seg_size)
+ front_seg_size = seg_size;
}
- *segs = nsegs;
- return NULL;
+ do_split = false;
split:
*segs = nsegs;
- return bio_split(bio, sectors, GFP_NOIO, bs);
+
+ if (do_split) {
+ new = bio_split(bio, sectors, GFP_NOIO, bs);
+ if (new)
+ bio = new;
+ }
+
+ bio->bi_seg_front_size = front_seg_size;
+ if (seg_size > bio->bi_seg_back_size)
+ bio->bi_seg_back_size = seg_size;
+
+ return do_split ? new : NULL;
}
void blk_queue_split(struct request_queue *q, struct bio **bio,
@@ -412,6 +431,12 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
if (sg)
sg_mark_end(sg);
+ /*
+ * Something must have been wrong if the figured number of
+ * segment is bigger than number of req's physical segments
+ */
+ WARN_ON(nsegs > rq->nr_phys_segments);
+
return nsegs;
}
EXPORT_SYMBOL(blk_rq_map_sg);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3ae09de62f19..6d6f8feb48c0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1291,15 +1291,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_bio_to_request(rq, bio);
/*
- * we do limited pluging. If bio can be merged, do merge.
+ * We do limited pluging. If the bio can be merged, do that.
* Otherwise the existing request in the plug list will be
* issued. So the plug list will have one request at most
*/
if (plug) {
/*
* The plug list might get flushed before this. If that
- * happens, same_queue_rq is invalid and plug list is empty
- **/
+ * happens, same_queue_rq is invalid and plug list is
+ * empty
+ */
if (same_queue_rq && !list_empty(&plug->mq_list)) {
old_rq = same_queue_rq;
list_del_init(&old_rq->queuelist);
@@ -1380,12 +1381,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_bio_to_request(rq, bio);
if (!request_count)
trace_block_plug(q);
- else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+
+ blk_mq_put_ctx(data.ctx);
+
+ if (request_count >= BLK_MAX_REQUEST_COUNT) {
blk_flush_plug_list(plug, false);
trace_block_plug(q);
}
+
list_add_tail(&rq->queuelist, &plug->mq_list);
- blk_mq_put_ctx(data.ctx);
return cookie;
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 7d8f129a1516..dd4973583978 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -91,7 +91,8 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
lim->virt_boundary_mask = 0;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
- lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+ lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors =
+ BLK_SAFE_MAX_SECTORS;
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
lim->max_discard_sectors = 0;
@@ -127,6 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_hw_sectors = UINT_MAX;
lim->max_segment_size = UINT_MAX;
lim->max_sectors = UINT_MAX;
+ lim->max_dev_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -214,8 +216,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
EXPORT_SYMBOL(blk_queue_bounce_limit);
/**
- * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request
- * @limits: the queue limits
+ * blk_queue_max_hw_sectors - set max sectors for a request for this queue
+ * @q: the request queue for the device
* @max_hw_sectors: max hardware sectors in the usual 512b unit
*
* Description:
@@ -224,13 +226,19 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
* the device driver based upon the capabilities of the I/O
* controller.
*
+ * max_dev_sectors is a hard limit imposed by the storage device for
+ * READ/WRITE requests. It is set by the disk driver.
+ *
* max_sectors is a soft limit imposed by the block layer for
* filesystem type requests. This value can be overridden on a
* per-device basis in /sys/block/<device>/queue/max_sectors_kb.
* The soft limit can not exceed max_hw_sectors.
**/
-void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors)
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
{
+ struct queue_limits *limits = &q->limits;
+ unsigned int max_sectors;
+
if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
printk(KERN_INFO "%s: set to minimum %d\n",
@@ -238,22 +246,9 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_
}
limits->max_hw_sectors = max_hw_sectors;
- limits->max_sectors = min_t(unsigned int, max_hw_sectors,
- BLK_DEF_MAX_SECTORS);
-}
-EXPORT_SYMBOL(blk_limits_max_hw_sectors);
-
-/**
- * blk_queue_max_hw_sectors - set max sectors for a request for this queue
- * @q: the request queue for the device
- * @max_hw_sectors: max hardware sectors in the usual 512b unit
- *
- * Description:
- * See description for blk_limits_max_hw_sectors().
- **/
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
-{
- blk_limits_max_hw_sectors(&q->limits, max_hw_sectors);
+ max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
+ max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
+ limits->max_sectors = max_sectors;
}
EXPORT_SYMBOL(blk_queue_max_hw_sectors);
@@ -527,6 +522,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+ t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_same_sectors = min(t->max_write_same_sectors,
b->max_write_same_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 565b8dac5782..e140cc487ce1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -205,6 +205,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
if (ret < 0)
return ret;
+ max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long)
+ q->limits.max_dev_sectors >> 1);
+
if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
return -EINVAL;
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 246dfb16c3d9..aa40aa93381b 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -158,11 +158,13 @@ void blk_abort_request(struct request *req)
{
if (blk_mark_rq_complete(req))
return;
- blk_delete_timer(req);
- if (req->q->mq_ops)
+
+ if (req->q->mq_ops) {
blk_mq_rq_timed_out(req, false);
- else
+ } else {
+ blk_delete_timer(req);
blk_rq_timed_out(req);
+ }
}
EXPORT_SYMBOL_GPL(blk_abort_request);
diff --git a/block/blk.h b/block/blk.h
index da722eb786df..c43926d3d74d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -72,8 +72,6 @@ void blk_dequeue_request(struct request *rq);
void __blk_queue_free_tags(struct request_queue *q);
bool __blk_end_bidi_request(struct request *rq, int error,
unsigned int nr_bytes, unsigned int bidi_bytes);
-int blk_queue_enter(struct request_queue *q, gfp_t gfp);
-void blk_queue_exit(struct request_queue *q);
void blk_freeze_queue(struct request_queue *q);
static inline void blk_queue_enter_live(struct request_queue *q)
diff --git a/block/genhd.c b/block/genhd.c
index e5cafa51567c..5aaeb2ad0fd3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -20,6 +20,7 @@
#include <linux/idr.h>
#include <linux/log2.h>
#include <linux/pm_runtime.h>
+#include <linux/badblocks.h>
#include "blk.h"
@@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk)
kobject_put(disk->part0.holder_dir);
kobject_put(disk->slave_dir);
- disk->driverfs_dev = NULL;
if (!sysfs_deprecated)
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk)
}
EXPORT_SYMBOL(del_gendisk);
+/* sysfs access to bad-blocks list. */
+static ssize_t disk_badblocks_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!disk->bb)
+ return sprintf(page, "\n");
+
+ return badblocks_show(disk->bb, page, 0);
+}
+
+static ssize_t disk_badblocks_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *page, size_t len)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!disk->bb)
+ return -ENXIO;
+
+ return badblocks_store(disk->bb, page, len, 0);
+}
+
/**
* get_gendisk - get partitioning information for a given device
* @devt: device to get partitioning information for
@@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
+ disk_badblocks_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_capability.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
+ &dev_attr_badblocks.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
diff --git a/block/ioctl.c b/block/ioctl.c
index 0918aed2d847..2c84683aada5 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -4,6 +4,7 @@
#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
+#include <linux/badblocks.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/blktrace_api.h>
@@ -406,6 +407,71 @@ static inline int is_unrecognized_ioctl(int ret)
ret == -ENOIOCTLCMD;
}
+#ifdef CONFIG_FS_DAX
+bool blkdev_dax_capable(struct block_device *bdev)
+{
+ struct gendisk *disk = bdev->bd_disk;
+
+ if (!disk->fops->direct_access)
+ return false;
+
+ /*
+ * If the partition is not aligned on a page boundary, we can't
+ * do dax I/O to it.
+ */
+ if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+ || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+ return false;
+
+ /*
+ * If the device has known bad blocks, force all I/O through the
+ * driver / page cache.
+ *
+ * TODO: support finer grained dax error handling
+ */
+ if (disk->bb && disk->bb->count)
+ return false;
+
+ return true;
+}
+
+static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
+{
+ unsigned long arg;
+ int rc = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (get_user(arg, (int __user *)(argp)))
+ return -EFAULT;
+ arg = !!arg;
+ if (arg == !!(bdev->bd_inode->i_flags & S_DAX))
+ return 0;
+
+ if (arg)
+ arg = S_DAX;
+
+ if (arg && !blkdev_dax_capable(bdev))
+ return -ENOTTY;
+
+ mutex_lock(&bdev->bd_inode->i_mutex);
+ if (bdev->bd_map_count == 0)
+ inode_set_flags(bdev->bd_inode, arg, S_DAX);
+ else
+ rc = -EBUSY;
+ mutex_unlock(&bdev->bd_inode->i_mutex);
+ return rc;
+}
+#else
+static int blkdev_daxset(struct block_device *bdev, int arg)
+{
+ if (arg)
+ return -ENOTTY;
+ return 0;
+}
+#endif
+
static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
@@ -568,6 +634,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
+ case BLKDAXSET:
+ return blkdev_daxset(bdev, arg);
+ case BLKDAXGET:
+ return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
+ break;
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, argp);
case IOC_PR_RESERVE:
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 3de89d4690f3..a163c487cf38 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -21,10 +21,10 @@ static void noop_merged_requests(struct request_queue *q, struct request *rq,
static int noop_dispatch(struct request_queue *q, int force)
{
struct noop_data *nd = q->elevator->elevator_data;
+ struct request *rq;
- if (!list_empty(&nd->queue)) {
- struct request *rq;
- rq = list_entry(nd->queue.next, struct request, queuelist);
+ rq = list_first_entry_or_null(&nd->queue, struct request, queuelist);
+ if (rq) {
list_del_init(&rq->queuelist);
elv_dispatch_sort(q, rq);
return 1;
@@ -46,7 +46,7 @@ noop_former_request(struct request_queue *q, struct request *rq)
if (rq->queuelist.prev == &nd->queue)
return NULL;
- return list_entry(rq->queuelist.prev, struct request, queuelist);
+ return list_prev_entry(rq, queuelist);
}
static struct request *
@@ -56,7 +56,7 @@ noop_latter_request(struct request_queue *q, struct request *rq)
if (rq->queuelist.next == &nd->queue)
return NULL;
- return list_entry(rq->queuelist.next, struct request, queuelist);
+ return list_next_entry(rq, queuelist);
}
static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 3b030157ec85..746935a5973c 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -397,7 +397,7 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
struct hd_struct *part;
int res;
- if (bdev->bd_part_count)
+ if (bdev->bd_part_count || bdev->bd_super)
return -EBUSY;
res = invalidate_partition(disk, 0);
if (res)
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
index c2c48ec64b27..621317ac4d59 100644
--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@@ -32,7 +32,7 @@ int mac_partition(struct parsed_partitions *state)
Sector sect;
unsigned char *data;
int slot, blocks_in_map;
- unsigned secsize;
+ unsigned secsize, datasize, partoffset;
#ifdef CONFIG_PPC_PMAC
int found_root = 0;
int found_root_goodness = 0;
@@ -50,10 +50,14 @@ int mac_partition(struct parsed_partitions *state)
}
secsize = be16_to_cpu(md->block_size);
put_dev_sector(sect);
- data = read_part_sector(state, secsize/512, &sect);
+ datasize = round_down(secsize, 512);
+ data = read_part_sector(state, datasize / 512, &sect);
if (!data)
return -1;
- part = (struct mac_partition *) (data + secsize%512);
+ partoffset = secsize % 512;
+ if (partoffset + sizeof(*part) > datasize)
+ return -1;
+ part = (struct mac_partition *) (data + partoffset);
if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
put_dev_sector(sect);
return 0; /* not a MacOS disk */