From a1ed5b0cffe4b16a93a6a3390e8cee0fbef94f86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:50:16 +0200 Subject: klist: don't iterate over deleted entries A klist entry is kept on the list till all its current iterations are finished; however, a new iteration after deletion also iterates over deleted entries as long as their reference count stays above zero. This causes problems for cases where there are users which iterate over the list while synchronized against list manipulations and natuarally expect already deleted entries to not show up during iteration. This patch implements dead flag which gets set on deletion so that iteration can skip already deleted entries. The dead flag piggy backs on the lowest bit of knode->n_klist and only visible to klist implementation proper. While at it, drop klist_iter->i_head as it's redundant and doesn't offer anything in semantics or performance wise as klist_iter->i_klist is dereferenced on every iteration anyway. Signed-off-by: Tejun Heo Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: Jens Axboe Signed-off-by: Jens Axboe --- lib/klist.c | 96 ++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 70 insertions(+), 26 deletions(-) (limited to 'lib') diff --git a/lib/klist.c b/lib/klist.c index cca37f96faa2..bbdd3015c2c7 100644 --- a/lib/klist.c +++ b/lib/klist.c @@ -37,6 +37,37 @@ #include #include +/* + * Use the lowest bit of n_klist to mark deleted nodes and exclude + * dead ones from iteration. + */ +#define KNODE_DEAD 1LU +#define KNODE_KLIST_MASK ~KNODE_DEAD + +static struct klist *knode_klist(struct klist_node *knode) +{ + return (struct klist *) + ((unsigned long)knode->n_klist & KNODE_KLIST_MASK); +} + +static bool knode_dead(struct klist_node *knode) +{ + return (unsigned long)knode->n_klist & KNODE_DEAD; +} + +static void knode_set_klist(struct klist_node *knode, struct klist *klist) +{ + knode->n_klist = klist; + /* no knode deserves to start its life dead */ + WARN_ON(knode_dead(knode)); +} + +static void knode_kill(struct klist_node *knode) +{ + /* and no knode should die twice ever either, see we're very humane */ + WARN_ON(knode_dead(knode)); + *(unsigned long *)&knode->n_klist |= KNODE_DEAD; +} /** * klist_init - Initialize a klist structure. @@ -79,7 +110,7 @@ static void klist_node_init(struct klist *k, struct klist_node *n) INIT_LIST_HEAD(&n->n_node); init_completion(&n->n_removed); kref_init(&n->n_ref); - n->n_klist = k; + knode_set_klist(n, k); if (k->get) k->get(n); } @@ -115,7 +146,7 @@ EXPORT_SYMBOL_GPL(klist_add_tail); */ void klist_add_after(struct klist_node *n, struct klist_node *pos) { - struct klist *k = pos->n_klist; + struct klist *k = knode_klist(pos); klist_node_init(k, n); spin_lock(&k->k_lock); @@ -131,7 +162,7 @@ EXPORT_SYMBOL_GPL(klist_add_after); */ void klist_add_before(struct klist_node *n, struct klist_node *pos) { - struct klist *k = pos->n_klist; + struct klist *k = knode_klist(pos); klist_node_init(k, n); spin_lock(&k->k_lock); @@ -144,9 +175,10 @@ static void klist_release(struct kref *kref) { struct klist_node *n = container_of(kref, struct klist_node, n_ref); + WARN_ON(!knode_dead(n)); list_del(&n->n_node); complete(&n->n_removed); - n->n_klist = NULL; + knode_set_klist(n, NULL); } static int klist_dec_and_del(struct klist_node *n) @@ -154,22 +186,29 @@ static int klist_dec_and_del(struct klist_node *n) return kref_put(&n->n_ref, klist_release); } -/** - * klist_del - Decrement the reference count of node and try to remove. - * @n: node we're deleting. - */ -void klist_del(struct klist_node *n) +static void klist_put(struct klist_node *n, bool kill) { - struct klist *k = n->n_klist; + struct klist *k = knode_klist(n); void (*put)(struct klist_node *) = k->put; spin_lock(&k->k_lock); + if (kill) + knode_kill(n); if (!klist_dec_and_del(n)) put = NULL; spin_unlock(&k->k_lock); if (put) put(n); } + +/** + * klist_del - Decrement the reference count of node and try to remove. + * @n: node we're deleting. + */ +void klist_del(struct klist_node *n) +{ + klist_put(n, true); +} EXPORT_SYMBOL_GPL(klist_del); /** @@ -206,7 +245,6 @@ void klist_iter_init_node(struct klist *k, struct klist_iter *i, struct klist_node *n) { i->i_klist = k; - i->i_head = &k->k_list; i->i_cur = n; if (n) kref_get(&n->n_ref); @@ -237,7 +275,7 @@ EXPORT_SYMBOL_GPL(klist_iter_init); void klist_iter_exit(struct klist_iter *i) { if (i->i_cur) { - klist_del(i->i_cur); + klist_put(i->i_cur, false); i->i_cur = NULL; } } @@ -258,27 +296,33 @@ static struct klist_node *to_klist_node(struct list_head *n) */ struct klist_node *klist_next(struct klist_iter *i) { - struct list_head *next; - struct klist_node *lnode = i->i_cur; - struct klist_node *knode = NULL; void (*put)(struct klist_node *) = i->i_klist->put; + struct klist_node *last = i->i_cur; + struct klist_node *next; spin_lock(&i->i_klist->k_lock); - if (lnode) { - next = lnode->n_node.next; - if (!klist_dec_and_del(lnode)) + + if (last) { + next = to_klist_node(last->n_node.next); + if (!klist_dec_and_del(last)) put = NULL; } else - next = i->i_head->next; + next = to_klist_node(i->i_klist->k_list.next); - if (next != i->i_head) { - knode = to_klist_node(next); - kref_get(&knode->n_ref); + i->i_cur = NULL; + while (next != to_klist_node(&i->i_klist->k_list)) { + if (likely(!knode_dead(next))) { + kref_get(&next->n_ref); + i->i_cur = next; + break; + } + next = to_klist_node(next->n_node.next); } - i->i_cur = knode; + spin_unlock(&i->i_klist->k_lock); - if (put && lnode) - put(lnode); - return knode; + + if (put && last) + put(last); + return i->i_cur; } EXPORT_SYMBOL_GPL(klist_next); -- cgit v1.2.3 From 870d6656126add8e383645732b03df2b7ccd4f94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:25 +0900 Subject: block: implement CONFIG_DEBUG_BLOCK_EXT_DEVT Extended devt introduces non-contiguos device numbers. This patch implements a debug option which forces most devt allocations to be from the extended area and spreads them out. This is enabled by default if DEBUG_KERNEL is set and achieves... 1. Detects code paths in kernel or userland which expect predetermined consecutive device numbers. 2. When something goes wrong, avoid corruption as adding to the minor of earlier partition won't lead to the wrong but valid device. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 38 +++++++++++++++++++++++++++++++++++--- drivers/ide/ide-disk.c | 6 ++++++ drivers/scsi/sd.c | 6 ++++++ lib/Kconfig.debug | 16 ++++++++++++++++ 4 files changed, 63 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/block/genhd.c b/block/genhd.c index ee4b13520e59..67e5a59ced2a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -298,6 +298,38 @@ EXPORT_SYMBOL(unregister_blkdev); static struct kobj_map *bdev_map; +/** + * blk_mangle_minor - scatter minor numbers apart + * @minor: minor number to mangle + * + * Scatter consecutively allocated @minor number apart if MANGLE_DEVT + * is enabled. Mangling twice gives the original value. + * + * RETURNS: + * Mangled value. + * + * CONTEXT: + * Don't care. + */ +static int blk_mangle_minor(int minor) +{ +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + int i; + + for (i = 0; i < MINORBITS / 2; i++) { + int low = minor & (1 << i); + int high = minor & (1 << (MINORBITS - 1 - i)); + int distance = MINORBITS - 1 - 2 * i; + + minor ^= low | high; /* clear both bits */ + low <<= distance; /* swap the positions */ + high >>= distance; + minor |= low | high; /* and set */ + } +#endif + return minor; +} + /** * blk_alloc_devt - allocate a dev_t for a partition * @part: partition to allocate dev_t for @@ -339,7 +371,7 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) return -EBUSY; } - *devt = MKDEV(BLOCK_EXT_MAJOR, idx); + *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); return 0; } @@ -361,7 +393,7 @@ void blk_free_devt(dev_t devt) if (MAJOR(devt) == BLOCK_EXT_MAJOR) { mutex_lock(&ext_devt_mutex); - idr_remove(&ext_devt_idr, MINOR(devt)); + idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); mutex_unlock(&ext_devt_mutex); } } @@ -473,7 +505,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) struct hd_struct *part; mutex_lock(&ext_devt_mutex); - part = idr_find(&ext_devt_idr, MINOR(devt)); + part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); if (part && get_disk(part_to_disk(part))) { *partno = part->partno; disk = part_to_disk(part); diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 7a88de9ada29..a072df5053ae 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -42,7 +42,13 @@ #include #define IDE_DISK_PARTS (1 << PARTN_BITS) + +#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT) #define IDE_DISK_MINORS IDE_DISK_PARTS +#else +#define IDE_DISK_MINORS 1 +#endif + #define IDE_DISK_EXT_MINORS (IDE_DISK_PARTS - IDE_DISK_MINORS) struct ide_disk_obj { diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index d1bb0e1d2d28..280d231a86ed 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -87,7 +87,13 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD); MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC); #define SD_PARTS 64 + +#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT) #define SD_MINORS 16 +#else +#define SD_MINORS 1 +#endif + #define SD_EXT_MINORS (SD_PARTS - SD_MINORS) static int sd_revalidate_disk(struct gendisk *); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0b504814e378..5a536f703a83 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -624,6 +624,22 @@ config BACKTRACE_SELF_TEST Say N if you are unsure. +config DEBUG_BLOCK_EXT_DEVT + bool "Force extended block device numbers and spread them" + depends on DEBUG_KERNEL + depends on BLOCK + default y + help + Conventionally, block device numbers are allocated from + predetermined contiguous area. However, extended block area + may introduce non-contiguous block device numbers. This + option forces most block device numbers to be allocated from + the extended space and spreads them to discover kernel or + userland code paths which assume predetermined contiguous + device number allocation. + + Say N if you are unsure. + config LKDTM tristate "Linux Kernel Dump Test Tool Module" depends on DEBUG_KERNEL -- cgit v1.2.3 From 759f8ca3048f7438aa3129268d7252552505d662 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Aug 2008 09:06:29 +0200 Subject: Change default value of CONFIG_DEBUG_BLOCK_EXT_DEVT to 'n' It's a debug option that you would explicitly enable to test this feature, we should default it to 'n' to prevent accidental surprises for now. Signed-off-by: Jens Axboe --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5a536f703a83..4378d5e923ca 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -628,7 +628,7 @@ config DEBUG_BLOCK_EXT_DEVT bool "Force extended block device numbers and spread them" depends on DEBUG_KERNEL depends on BLOCK - default y + default n help Conventionally, block device numbers are allocated from predetermined contiguous area. However, extended block area -- cgit v1.2.3 From 55dc7db70a73a3809a2334063c9b5b0d8ccebdaa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Sep 2008 13:44:35 +0200 Subject: init: DEBUG_BLOCK_EXT_DEVT requires explicit root= param DEBUG_BLOCK_EXT_DEVT shuffles SCSI and IDE device numbers and root device number set using rdev become meaningless. Root devices should be explicitly specified using textual names. Warn about it if root can't be found and DEBUG_BLOCK_EXT_DEVT is enabled. Also, add warning to the help text. Signed-off-by: Tejun Heo Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Jens Axboe --- init/do_mounts.c | 4 ++++ lib/Kconfig.debug | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'lib') diff --git a/init/do_mounts.c b/init/do_mounts.c index 3715feb8446d..d055b1914c3d 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -263,6 +263,10 @@ retry: printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); printk_all_partitions(); +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify " + "explicit textual name for \"root=\" boot option.\n"); +#endif panic("VFS: Unable to mount root fs on %s", b); } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4378d5e923ca..c556896abe57 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -638,6 +638,12 @@ config DEBUG_BLOCK_EXT_DEVT userland code paths which assume predetermined contiguous device number allocation. + Note that turning on this debug option shuffles all the + device numbers for all IDE and SCSI devices including libata + ones, so root partition specified using device number + directly (via rdev or root=MAJ:MIN) won't work anymore. + Textual device names (root=/dev/sdXn) will continue to work. + Say N if you are unsure. config LKDTM -- cgit v1.2.3 From 581d4e28d9195aa8b2231383dbabc288988d615e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 14 Sep 2008 05:56:33 -0700 Subject: block: add fault injection mechanism for faking request timeouts Only works for the generic request timer handling. Allows one to sporadically ignore request completions, thus exercising the timeout handling. Signed-off-by: Jens Axboe --- block/blk-softirq.c | 2 ++ block/blk-timeout.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk.h | 12 ++++++++++ block/genhd.c | 8 +++++++ include/linux/blkdev.h | 1 + lib/Kconfig.debug | 13 ++++++++++- 6 files changed, 94 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 7ab344afb16f..e660d26ca656 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -154,6 +154,8 @@ do_local: **/ void blk_complete_request(struct request *req) { + if (unlikely(blk_should_fake_timeout(req->q))) + return; if (!blk_mark_rq_complete(req)) __blk_complete_request(req); } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 6e5c781c5af1..9b4ad138bb33 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -4,9 +4,68 @@ #include #include #include +#include #include "blk.h" +#ifdef CONFIG_FAIL_IO_TIMEOUT + +static DECLARE_FAULT_ATTR(fail_io_timeout); + +static int __init setup_fail_io_timeout(char *str) +{ + return setup_fault_attr(&fail_io_timeout, str); +} +__setup("fail_io_timeout=", setup_fail_io_timeout); + +int blk_should_fake_timeout(struct request_queue *q) +{ + if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) + return 0; + + return should_fail(&fail_io_timeout, 1); +} + +static int __init fail_io_timeout_debugfs(void) +{ + return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout"); +} + +late_initcall(fail_io_timeout_debugfs); + +ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags); + + return sprintf(buf, "%d\n", set != 0); +} + +ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + int val; + + if (count) { + struct request_queue *q = disk->queue; + char *p = (char *) buf; + + val = simple_strtoul(p, &p, 10); + spin_lock_irq(q->queue_lock); + if (val) + queue_flag_set(QUEUE_FLAG_FAIL_IO, q); + else + queue_flag_clear(QUEUE_FLAG_FAIL_IO, q); + spin_unlock_irq(q->queue_lock); + } + + return count; +} + +#endif /* CONFIG_FAIL_IO_TIMEOUT */ + /* * blk_delete_timer - Delete/cancel timer for a given function. * @req: request that we are canceling timer for diff --git a/block/blk.h b/block/blk.h index a4f4a50aefaa..e5c579769963 100644 --- a/block/blk.h +++ b/block/blk.h @@ -42,6 +42,18 @@ static inline void blk_clear_rq_complete(struct request *rq) clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); } +#ifdef CONFIG_FAIL_IO_TIMEOUT +int blk_should_fake_timeout(struct request_queue *); +ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); +ssize_t part_timeout_store(struct device *, struct device_attribute *, + const char *, size_t); +#else +static inline int blk_should_fake_timeout(struct request_queue *q) +{ + return 0; +} +#endif + struct io_context *current_io_context(gfp_t gfp_flags, int node); int ll_back_merge_fn(struct request_queue *q, struct request *req, diff --git a/block/genhd.c b/block/genhd.c index 8acaff0154e3..4cd3433c99ac 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -817,6 +817,11 @@ static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); #endif +#ifdef CONFIG_FAIL_IO_TIMEOUT +static struct device_attribute dev_attr_fail_timeout = + __ATTR(io-timeout-fail, S_IRUGO|S_IWUSR, part_timeout_show, + part_timeout_store); +#endif static struct attribute *disk_attrs[] = { &dev_attr_range.attr, @@ -828,6 +833,9 @@ static struct attribute *disk_attrs[] = { &dev_attr_stat.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, +#endif +#ifdef CONFIG_FAIL_IO_TIMEOUT + &dev_attr_fail_timeout.attr, #endif NULL }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b47767c72ce3..e34999d14c16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -440,6 +440,7 @@ struct request_queue #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ +#define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ static inline int queue_is_locked(struct request_queue *q) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c556896abe57..7d7a31d0ddeb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -683,10 +683,21 @@ config FAIL_PAGE_ALLOC config FAIL_MAKE_REQUEST bool "Fault-injection capability for disk IO" - depends on FAULT_INJECTION + depends on FAULT_INJECTION && BLOCK help Provide fault-injection capability for disk IO. +config FAIL_IO_TIMEOUT + bool "Faul-injection capability for faking disk interrupts" + depends on FAULT_INJECTION && BLOCK + help + Provide fault-injection capability on end IO handling. This + will make the block layer "forget" an interrupt as configured, + thus exercising the error handling. + + Only works with drivers that use the generic timeout handling, + for others it wont do anything. + config FAULT_INJECTION_DEBUG_FS bool "Debugfs entries for fault-injection capabilities" depends on FAULT_INJECTION && SYSFS && DEBUG_FS -- cgit v1.2.3