From 011abdc9df559ec75779bb7c53a744c69b2a94c6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 26 Apr 2018 14:46:29 +1000 Subject: md: fix two problems with setting the "re-add" device state. If "re-add" is written to the "state" file for a device which is faulty, this has an effect similar to removing and re-adding the device. It should take up the same slot in the array that it previously had, and an accelerated (e.g. bitmap-based) rebuild should happen. The slot that "it previously had" is determined by rdev->saved_raid_disk. However this is not set when a device fails (only when a device is added), and it is cleared when resync completes. This means that "re-add" will normally work once, but may not work a second time. This patch includes two fixes. 1/ when a device fails, record the ->raid_disk value in ->saved_raid_disk before clearing ->raid_disk 2/ when "re-add" is written to a device for which ->saved_raid_disk is not set, fail. I think this is suitable for stable as it can cause re-adding a device to be forced to do a full resync which takes a lot longer and so puts data at more risk. Cc: (v4.1) Fixes: 97f6cd39da22 ("md-cluster: re-add capabilities") Signed-off-by: NeilBrown Reviewed-by: Goldwyn Rodrigues Signed-off-by: Shaohua Li --- drivers/md/md.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index c208c01f63a5..bac480d75d1d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2853,7 +2853,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = 0; } } else if (cmd_match(buf, "re-add")) { - if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { + if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && + rdev->saved_raid_disk >= 0) { /* clear_bit is performed _after_ all the devices * have their local Faulty bit cleared. If any writes * happen in the meantime in the local node, they @@ -8641,6 +8642,7 @@ static int remove_and_add_spares(struct mddev *mddev, if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); + rdev->saved_raid_disk = rdev->raid_disk; rdev->raid_disk = -1; removed++; } -- cgit v1.2.3 From c42a0e2675721e1444f56e6132a07b7b1ec169ac Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 4 May 2018 18:08:10 +0800 Subject: md: fix NULL dereference of mddev->pers in remove_and_add_spares() We met NULL pointer BUG as follow: [ 151.760358] BUG: unable to handle kernel NULL pointer dereference at 0000000000000060 [ 151.761340] PGD 80000001011eb067 P4D 80000001011eb067 PUD 1011ea067 PMD 0 [ 151.762039] Oops: 0000 [#1] SMP PTI [ 151.762406] Modules linked in: [ 151.762723] CPU: 2 PID: 3561 Comm: mdadm-test Kdump: loaded Not tainted 4.17.0-rc1+ #238 [ 151.763542] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1.fc26 04/01/2014 [ 151.764432] RIP: 0010:remove_and_add_spares.part.56+0x13c/0x3a0 [ 151.765061] RSP: 0018:ffffc90001d7fcd8 EFLAGS: 00010246 [ 151.765590] RAX: 0000000000000000 RBX: ffff88013601d600 RCX: 0000000000000000 [ 151.766306] RDX: 0000000000000000 RSI: ffff88013601d600 RDI: ffff880136187000 [ 151.767014] RBP: ffff880136187018 R08: 0000000000000003 R09: 0000000000000051 [ 151.767728] R10: ffffc90001d7fed8 R11: 0000000000000000 R12: ffff88013601d600 [ 151.768447] R13: ffff8801298b1300 R14: ffff880136187000 R15: 0000000000000000 [ 151.769160] FS: 00007f2624276700(0000) GS:ffff88013ae80000(0000) knlGS:0000000000000000 [ 151.769971] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 151.770554] CR2: 0000000000000060 CR3: 0000000111aac000 CR4: 00000000000006e0 [ 151.771272] Call Trace: [ 151.771542] md_ioctl+0x1df2/0x1e10 [ 151.771906] ? __switch_to+0x129/0x440 [ 151.772295] ? __schedule+0x244/0x850 [ 151.772672] blkdev_ioctl+0x4bd/0x970 [ 151.773048] block_ioctl+0x39/0x40 [ 151.773402] do_vfs_ioctl+0xa4/0x610 [ 151.773770] ? dput.part.23+0x87/0x100 [ 151.774151] ksys_ioctl+0x70/0x80 [ 151.774493] __x64_sys_ioctl+0x16/0x20 [ 151.774877] do_syscall_64+0x5b/0x180 [ 151.775258] entry_SYSCALL_64_after_hwframe+0x44/0xa9 For raid6, when two disk of the array are offline, two spare disks can be added into the array. Before spare disks recovery completing, system reboot and mdadm thinks it is ok to restart the degraded array by md_ioctl(). Since disks in raid6 is not only_parity(), raid5_run() will abort, when there is no PPL feature or not setting 'start_dirty_degraded' parameter. Therefore, mddev->pers is NULL. But, mddev->raid_disks has been set and it will not be cleared when raid5_run abort. md_ioctl() can execute cmd 'HOT_REMOVE_DISK' to remove a disk by mdadm, which will cause NULL pointer dereference in remove_and_add_spares() finally. Signed-off-by: Yufen Yu Signed-off-by: Shaohua Li --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index bac480d75d1d..df9eb1a04f26 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6525,6 +6525,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) char b[BDEVNAME_SIZE]; struct md_rdev *rdev; + if (!mddev->pers) + return -ENODEV; + rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; -- cgit v1.2.3 From 5a409b4f56d50b212334f338cb8465d65550cd85 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Mon, 21 May 2018 11:49:54 +0800 Subject: MD: fix lock contention for flush bios There is a lock contention when there are many processes which send flush bios to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv. Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio to be NULL, otherwise get mddev->lock. This patch remove mddev->flush_bio and handle flush bio asynchronously. I did a test with command dbench -s 128 -t 300. This is the test result: =================Without the patch============================ Operation Count AvgLat MaxLat -------------------------------------------------- Flush 11165 167.595 5879.560 Close 107469 1.391 2231.094 LockX 384 0.003 0.019 Rename 5944 2.141 1856.001 ReadX 208121 0.003 0.074 WriteX 98259 1925.402 15204.895 Unlink 25198 13.264 3457.268 UnlockX 384 0.001 0.009 FIND_FIRST 47111 0.012 0.076 SET_FILE_INFORMATION 12966 0.007 0.065 QUERY_FILE_INFORMATION 27921 0.004 0.085 QUERY_PATH_INFORMATION 124650 0.005 5.766 QUERY_FS_INFORMATION 22519 0.003 0.053 NTCreateX 141086 4.291 2502.812 Throughput 3.7181 MB/sec (sync open) 128 clients 128 procs max_latency=15204.905 ms =================With the patch============================ Operation Count AvgLat MaxLat -------------------------------------------------- Flush 4500 174.134 406.398 Close 48195 0.060 467.062 LockX 256 0.003 0.029 Rename 2324 0.026 0.360 ReadX 78846 0.004 0.504 WriteX 66832 562.775 1467.037 Unlink 5516 3.665 1141.740 UnlockX 256 0.002 0.019 FIND_FIRST 16428 0.015 0.313 SET_FILE_INFORMATION 6400 0.009 0.520 QUERY_FILE_INFORMATION 17865 0.003 0.089 QUERY_PATH_INFORMATION 47060 0.078 416.299 QUERY_FS_INFORMATION 7024 0.004 0.032 NTCreateX 55921 0.854 1141.452 Throughput 11.744 MB/sec (sync open) 128 clients 128 procs max_latency=1467.041 ms The test is done on raid1 disk with two rotational disks V5: V4 is more complicated than the version with memory pool. So revert to the memory pool version V4: use address of fbio to do hash to choose free flush info. V3: Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device and uses a simple bitmap to record which resource is free. Fix a bug from v2. It should set flush_pending to 1 at first. V2: Neil pointed out two problems. One is counting error problem and another is return value when allocat memory fails. 1. counting error problem This isn't safe. It is only safe to call rdev_dec_pending() on rdevs that you previously called atomic_inc(&rdev->nr_pending); If an rdev was added to the list between the start and end of the flush, this will do something bad. Now it doesn't use bio_chain. It uses specified call back function for each flush bio. 2. Returned on IO error when kmalloc fails is wrong. I use mempool suggested by Neil in V2 3. Fixed some places pointed by Guoqing Suggested-by: Ming Lei Signed-off-by: Xiao Ni Signed-off-by: Shaohua Li --- drivers/md/md.c | 158 +++++++++++++++++++++++++++++++++++++------------------- drivers/md/md.h | 22 +++++--- 2 files changed, 120 insertions(+), 60 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index df9eb1a04f26..6b4e2f29fe4e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -130,6 +130,24 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } +static void * flush_info_alloc(gfp_t gfp_flags, void *data) +{ + return kzalloc(sizeof(struct flush_info), gfp_flags); +} +static void flush_info_free(void *flush_info, void *data) +{ + kfree(flush_info); +} + +static void * flush_bio_alloc(gfp_t gfp_flags, void *data) +{ + return kzalloc(sizeof(struct flush_bio), gfp_flags); +} +static void flush_bio_free(void *flush_bio, void *data) +{ + kfree(flush_bio); +} + static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -412,30 +430,53 @@ static int md_congested(void *data, int bits) /* * Generic flush handling for md */ +static void submit_flushes(struct work_struct *ws) +{ + struct flush_info *fi = container_of(ws, struct flush_info, flush_work); + struct mddev *mddev = fi->mddev; + struct bio *bio = fi->bio; + + bio->bi_opf &= ~REQ_PREFLUSH; + md_handle_request(mddev, bio); + + mempool_free(fi, mddev->flush_pool); +} -static void md_end_flush(struct bio *bio) +static void md_end_flush(struct bio *fbio) { - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; + struct flush_bio *fb = fbio->bi_private; + struct md_rdev *rdev = fb->rdev; + struct flush_info *fi = fb->fi; + struct bio *bio = fi->bio; + struct mddev *mddev = fi->mddev; rdev_dec_pending(rdev, mddev); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); + if (atomic_dec_and_test(&fi->flush_pending)) { + if (bio->bi_iter.bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio); + else { + INIT_WORK(&fi->flush_work, submit_flushes); + queue_work(md_wq, &fi->flush_work); + } } - bio_put(bio); -} -static void md_submit_flush_data(struct work_struct *ws); + mempool_free(fb, mddev->flush_bio_pool); + bio_put(fbio); +} -static void submit_flushes(struct work_struct *ws) +void md_flush_request(struct mddev *mddev, struct bio *bio) { - struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; + struct flush_info *fi; + + fi = mempool_alloc(mddev->flush_pool, GFP_NOIO); + + fi->bio = bio; + fi->mddev = mddev; + atomic_set(&fi->flush_pending, 1); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && @@ -445,59 +486,39 @@ static void submit_flushes(struct work_struct *ws) * we reclaim rcu_read_lock */ struct bio *bi; + struct flush_bio *fb; atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); + + fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO); + fb->fi = fi; + fb->rdev = rdev; + bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; bio_set_dev(bi, rdev->bdev); + bi->bi_end_io = md_end_flush; + bi->bi_private = fb; bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - atomic_inc(&mddev->flush_pending); + + atomic_inc(&fi->flush_pending); submit_bio(bi); + rcu_read_lock(); rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); -} - -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; - /* - * must reset flush_bio before calling into md_handle_request to avoid a - * deadlock, because other bios passed md_handle_request suspend check - * could wait for this and below md_handle_request could wait for those - * bios because of suspend check - */ - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); - - if (bio->bi_iter.bi_size == 0) - /* an empty barrier - all done */ - bio_endio(bio); - else { - bio->bi_opf &= ~REQ_PREFLUSH; - md_handle_request(mddev, bio); + if (atomic_dec_and_test(&fi->flush_pending)) { + if (bio->bi_iter.bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio); + else { + INIT_WORK(&fi->flush_work, submit_flushes); + queue_work(md_wq, &fi->flush_work); + } } } - -void md_flush_request(struct mddev *mddev, struct bio *bio) -{ - spin_lock_irq(&mddev->lock); - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio, - mddev->lock); - mddev->flush_bio = bio; - spin_unlock_irq(&mddev->lock); - - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); -} EXPORT_SYMBOL(md_flush_request); static inline struct mddev *mddev_get(struct mddev *mddev) @@ -555,7 +576,6 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); spin_lock_init(&mddev->lock); - atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; @@ -5510,6 +5530,22 @@ int md_run(struct mddev *mddev) goto abort; } } + if (mddev->flush_pool == NULL) { + mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc, + flush_info_free, mddev); + if (!mddev->flush_pool) { + err = -ENOMEM; + goto abort; + } + } + if (mddev->flush_bio_pool == NULL) { + mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc, + flush_bio_free, mddev); + if (!mddev->flush_bio_pool) { + err = -ENOMEM; + goto abort; + } + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -5669,6 +5705,14 @@ int md_run(struct mddev *mddev) return 0; abort: + if (mddev->flush_bio_pool) { + mempool_destroy(mddev->flush_bio_pool); + mddev->flush_bio_pool = NULL; + } + if (mddev->flush_pool){ + mempool_destroy(mddev->flush_pool); + mddev->flush_pool = NULL; + } if (mddev->bio_set) { bioset_free(mddev->bio_set); mddev->bio_set = NULL; @@ -5889,6 +5933,14 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); + if (mddev->flush_bio_pool) { + mempool_destroy(mddev->flush_bio_pool); + mddev->flush_bio_pool = NULL; + } + if (mddev->flush_pool) { + mempool_destroy(mddev->flush_pool); + mddev->flush_pool = NULL; + } if (mddev->bio_set) { bioset_free(mddev->bio_set); mddev->bio_set = NULL; diff --git a/drivers/md/md.h b/drivers/md/md.h index fbc925cce810..ffcb1ae217fe 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -252,6 +252,19 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; +#define NR_FLUSH_INFOS 8 +#define NR_FLUSH_BIOS 64 +struct flush_info { + struct bio *bio; + struct mddev *mddev; + struct work_struct flush_work; + atomic_t flush_pending; +}; +struct flush_bio { + struct flush_info *fi; + struct md_rdev *rdev; +}; + struct mddev { void *private; struct md_personality *pers; @@ -457,13 +470,8 @@ struct mddev { * metadata and bitmap writes */ - /* Generic flush handling. - * The last to finish preflush schedules a worker to submit - * the rest of the request (without the REQ_PREFLUSH flag). - */ - struct bio *flush_bio; - atomic_t flush_pending; - struct work_struct flush_work; + mempool_t *flush_pool; + mempool_t *flush_bio_pool; struct work_struct event_work; /* used by dm to report failure event */ void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; -- cgit v1.2.3