diff options
| author | Jens Axboe <axboe@kernel.dk> | 2026-04-08 15:53:16 +0300 |
|---|---|---|
| committer | Jens Axboe <axboe@kernel.dk> | 2026-04-08 15:53:16 +0300 |
| commit | d0cc5f585f8b140cbab326f0e44f966aab52f2c8 (patch) | |
| tree | d04f20cb5fc13d6a39d93eceab3795c84b2af0b5 | |
| parent | 2d148a214b24b4a2525f649cced0c3e9e57281cd (diff) | |
| parent | 7f9f7c697474268d9ef9479df3ddfe7cdcfbbffc (diff) | |
| download | linux-d0cc5f585f8b140cbab326f0e44f966aab52f2c8.tar.xz | |
Merge tag 'md-7.1-20260407' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux into for-7.1/block
Pull MD changes from Yu Kuai:
"Bug Fixes:
- avoid a sysfs deadlock when clearing array state (Yu Kuai)
- validate raid5 journal payloads before reading metadata (Junrui Luo)
- fall back to the correct bitmap operations after version mismatches
(Yu Kuai)
- serialize overlapping writes on writemostly raid1 disks (Xiao Ni)
- wake raid456 reshape waiters before suspend (Yu Kuai)
- prevent retry_aligned_read() from triggering soft lockups
(Chia-Ming Chang)
Improvements:
- switch raid0 strip zone and devlist allocations to kvmalloc helpers
(Gregory Price)
- track clean unwritten stripes for proactive RAID5 parity building
(Yu Kuai)
- speed up initial llbitmap sync with write_zeroes_unmap support
(Yu Kuai)
Cleanups:
- remove the unused static md workqueue definition
(Abd-Alrhman Masalkhi)"
* tag 'md-7.1-20260407' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux:
md/raid5: fix soft lockup in retry_aligned_read()
md: wake raid456 reshape waiters before suspend
md/raid1: serialize overlap io for writemostly disk
md/md-llbitmap: optimize initial sync with write_zeroes_unmap support
md/md-llbitmap: add CleanUnwritten state for RAID-5 proactive parity building
md: add fallback to correct bitmap_ops on version mismatch
md/raid5: validate payload size before accessing journal metadata
md: remove unused static md_wq workqueue
md/raid0: use kvzalloc/kvfree for strip_zone and devlist allocations
md: fix array_state=clear sysfs deadlock
| -rw-r--r-- | drivers/md/md-llbitmap.c | 202 | ||||
| -rw-r--r-- | drivers/md/md.c | 139 | ||||
| -rw-r--r-- | drivers/md/md.h | 5 | ||||
| -rw-r--r-- | drivers/md/raid0.c | 18 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 47 | ||||
| -rw-r--r-- | drivers/md/raid5-cache.c | 48 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 8 |
7 files changed, 405 insertions, 62 deletions
diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index cdfecaca216b..9e7e6b1a6f15 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -208,6 +208,20 @@ enum llbitmap_state { BitNeedSync, /* data is synchronizing */ BitSyncing, + /* + * Proactive sync requested for unwritten region (raid456 only). + * Triggered via sysfs when user wants to pre-build XOR parity + * for regions that have never been written. + */ + BitNeedSyncUnwritten, + /* Proactive sync in progress for unwritten region */ + BitSyncingUnwritten, + /* + * XOR parity has been pre-built for a region that has never had + * user data written. When user writes to this region, it transitions + * to BitDirty. + */ + BitCleanUnwritten, BitStateCount, BitNone = 0xff, }; @@ -232,6 +246,12 @@ enum llbitmap_action { * BitNeedSync. */ BitmapActionStale, + /* + * Proactive sync trigger for raid456 - builds XOR parity for + * Unwritten regions without requiring user data write first. + */ + BitmapActionProactiveSync, + BitmapActionClearUnwritten, BitmapActionCount, /* Init state is BitUnwritten */ BitmapActionInit, @@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitNone, [BitmapActionStale] = BitNone, + [BitmapActionProactiveSync] = BitNeedSyncUnwritten, + [BitmapActionClearUnwritten] = BitNone, }, [BitClean] = { [BitmapActionStartwrite] = BitDirty, @@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitDirty] = { [BitmapActionStartwrite] = BitNone, @@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitClean, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitNeedSync] = { [BitmapActionStartwrite] = BitNone, @@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNone, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitSyncing] = { [BitmapActionStartwrite] = BitNone, @@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, + }, + [BitNeedSyncUnwritten] = { + [BitmapActionStartwrite] = BitNeedSync, + [BitmapActionStartsync] = BitSyncingUnwritten, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitUnwritten, + [BitmapActionReload] = BitUnwritten, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, + }, + [BitSyncingUnwritten] = { + [BitmapActionStartwrite] = BitSyncing, + [BitmapActionStartsync] = BitSyncingUnwritten, + [BitmapActionEndsync] = BitCleanUnwritten, + [BitmapActionAbortsync] = BitUnwritten, + [BitmapActionReload] = BitUnwritten, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, + }, + [BitCleanUnwritten] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, }, }; @@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; break; case BitClean: + case BitCleanUnwritten: pctl->state[pos] = BitDirty; break; } @@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, } static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, - int offset) + int offset, bool infect) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; unsigned int io_size = llbitmap->io_size; @@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, * resync all the dirty bits, hence skip infect new dirty bits to * prevent resync unnecessary data. */ - if (llbitmap->mddev->degraded) { + if (llbitmap->mddev->degraded || !infect) { set_bit(block, pctl->dirty); return; } @@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, llbitmap->pctl[idx]->state[bit] = state; if (state == BitDirty || state == BitNeedSync) - llbitmap_set_page_dirty(llbitmap, idx, bit); + llbitmap_set_page_dirty(llbitmap, idx, bit, true); + else if (state == BitNeedSyncUnwritten) + llbitmap_set_page_dirty(llbitmap, idx, bit, false); } static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) @@ -585,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap) return 0; } +/* + * Check if all underlying disks support write_zeroes with unmap. + */ +static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0) + return false; + } + + return true; +} + +/* + * Issue write_zeroes to all underlying disks to zero their data regions. + * This ensures parity consistency for RAID-456 (0 XOR 0 = 0). + * Returns true if all disks were successfully zeroed. + */ +static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + sector_t dev_sectors = mddev->dev_sectors; + int ret; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + ret = blkdev_issue_zeroout(rdev->bdev, + rdev->data_offset, + dev_sectors, + GFP_KERNEL, 0); + if (ret) { + pr_warn("md/llbitmap: failed to zero disk %pg: %d\n", + rdev->bdev, ret); + return false; + } + } + + return true; +} + static void llbitmap_init_state(struct llbitmap *llbitmap) { + struct mddev *mddev = llbitmap->mddev; enum llbitmap_state state = BitUnwritten; unsigned long i; - if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) + if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) { state = BitClean; + } else if (raid_is_456(mddev) && + llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) { + /* + * All disks support write_zeroes with unmap. Zero all disks + * to ensure parity consistency, then set BitCleanUnwritten + * to skip initial sync. + */ + if (llbitmap_zero_all_disks(llbitmap)) + state = BitCleanUnwritten; + } for (i = 0; i < llbitmap->chunks; i++) llbitmap_write(llbitmap, state, i); @@ -627,11 +756,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, goto write_bitmap; } - if (c == BitNeedSync) + if (c == BitNeedSync || c == BitNeedSyncUnwritten) need_resync = !mddev->degraded; state = state_machine[c][action]; - write_bitmap: if (unlikely(mddev->degraded)) { /* For degraded array, mark new data as need sync. */ @@ -658,8 +786,7 @@ write_bitmap: } llbitmap_write(llbitmap, state, start); - - if (state == BitNeedSync) + if (state == BitNeedSync || state == BitNeedSyncUnwritten) need_resync = !mddev->degraded; else if (state == BitDirty && !timer_pending(&llbitmap->pending_timer)) @@ -1229,7 +1356,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) unsigned long p = offset >> llbitmap->chunkshift; enum llbitmap_state c = llbitmap_read(llbitmap, p); - return c == BitClean || c == BitDirty; + return c == BitClean || c == BitDirty || c == BitCleanUnwritten; } static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) @@ -1243,6 +1370,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) if (c == BitUnwritten) return blocks; + /* Skip CleanUnwritten - no user data, will be reset after recovery */ + if (c == BitCleanUnwritten) + return blocks; + /* For degraded array, don't skip */ if (mddev->degraded) return 0; @@ -1261,14 +1392,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, { struct llbitmap *llbitmap = mddev->bitmap; unsigned long p = offset >> llbitmap->chunkshift; + enum llbitmap_state state; + + /* + * Before recovery starts, convert CleanUnwritten to Unwritten. + * This ensures the new disk won't have stale parity data. + */ + if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionClearUnwritten); + /* * Handle one bit at a time, this is much simpler. And it doesn't matter * if md_do_sync() loop more times. */ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); - return llbitmap_state_machine(llbitmap, p, p, - BitmapActionStartsync) == BitSyncing; + state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync); + return state == BitSyncing || state == BitSyncingUnwritten; } /* Something is wrong, sync_thread stop at @offset */ @@ -1474,9 +1616,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page) } mutex_unlock(&mddev->bitmap_info.mutex); - return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", + return sprintf(page, + "unwritten %d\nclean %d\ndirty %d\n" + "need sync %d\nsyncing %d\n" + "need sync unwritten %d\nsyncing unwritten %d\n" + "clean unwritten %d\n", bits[BitUnwritten], bits[BitClean], bits[BitDirty], - bits[BitNeedSync], bits[BitSyncing]); + bits[BitNeedSync], bits[BitSyncing], + bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten], + bits[BitCleanUnwritten]); } static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); @@ -1549,11 +1697,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); +static ssize_t +proactive_sync_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct llbitmap *llbitmap; + + /* Only for RAID-456 */ + if (!raid_is_456(mddev)) + return -EINVAL; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap || !llbitmap->pctl) { + mutex_unlock(&mddev->bitmap_info.mutex); + return -ENODEV; + } + + /* Trigger proactive sync on all Unwritten regions */ + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionProactiveSync); + + mutex_unlock(&mddev->bitmap_info.mutex); + return len; +} + +static struct md_sysfs_entry llbitmap_proactive_sync = + __ATTR(proactive_sync, 0200, NULL, proactive_sync_store); + static struct attribute *md_llbitmap_attrs[] = { &llbitmap_bits.attr, &llbitmap_metadata.attr, &llbitmap_daemon_sleep.attr, &llbitmap_barrier_idle.attr, + &llbitmap_proactive_sync.attr, NULL }; diff --git a/drivers/md/md.c b/drivers/md/md.c index 521d9b34cd9e..5fb5ae8368ba 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule); static const struct kobj_type md_ktype; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); -static struct workqueue_struct *md_wq; /* * This workqueue is used for sync_work to register new sync_thread, and for @@ -188,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev) spin_lock_init(&serial_tmp->serial_lock); serial_tmp->serial_rb = RB_ROOT_CACHED; - init_waitqueue_head(&serial_tmp->serial_io_wait); } rdev->serial = serial; @@ -489,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible) } percpu_ref_kill(&mddev->active_io); + + /* + * RAID456 IO can sleep in wait_for_reshape while still holding an + * active_io reference. If reshape is already interrupted or frozen, + * wake those waiters so they can abort and drop the reference instead + * of deadlocking suspend. + */ + if (mddev->pers && mddev->pers->prepare_suspend && + reshape_interrupted(mddev)) + mddev->pers->prepare_suspend(mddev); + if (interruptible) err = wait_event_interruptible(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); @@ -6130,10 +6139,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, } spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); - mddev_put(mddev); + /* + * For "array_state=clear", dropping the extra kobject reference from + * sysfs_break_active_protection() can trigger md kobject deletion. + * Restore active protection before mddev_put() so deletion happens + * after the sysfs write path fully unwinds. + */ if (kn) sysfs_unbreak_active_protection(kn); + mddev_put(mddev); return rv; } @@ -6449,15 +6464,124 @@ static void md_safemode_timeout(struct timer_list *t) static int start_dirty_degraded; +/* + * Read bitmap superblock and return the bitmap_id based on disk version. + * This is used as fallback when default bitmap version and on-disk version + * doesn't match, and mdadm is not the latest version to set bitmap_type. + */ +static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct page *sb_page; + bitmap_super_t *sb; + enum md_submodule_id id = ID_BITMAP_NONE; + sector_t sector; + u32 version; + + if (!mddev->bitmap_info.offset) + return ID_BITMAP_NONE; + + sb_page = alloc_page(GFP_KERNEL); + if (!sb_page) { + pr_warn("md: %s: failed to allocate memory for bitmap\n", + mdname(mddev)); + return ID_BITMAP_NONE; + } + + sector = mddev->bitmap_info.offset; + + rdev_for_each(rdev, mddev) { + u32 iosize; + + if (!test_bit(In_sync, &rdev->flags) || + test_bit(Faulty, &rdev->flags) || + test_bit(Bitmap_sync, &rdev->flags)) + continue; + + iosize = roundup(sizeof(bitmap_super_t), + bdev_logical_block_size(rdev->bdev)); + if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ, + true)) + goto read_ok; + } + pr_warn("md: %s: failed to read bitmap from any device\n", + mdname(mddev)); + goto out; + +read_ok: + sb = kmap_local_page(sb_page); + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { + pr_warn("md: %s: invalid bitmap magic 0x%x\n", + mdname(mddev), le32_to_cpu(sb->magic)); + goto out_unmap; + } + + version = le32_to_cpu(sb->version); + switch (version) { + case BITMAP_MAJOR_LO: + case BITMAP_MAJOR_HI: + case BITMAP_MAJOR_CLUSTERED: + id = ID_BITMAP; + break; + case BITMAP_MAJOR_LOCKLESS: + id = ID_LLBITMAP; + break; + default: + pr_warn("md: %s: unknown bitmap version %u\n", + mdname(mddev), version); + break; + } + +out_unmap: + kunmap_local(sb); +out: + __free_page(sb_page); + return id; +} + static int md_bitmap_create(struct mddev *mddev) { + enum md_submodule_id orig_id = mddev->bitmap_id; + enum md_submodule_id sb_id; + int err; + if (mddev->bitmap_id == ID_BITMAP_NONE) return -EINVAL; if (!mddev_set_bitmap_ops(mddev)) return -ENOENT; - return mddev->bitmap_ops->create(mddev); + err = mddev->bitmap_ops->create(mddev); + if (!err) + return 0; + + /* + * Create failed, if default bitmap version and on-disk version + * doesn't match, and mdadm is not the latest version to set + * bitmap_type, set bitmap_ops based on the disk version. + */ + mddev_clear_bitmap_ops(mddev); + + sb_id = md_bitmap_get_id_from_sb(mddev); + if (sb_id == ID_BITMAP_NONE || sb_id == orig_id) + return err; + + pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n", + mdname(mddev), orig_id, sb_id); + + mddev->bitmap_id = sb_id; + if (!mddev_set_bitmap_ops(mddev)) { + mddev->bitmap_id = orig_id; + return -ENOENT; + } + + err = mddev->bitmap_ops->create(mddev); + if (err) { + mddev_clear_bitmap_ops(mddev); + mddev->bitmap_id = orig_id; + } + + return err; } static void md_bitmap_destroy(struct mddev *mddev) @@ -10505,10 +10629,6 @@ static int __init md_init(void) goto err_bitmap; ret = -ENOMEM; - md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0); - if (!md_wq) - goto err_wq; - md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0); if (!md_misc_wq) goto err_misc_wq; @@ -10533,8 +10653,6 @@ err_mdp: err_md: destroy_workqueue(md_misc_wq); err_misc_wq: - destroy_workqueue(md_wq); -err_wq: md_llbitmap_exit(); err_bitmap: md_bitmap_exit(); @@ -10843,7 +10961,6 @@ static __exit void md_exit(void) spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); - destroy_workqueue(md_wq); md_bitmap_exit(); } diff --git a/drivers/md/md.h b/drivers/md/md.h index ac84289664cd..d6f5482e2479 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -126,7 +126,6 @@ enum sync_action { struct serial_in_rdev { struct rb_root_cached serial_rb; spinlock_t serial_lock; - wait_queue_head_t serial_io_wait; }; /* @@ -381,7 +380,11 @@ struct serial_info { struct rb_node node; sector_t start; /* start sector of rb node */ sector_t last; /* end sector of rb node */ + sector_t wnode_start; /* address of waiting nodes on the same list */ sector_t _subtree_last; /* highest sector in subtree of rb node */ + struct list_head list_node; + struct list_head waiters; + struct completion ready; }; /* diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ef0045db409f..5e38a51e349a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -143,13 +143,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } err = -ENOMEM; - conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones); + conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones); if (!conf->strip_zone) goto abort; - conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *), - conf->nr_strip_zones, - mddev->raid_disks), - GFP_KERNEL); + conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *), + conf->nr_strip_zones, + mddev->raid_disks), + GFP_KERNEL); if (!conf->devlist) goto abort; @@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) return 0; abort: - kfree(conf->strip_zone); - kfree(conf->devlist); + kvfree(conf->strip_zone); + kvfree(conf->devlist); kfree(conf); *private_conf = ERR_PTR(err); return err; @@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv) { struct r0conf *conf = priv; - kfree(conf->strip_zone); - kfree(conf->devlist); + kvfree(conf->strip_zone); + kvfree(conf->devlist); kfree(conf); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 16f671ab12c0..ba91f7e61920 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, START, LAST, static inline, raid1_rb); static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, - struct serial_info *si, int idx) + struct serial_info *si) { unsigned long flags; int ret = 0; sector_t lo = r1_bio->sector; sector_t hi = lo + r1_bio->sectors - 1; + int idx = sector_to_idx(r1_bio->sector); struct serial_in_rdev *serial = &rdev->serial[idx]; + struct serial_info *head_si; spin_lock_irqsave(&serial->serial_lock, flags); /* collision happened */ - if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) + head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); + if (head_si && head_si != si) { + si->start = lo; + si->last = hi; + si->wnode_start = head_si->wnode_start; + list_add_tail(&si->list_node, &head_si->waiters); ret = -EBUSY; - else { + } else if (!head_si) { si->start = lo; si->last = hi; + si->wnode_start = si->start; raid1_rb_insert(si, &serial->serial_rb); } spin_unlock_irqrestore(&serial->serial_lock, flags); @@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) { struct mddev *mddev = rdev->mddev; struct serial_info *si; - int idx = sector_to_idx(r1_bio->sector); - struct serial_in_rdev *serial = &rdev->serial[idx]; if (WARN_ON(!mddev->serial_info_pool)) return; si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); - wait_event(serial->serial_io_wait, - check_and_add_serial(rdev, r1_bio, si, idx) == 0); + INIT_LIST_HEAD(&si->waiters); + INIT_LIST_HEAD(&si->list_node); + init_completion(&si->ready); + while (check_and_add_serial(rdev, r1_bio, si)) { + wait_for_completion(&si->ready); + reinit_completion(&si->ready); + } } static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct serial_info *si; + struct serial_info *si, *iter_si; unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; @@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); si; si = raid1_rb_iter_next(si, lo, hi)) { if (si->start == lo && si->last == hi) { - raid1_rb_remove(si, &serial->serial_rb); - mempool_free(si, mddev->serial_info_pool); found = 1; break; } } - if (!found) + if (found) { + raid1_rb_remove(si, &serial->serial_rb); + if (!list_empty(&si->waiters)) { + list_for_each_entry(iter_si, &si->waiters, list_node) { + if (iter_si->wnode_start == si->wnode_start) { + list_del_init(&iter_si->list_node); + list_splice_init(&si->waiters, &iter_si->waiters); + raid1_rb_insert(iter_si, &serial->serial_rb); + complete(&iter_si->ready); + break; + } + } + } + mempool_free(si, mddev->serial_info_pool); + } else { WARN(1, "The write IO is not recorded for serialization\n"); + } spin_unlock_irqrestore(&serial->serial_lock, flags); - wake_up(&serial->serial_io_wait); } /* diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 66b10cbda96d..7b7546bfa21f 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2002,15 +2002,27 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, return -ENOMEM; while (mb_offset < le32_to_cpu(mb->meta_size)) { + sector_t payload_len; + payload = (void *)mb + mb_offset; payload_flush = (void *)mb + mb_offset; if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; if (r5l_recovery_verify_data_checksum( log, ctx, page, log_offset, payload->checksum[0]) < 0) goto mismatch; } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) { + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; if (r5l_recovery_verify_data_checksum( log, ctx, page, log_offset, payload->checksum[0]) < 0) @@ -2023,22 +2035,18 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, payload->checksum[1]) < 0) goto mismatch; } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { - /* nothing to do for R5LOG_PAYLOAD_FLUSH here */ + payload_len = sizeof(struct r5l_payload_flush) + + (sector_t)le32_to_cpu(payload_flush->size); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */ goto mismatch; - if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { - mb_offset += sizeof(struct r5l_payload_flush) + - le32_to_cpu(payload_flush->size); - } else { - /* DATA or PARITY payload */ + if (le16_to_cpu(payload->header.type) != R5LOG_PAYLOAD_FLUSH) { log_offset = r5l_ring_add(log, log_offset, le32_to_cpu(payload->size)); - mb_offset += sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * - (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); } - + mb_offset += payload_len; } put_page(page); @@ -2089,6 +2097,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); while (mb_offset < le32_to_cpu(mb->meta_size)) { + sector_t payload_len; int dd; payload = (void *)mb + mb_offset; @@ -2097,6 +2106,12 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { int i, count; + payload_len = sizeof(struct r5l_payload_flush) + + (sector_t)le32_to_cpu(payload_flush->size); + if (mb_offset + payload_len > + le32_to_cpu(mb->meta_size)) + return -EINVAL; + count = le32_to_cpu(payload_flush->size) / sizeof(__le64); for (i = 0; i < count; ++i) { stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]); @@ -2110,12 +2125,17 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, } } - mb_offset += sizeof(struct r5l_payload_flush) + - le32_to_cpu(payload_flush->size); + mb_offset += payload_len; continue; } /* DATA or PARITY payload */ + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + return -EINVAL; + stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ? raid5_compute_sector( conf, le64_to_cpu(payload->location), 0, &dd, @@ -2180,9 +2200,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, log_offset = r5l_ring_add(log, log_offset, le32_to_cpu(payload->size)); - mb_offset += sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * - (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + mb_offset += payload_len; } return 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1f8360d4cdb7..6e79829c5acb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6641,7 +6641,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { - raid5_release_stripe(sh); + int hash; + + spin_lock_irq(&conf->device_lock); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, + &conf->temp_inactive_list[hash]); + spin_unlock_irq(&conf->device_lock); conf->retry_read_aligned = raid_bio; conf->retry_read_offset = scnt; return handled; |
