summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2026-04-08 15:53:16 +0300
committerJens Axboe <axboe@kernel.dk>2026-04-08 15:53:16 +0300
commitd0cc5f585f8b140cbab326f0e44f966aab52f2c8 (patch)
treed04f20cb5fc13d6a39d93eceab3795c84b2af0b5
parent2d148a214b24b4a2525f649cced0c3e9e57281cd (diff)
parent7f9f7c697474268d9ef9479df3ddfe7cdcfbbffc (diff)
downloadlinux-d0cc5f585f8b140cbab326f0e44f966aab52f2c8.tar.xz
Merge tag 'md-7.1-20260407' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux into for-7.1/block
Pull MD changes from Yu Kuai: "Bug Fixes: - avoid a sysfs deadlock when clearing array state (Yu Kuai) - validate raid5 journal payloads before reading metadata (Junrui Luo) - fall back to the correct bitmap operations after version mismatches (Yu Kuai) - serialize overlapping writes on writemostly raid1 disks (Xiao Ni) - wake raid456 reshape waiters before suspend (Yu Kuai) - prevent retry_aligned_read() from triggering soft lockups (Chia-Ming Chang) Improvements: - switch raid0 strip zone and devlist allocations to kvmalloc helpers (Gregory Price) - track clean unwritten stripes for proactive RAID5 parity building (Yu Kuai) - speed up initial llbitmap sync with write_zeroes_unmap support (Yu Kuai) Cleanups: - remove the unused static md workqueue definition (Abd-Alrhman Masalkhi)" * tag 'md-7.1-20260407' of git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux: md/raid5: fix soft lockup in retry_aligned_read() md: wake raid456 reshape waiters before suspend md/raid1: serialize overlap io for writemostly disk md/md-llbitmap: optimize initial sync with write_zeroes_unmap support md/md-llbitmap: add CleanUnwritten state for RAID-5 proactive parity building md: add fallback to correct bitmap_ops on version mismatch md/raid5: validate payload size before accessing journal metadata md: remove unused static md_wq workqueue md/raid0: use kvzalloc/kvfree for strip_zone and devlist allocations md: fix array_state=clear sysfs deadlock
-rw-r--r--drivers/md/md-llbitmap.c202
-rw-r--r--drivers/md/md.c139
-rw-r--r--drivers/md/md.h5
-rw-r--r--drivers/md/raid0.c18
-rw-r--r--drivers/md/raid1.c47
-rw-r--r--drivers/md/raid5-cache.c48
-rw-r--r--drivers/md/raid5.c8
7 files changed, 405 insertions, 62 deletions
diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c
index cdfecaca216b..9e7e6b1a6f15 100644
--- a/drivers/md/md-llbitmap.c
+++ b/drivers/md/md-llbitmap.c
@@ -208,6 +208,20 @@ enum llbitmap_state {
BitNeedSync,
/* data is synchronizing */
BitSyncing,
+ /*
+ * Proactive sync requested for unwritten region (raid456 only).
+ * Triggered via sysfs when user wants to pre-build XOR parity
+ * for regions that have never been written.
+ */
+ BitNeedSyncUnwritten,
+ /* Proactive sync in progress for unwritten region */
+ BitSyncingUnwritten,
+ /*
+ * XOR parity has been pre-built for a region that has never had
+ * user data written. When user writes to this region, it transitions
+ * to BitDirty.
+ */
+ BitCleanUnwritten,
BitStateCount,
BitNone = 0xff,
};
@@ -232,6 +246,12 @@ enum llbitmap_action {
* BitNeedSync.
*/
BitmapActionStale,
+ /*
+ * Proactive sync trigger for raid456 - builds XOR parity for
+ * Unwritten regions without requiring user data write first.
+ */
+ BitmapActionProactiveSync,
+ BitmapActionClearUnwritten,
BitmapActionCount,
/* Init state is BitUnwritten */
BitmapActionInit,
@@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitNone,
[BitmapActionStale] = BitNone,
+ [BitmapActionProactiveSync] = BitNeedSyncUnwritten,
+ [BitmapActionClearUnwritten] = BitNone,
},
[BitClean] = {
[BitmapActionStartwrite] = BitDirty,
@@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNeedSync,
+ [BitmapActionProactiveSync] = BitNone,
+ [BitmapActionClearUnwritten] = BitNone,
},
[BitDirty] = {
[BitmapActionStartwrite] = BitNone,
@@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitClean,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNeedSync,
+ [BitmapActionProactiveSync] = BitNone,
+ [BitmapActionClearUnwritten] = BitNone,
},
[BitNeedSync] = {
[BitmapActionStartwrite] = BitNone,
@@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNone,
+ [BitmapActionProactiveSync] = BitNone,
+ [BitmapActionClearUnwritten] = BitNone,
},
[BitSyncing] = {
[BitmapActionStartwrite] = BitNone,
@@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNeedSync,
+ [BitmapActionProactiveSync] = BitNone,
+ [BitmapActionClearUnwritten] = BitNone,
+ },
+ [BitNeedSyncUnwritten] = {
+ [BitmapActionStartwrite] = BitNeedSync,
+ [BitmapActionStartsync] = BitSyncingUnwritten,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitUnwritten,
+ [BitmapActionReload] = BitUnwritten,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitUnwritten,
+ [BitmapActionProactiveSync] = BitNone,
+ [BitmapActionClearUnwritten] = BitUnwritten,
+ },
+ [BitSyncingUnwritten] = {
+ [BitmapActionStartwrite] = BitSyncing,
+ [BitmapActionStartsync] = BitSyncingUnwritten,
+ [BitmapActionEndsync] = BitCleanUnwritten,
+ [BitmapActionAbortsync] = BitUnwritten,
+ [BitmapActionReload] = BitUnwritten,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitUnwritten,
+ [BitmapActionProactiveSync] = BitNone,
+ [BitmapActionClearUnwritten] = BitUnwritten,
+ },
+ [BitCleanUnwritten] = {
+ [BitmapActionStartwrite] = BitDirty,
+ [BitmapActionStartsync] = BitNone,
+ [BitmapActionEndsync] = BitNone,
+ [BitmapActionAbortsync] = BitNone,
+ [BitmapActionReload] = BitNone,
+ [BitmapActionDaemon] = BitNone,
+ [BitmapActionDiscard] = BitUnwritten,
+ [BitmapActionStale] = BitUnwritten,
+ [BitmapActionProactiveSync] = BitNone,
+ [BitmapActionClearUnwritten] = BitUnwritten,
},
};
@@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
break;
case BitClean:
+ case BitCleanUnwritten:
pctl->state[pos] = BitDirty;
break;
}
@@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
}
static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
- int offset)
+ int offset, bool infect)
{
struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
unsigned int io_size = llbitmap->io_size;
@@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
* resync all the dirty bits, hence skip infect new dirty bits to
* prevent resync unnecessary data.
*/
- if (llbitmap->mddev->degraded) {
+ if (llbitmap->mddev->degraded || !infect) {
set_bit(block, pctl->dirty);
return;
}
@@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
llbitmap->pctl[idx]->state[bit] = state;
if (state == BitDirty || state == BitNeedSync)
- llbitmap_set_page_dirty(llbitmap, idx, bit);
+ llbitmap_set_page_dirty(llbitmap, idx, bit, true);
+ else if (state == BitNeedSyncUnwritten)
+ llbitmap_set_page_dirty(llbitmap, idx, bit, false);
}
static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
@@ -585,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap)
return 0;
}
+/*
+ * Check if all underlying disks support write_zeroes with unmap.
+ */
+static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Issue write_zeroes to all underlying disks to zero their data regions.
+ * This ensures parity consistency for RAID-456 (0 XOR 0 = 0).
+ * Returns true if all disks were successfully zeroed.
+ */
+static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap)
+{
+ struct mddev *mddev = llbitmap->mddev;
+ struct md_rdev *rdev;
+ sector_t dev_sectors = mddev->dev_sectors;
+ int ret;
+
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ ret = blkdev_issue_zeroout(rdev->bdev,
+ rdev->data_offset,
+ dev_sectors,
+ GFP_KERNEL, 0);
+ if (ret) {
+ pr_warn("md/llbitmap: failed to zero disk %pg: %d\n",
+ rdev->bdev, ret);
+ return false;
+ }
+ }
+
+ return true;
+}
+
static void llbitmap_init_state(struct llbitmap *llbitmap)
{
+ struct mddev *mddev = llbitmap->mddev;
enum llbitmap_state state = BitUnwritten;
unsigned long i;
- if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
+ if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) {
state = BitClean;
+ } else if (raid_is_456(mddev) &&
+ llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) {
+ /*
+ * All disks support write_zeroes with unmap. Zero all disks
+ * to ensure parity consistency, then set BitCleanUnwritten
+ * to skip initial sync.
+ */
+ if (llbitmap_zero_all_disks(llbitmap))
+ state = BitCleanUnwritten;
+ }
for (i = 0; i < llbitmap->chunks; i++)
llbitmap_write(llbitmap, state, i);
@@ -627,11 +756,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
goto write_bitmap;
}
- if (c == BitNeedSync)
+ if (c == BitNeedSync || c == BitNeedSyncUnwritten)
need_resync = !mddev->degraded;
state = state_machine[c][action];
-
write_bitmap:
if (unlikely(mddev->degraded)) {
/* For degraded array, mark new data as need sync. */
@@ -658,8 +786,7 @@ write_bitmap:
}
llbitmap_write(llbitmap, state, start);
-
- if (state == BitNeedSync)
+ if (state == BitNeedSync || state == BitNeedSyncUnwritten)
need_resync = !mddev->degraded;
else if (state == BitDirty &&
!timer_pending(&llbitmap->pending_timer))
@@ -1229,7 +1356,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
unsigned long p = offset >> llbitmap->chunkshift;
enum llbitmap_state c = llbitmap_read(llbitmap, p);
- return c == BitClean || c == BitDirty;
+ return c == BitClean || c == BitDirty || c == BitCleanUnwritten;
}
static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
@@ -1243,6 +1370,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
if (c == BitUnwritten)
return blocks;
+ /* Skip CleanUnwritten - no user data, will be reset after recovery */
+ if (c == BitCleanUnwritten)
+ return blocks;
+
/* For degraded array, don't skip */
if (mddev->degraded)
return 0;
@@ -1261,14 +1392,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
{
struct llbitmap *llbitmap = mddev->bitmap;
unsigned long p = offset >> llbitmap->chunkshift;
+ enum llbitmap_state state;
+
+ /*
+ * Before recovery starts, convert CleanUnwritten to Unwritten.
+ * This ensures the new disk won't have stale parity data.
+ */
+ if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery))
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+ BitmapActionClearUnwritten);
+
/*
* Handle one bit at a time, this is much simpler. And it doesn't matter
* if md_do_sync() loop more times.
*/
*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
- return llbitmap_state_machine(llbitmap, p, p,
- BitmapActionStartsync) == BitSyncing;
+ state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync);
+ return state == BitSyncing || state == BitSyncingUnwritten;
}
/* Something is wrong, sync_thread stop at @offset */
@@ -1474,9 +1616,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page)
}
mutex_unlock(&mddev->bitmap_info.mutex);
- return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
+ return sprintf(page,
+ "unwritten %d\nclean %d\ndirty %d\n"
+ "need sync %d\nsyncing %d\n"
+ "need sync unwritten %d\nsyncing unwritten %d\n"
+ "clean unwritten %d\n",
bits[BitUnwritten], bits[BitClean], bits[BitDirty],
- bits[BitNeedSync], bits[BitSyncing]);
+ bits[BitNeedSync], bits[BitSyncing],
+ bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten],
+ bits[BitCleanUnwritten]);
}
static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
@@ -1549,11 +1697,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
+static ssize_t
+proactive_sync_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ struct llbitmap *llbitmap;
+
+ /* Only for RAID-456 */
+ if (!raid_is_456(mddev))
+ return -EINVAL;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ llbitmap = mddev->bitmap;
+ if (!llbitmap || !llbitmap->pctl) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return -ENODEV;
+ }
+
+ /* Trigger proactive sync on all Unwritten regions */
+ llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+ BitmapActionProactiveSync);
+
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return len;
+}
+
+static struct md_sysfs_entry llbitmap_proactive_sync =
+ __ATTR(proactive_sync, 0200, NULL, proactive_sync_store);
+
static struct attribute *md_llbitmap_attrs[] = {
&llbitmap_bits.attr,
&llbitmap_metadata.attr,
&llbitmap_daemon_sleep.attr,
&llbitmap_barrier_idle.attr,
+ &llbitmap_proactive_sync.attr,
NULL
};
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 521d9b34cd9e..5fb5ae8368ba 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule);
static const struct kobj_type md_ktype;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
-static struct workqueue_struct *md_wq;
/*
* This workqueue is used for sync_work to register new sync_thread, and for
@@ -188,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev)
spin_lock_init(&serial_tmp->serial_lock);
serial_tmp->serial_rb = RB_ROOT_CACHED;
- init_waitqueue_head(&serial_tmp->serial_io_wait);
}
rdev->serial = serial;
@@ -489,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible)
}
percpu_ref_kill(&mddev->active_io);
+
+ /*
+ * RAID456 IO can sleep in wait_for_reshape while still holding an
+ * active_io reference. If reshape is already interrupted or frozen,
+ * wake those waiters so they can abort and drop the reference instead
+ * of deadlocking suspend.
+ */
+ if (mddev->pers && mddev->pers->prepare_suspend &&
+ reshape_interrupted(mddev))
+ mddev->pers->prepare_suspend(mddev);
+
if (interruptible)
err = wait_event_interruptible(mddev->sb_wait,
percpu_ref_is_zero(&mddev->active_io));
@@ -6130,10 +6139,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
}
spin_unlock(&all_mddevs_lock);
rv = entry->store(mddev, page, length);
- mddev_put(mddev);
+ /*
+ * For "array_state=clear", dropping the extra kobject reference from
+ * sysfs_break_active_protection() can trigger md kobject deletion.
+ * Restore active protection before mddev_put() so deletion happens
+ * after the sysfs write path fully unwinds.
+ */
if (kn)
sysfs_unbreak_active_protection(kn);
+ mddev_put(mddev);
return rv;
}
@@ -6449,15 +6464,124 @@ static void md_safemode_timeout(struct timer_list *t)
static int start_dirty_degraded;
+/*
+ * Read bitmap superblock and return the bitmap_id based on disk version.
+ * This is used as fallback when default bitmap version and on-disk version
+ * doesn't match, and mdadm is not the latest version to set bitmap_type.
+ */
+static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ struct page *sb_page;
+ bitmap_super_t *sb;
+ enum md_submodule_id id = ID_BITMAP_NONE;
+ sector_t sector;
+ u32 version;
+
+ if (!mddev->bitmap_info.offset)
+ return ID_BITMAP_NONE;
+
+ sb_page = alloc_page(GFP_KERNEL);
+ if (!sb_page) {
+ pr_warn("md: %s: failed to allocate memory for bitmap\n",
+ mdname(mddev));
+ return ID_BITMAP_NONE;
+ }
+
+ sector = mddev->bitmap_info.offset;
+
+ rdev_for_each(rdev, mddev) {
+ u32 iosize;
+
+ if (!test_bit(In_sync, &rdev->flags) ||
+ test_bit(Faulty, &rdev->flags) ||
+ test_bit(Bitmap_sync, &rdev->flags))
+ continue;
+
+ iosize = roundup(sizeof(bitmap_super_t),
+ bdev_logical_block_size(rdev->bdev));
+ if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ,
+ true))
+ goto read_ok;
+ }
+ pr_warn("md: %s: failed to read bitmap from any device\n",
+ mdname(mddev));
+ goto out;
+
+read_ok:
+ sb = kmap_local_page(sb_page);
+ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
+ pr_warn("md: %s: invalid bitmap magic 0x%x\n",
+ mdname(mddev), le32_to_cpu(sb->magic));
+ goto out_unmap;
+ }
+
+ version = le32_to_cpu(sb->version);
+ switch (version) {
+ case BITMAP_MAJOR_LO:
+ case BITMAP_MAJOR_HI:
+ case BITMAP_MAJOR_CLUSTERED:
+ id = ID_BITMAP;
+ break;
+ case BITMAP_MAJOR_LOCKLESS:
+ id = ID_LLBITMAP;
+ break;
+ default:
+ pr_warn("md: %s: unknown bitmap version %u\n",
+ mdname(mddev), version);
+ break;
+ }
+
+out_unmap:
+ kunmap_local(sb);
+out:
+ __free_page(sb_page);
+ return id;
+}
+
static int md_bitmap_create(struct mddev *mddev)
{
+ enum md_submodule_id orig_id = mddev->bitmap_id;
+ enum md_submodule_id sb_id;
+ int err;
+
if (mddev->bitmap_id == ID_BITMAP_NONE)
return -EINVAL;
if (!mddev_set_bitmap_ops(mddev))
return -ENOENT;
- return mddev->bitmap_ops->create(mddev);
+ err = mddev->bitmap_ops->create(mddev);
+ if (!err)
+ return 0;
+
+ /*
+ * Create failed, if default bitmap version and on-disk version
+ * doesn't match, and mdadm is not the latest version to set
+ * bitmap_type, set bitmap_ops based on the disk version.
+ */
+ mddev_clear_bitmap_ops(mddev);
+
+ sb_id = md_bitmap_get_id_from_sb(mddev);
+ if (sb_id == ID_BITMAP_NONE || sb_id == orig_id)
+ return err;
+
+ pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n",
+ mdname(mddev), orig_id, sb_id);
+
+ mddev->bitmap_id = sb_id;
+ if (!mddev_set_bitmap_ops(mddev)) {
+ mddev->bitmap_id = orig_id;
+ return -ENOENT;
+ }
+
+ err = mddev->bitmap_ops->create(mddev);
+ if (err) {
+ mddev_clear_bitmap_ops(mddev);
+ mddev->bitmap_id = orig_id;
+ }
+
+ return err;
}
static void md_bitmap_destroy(struct mddev *mddev)
@@ -10505,10 +10629,6 @@ static int __init md_init(void)
goto err_bitmap;
ret = -ENOMEM;
- md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
- if (!md_wq)
- goto err_wq;
-
md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0);
if (!md_misc_wq)
goto err_misc_wq;
@@ -10533,8 +10653,6 @@ err_mdp:
err_md:
destroy_workqueue(md_misc_wq);
err_misc_wq:
- destroy_workqueue(md_wq);
-err_wq:
md_llbitmap_exit();
err_bitmap:
md_bitmap_exit();
@@ -10843,7 +10961,6 @@ static __exit void md_exit(void)
spin_unlock(&all_mddevs_lock);
destroy_workqueue(md_misc_wq);
- destroy_workqueue(md_wq);
md_bitmap_exit();
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ac84289664cd..d6f5482e2479 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -126,7 +126,6 @@ enum sync_action {
struct serial_in_rdev {
struct rb_root_cached serial_rb;
spinlock_t serial_lock;
- wait_queue_head_t serial_io_wait;
};
/*
@@ -381,7 +380,11 @@ struct serial_info {
struct rb_node node;
sector_t start; /* start sector of rb node */
sector_t last; /* end sector of rb node */
+ sector_t wnode_start; /* address of waiting nodes on the same list */
sector_t _subtree_last; /* highest sector in subtree of rb node */
+ struct list_head list_node;
+ struct list_head waiters;
+ struct completion ready;
};
/*
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ef0045db409f..5e38a51e349a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -143,13 +143,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
err = -ENOMEM;
- conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones);
+ conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones);
if (!conf->strip_zone)
goto abort;
- conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *),
- conf->nr_strip_zones,
- mddev->raid_disks),
- GFP_KERNEL);
+ conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *),
+ conf->nr_strip_zones,
+ mddev->raid_disks),
+ GFP_KERNEL);
if (!conf->devlist)
goto abort;
@@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
return 0;
abort:
- kfree(conf->strip_zone);
- kfree(conf->devlist);
+ kvfree(conf->strip_zone);
+ kvfree(conf->devlist);
kfree(conf);
*private_conf = ERR_PTR(err);
return err;
@@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv)
{
struct r0conf *conf = priv;
- kfree(conf->strip_zone);
- kfree(conf->devlist);
+ kvfree(conf->strip_zone);
+ kvfree(conf->devlist);
kfree(conf);
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 16f671ab12c0..ba91f7e61920 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
START, LAST, static inline, raid1_rb);
static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
- struct serial_info *si, int idx)
+ struct serial_info *si)
{
unsigned long flags;
int ret = 0;
sector_t lo = r1_bio->sector;
sector_t hi = lo + r1_bio->sectors - 1;
+ int idx = sector_to_idx(r1_bio->sector);
struct serial_in_rdev *serial = &rdev->serial[idx];
+ struct serial_info *head_si;
spin_lock_irqsave(&serial->serial_lock, flags);
/* collision happened */
- if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
+ head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
+ if (head_si && head_si != si) {
+ si->start = lo;
+ si->last = hi;
+ si->wnode_start = head_si->wnode_start;
+ list_add_tail(&si->list_node, &head_si->waiters);
ret = -EBUSY;
- else {
+ } else if (!head_si) {
si->start = lo;
si->last = hi;
+ si->wnode_start = si->start;
raid1_rb_insert(si, &serial->serial_rb);
}
spin_unlock_irqrestore(&serial->serial_lock, flags);
@@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
{
struct mddev *mddev = rdev->mddev;
struct serial_info *si;
- int idx = sector_to_idx(r1_bio->sector);
- struct serial_in_rdev *serial = &rdev->serial[idx];
if (WARN_ON(!mddev->serial_info_pool))
return;
si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
- wait_event(serial->serial_io_wait,
- check_and_add_serial(rdev, r1_bio, si, idx) == 0);
+ INIT_LIST_HEAD(&si->waiters);
+ INIT_LIST_HEAD(&si->list_node);
+ init_completion(&si->ready);
+ while (check_and_add_serial(rdev, r1_bio, si)) {
+ wait_for_completion(&si->ready);
+ reinit_completion(&si->ready);
+ }
}
static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
- struct serial_info *si;
+ struct serial_info *si, *iter_si;
unsigned long flags;
int found = 0;
struct mddev *mddev = rdev->mddev;
@@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
si; si = raid1_rb_iter_next(si, lo, hi)) {
if (si->start == lo && si->last == hi) {
- raid1_rb_remove(si, &serial->serial_rb);
- mempool_free(si, mddev->serial_info_pool);
found = 1;
break;
}
}
- if (!found)
+ if (found) {
+ raid1_rb_remove(si, &serial->serial_rb);
+ if (!list_empty(&si->waiters)) {
+ list_for_each_entry(iter_si, &si->waiters, list_node) {
+ if (iter_si->wnode_start == si->wnode_start) {
+ list_del_init(&iter_si->list_node);
+ list_splice_init(&si->waiters, &iter_si->waiters);
+ raid1_rb_insert(iter_si, &serial->serial_rb);
+ complete(&iter_si->ready);
+ break;
+ }
+ }
+ }
+ mempool_free(si, mddev->serial_info_pool);
+ } else {
WARN(1, "The write IO is not recorded for serialization\n");
+ }
spin_unlock_irqrestore(&serial->serial_lock, flags);
- wake_up(&serial->serial_io_wait);
}
/*
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 66b10cbda96d..7b7546bfa21f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2002,15 +2002,27 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
return -ENOMEM;
while (mb_offset < le32_to_cpu(mb->meta_size)) {
+ sector_t payload_len;
+
payload = (void *)mb + mb_offset;
payload_flush = (void *)mb + mb_offset;
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
+ payload_len = sizeof(struct r5l_payload_data_parity) +
+ (sector_t)sizeof(__le32) *
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+ if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+ goto mismatch;
if (r5l_recovery_verify_data_checksum(
log, ctx, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
+ payload_len = sizeof(struct r5l_payload_data_parity) +
+ (sector_t)sizeof(__le32) *
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+ if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+ goto mismatch;
if (r5l_recovery_verify_data_checksum(
log, ctx, page, log_offset,
payload->checksum[0]) < 0)
@@ -2023,22 +2035,18 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
payload->checksum[1]) < 0)
goto mismatch;
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
- /* nothing to do for R5LOG_PAYLOAD_FLUSH here */
+ payload_len = sizeof(struct r5l_payload_flush) +
+ (sector_t)le32_to_cpu(payload_flush->size);
+ if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+ goto mismatch;
} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
goto mismatch;
- if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
- mb_offset += sizeof(struct r5l_payload_flush) +
- le32_to_cpu(payload_flush->size);
- } else {
- /* DATA or PARITY payload */
+ if (le16_to_cpu(payload->header.type) != R5LOG_PAYLOAD_FLUSH) {
log_offset = r5l_ring_add(log, log_offset,
le32_to_cpu(payload->size));
- mb_offset += sizeof(struct r5l_payload_data_parity) +
- sizeof(__le32) *
- (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
}
-
+ mb_offset += payload_len;
}
put_page(page);
@@ -2089,6 +2097,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
while (mb_offset < le32_to_cpu(mb->meta_size)) {
+ sector_t payload_len;
int dd;
payload = (void *)mb + mb_offset;
@@ -2097,6 +2106,12 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
int i, count;
+ payload_len = sizeof(struct r5l_payload_flush) +
+ (sector_t)le32_to_cpu(payload_flush->size);
+ if (mb_offset + payload_len >
+ le32_to_cpu(mb->meta_size))
+ return -EINVAL;
+
count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
for (i = 0; i < count; ++i) {
stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
@@ -2110,12 +2125,17 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
}
}
- mb_offset += sizeof(struct r5l_payload_flush) +
- le32_to_cpu(payload_flush->size);
+ mb_offset += payload_len;
continue;
}
/* DATA or PARITY payload */
+ payload_len = sizeof(struct r5l_payload_data_parity) +
+ (sector_t)sizeof(__le32) *
+ (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+ if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+ return -EINVAL;
+
stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
raid5_compute_sector(
conf, le64_to_cpu(payload->location), 0, &dd,
@@ -2180,9 +2200,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
log_offset = r5l_ring_add(log, log_offset,
le32_to_cpu(payload->size));
- mb_offset += sizeof(struct r5l_payload_data_parity) +
- sizeof(__le32) *
- (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+ mb_offset += payload_len;
}
return 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1f8360d4cdb7..6e79829c5acb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6641,7 +6641,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
- raid5_release_stripe(sh);
+ int hash;
+
+ spin_lock_irq(&conf->device_lock);
+ hash = sh->hash_lock_index;
+ __release_stripe(conf, sh,
+ &conf->temp_inactive_list[hash]);
+ spin_unlock_irq(&conf->device_lock);
conf->retry_read_aligned = raid_bio;
conf->retry_read_offset = scnt;
return handled;