md/raid5: Improve performance for sequential IO

commit fc05e06e6098ca2c28f7a10da0e00aeea20fa59e upstream. Commit 7e55c60acfbb ("md/raid5: Pivot raid5_make_request()") changed the order in which requests for underlying disks are created. Since for large sequential IO adding of requests frequently races with md_raid5 thread submitting bios to underlying disks, this results in a change in IO pattern because intermediate states of new order of request creation result in more smaller discontiguous requests. For RAID5 on top of three rotational disks our performance testing revealed this results in regression in write throughput: iozone -a -s 131072000 -y 4 -q 8 -i 0 -i 1 -R before 7e55c60acfbb: KB reclen write rewrite read reread 131072000 4 493670 525964 524575 513384 131072000 8 540467 532880 512028 513703 after 7e55c60acfbb: KB reclen write rewrite read reread 131072000 4 421785 456184 531278 509248 131072000 8 459283 456354 528449 543834 To reduce the amount of discontiguous requests we can start generating requests with the stripe with the lowest chunk offset as that has the best chance of being adjacent to IO queued previously. This improves the performance to: KB reclen write rewrite read reread 131072000 4 497682 506317 518043 514559 131072000 8 514048 501886 506453 504319 restoring big part of the regression. Fixes: 7e55c60acfbb ("md/raid5: Pivot raid5_make_request()") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Jan Kara <jack@suse.cz> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20230417171537.17899-1-jack@suse.cz Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
author: Jan Kara <jack@suse.cz> 2023-04-17 20:15:37 +0300
committer: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2023-05-11 17:03:39 +0300
commit: 507fbfa79acb8cf92353ef14ad4cc74fb206bdfc (patch)
tree: b741e97bcd251fefe58f3449405afd9bd3838338
parent: b50fd1c3d9d0175aa29ff2706ef36cc178bc356a (diff)
download: linux-507fbfa79acb8cf92353ef14ad4cc74fb206bdfc.tar.xz
1 files changed, 44 insertions, 1 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7b820b81d8c2..f787c9e5b10e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6079,6 +6079,38 @@ out_release:
 	return ret;
 }
 
+/*
+ * If the bio covers multiple data disks, find sector within the bio that has
+ * the lowest chunk offset in the first chunk.
+ */
+static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
+					      struct bio *bi)
+{
+	int sectors_per_chunk = conf->chunk_sectors;
+	int raid_disks = conf->raid_disks;
+	int dd_idx;
+	struct stripe_head sh;
+	unsigned int chunk_offset;
+	sector_t r_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
+	sector_t sector;
+
+	/* We pass in fake stripe_head to get back parity disk numbers */
+	sector = raid5_compute_sector(conf, r_sector, 0, &dd_idx, &sh);
+	chunk_offset = sector_div(sector, sectors_per_chunk);
+	if (sectors_per_chunk - chunk_offset >= bio_sectors(bi))
+		return r_sector;
+	/*
+	 * Bio crosses to the next data disk. Check whether it's in the same
+	 * chunk.
+	 */
+	dd_idx++;
+	while (dd_idx == sh.pd_idx || dd_idx == sh.qd_idx)
+		dd_idx++;
+	if (dd_idx >= raid_disks)
+		return r_sector;
+	return r_sector + sectors_per_chunk - chunk_offset;
+}
+
 static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
@@ -6150,6 +6182,17 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 	}
 	md_account_bio(mddev, &bi);
 
+	/*
+	 * Lets start with the stripe with the lowest chunk offset in the first
+	 * chunk. That has the best chances of creating IOs adjacent to
+	 * previous IOs in case of sequential IO and thus creates the most
+	 * sequential IO pattern. We don't bother with the optimization when
+	 * reshaping as the performance benefit is not worth the complexity.
+	 */
+	if (likely(conf->reshape_progress == MaxSector))
+		logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
+	s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
+
 	add_wait_queue(&conf->wait_for_overlap, &wait);
 	while (1) {
 		res = make_stripe_request(mddev, conf, &ctx, logical_sector,
@@ -6178,7 +6221,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 			continue;
 		}
 
-		s = find_first_bit(ctx.sectors_to_do, stripe_cnt);
+		s = find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s);
 		if (s == stripe_cnt)
 			break;
author	Jan Kara <jack@suse.cz>	2023-04-17 20:15:37 +0300
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2023-05-11 17:03:39 +0300
commit	507fbfa79acb8cf92353ef14ad4cc74fb206bdfc (patch)
tree	b741e97bcd251fefe58f3449405afd9bd3838338
parent	b50fd1c3d9d0175aa29ff2706ef36cc178bc356a (diff)
download	linux-507fbfa79acb8cf92353ef14ad4cc74fb206bdfc.tar.xz