diff options
Diffstat (limited to 'drivers/md/raid10.c')
| -rw-r--r-- | drivers/md/raid10.c | 434 | 
1 files changed, 377 insertions, 57 deletions
| diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a9ae7d113492..13f5e6b2a73d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio)  static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)  {  	struct r10conf *conf = data; -	int size = offsetof(struct r10bio, devs[conf->copies]); +	int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);  	/* allocate a r10bio with room for raid_disks entries in the  	 * bios array */ @@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)  {  	int i; -	for (i = 0; i < conf->copies; i++) { +	for (i = 0; i < conf->geo.raid_disks; i++) {  		struct bio **bio = & r10_bio->devs[i].bio;  		if (!BIO_SPECIAL(*bio))  			bio_put(*bio); @@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,  	int slot;  	int repl = 0; -	for (slot = 0; slot < conf->copies; slot++) { +	for (slot = 0; slot < conf->geo.raid_disks; slot++) {  		if (r10_bio->devs[slot].bio == bio)  			break;  		if (r10_bio->devs[slot].repl_bio == bio) { @@ -336,7 +336,6 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,  		}  	} -	BUG_ON(slot == conf->copies);  	update_head_pos(slot, r10_bio);  	if (slotp) @@ -1274,12 +1273,77 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,  	}  } +static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) +{ +	int i; +	struct r10conf *conf = mddev->private; +	struct md_rdev *blocked_rdev; + +retry_wait: +	blocked_rdev = NULL; +	rcu_read_lock(); +	for (i = 0; i < conf->copies; i++) { +		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); +		struct md_rdev *rrdev = rcu_dereference( +			conf->mirrors[i].replacement); +		if (rdev == rrdev) +			rrdev = NULL; +		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { +			atomic_inc(&rdev->nr_pending); +			blocked_rdev = rdev; +			break; +		} +		if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { +			atomic_inc(&rrdev->nr_pending); +			blocked_rdev = rrdev; +			break; +		} + +		if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { +			sector_t first_bad; +			sector_t dev_sector = r10_bio->devs[i].addr; +			int bad_sectors; +			int is_bad; + +			/* +			 * Discard request doesn't care the write result +			 * so it doesn't need to wait blocked disk here. +			 */ +			if (!r10_bio->sectors) +				continue; + +			is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, +					     &first_bad, &bad_sectors); +			if (is_bad < 0) { +				/* +				 * Mustn't write here until the bad block +				 * is acknowledged +				 */ +				atomic_inc(&rdev->nr_pending); +				set_bit(BlockedBadBlocks, &rdev->flags); +				blocked_rdev = rdev; +				break; +			} +		} +	} +	rcu_read_unlock(); + +	if (unlikely(blocked_rdev)) { +		/* Have to wait for this device to get unblocked, then retry */ +		allow_barrier(conf); +		raid10_log(conf->mddev, "%s wait rdev %d blocked", +				__func__, blocked_rdev->raid_disk); +		md_wait_for_blocked_rdev(blocked_rdev, mddev); +		wait_barrier(conf); +		goto retry_wait; +	} +} +  static void raid10_write_request(struct mddev *mddev, struct bio *bio,  				 struct r10bio *r10_bio)  {  	struct r10conf *conf = mddev->private;  	int i; -	struct md_rdev *blocked_rdev;  	sector_t sectors;  	int max_sectors; @@ -1337,8 +1401,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,  	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */  	raid10_find_phys(conf, r10_bio); -retry_write: -	blocked_rdev = NULL; + +	wait_blocked_dev(mddev, r10_bio); +  	rcu_read_lock();  	max_sectors = r10_bio->sectors; @@ -1349,16 +1414,6 @@ retry_write:  			conf->mirrors[d].replacement);  		if (rdev == rrdev)  			rrdev = NULL; -		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { -			atomic_inc(&rdev->nr_pending); -			blocked_rdev = rdev; -			break; -		} -		if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { -			atomic_inc(&rrdev->nr_pending); -			blocked_rdev = rrdev; -			break; -		}  		if (rdev && (test_bit(Faulty, &rdev->flags)))  			rdev = NULL;  		if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1379,15 +1434,6 @@ retry_write:  			is_bad = is_badblock(rdev, dev_sector, max_sectors,  					     &first_bad, &bad_sectors); -			if (is_bad < 0) { -				/* Mustn't write here until the bad block -				 * is acknowledged -				 */ -				atomic_inc(&rdev->nr_pending); -				set_bit(BlockedBadBlocks, &rdev->flags); -				blocked_rdev = rdev; -				break; -			}  			if (is_bad && first_bad <= dev_sector) {  				/* Cannot write here at all */  				bad_sectors -= (dev_sector - first_bad); @@ -1423,35 +1469,6 @@ retry_write:  	}  	rcu_read_unlock(); -	if (unlikely(blocked_rdev)) { -		/* Have to wait for this device to get unblocked, then retry */ -		int j; -		int d; - -		for (j = 0; j < i; j++) { -			if (r10_bio->devs[j].bio) { -				d = r10_bio->devs[j].devnum; -				rdev_dec_pending(conf->mirrors[d].rdev, mddev); -			} -			if (r10_bio->devs[j].repl_bio) { -				struct md_rdev *rdev; -				d = r10_bio->devs[j].devnum; -				rdev = conf->mirrors[d].replacement; -				if (!rdev) { -					/* Race with remove_disk */ -					smp_mb(); -					rdev = conf->mirrors[d].rdev; -				} -				rdev_dec_pending(rdev, mddev); -			} -		} -		allow_barrier(conf); -		raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); -		md_wait_for_blocked_rdev(blocked_rdev, mddev); -		wait_barrier(conf); -		goto retry_write; -	} -  	if (max_sectors < r10_bio->sectors)  		r10_bio->sectors = max_sectors; @@ -1492,7 +1509,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)  	r10_bio->sector = bio->bi_iter.bi_sector;  	r10_bio->state = 0;  	r10_bio->read_slot = -1; -	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); +	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * +			conf->geo.raid_disks);  	if (bio_data_dir(bio) == READ)  		raid10_read_request(mddev, bio, r10_bio); @@ -1500,6 +1518,304 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)  		raid10_write_request(mddev, bio, r10_bio);  } +static void raid_end_discard_bio(struct r10bio *r10bio) +{ +	struct r10conf *conf = r10bio->mddev->private; +	struct r10bio *first_r10bio; + +	while (atomic_dec_and_test(&r10bio->remaining)) { + +		allow_barrier(conf); + +		if (!test_bit(R10BIO_Discard, &r10bio->state)) { +			first_r10bio = (struct r10bio *)r10bio->master_bio; +			free_r10bio(r10bio); +			r10bio = first_r10bio; +		} else { +			md_write_end(r10bio->mddev); +			bio_endio(r10bio->master_bio); +			free_r10bio(r10bio); +			break; +		} +	} +} + +static void raid10_end_discard_request(struct bio *bio) +{ +	struct r10bio *r10_bio = bio->bi_private; +	struct r10conf *conf = r10_bio->mddev->private; +	struct md_rdev *rdev = NULL; +	int dev; +	int slot, repl; + +	/* +	 * We don't care the return value of discard bio +	 */ +	if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) +		set_bit(R10BIO_Uptodate, &r10_bio->state); + +	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); +	if (repl) +		rdev = conf->mirrors[dev].replacement; +	if (!rdev) { +		/* +		 * raid10_remove_disk uses smp_mb to make sure rdev is set to +		 * replacement before setting replacement to NULL. It can read +		 * rdev first without barrier protect even replacment is NULL +		 */ +		smp_rmb(); +		rdev = conf->mirrors[dev].rdev; +	} + +	raid_end_discard_bio(r10_bio); +	rdev_dec_pending(rdev, conf->mddev); +} + +/* + * There are some limitations to handle discard bio + * 1st, the discard size is bigger than stripe_size*2. + * 2st, if the discard bio spans reshape progress, we use the old way to + * handle discard bio + */ +static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) +{ +	struct r10conf *conf = mddev->private; +	struct geom *geo = &conf->geo; +	int far_copies = geo->far_copies; +	bool first_copy = true; +	struct r10bio *r10_bio, *first_r10bio; +	struct bio *split; +	int disk; +	sector_t chunk; +	unsigned int stripe_size; +	unsigned int stripe_data_disks; +	sector_t split_size; +	sector_t bio_start, bio_end; +	sector_t first_stripe_index, last_stripe_index; +	sector_t start_disk_offset; +	unsigned int start_disk_index; +	sector_t end_disk_offset; +	unsigned int end_disk_index; +	unsigned int remainder; + +	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) +		return -EAGAIN; + +	wait_barrier(conf); + +	/* +	 * Check reshape again to avoid reshape happens after checking +	 * MD_RECOVERY_RESHAPE and before wait_barrier +	 */ +	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) +		goto out; + +	if (geo->near_copies) +		stripe_data_disks = geo->raid_disks / geo->near_copies + +					geo->raid_disks % geo->near_copies; +	else +		stripe_data_disks = geo->raid_disks; + +	stripe_size = stripe_data_disks << geo->chunk_shift; + +	bio_start = bio->bi_iter.bi_sector; +	bio_end = bio_end_sector(bio); + +	/* +	 * Maybe one discard bio is smaller than strip size or across one +	 * stripe and discard region is larger than one stripe size. For far +	 * offset layout, if the discard region is not aligned with stripe +	 * size, there is hole when we submit discard bio to member disk. +	 * For simplicity, we only handle discard bio which discard region +	 * is bigger than stripe_size * 2 +	 */ +	if (bio_sectors(bio) < stripe_size*2) +		goto out; + +	/* +	 * Keep bio aligned with strip size. +	 */ +	div_u64_rem(bio_start, stripe_size, &remainder); +	if (remainder) { +		split_size = stripe_size - remainder; +		split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); +		bio_chain(split, bio); +		allow_barrier(conf); +		/* Resend the fist split part */ +		submit_bio_noacct(split); +		wait_barrier(conf); +	} +	div_u64_rem(bio_end, stripe_size, &remainder); +	if (remainder) { +		split_size = bio_sectors(bio) - remainder; +		split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); +		bio_chain(split, bio); +		allow_barrier(conf); +		/* Resend the second split part */ +		submit_bio_noacct(bio); +		bio = split; +		wait_barrier(conf); +	} + +	bio_start = bio->bi_iter.bi_sector; +	bio_end = bio_end_sector(bio); + +	/* +	 * Raid10 uses chunk as the unit to store data. It's similar like raid0. +	 * One stripe contains the chunks from all member disk (one chunk from +	 * one disk at the same HBA address). For layout detail, see 'man md 4' +	 */ +	chunk = bio_start >> geo->chunk_shift; +	chunk *= geo->near_copies; +	first_stripe_index = chunk; +	start_disk_index = sector_div(first_stripe_index, geo->raid_disks); +	if (geo->far_offset) +		first_stripe_index *= geo->far_copies; +	start_disk_offset = (bio_start & geo->chunk_mask) + +				(first_stripe_index << geo->chunk_shift); + +	chunk = bio_end >> geo->chunk_shift; +	chunk *= geo->near_copies; +	last_stripe_index = chunk; +	end_disk_index = sector_div(last_stripe_index, geo->raid_disks); +	if (geo->far_offset) +		last_stripe_index *= geo->far_copies; +	end_disk_offset = (bio_end & geo->chunk_mask) + +				(last_stripe_index << geo->chunk_shift); + +retry_discard: +	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); +	r10_bio->mddev = mddev; +	r10_bio->state = 0; +	r10_bio->sectors = 0; +	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); +	wait_blocked_dev(mddev, r10_bio); + +	/* +	 * For far layout it needs more than one r10bio to cover all regions. +	 * Inspired by raid10_sync_request, we can use the first r10bio->master_bio +	 * to record the discard bio. Other r10bio->master_bio record the first +	 * r10bio. The first r10bio only release after all other r10bios finish. +	 * The discard bio returns only first r10bio finishes +	 */ +	if (first_copy) { +		r10_bio->master_bio = bio; +		set_bit(R10BIO_Discard, &r10_bio->state); +		first_copy = false; +		first_r10bio = r10_bio; +	} else +		r10_bio->master_bio = (struct bio *)first_r10bio; + +	rcu_read_lock(); +	for (disk = 0; disk < geo->raid_disks; disk++) { +		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); +		struct md_rdev *rrdev = rcu_dereference( +			conf->mirrors[disk].replacement); + +		r10_bio->devs[disk].bio = NULL; +		r10_bio->devs[disk].repl_bio = NULL; + +		if (rdev && (test_bit(Faulty, &rdev->flags))) +			rdev = NULL; +		if (rrdev && (test_bit(Faulty, &rrdev->flags))) +			rrdev = NULL; +		if (!rdev && !rrdev) +			continue; + +		if (rdev) { +			r10_bio->devs[disk].bio = bio; +			atomic_inc(&rdev->nr_pending); +		} +		if (rrdev) { +			r10_bio->devs[disk].repl_bio = bio; +			atomic_inc(&rrdev->nr_pending); +		} +	} +	rcu_read_unlock(); + +	atomic_set(&r10_bio->remaining, 1); +	for (disk = 0; disk < geo->raid_disks; disk++) { +		sector_t dev_start, dev_end; +		struct bio *mbio, *rbio = NULL; +		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); +		struct md_rdev *rrdev = rcu_dereference( +			conf->mirrors[disk].replacement); + +		/* +		 * Now start to calculate the start and end address for each disk. +		 * The space between dev_start and dev_end is the discard region. +		 * +		 * For dev_start, it needs to consider three conditions: +		 * 1st, the disk is before start_disk, you can imagine the disk in +		 * the next stripe. So the dev_start is the start address of next +		 * stripe. +		 * 2st, the disk is after start_disk, it means the disk is at the +		 * same stripe of first disk +		 * 3st, the first disk itself, we can use start_disk_offset directly +		 */ +		if (disk < start_disk_index) +			dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; +		else if (disk > start_disk_index) +			dev_start = first_stripe_index * mddev->chunk_sectors; +		else +			dev_start = start_disk_offset; + +		if (disk < end_disk_index) +			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; +		else if (disk > end_disk_index) +			dev_end = last_stripe_index * mddev->chunk_sectors; +		else +			dev_end = end_disk_offset; + +		/* +		 * It only handles discard bio which size is >= stripe size, so +		 * dev_end > dev_start all the time +		 */ +		if (r10_bio->devs[disk].bio) { +			mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); +			mbio->bi_end_io = raid10_end_discard_request; +			mbio->bi_private = r10_bio; +			r10_bio->devs[disk].bio = mbio; +			r10_bio->devs[disk].devnum = disk; +			atomic_inc(&r10_bio->remaining); +			md_submit_discard_bio(mddev, rdev, mbio, +					dev_start + choose_data_offset(r10_bio, rdev), +					dev_end - dev_start); +			bio_endio(mbio); +		} +		if (r10_bio->devs[disk].repl_bio) { +			rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); +			rbio->bi_end_io = raid10_end_discard_request; +			rbio->bi_private = r10_bio; +			r10_bio->devs[disk].repl_bio = rbio; +			r10_bio->devs[disk].devnum = disk; +			atomic_inc(&r10_bio->remaining); +			md_submit_discard_bio(mddev, rrdev, rbio, +					dev_start + choose_data_offset(r10_bio, rrdev), +					dev_end - dev_start); +			bio_endio(rbio); +		} +	} + +	if (!geo->far_offset && --far_copies) { +		first_stripe_index += geo->stride >> geo->chunk_shift; +		start_disk_offset += geo->stride; +		last_stripe_index += geo->stride >> geo->chunk_shift; +		end_disk_offset += geo->stride; +		atomic_inc(&first_r10bio->remaining); +		raid_end_discard_bio(r10_bio); +		wait_barrier(conf); +		goto retry_discard; +	} + +	raid_end_discard_bio(r10_bio); + +	return 0; +out: +	allow_barrier(conf); +	return -EAGAIN; +} +  static bool raid10_make_request(struct mddev *mddev, struct bio *bio)  {  	struct r10conf *conf = mddev->private; @@ -1514,6 +1830,10 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)  	if (!md_write_start(mddev, bio))  		return false; +	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) +		if (!raid10_handle_discard(mddev, bio)) +			return true; +  	/*  	 * If this request crosses a chunk boundary, we need to split  	 * it. @@ -3753,7 +4073,7 @@ static int raid10_run(struct mddev *mddev)  	if (mddev->queue) {  		blk_queue_max_discard_sectors(mddev->queue, -					      mddev->chunk_sectors); +					      UINT_MAX);  		blk_queue_max_write_same_sectors(mddev->queue, 0);  		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);  		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); | 
