diff options
| -rw-r--r-- | Documentation/device-mapper/cache.txt | 11 | ||||
| -rw-r--r-- | Documentation/device-mapper/thin-provisioning.txt | 34 | ||||
| -rw-r--r-- | drivers/md/Kconfig | 10 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 4 | ||||
| -rw-r--r-- | drivers/md/dm-snap-persistent.c | 3 | ||||
| -rw-r--r-- | drivers/md/dm-thin-metadata.c | 37 | ||||
| -rw-r--r-- | drivers/md/dm-thin-metadata.h | 11 | ||||
| -rw-r--r-- | drivers/md/dm-thin.c | 304 | ||||
| -rw-r--r-- | drivers/md/persistent-data/Kconfig | 10 | ||||
| -rw-r--r-- | drivers/md/persistent-data/dm-space-map-metadata.c | 113 | 
10 files changed, 425 insertions, 112 deletions
| diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt index e6b72d355151..68c0f517c60e 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.txt @@ -124,12 +124,11 @@ the default being 204800 sectors (or 100MB).  Updating on-disk metadata  ------------------------- -On-disk metadata is committed every time a REQ_SYNC or REQ_FUA bio is -written.  If no such requests are made then commits will occur every -second.  This means the cache behaves like a physical disk that has a -write cache (the same is true of the thin-provisioning target).  If -power is lost you may lose some recent writes.  The metadata should -always be consistent in spite of any crash. +On-disk metadata is committed every time a FLUSH or FUA bio is written. +If no such requests are made then commits will occur every second.  This +means the cache behaves like a physical disk that has a volatile write +cache.  If power is lost you may lose some recent writes.  The metadata +should always be consistent in spite of any crash.  The 'dirty' state for a cache block changes far too frequently for us  to keep updating it on the fly.  So we treat it as a hint.  In normal diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt index 8a7a3d46e0da..05a27e9442bd 100644 --- a/Documentation/device-mapper/thin-provisioning.txt +++ b/Documentation/device-mapper/thin-provisioning.txt @@ -116,6 +116,35 @@ Resuming a device with a new table itself triggers an event so the  userspace daemon can use this to detect a situation where a new table  already exceeds the threshold. +A low water mark for the metadata device is maintained in the kernel and +will trigger a dm event if free space on the metadata device drops below +it. + +Updating on-disk metadata +------------------------- + +On-disk metadata is committed every time a FLUSH or FUA bio is written. +If no such requests are made then commits will occur every second.  This +means the thin-provisioning target behaves like a physical disk that has +a volatile write cache.  If power is lost you may lose some recent +writes.  The metadata should always be consistent in spite of any crash. + +If data space is exhausted the pool will either error or queue IO +according to the configuration (see: error_if_no_space).  If metadata +space is exhausted or a metadata operation fails: the pool will error IO +until the pool is taken offline and repair is performed to 1) fix any +potential inconsistencies and 2) clear the flag that imposes repair. +Once the pool's metadata device is repaired it may be resized, which +will allow the pool to return to normal operation.  Note that if a pool +is flagged as needing repair, the pool's data and metadata devices +cannot be resized until repair is performed.  It should also be noted +that when the pool's metadata space is exhausted the current metadata +transaction is aborted.  Given that the pool will cache IO whose +completion may have already been acknowledged to upper IO layers +(e.g. filesystem) it is strongly suggested that consistency checks +(e.g. fsck) be performed on those layers when repair of the pool is +required. +  Thin provisioning  ----------------- @@ -258,10 +287,9 @@ ii) Status  	should register for the event and then check the target's status.      held metadata root: -	The location, in sectors, of the metadata root that has been +	The location, in blocks, of the metadata root that has been  	'held' for userspace read access.  '-' indicates there is no -	held root.  This feature is not yet implemented so '-' is -	always returned. +	held root.      discard_passdown|no_discard_passdown  	Whether or not discards are actually being passed down to the diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 9a06fe883766..95ad936e6048 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -254,16 +254,6 @@ config DM_THIN_PROVISIONING         ---help---           Provides thin provisioning and snapshots that share a data store. -config DM_DEBUG_BLOCK_STACK_TRACING -	boolean "Keep stack trace of persistent data block lock holders" -	depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA -	select STACKTRACE -	---help--- -	  Enable this for messages that may help debug problems with the -	  block manager locking used by thin provisioning and caching. - -	  If unsure, say N. -  config DM_CACHE         tristate "Cache target (EXPERIMENTAL)"         depends on BLK_DEV_DM diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 1e018e986610..0e385e40909e 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -872,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p)  {  	struct mq_policy *mq = to_mq_policy(p); -	kfree(mq->table); +	vfree(mq->table);  	epool_exit(&mq->cache_pool);  	epool_exit(&mq->pre_cache_pool);  	kfree(mq); @@ -1245,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,  	mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);  	mq->hash_bits = ffs(mq->nr_buckets) - 1; -	mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); +	mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);  	if (!mq->table)  		goto bad_alloc_table; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index afc3d017de4c..d6e88178d22c 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -546,6 +546,9 @@ static int read_exceptions(struct pstore *ps,  		r = insert_exceptions(ps, area, callback, callback_context,  				      &full); +		if (!full) +			memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT); +  		dm_bufio_release(bp);  		dm_bufio_forget(client, chunk); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index baa87ff12816..fb9efc829182 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -76,7 +76,7 @@  #define THIN_SUPERBLOCK_MAGIC 27022010  #define THIN_SUPERBLOCK_LOCATION 0 -#define THIN_VERSION 1 +#define THIN_VERSION 2  #define THIN_METADATA_CACHE_SIZE 64  #define SECTOR_TO_BLOCK_SHIFT 3 @@ -1755,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,  	return r;  } + +int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) +{ +	int r; +	struct dm_block *sblock; +	struct thin_disk_superblock *disk_super; + +	down_write(&pmd->root_lock); +	pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; + +	r = superblock_lock(pmd, &sblock); +	if (r) { +		DMERR("couldn't read superblock"); +		goto out; +	} + +	disk_super = dm_block_data(sblock); +	disk_super->flags = cpu_to_le32(pmd->flags); + +	dm_bm_unlock(sblock); +out: +	up_write(&pmd->root_lock); +	return r; +} + +bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) +{ +	bool needs_check; + +	down_read(&pmd->root_lock); +	needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG; +	up_read(&pmd->root_lock); + +	return needs_check; +} diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 82ea384d36ff..e3c857db195a 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -25,6 +25,11 @@  /*----------------------------------------------------------------*/ +/* + * Thin metadata superblock flags. + */ +#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0) +  struct dm_pool_metadata;  struct dm_thin_device; @@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,  					dm_sm_threshold_fn fn,  					void *context); +/* + * Updates the superblock immediately. + */ +int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); +bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); +  /*----------------------------------------------------------------*/  #endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 7e84baccf0ad..be70d38745f7 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,  struct dm_thin_new_mapping;  /* - * The pool runs in 3 modes.  Ordered in degraded order for comparisons. + * The pool runs in 4 modes.  Ordered in degraded order for comparisons.   */  enum pool_mode {  	PM_WRITE,		/* metadata may be changed */ +	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */  	PM_READ_ONLY,		/* metadata may not be changed */  	PM_FAIL,		/* all I/O fails */  }; @@ -198,7 +199,6 @@ struct pool {  };  static enum pool_mode get_pool_mode(struct pool *pool); -static void out_of_data_space(struct pool *pool);  static void metadata_operation_failed(struct pool *pool, const char *op, int r);  /* @@ -226,6 +226,7 @@ struct thin_c {  	struct pool *pool;  	struct dm_thin_device *td; +	bool requeue_mode:1;  };  /*----------------------------------------------------------------*/ @@ -369,14 +370,18 @@ struct dm_thin_endio_hook {  	struct dm_thin_new_mapping *overwrite_mapping;  }; -static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) +static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)  {  	struct bio *bio;  	struct bio_list bios; +	unsigned long flags;  	bio_list_init(&bios); + +	spin_lock_irqsave(&tc->pool->lock, flags);  	bio_list_merge(&bios, master);  	bio_list_init(master); +	spin_unlock_irqrestore(&tc->pool->lock, flags);  	while ((bio = bio_list_pop(&bios))) {  		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); @@ -391,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)  static void requeue_io(struct thin_c *tc)  {  	struct pool *pool = tc->pool; + +	requeue_bio_list(tc, &pool->deferred_bios); +	requeue_bio_list(tc, &pool->retry_on_resume_list); +} + +static void error_retry_list(struct pool *pool) +{ +	struct bio *bio;  	unsigned long flags; +	struct bio_list bios; + +	bio_list_init(&bios);  	spin_lock_irqsave(&pool->lock, flags); -	__requeue_bio_list(tc, &pool->deferred_bios); -	__requeue_bio_list(tc, &pool->retry_on_resume_list); +	bio_list_merge(&bios, &pool->retry_on_resume_list); +	bio_list_init(&pool->retry_on_resume_list);  	spin_unlock_irqrestore(&pool->lock, flags); + +	while ((bio = bio_list_pop(&bios))) +		bio_io_error(bio);  }  /* @@ -925,13 +944,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)  	}  } +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); +  static int alloc_data_block(struct thin_c *tc, dm_block_t *result)  {  	int r;  	dm_block_t free_blocks;  	struct pool *pool = tc->pool; -	if (get_pool_mode(pool) != PM_WRITE) +	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))  		return -EINVAL;  	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); @@ -958,7 +979,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)  		}  		if (!free_blocks) { -			out_of_data_space(pool); +			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);  			return -ENOSPC;  		}  	} @@ -988,15 +1009,32 @@ static void retry_on_resume(struct bio *bio)  	spin_unlock_irqrestore(&pool->lock, flags);  } -static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) +static bool should_error_unserviceable_bio(struct pool *pool)  { -	/* -	 * When pool is read-only, no cell locking is needed because -	 * nothing is changing. -	 */ -	WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY); +	enum pool_mode m = get_pool_mode(pool); + +	switch (m) { +	case PM_WRITE: +		/* Shouldn't get here */ +		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); +		return true; + +	case PM_OUT_OF_DATA_SPACE: +		return pool->pf.error_if_no_space; -	if (pool->pf.error_if_no_space) +	case PM_READ_ONLY: +	case PM_FAIL: +		return true; +	default: +		/* Shouldn't get here */ +		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); +		return true; +	} +} + +static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) +{ +	if (should_error_unserviceable_bio(pool))  		bio_io_error(bio);  	else  		retry_on_resume(bio); @@ -1007,11 +1045,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c  	struct bio *bio;  	struct bio_list bios; +	if (should_error_unserviceable_bio(pool)) { +		cell_error(pool, cell); +		return; +	} +  	bio_list_init(&bios);  	cell_release(pool, cell, &bios); -	while ((bio = bio_list_pop(&bios))) -		handle_unserviceable_bio(pool, bio); +	if (should_error_unserviceable_bio(pool)) +		while ((bio = bio_list_pop(&bios))) +			bio_io_error(bio); +	else +		while ((bio = bio_list_pop(&bios))) +			retry_on_resume(bio);  }  static void process_discard(struct thin_c *tc, struct bio *bio) @@ -1296,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)  	}  } +static void process_bio_success(struct thin_c *tc, struct bio *bio) +{ +	bio_endio(bio, 0); +} +  static void process_bio_fail(struct thin_c *tc, struct bio *bio)  {  	bio_io_error(bio); @@ -1328,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool)  		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));  		struct thin_c *tc = h->tc; +		if (tc->requeue_mode) { +			bio_endio(bio, DM_ENDIO_REQUEUE); +			continue; +		} +  		/*  		 * If we've got no free new_mapping structs, and processing  		 * this bio might require one, we pause until there are some @@ -1394,51 +1451,134 @@ static void do_waker(struct work_struct *ws)  /*----------------------------------------------------------------*/ +struct noflush_work { +	struct work_struct worker; +	struct thin_c *tc; + +	atomic_t complete; +	wait_queue_head_t wait; +}; + +static void complete_noflush_work(struct noflush_work *w) +{ +	atomic_set(&w->complete, 1); +	wake_up(&w->wait); +} + +static void do_noflush_start(struct work_struct *ws) +{ +	struct noflush_work *w = container_of(ws, struct noflush_work, worker); +	w->tc->requeue_mode = true; +	requeue_io(w->tc); +	complete_noflush_work(w); +} + +static void do_noflush_stop(struct work_struct *ws) +{ +	struct noflush_work *w = container_of(ws, struct noflush_work, worker); +	w->tc->requeue_mode = false; +	complete_noflush_work(w); +} + +static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) +{ +	struct noflush_work w; + +	INIT_WORK(&w.worker, fn); +	w.tc = tc; +	atomic_set(&w.complete, 0); +	init_waitqueue_head(&w.wait); + +	queue_work(tc->pool->wq, &w.worker); + +	wait_event(w.wait, atomic_read(&w.complete)); +} + +/*----------------------------------------------------------------*/ +  static enum pool_mode get_pool_mode(struct pool *pool)  {  	return pool->pf.mode;  } +static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) +{ +	dm_table_event(pool->ti->table); +	DMINFO("%s: switching pool to %s mode", +	       dm_device_name(pool->pool_md), new_mode); +} +  static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)  { -	int r; -	enum pool_mode old_mode = pool->pf.mode; +	struct pool_c *pt = pool->ti->private; +	bool needs_check = dm_pool_metadata_needs_check(pool->pmd); +	enum pool_mode old_mode = get_pool_mode(pool); + +	/* +	 * Never allow the pool to transition to PM_WRITE mode if user +	 * intervention is required to verify metadata and data consistency. +	 */ +	if (new_mode == PM_WRITE && needs_check) { +		DMERR("%s: unable to switch pool to write mode until repaired.", +		      dm_device_name(pool->pool_md)); +		if (old_mode != new_mode) +			new_mode = old_mode; +		else +			new_mode = PM_READ_ONLY; +	} +	/* +	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're +	 * not going to recover without a thin_repair.	So we never let the +	 * pool move out of the old mode. +	 */ +	if (old_mode == PM_FAIL) +		new_mode = old_mode;  	switch (new_mode) {  	case PM_FAIL:  		if (old_mode != new_mode) -			DMERR("%s: switching pool to failure mode", -			      dm_device_name(pool->pool_md)); +			notify_of_pool_mode_change(pool, "failure");  		dm_pool_metadata_read_only(pool->pmd);  		pool->process_bio = process_bio_fail;  		pool->process_discard = process_bio_fail;  		pool->process_prepared_mapping = process_prepared_mapping_fail;  		pool->process_prepared_discard = process_prepared_discard_fail; + +		error_retry_list(pool);  		break;  	case PM_READ_ONLY:  		if (old_mode != new_mode) -			DMERR("%s: switching pool to read-only mode", -			      dm_device_name(pool->pool_md)); -		r = dm_pool_abort_metadata(pool->pmd); -		if (r) { -			DMERR("%s: aborting transaction failed", -			      dm_device_name(pool->pool_md)); -			new_mode = PM_FAIL; -			set_pool_mode(pool, new_mode); -		} else { -			dm_pool_metadata_read_only(pool->pmd); -			pool->process_bio = process_bio_read_only; -			pool->process_discard = process_discard; -			pool->process_prepared_mapping = process_prepared_mapping_fail; -			pool->process_prepared_discard = process_prepared_discard_passdown; -		} +			notify_of_pool_mode_change(pool, "read-only"); +		dm_pool_metadata_read_only(pool->pmd); +		pool->process_bio = process_bio_read_only; +		pool->process_discard = process_bio_success; +		pool->process_prepared_mapping = process_prepared_mapping_fail; +		pool->process_prepared_discard = process_prepared_discard_passdown; + +		error_retry_list(pool); +		break; + +	case PM_OUT_OF_DATA_SPACE: +		/* +		 * Ideally we'd never hit this state; the low water mark +		 * would trigger userland to extend the pool before we +		 * completely run out of data space.  However, many small +		 * IOs to unprovisioned space can consume data space at an +		 * alarming rate.  Adjust your low water mark if you're +		 * frequently seeing this mode. +		 */ +		if (old_mode != new_mode) +			notify_of_pool_mode_change(pool, "out-of-data-space"); +		pool->process_bio = process_bio_read_only; +		pool->process_discard = process_discard; +		pool->process_prepared_mapping = process_prepared_mapping; +		pool->process_prepared_discard = process_prepared_discard_passdown;  		break;  	case PM_WRITE:  		if (old_mode != new_mode) -			DMINFO("%s: switching pool to write mode", -			       dm_device_name(pool->pool_md)); +			notify_of_pool_mode_change(pool, "write");  		dm_pool_metadata_read_write(pool->pmd);  		pool->process_bio = process_bio;  		pool->process_discard = process_discard; @@ -1448,32 +1588,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)  	}  	pool->pf.mode = new_mode; +	/* +	 * The pool mode may have changed, sync it so bind_control_target() +	 * doesn't cause an unexpected mode transition on resume. +	 */ +	pt->adjusted_pf.mode = new_mode;  } -/* - * Rather than calling set_pool_mode directly, use these which describe the - * reason for mode degradation. - */ -static void out_of_data_space(struct pool *pool) +static void abort_transaction(struct pool *pool)  { -	DMERR_LIMIT("%s: no free data space available.", -		    dm_device_name(pool->pool_md)); -	set_pool_mode(pool, PM_READ_ONLY); +	const char *dev_name = dm_device_name(pool->pool_md); + +	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); +	if (dm_pool_abort_metadata(pool->pmd)) { +		DMERR("%s: failed to abort metadata transaction", dev_name); +		set_pool_mode(pool, PM_FAIL); +	} + +	if (dm_pool_metadata_set_needs_check(pool->pmd)) { +		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); +		set_pool_mode(pool, PM_FAIL); +	}  }  static void metadata_operation_failed(struct pool *pool, const char *op, int r)  { -	dm_block_t free_blocks; -  	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",  		    dm_device_name(pool->pool_md), op, r); -	if (r == -ENOSPC && -	    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && -	    !free_blocks) -		DMERR_LIMIT("%s: no free metadata space available.", -			    dm_device_name(pool->pool_md)); - +	abort_transaction(pool);  	set_pool_mode(pool, PM_READ_ONLY);  } @@ -1524,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)  	thin_hook_bio(tc, bio); +	if (tc->requeue_mode) { +		bio_endio(bio, DM_ENDIO_REQUEUE); +		return DM_MAPIO_SUBMITTED; +	} +  	if (get_pool_mode(tc->pool) == PM_FAIL) {  		bio_io_error(bio);  		return DM_MAPIO_SUBMITTED; @@ -1687,7 +1835,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)  	/*  	 * We want to make sure that a pool in PM_FAIL mode is never upgraded.  	 */ -	enum pool_mode old_mode = pool->pf.mode; +	enum pool_mode old_mode = get_pool_mode(pool);  	enum pool_mode new_mode = pt->adjusted_pf.mode;  	/* @@ -1701,16 +1849,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)  	pool->pf = pt->adjusted_pf;  	pool->low_water_blocks = pt->low_water_blocks; -	/* -	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're -	 * not going to recover without a thin_repair.  So we never let the -	 * pool move out of the old mode.  On the other hand a PM_READ_ONLY -	 * may have been due to a lack of metadata or data space, and may -	 * now work (ie. if the underlying devices have been resized). -	 */ -	if (old_mode == PM_FAIL) -		new_mode = old_mode; -  	set_pool_mode(pool, new_mode);  	return 0; @@ -2253,6 +2391,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)  		return -EINVAL;  	} else if (data_size > sb_data_size) { +		if (dm_pool_metadata_needs_check(pool->pmd)) { +			DMERR("%s: unable to grow the data device until repaired.", +			      dm_device_name(pool->pool_md)); +			return 0; +		} +  		if (sb_data_size)  			DMINFO("%s: growing the data device from %llu to %llu blocks",  			       dm_device_name(pool->pool_md), @@ -2294,6 +2438,12 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)  		return -EINVAL;  	} else if (metadata_dev_size > sb_metadata_dev_size) { +		if (dm_pool_metadata_needs_check(pool->pmd)) { +			DMERR("%s: unable to grow the metadata device until repaired.", +			      dm_device_name(pool->pool_md)); +			return 0; +		} +  		warn_if_metadata_device_too_big(pool->md_dev);  		DMINFO("%s: growing the metadata device from %llu to %llu blocks",  		       dm_device_name(pool->pool_md), @@ -2681,7 +2831,9 @@ static void pool_status(struct dm_target *ti, status_type_t type,  		else  			DMEMIT("- "); -		if (pool->pf.mode == PM_READ_ONLY) +		if (pool->pf.mode == PM_OUT_OF_DATA_SPACE) +			DMEMIT("out_of_data_space "); +		else if (pool->pf.mode == PM_READ_ONLY)  			DMEMIT("ro ");  		else  			DMEMIT("rw "); @@ -2795,7 +2947,7 @@ static struct target_type pool_target = {  	.name = "thin-pool",  	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |  		    DM_TARGET_IMMUTABLE, -	.version = {1, 10, 0}, +	.version = {1, 11, 0},  	.module = THIS_MODULE,  	.ctr = pool_ctr,  	.dtr = pool_dtr, @@ -2997,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)  	return 0;  } -static void thin_postsuspend(struct dm_target *ti) +static void thin_presuspend(struct dm_target *ti)  { +	struct thin_c *tc = ti->private; +  	if (dm_noflush_suspending(ti)) -		requeue_io((struct thin_c *)ti->private); +		noflush_work(tc, do_noflush_start); +} + +static void thin_postsuspend(struct dm_target *ti) +{ +	struct thin_c *tc = ti->private; + +	/* +	 * The dm_noflush_suspending flag has been cleared by now, so +	 * unfortunately we must always run this. +	 */ +	noflush_work(tc, do_noflush_stop);  }  /* @@ -3085,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti,  static struct target_type thin_target = {  	.name = "thin", -	.version = {1, 10, 0}, +	.version = {1, 11, 0},  	.module	= THIS_MODULE,  	.ctr = thin_ctr,  	.dtr = thin_dtr,  	.map = thin_map,  	.end_io = thin_endio, +	.presuspend = thin_presuspend,  	.postsuspend = thin_postsuspend,  	.status = thin_status,  	.iterate_devices = thin_iterate_devices, diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index 19b268795415..0c2dec7aec20 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig @@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA         ---help---  	 Library providing immutable on-disk data structure support for  	 device-mapper targets such as the thin provisioning target. + +config DM_DEBUG_BLOCK_STACK_TRACING +       boolean "Keep stack trace of persistent data block lock holders" +       depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA +       select STACKTRACE +       ---help--- +	 Enable this for messages that may help debug problems with the +	 block manager locking used by thin provisioning and caching. + +	 If unsure, say N. diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index e9bdd462f4f5..786b689bdfc7 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -91,6 +91,69 @@ struct block_op {  	dm_block_t block;  }; +struct bop_ring_buffer { +	unsigned begin; +	unsigned end; +	struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1]; +}; + +static void brb_init(struct bop_ring_buffer *brb) +{ +	brb->begin = 0; +	brb->end = 0; +} + +static bool brb_empty(struct bop_ring_buffer *brb) +{ +	return brb->begin == brb->end; +} + +static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) +{ +	unsigned r = old + 1; +	return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; +} + +static int brb_push(struct bop_ring_buffer *brb, +		    enum block_op_type type, dm_block_t b) +{ +	struct block_op *bop; +	unsigned next = brb_next(brb, brb->end); + +	/* +	 * We don't allow the last bop to be filled, this way we can +	 * differentiate between full and empty. +	 */ +	if (next == brb->begin) +		return -ENOMEM; + +	bop = brb->bops + brb->end; +	bop->type = type; +	bop->block = b; + +	brb->end = next; + +	return 0; +} + +static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) +{ +	struct block_op *bop; + +	if (brb_empty(brb)) +		return -ENODATA; + +	bop = brb->bops + brb->begin; +	result->type = bop->type; +	result->block = bop->block; + +	brb->begin = brb_next(brb, brb->begin); + +	return 0; +} + +/*----------------------------------------------------------------*/ +  struct sm_metadata {  	struct dm_space_map sm; @@ -101,25 +164,20 @@ struct sm_metadata {  	unsigned recursion_count;  	unsigned allocated_this_transaction; -	unsigned nr_uncommitted; -	struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; +	struct bop_ring_buffer uncommitted;  	struct threshold threshold;  };  static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)  { -	struct block_op *op; +	int r = brb_push(&smm->uncommitted, type, b); -	if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { +	if (r) {  		DMERR("too many recursive allocations");  		return -ENOMEM;  	} -	op = smm->uncommitted + smm->nr_uncommitted++; -	op->type = type; -	op->block = b; -  	return 0;  } @@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)  		return -ENOMEM;  	} -	if (smm->recursion_count == 1 && smm->nr_uncommitted) { -		while (smm->nr_uncommitted && !r) { -			smm->nr_uncommitted--; -			r = commit_bop(smm, smm->uncommitted + -				       smm->nr_uncommitted); +	if (smm->recursion_count == 1) { +		while (!brb_empty(&smm->uncommitted)) { +			struct block_op bop; + +			r = brb_pop(&smm->uncommitted, &bop); +			if (r) { +				DMERR("bug in bop ring buffer"); +				break; +			} + +			r = commit_bop(smm, &bop);  			if (r)  				break;  		} @@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)  static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,  				 uint32_t *result)  { -	int r, i; +	int r; +	unsigned i;  	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);  	unsigned adjustment = 0; @@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,  	 * We may have some uncommitted adjustments to add.  This list  	 * should always be really short.  	 */ -	for (i = 0; i < smm->nr_uncommitted; i++) { -		struct block_op *op = smm->uncommitted + i; +	for (i = smm->uncommitted.begin; +	     i != smm->uncommitted.end; +	     i = brb_next(&smm->uncommitted, i)) { +		struct block_op *op = smm->uncommitted.bops + i;  		if (op->block != b)  			continue; @@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,  static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,  					      dm_block_t b, int *result)  { -	int r, i, adjustment = 0; +	int r, adjustment = 0; +	unsigned i;  	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);  	uint32_t rc; @@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,  	 * We may have some uncommitted adjustments to add.  This list  	 * should always be really short.  	 */ -	for (i = 0; i < smm->nr_uncommitted; i++) { -		struct block_op *op = smm->uncommitted + i; +	for (i = smm->uncommitted.begin; +	     i != smm->uncommitted.end; +	     i = brb_next(&smm->uncommitted, i)) { + +		struct block_op *op = smm->uncommitted.bops + i;  		if (op->block != b)  			continue; @@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,  	smm->begin = superblock + 1;  	smm->recursion_count = 0;  	smm->allocated_this_transaction = 0; -	smm->nr_uncommitted = 0; +	brb_init(&smm->uncommitted);  	threshold_init(&smm->threshold);  	memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); @@ -715,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,  	smm->begin = 0;  	smm->recursion_count = 0;  	smm->allocated_this_transaction = 0; -	smm->nr_uncommitted = 0; +	brb_init(&smm->uncommitted);  	threshold_init(&smm->threshold);  	memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); | 
