diff options
Diffstat (limited to 'drivers/md')
35 files changed, 1201 insertions, 585 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 4fd9d6aeff6a..5a2c75499824 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -846,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set_bit(bit, kaddr); else - test_and_set_bit_le(bit, kaddr); + set_bit_le(bit, kaddr); kunmap_atomic(kaddr); pr_debug("set file bit %lu page %lu\n", bit, page->index); /* record page number so it gets flushed to disk when unplug occurs */ @@ -868,7 +868,7 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) clear_bit(bit, paddr); else - test_and_clear_bit_le(bit, paddr); + clear_bit_le(bit, paddr); kunmap_atomic(paddr); if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 3c955e10a618..0387e05cdb98 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -319,6 +319,9 @@ static void __cache_size_refresh(void) static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, enum data_mode *data_mode) { + unsigned noio_flag; + void *ptr; + if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { *data_mode = DATA_MODE_SLAB; return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); @@ -332,7 +335,26 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, } *data_mode = DATA_MODE_VMALLOC; - return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + + /* + * __vmalloc allocates the data pages and auxiliary structures with + * gfp_flags that were specified, but pagetables are always allocated + * with GFP_KERNEL, no matter what was specified as gfp_mask. + * + * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that + * all allocations done by this process (including pagetables) are done + * as if GFP_NOIO was specified. + */ + + if (gfp_mask & __GFP_NORETRY) + noio_flag = memalloc_noio_save(); + + ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + + if (gfp_mask & __GFP_NORETRY) + memalloc_noio_restore(noio_flag); + + return ptr; } /* @@ -1025,6 +1047,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, { struct blk_plug plug; + BUG_ON(dm_bufio_in_request()); + blk_start_plug(&plug); dm_bufio_lock(c); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index fbd3625f2748..1af7255bbffb 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -83,6 +83,8 @@ struct cache_disk_superblock { __le32 read_misses; __le32 write_hits; __le32 write_misses; + + __le32 policy_version[CACHE_POLICY_VERSION_SIZE]; } __packed; struct dm_cache_metadata { @@ -109,6 +111,7 @@ struct dm_cache_metadata { bool clean_when_opened:1; char policy_name[CACHE_POLICY_NAME_SIZE]; + unsigned policy_version[CACHE_POLICY_VERSION_SIZE]; size_t policy_hint_size; struct dm_cache_statistics stats; }; @@ -268,7 +271,8 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); disk_super->version = cpu_to_le32(CACHE_VERSION); - memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE); + memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); + memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); disk_super->policy_hint_size = 0; r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, @@ -284,7 +288,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); disk_super->cache_blocks = cpu_to_le32(0); - memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); disk_super->read_hits = cpu_to_le32(0); disk_super->read_misses = cpu_to_le32(0); @@ -478,6 +481,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd, cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); + cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]); + cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]); + cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]); cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size); cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits); @@ -572,6 +578,9 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); + disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); + disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); + disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]); disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); @@ -854,18 +863,43 @@ struct thunk { bool hints_valid; }; +static bool policy_unchanged(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy) +{ + const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); + size_t policy_hint_size = dm_cache_policy_get_hint_size(policy); + + /* + * Ensure policy names match. + */ + if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name))) + return false; + + /* + * Ensure policy major versions match. + */ + if (cmd->policy_version[0] != policy_version[0]) + return false; + + /* + * Ensure policy hint sizes match. + */ + if (cmd->policy_hint_size != policy_hint_size) + return false; + + return true; +} + static bool hints_array_initialized(struct dm_cache_metadata *cmd) { return cmd->hint_root && cmd->policy_hint_size; } static bool hints_array_available(struct dm_cache_metadata *cmd, - const char *policy_name) + struct dm_cache_policy *policy) { - bool policy_names_match = !strncmp(cmd->policy_name, policy_name, - sizeof(cmd->policy_name)); - - return cmd->clean_when_opened && policy_names_match && + return cmd->clean_when_opened && policy_unchanged(cmd, policy) && hints_array_initialized(cmd); } @@ -899,7 +933,8 @@ static int __load_mapping(void *context, uint64_t cblock, void *leaf) return r; } -static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +static int __load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { struct thunk thunk; @@ -909,18 +944,19 @@ static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_nam thunk.cmd = cmd; thunk.respect_dirty_flags = cmd->clean_when_opened; - thunk.hints_valid = hints_array_available(cmd, policy_name); + thunk.hints_valid = hints_array_available(cmd, policy); return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk); } -int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +int dm_cache_load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { int r; down_read(&cmd->root_lock); - r = __load_mappings(cmd, policy_name, fn, context); + r = __load_mappings(cmd, policy, fn, context); up_read(&cmd->root_lock); return r; @@ -979,7 +1015,7 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty /* nothing to be done */ return 0; - value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0)); + value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0)); __dm_bless_for_disk(&value); r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), @@ -1008,7 +1044,7 @@ void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, struct dm_cache_statistics *stats) { down_read(&cmd->root_lock); - memcpy(stats, &cmd->stats, sizeof(*stats)); + *stats = cmd->stats; up_read(&cmd->root_lock); } @@ -1016,7 +1052,7 @@ void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, struct dm_cache_statistics *stats) { down_write(&cmd->root_lock); - memcpy(&cmd->stats, stats, sizeof(*stats)); + cmd->stats = *stats; up_write(&cmd->root_lock); } @@ -1070,13 +1106,15 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po __le32 value; size_t hint_size; const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); if (!policy_name[0] || (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) return -EINVAL; - if (strcmp(cmd->policy_name, policy_name)) { + if (!policy_unchanged(cmd, policy)) { strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); + memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); hint_size = dm_cache_policy_get_hint_size(policy); if (!hint_size) diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 135864ea0eee..f45cef21f3d0 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -89,7 +89,7 @@ typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock, dm_cblock_t cblock, bool dirty, uint32_t hint, bool hint_valid); int dm_cache_load_mappings(struct dm_cache_metadata *cmd, - const char *policy_name, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context); diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c index cc05d70b3cb8..b04d1f904d07 100644 --- a/drivers/md/dm-cache-policy-cleaner.c +++ b/drivers/md/dm-cache-policy-cleaner.c @@ -17,7 +17,6 @@ /*----------------------------------------------------------------*/ #define DM_MSG_PREFIX "cache cleaner" -#define CLEANER_VERSION "1.0.0" /* Cache entry struct. */ struct wb_cache_entry { @@ -434,6 +433,7 @@ static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, static struct dm_cache_policy_type wb_policy_type = { .name = "cleaner", + .version = {1, 0, 0}, .hint_size = 0, .owner = THIS_MODULE, .create = wb_create @@ -446,7 +446,10 @@ static int __init wb_init(void) if (r < 0) DMERR("register failed %d", r); else - DMINFO("version " CLEANER_VERSION " loaded"); + DMINFO("version %u.%u.%u loaded", + wb_policy_type.version[0], + wb_policy_type.version[1], + wb_policy_type.version[2]); return r; } diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 52a75beeced5..0928abdc49f0 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -117,6 +117,8 @@ void dm_cache_policy_destroy(struct dm_cache_policy *p); */ const char *dm_cache_policy_get_name(struct dm_cache_policy *p); +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p); + size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p); /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 964153255076..dc112a7137fe 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -14,7 +14,6 @@ #include <linux/vmalloc.h> #define DM_MSG_PREFIX "cache-policy-mq" -#define MQ_VERSION "1.0.0" static struct kmem_cache *mq_entry_cache; @@ -1133,6 +1132,7 @@ bad_cache_alloc: static struct dm_cache_policy_type mq_policy_type = { .name = "mq", + .version = {1, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1140,6 +1140,7 @@ static struct dm_cache_policy_type mq_policy_type = { static struct dm_cache_policy_type default_policy_type = { .name = "default", + .version = {1, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1164,7 +1165,10 @@ static int __init mq_init(void) r = dm_cache_policy_register(&default_policy_type); if (!r) { - DMINFO("version " MQ_VERSION " loaded"); + DMINFO("version %u.%u.%u loaded", + mq_policy_type.version[0], + mq_policy_type.version[1], + mq_policy_type.version[2]); return 0; } diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c index 2cbf5fdaac52..21c03c570c06 100644 --- a/drivers/md/dm-cache-policy.c +++ b/drivers/md/dm-cache-policy.c @@ -150,6 +150,14 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p) } EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + return t->version; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_version); + size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p) { struct dm_cache_policy_type *t = p->private; diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index f0f51b260544..33369ca9614f 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -130,8 +130,8 @@ struct dm_cache_policy { * * Must not block. * - * Returns 1 iff in cache, 0 iff not, < 0 on error (-EWOULDBLOCK - * would be typical). + * Returns 0 if in cache, -ENOENT if not, < 0 for other errors + * (-EWOULDBLOCK would be typical). */ int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); @@ -196,6 +196,7 @@ struct dm_cache_policy { * We maintain a little register of the different policy types. */ #define CACHE_POLICY_NAME_SIZE 16 +#define CACHE_POLICY_VERSION_SIZE 3 struct dm_cache_policy_type { /* For use by the register code only. */ @@ -206,6 +207,7 @@ struct dm_cache_policy_type { * what gets passed on the target line to select your policy. */ char name[CACHE_POLICY_NAME_SIZE]; + unsigned version[CACHE_POLICY_VERSION_SIZE]; /* * Policies may store a hint for each each cache block. diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 0f4e84b15c30..df44b60e66f2 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -6,6 +6,7 @@ #include "dm.h" #include "dm-bio-prison.h" +#include "dm-bio-record.h" #include "dm-cache-metadata.h" #include <linux/dm-io.h> @@ -142,6 +143,7 @@ struct cache { spinlock_t lock; struct bio_list deferred_bios; struct bio_list deferred_flush_bios; + struct bio_list deferred_writethrough_bios; struct list_head quiesced_migrations; struct list_head completed_migrations; struct list_head need_commit_migrations; @@ -158,7 +160,7 @@ struct cache { /* * origin_blocks entries, discarded if set. */ - sector_t discard_block_size; /* a power of 2 times sectors per block */ + uint32_t discard_block_size; /* a power of 2 times sectors per block */ dm_dblock_t discard_nr_blocks; unsigned long *discard_bitset; @@ -199,6 +201,16 @@ struct per_bio_data { bool tick:1; unsigned req_nr:2; struct dm_deferred_entry *all_io_entry; + + /* + * writethrough fields. These MUST remain at the end of this + * structure and the 'cache' member must be the first as it + * is used to determine the offset of the writethrough fields. + */ + struct cache *cache; + dm_cblock_t cblock; + bio_end_io_t *saved_bi_end_io; + struct dm_bio_details bio_details; }; struct dm_cache_migration { @@ -381,7 +393,7 @@ static int get_cell(struct cache *cache, return r; } - /*----------------------------------------------------------------*/ +/*----------------------------------------------------------------*/ static bool is_dirty(struct cache *cache, dm_cblock_t b) { @@ -407,22 +419,30 @@ static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cbl } /*----------------------------------------------------------------*/ + static bool block_size_is_power_of_two(struct cache *cache) { return cache->sectors_per_block_shift >= 0; } +static dm_block_t block_div(dm_block_t b, uint32_t n) +{ + do_div(b, n); + + return b; +} + static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) { - sector_t discard_blocks = cache->discard_block_size; + uint32_t discard_blocks = cache->discard_block_size; dm_block_t b = from_oblock(oblock); if (!block_size_is_power_of_two(cache)) - (void) sector_div(discard_blocks, cache->sectors_per_block); + discard_blocks = discard_blocks / cache->sectors_per_block; else discard_blocks >>= cache->sectors_per_block_shift; - (void) sector_div(b, discard_blocks); + b = block_div(b, discard_blocks); return to_dblock(b); } @@ -500,16 +520,28 @@ static void save_stats(struct cache *cache) /*---------------------------------------------------------------- * Per bio data *--------------------------------------------------------------*/ -static struct per_bio_data *get_per_bio_data(struct bio *bio) + +/* + * If using writeback, leave out struct per_bio_data's writethrough fields. + */ +#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) +#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) + +static size_t get_per_bio_data_size(struct cache *cache) { - struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; +} + +static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) +{ + struct per_bio_data *pb = dm_per_bio_data(bio, data_size); BUG_ON(!pb); return pb; } -static struct per_bio_data *init_per_bio_data(struct bio *bio) +static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) { - struct per_bio_data *pb = get_per_bio_data(bio); + struct per_bio_data *pb = get_per_bio_data(bio, data_size); pb->tick = false; pb->req_nr = dm_bio_get_target_bio_nr(bio); @@ -543,7 +575,8 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) { unsigned long flags; - struct per_bio_data *pb = get_per_bio_data(bio); + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); spin_lock_irqsave(&cache->lock, flags); if (cache->need_tick_bio && @@ -609,6 +642,58 @@ static void issue(struct cache *cache, struct bio *bio) spin_unlock_irqrestore(&cache->lock, flags); } +static void defer_writethrough_bio(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_writethrough_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void writethrough_endio(struct bio *bio, int err) +{ + struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); + bio->bi_end_io = pb->saved_bi_end_io; + + if (err) { + bio_endio(bio, err); + return; + } + + dm_bio_restore(&pb->bio_details, bio); + remap_to_cache(pb->cache, bio, pb->cblock); + + /* + * We can't issue this bio directly, since we're in interrupt + * context. So it gets put on a bio list for processing by the + * worker thread. + */ + defer_writethrough_bio(pb->cache, bio); +} + +/* + * When running in writethrough mode we need to send writes to clean blocks + * to both the cache and origin devices. In future we'd like to clone the + * bio and send them in parallel, but for now we're doing them in + * series as this is easier. + */ +static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); + + pb->cache = cache; + pb->cblock = cblock; + pb->saved_bi_end_io = bio->bi_end_io; + dm_bio_record(&pb->bio_details, bio); + bio->bi_end_io = writethrough_endio; + + remap_to_origin_clear_discard(pb->cache, bio, oblock); +} + /*---------------------------------------------------------------- * Migration processing * @@ -972,7 +1057,8 @@ static void defer_bio(struct cache *cache, struct bio *bio) static void process_flush_bio(struct cache *cache, struct bio *bio) { - struct per_bio_data *pb = get_per_bio_data(bio); + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); BUG_ON(bio->bi_size); if (!pb->req_nr) @@ -1002,7 +1088,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio) dm_block_t end_block = bio->bi_sector + bio_sectors(bio); dm_block_t b; - (void) sector_div(end_block, cache->discard_block_size); + end_block = block_div(end_block, cache->discard_block_size); for (b = start_block; b < end_block; b++) set_discard(cache, to_dblock(b)); @@ -1044,7 +1130,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, dm_oblock_t block = get_bio_block(cache, bio); struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; struct policy_result lookup_result; - struct per_bio_data *pb = get_per_bio_data(bio); + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); bool discarded_block = is_discarded_oblock(cache, block); bool can_migrate = discarded_block || spare_migration_bandwidth(cache); @@ -1070,14 +1157,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs, inc_hit_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) { - /* - * No need to mark anything dirty in write through mode. - */ - pb->req_nr == 0 ? - remap_to_cache(cache, bio, lookup_result.cblock) : - remap_to_origin_clear_discard(cache, bio, block); - } else + if (is_writethrough_io(cache, bio, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); issue(cache, bio); @@ -1086,17 +1168,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, case POLICY_MISS: inc_miss_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - - if (pb->req_nr != 0) { - /* - * This is a duplicate writethrough io that is no - * longer needed because the block has been demoted. - */ - bio_endio(bio, 0); - } else { - remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); - } + remap_to_origin_clear_discard(cache, bio, block); + issue(cache, bio); break; case POLICY_NEW: @@ -1217,6 +1290,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) submit_bios ? generic_make_request(bio) : bio_io_error(bio); } +static void process_deferred_writethrough_bios(struct cache *cache) +{ + unsigned long flags; + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&bios, &cache->deferred_writethrough_bios); + bio_list_init(&cache->deferred_writethrough_bios); + spin_unlock_irqrestore(&cache->lock, flags); + + while ((bio = bio_list_pop(&bios))) + generic_make_request(bio); +} + static void writeback_some_dirty_blocks(struct cache *cache) { int r = 0; @@ -1313,6 +1403,7 @@ static int more_work(struct cache *cache) else return !bio_list_empty(&cache->deferred_bios) || !bio_list_empty(&cache->deferred_flush_bios) || + !bio_list_empty(&cache->deferred_writethrough_bios) || !list_empty(&cache->quiesced_migrations) || !list_empty(&cache->completed_migrations) || !list_empty(&cache->need_commit_migrations); @@ -1331,6 +1422,8 @@ static void do_worker(struct work_struct *ws) writeback_some_dirty_blocks(cache); + process_deferred_writethrough_bios(cache); + if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); @@ -1353,6 +1446,7 @@ static void do_worker(struct work_struct *ws) static void do_waker(struct work_struct *ws) { struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); + policy_tick(cache->policy); wake_worker(cache); queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); } @@ -1717,7 +1811,37 @@ static int parse_cache_args(struct cache_args *ca, int argc, char **argv, static struct kmem_cache *migration_cache; -static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv) +#define NOT_CORE_OPTION 1 + +static int process_config_option(struct cache *cache, const char *key, const char *value) +{ + unsigned long tmp; + + if (!strcasecmp(key, "migration_threshold")) { + if (kstrtoul(value, 10, &tmp)) + return -EINVAL; + + cache->migration_threshold = tmp; + return 0; + } + + return NOT_CORE_OPTION; +} + +static int set_config_value(struct cache *cache, const char *key, const char *value) +{ + int r = process_config_option(cache, key, value); + + if (r == NOT_CORE_OPTION) + r = policy_set_config_value(cache->policy, key, value); + + if (r) + DMWARN("bad config value for %s: %s", key, value); + + return r; +} + +static int set_config_values(struct cache *cache, int argc, const char **argv) { int r = 0; @@ -1727,12 +1851,9 @@ static int set_config_values(struct dm_cache_policy *p, int argc, const char **a } while (argc) { - r = policy_set_config_value(p, argv[0], argv[1]); - if (r) { - DMWARN("policy_set_config_value failed: key = '%s', value = '%s'", - argv[0], argv[1]); - return r; - } + r = set_config_value(cache, argv[0], argv[1]); + if (r) + break; argc -= 2; argv += 2; @@ -1744,8 +1865,6 @@ static int set_config_values(struct dm_cache_policy *p, int argc, const char **a static int create_cache_policy(struct cache *cache, struct cache_args *ca, char **error) { - int r; - cache->policy = dm_cache_policy_create(ca->policy_name, cache->cache_size, cache->origin_sectors, @@ -1755,11 +1874,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, return -ENOMEM; } - r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv); - if (r) - dm_cache_policy_destroy(cache->policy); - - return r; + return 0; } /* @@ -1791,9 +1906,7 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size, return discard_block_size; } -#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) - -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio); +#define DEFAULT_MIGRATION_THRESHOLD 2048 static int cache_create(struct cache_args *ca, struct cache **result) { @@ -1811,7 +1924,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) cache->ti = ca->ti; ti->private = cache; - ti->per_bio_data_size = sizeof(struct per_bio_data); ti->num_flush_bios = 2; ti->flush_supported = true; @@ -1819,10 +1931,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->discards_supported = true; ti->discard_zeroes_data_unsupported = true; - memcpy(&cache->features, &ca->features, sizeof(cache->features)); - - if (cache->features.write_through) - ti->num_write_bios = cache_num_write_bios; + cache->features = ca->features; + ti->per_bio_data_size = get_per_bio_data_size(cache); cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -1835,7 +1945,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) /* FIXME: factor out this whole section */ origin_blocks = cache->origin_sectors = ca->origin_sectors; - (void) sector_div(origin_blocks, ca->block_size); + origin_blocks = block_div(origin_blocks, ca->block_size); cache->origin_blocks = to_oblock(origin_blocks); cache->sectors_per_block = ca->block_size; @@ -1848,7 +1958,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) dm_block_t cache_size = ca->cache_sectors; cache->sectors_per_block_shift = -1; - (void) sector_div(cache_size, ca->block_size); + cache_size = block_div(cache_size, ca->block_size); cache->cache_size = to_cblock(cache_size); } else { cache->sectors_per_block_shift = __ffs(ca->block_size); @@ -1858,7 +1968,15 @@ static int cache_create(struct cache_args *ca, struct cache **result) r = create_cache_policy(cache, ca, error); if (r) goto bad; + cache->policy_nr_args = ca->policy_argc; + cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; + + r = set_config_values(cache, ca->policy_argc, ca->policy_argv); + if (r) { + *error = "Error setting cache policy's config values"; + goto bad; + } cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, ca->block_size, may_format, @@ -1873,13 +1991,14 @@ static int cache_create(struct cache_args *ca, struct cache **result) spin_lock_init(&cache->lock); bio_list_init(&cache->deferred_bios); bio_list_init(&cache->deferred_flush_bios); + bio_list_init(&cache->deferred_writethrough_bios); INIT_LIST_HEAD(&cache->quiesced_migrations); INIT_LIST_HEAD(&cache->completed_migrations); INIT_LIST_HEAD(&cache->need_commit_migrations); - cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; atomic_set(&cache->nr_migrations, 0); init_waitqueue_head(&cache->migration_wait); + r = -ENOMEM; cache->nr_dirty = 0; cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); if (!cache->dirty_bitset) { @@ -2002,6 +2121,8 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out; r = cache_create(ca, &cache); + if (r) + goto out; r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); if (r) { @@ -2016,26 +2137,13 @@ out: return r; } -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio) -{ - int r; - struct cache *cache = ti->private; - dm_oblock_t block = get_bio_block(cache, bio); - dm_cblock_t cblock; - - r = policy_lookup(cache->policy, block, &cblock); - if (r < 0) - return 2; /* assume the worst */ - - return (!r && !is_dirty(cache, cblock)) ? 2 : 1; -} - static int cache_map(struct dm_target *ti, struct bio *bio) { struct cache *cache = ti->private; int r; dm_oblock_t block = get_bio_block(cache, bio); + size_t pb_data_size = get_per_bio_data_size(cache); bool can_migrate = false; bool discarded_block; struct dm_bio_prison_cell *cell; @@ -2052,7 +2160,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } - pb = init_per_bio_data(bio); + pb = init_per_bio_data(bio, pb_data_size); if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { defer_bio(cache, bio); @@ -2097,18 +2205,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio) inc_hit_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) { - /* - * No need to mark anything dirty in write through mode. - */ - pb->req_nr == 0 ? - remap_to_cache(cache, bio, lookup_result.cblock) : - remap_to_origin_clear_discard(cache, bio, block); - cell_defer(cache, cell, false); - } else { + if (is_writethrough_io(cache, bio, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); - cell_defer(cache, cell, false); - } + + cell_defer(cache, cell, false); break; case POLICY_MISS: @@ -2143,7 +2245,8 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) { struct cache *cache = ti->private; unsigned long flags; - struct per_bio_data *pb = get_per_bio_data(bio); + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); if (pb->tick) { policy_tick(cache->policy); @@ -2319,8 +2422,7 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_mappings) { - r = dm_cache_load_mappings(cache->cmd, - dm_cache_policy_get_name(cache->policy), + r = dm_cache_load_mappings(cache->cmd, cache->policy, load_mapping, cache); if (r) { DMERR("could not load cache mappings"); @@ -2443,23 +2545,6 @@ err: DMEMIT("Error"); } -#define NOT_CORE_OPTION 1 - -static int process_config_option(struct cache *cache, char **argv) -{ - unsigned long tmp; - - if (!strcasecmp(argv[0], "migration_threshold")) { - if (kstrtoul(argv[1], 10, &tmp)) - return -EINVAL; - - cache->migration_threshold = tmp; - return 0; - } - - return NOT_CORE_OPTION; -} - /* * Supports <key> <value>. * @@ -2467,17 +2552,12 @@ static int process_config_option(struct cache *cache, char **argv) */ static int cache_message(struct dm_target *ti, unsigned argc, char **argv) { - int r; struct cache *cache = ti->private; if (argc != 2) return -EINVAL; - r = process_config_option(cache, argv); - if (r == NOT_CORE_OPTION) - return policy_set_config_value(cache->policy, argv[0], argv[1]); - - return r; + return set_config_value(cache, argv[0], argv[1]); } static int cache_iterate_devices(struct dm_target *ti, @@ -2535,7 +2615,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 0, 0}, + .version = {1, 1, 1}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 13c15480d940..6d2d41ae9e32 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -858,8 +858,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) unsigned int i; struct bio_vec *bv; - for (i = 0; i < clone->bi_vcnt; i++) { - bv = bio_iovec_idx(clone, i); + bio_for_each_segment_all(bv, clone, i) { BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, cc->page_pool); bv->bv_page = NULL; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 51bb81676be3..bdf26f5bd326 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -907,6 +907,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, ti->num_flush_bios = 1; ti->num_discard_bios = 1; + ti->num_write_same_bios = 1; return 0; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 311e3d35b272..1d3fe1a40a9b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1279,6 +1279,31 @@ static int raid_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +static const char *decipher_sync_action(struct mddev *mddev) +{ + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + return "frozen"; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return "reshape"; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + return "resync"; + else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + return "check"; + return "repair"; + } + + if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + return "recover"; + } + + return "idle"; +} + static void raid_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { @@ -1298,8 +1323,18 @@ static void raid_status(struct dm_target *ti, status_type_t type, sync = rs->md.recovery_cp; if (sync >= rs->md.resync_max_sectors) { + /* + * Sync complete. + */ array_in_sync = 1; sync = rs->md.resync_max_sectors; + } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) { + /* + * If "check" or "repair" is occurring, the array has + * undergone and initial sync and the health characters + * should not be 'a' anymore. + */ + array_in_sync = 1; } else { /* * The array may be doing an initial sync, or it may @@ -1311,6 +1346,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) array_in_sync = 1; } + /* * Status characters: * 'D' = Dead/Failed device @@ -1339,6 +1375,21 @@ static void raid_status(struct dm_target *ti, status_type_t type, (unsigned long long) sync, (unsigned long long) rs->md.resync_max_sectors); + /* + * Sync action: + * See Documentation/device-mapper/dm-raid.c for + * information on each of these states. + */ + DMEMIT(" %s", decipher_sync_action(&rs->md)); + + /* + * resync_mismatches/mismatch_cnt + * This field shows the number of discrepancies found when + * performing a "check" of the array. + */ + DMEMIT(" %llu", + (unsigned long long) + atomic64_read(&rs->md.resync_mismatches)); break; case STATUSTYPE_TABLE: /* The string you would use to construct this array */ @@ -1425,7 +1476,62 @@ static void raid_status(struct dm_target *ti, status_type_t type, } } -static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +static int raid_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + if (!strcasecmp(argv[0], "reshape")) { + DMERR("Reshape not supported."); + return -EINVAL; + } + + if (!mddev->pers || !mddev->pers->sync_request) + return -EINVAL; + + if (!strcasecmp(argv[0], "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); + } + } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + return -EBUSY; + else if (!strcasecmp(argv[0], "resync")) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + else if (!strcasecmp(argv[0], "recover")) { + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } else { + if (!strcasecmp(argv[0], "check")) + set_bit(MD_RECOVERY_CHECK, &mddev->recovery); + else if (!!strcasecmp(argv[0], "repair")) + return -EINVAL; + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + if (!mddev->suspended) + md_wakeup_thread(mddev->sync_thread); + } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (!mddev->suspended) + md_wakeup_thread(mddev->thread); + + return 0; +} + +static int raid_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) { struct raid_set *rs = ti->private; unsigned i; @@ -1482,12 +1588,13 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 4, 2}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, .map = raid_map, .status = raid_status, + .message = raid_message, .iterate_devices = raid_iterate_devices, .io_hints = raid_io_hints, .presuspend = raid_presuspend, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index d053098c6a91..699b5be68d31 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -458,7 +458,7 @@ static void map_region(struct dm_io_region *io, struct mirror *m, { io->bdev = m->dev->bdev; io->sector = map_sector(m, bio); - io->count = bio->bi_size >> 9; + io->count = bio_sectors(bio); } static void hold_bio(struct mirror_set *ms, struct bio *bio) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c0e07026a8d1..c434e5aab2df 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1121,6 +1121,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); if (!s->pending_pool) { ti->error = "Could not allocate mempool for pending exceptions"; + r = -ENOMEM; goto bad_pending_pool; } diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index d8837d313f54..d907ca6227ce 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -94,7 +94,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct stripe_c *sc; - sector_t width; + sector_t width, tmp_len; uint32_t stripes; uint32_t chunk_size; int r; @@ -116,15 +116,16 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) } width = ti->len; - if (sector_div(width, chunk_size)) { + if (sector_div(width, stripes)) { ti->error = "Target length not divisible by " - "chunk size"; + "number of stripes"; return -EINVAL; } - if (sector_div(width, stripes)) { + tmp_len = width; + if (sector_div(tmp_len, chunk_size)) { ti->error = "Target length not divisible by " - "number of stripes"; + "chunk size"; return -EINVAL; } @@ -258,7 +259,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio, sector_t begin, end; stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); - stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio), + stripe_map_range_sector(sc, bio_end_sector(bio), target_stripe, &end); if (begin < end) { bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e50dad0c65f4..1ff252ab7d46 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1442,7 +1442,7 @@ static bool dm_table_supports_write_same(struct dm_table *t) return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_not_write_same_capable, NULL)) + ti->type->iterate_devices(ti, device_not_write_same_capable, NULL)) return false; } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 00cee02f8fc9..60bce435f4fa 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1645,12 +1645,12 @@ int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, return r; } -static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) +static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count) { int r; dm_block_t old_count; - r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count); + r = dm_sm_get_nr_blocks(sm, &old_count); if (r) return r; @@ -1658,11 +1658,11 @@ static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) return 0; if (new_count < old_count) { - DMERR("cannot reduce size of data device"); + DMERR("cannot reduce size of space map"); return -EINVAL; } - return dm_sm_extend(pmd->data_sm, new_count - old_count); + return dm_sm_extend(sm, new_count - old_count); } int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) @@ -1671,7 +1671,19 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) down_write(&pmd->root_lock); if (!pmd->fail_io) - r = __resize_data_dev(pmd, new_count); + r = __resize_space_map(pmd->data_sm, new_count); + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = __resize_space_map(pmd->metadata_sm, new_count); up_write(&pmd->root_lock); return r; @@ -1684,3 +1696,17 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) dm_bm_set_read_only(pmd->bm); up_write(&pmd->root_lock); } + +int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + int r; + + down_write(&pmd->root_lock); + r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context); + up_write(&pmd->root_lock); + + return r; +} diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 0cecc3702885..845ebbe589a9 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -8,6 +8,7 @@ #define DM_THIN_METADATA_H #include "persistent-data/dm-block-manager.h" +#include "persistent-data/dm-space-map.h" #define THIN_METADATA_BLOCK_SIZE 4096 @@ -185,6 +186,7 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); * blocks would be lost. */ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); +int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); /* * Flicks the underlying block manager into read only mode, so you know @@ -192,6 +194,11 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); */ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); +int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context); + /*----------------------------------------------------------------*/ #endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 009339d62828..759cffc45cab 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -922,7 +922,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) return r; if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { - DMWARN("%s: reached low water mark, sending event.", + DMWARN("%s: reached low water mark for data device: sending event.", dm_device_name(pool->pool_md)); spin_lock_irqsave(&pool->lock, flags); pool->low_water_triggered = 1; @@ -1281,6 +1281,10 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio) bio_io_error(bio); } +/* + * FIXME: should we also commit due to size of transaction, measured in + * metadata blocks? + */ static int need_commit_due_to_time(struct pool *pool) { return jiffies < pool->last_commit_jiffies || @@ -1577,6 +1581,11 @@ static bool data_dev_supports_discard(struct pool_c *pt) return q && blk_queue_discard(q); } +static bool is_factor(sector_t block_size, uint32_t n) +{ + return !sector_div(block_size, n); +} + /* * If discard_passdown was enabled verify that the data device * supports discards. Disable discard_passdown if not. @@ -1602,7 +1611,7 @@ static void disable_passdown_if_not_supported(struct pool_c *pt) else if (data_limits->discard_granularity > block_size) reason = "discard granularity larger than a block"; - else if (block_size & (data_limits->discard_granularity - 1)) + else if (!is_factor(block_size, data_limits->discard_granularity)) reason = "discard granularity not a factor of block size"; if (reason) { @@ -1904,6 +1913,56 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, return r; } +static void metadata_low_callback(void *context) +{ + struct pool *pool = context; + + DMWARN("%s: reached low water mark for metadata device: sending event.", + dm_device_name(pool->pool_md)); + + dm_table_event(pool->ti->table); +} + +static sector_t get_metadata_dev_size(struct block_device *bdev) +{ + sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + char buffer[BDEVNAME_SIZE]; + + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) { + DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", + bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); + metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING; + } + + return metadata_dev_size; +} + +static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev) +{ + sector_t metadata_dev_size = get_metadata_dev_size(bdev); + + sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + + return metadata_dev_size; +} + +/* + * When a metadata threshold is crossed a dm event is triggered, and + * userland should respond by growing the metadata device. We could let + * userland set the threshold, like we do with the data threshold, but I'm + * not sure they know enough to do this well. + */ +static dm_block_t calc_metadata_threshold(struct pool_c *pt) +{ + /* + * 4M is ample for all ops with the possible exception of thin + * device deletion which is harmless if it fails (just retry the + * delete after you've grown the device). + */ + dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4; + return min((dm_block_t)1024ULL /* 4M */, quarter); +} + /* * thin-pool <metadata dev> <data dev> * <data block size (sectors)> @@ -1926,8 +1985,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) unsigned long block_size; dm_block_t low_water_blocks; struct dm_dev *metadata_dev; - sector_t metadata_dev_size; - char b[BDEVNAME_SIZE]; + fmode_t metadata_mode; /* * FIXME Remove validation from scope of lock. @@ -1939,19 +1997,32 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -EINVAL; goto out_unlock; } + as.argc = argc; as.argv = argv; - r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); + /* + * Set default pool features. + */ + pool_features_init(&pf); + + dm_consume_args(&as, 4); + r = parse_pool_features(&as, &pf, ti); + if (r) + goto out_unlock; + + metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE); + r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev); if (r) { ti->error = "Error opening metadata block device"; goto out_unlock; } - metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; - if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) - DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", - bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); + /* + * Run for the side-effect of possibly issuing a warning if the + * device is too big. + */ + (void) get_metadata_dev_size(metadata_dev->bdev); r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); if (r) { @@ -1974,16 +2045,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out; } - /* - * Set default pool features. - */ - pool_features_init(&pf); - - dm_consume_args(&as, 4); - r = parse_pool_features(&as, &pf, ti); - if (r) - goto out; - pt = kzalloc(sizeof(*pt), GFP_KERNEL); if (!pt) { r = -ENOMEM; @@ -2035,6 +2096,13 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) } ti->private = pt; + r = dm_pool_register_metadata_threshold(pt->pool->pmd, + calc_metadata_threshold(pt), + metadata_low_callback, + pool); + if (r) + goto out_free_pt; + pt->callbacks.congested_fn = pool_is_congested; dm_table_add_target_callbacks(ti->table, &pt->callbacks); @@ -2074,18 +2142,7 @@ static int pool_map(struct dm_target *ti, struct bio *bio) return r; } -/* - * Retrieves the number of blocks of the data device from - * the superblock and compares it to the actual device size, - * thus resizing the data device in case it has grown. - * - * This both copes with opening preallocated data devices in the ctr - * being followed by a resume - * -and- - * calling the resume method individually after userspace has - * grown the data device in reaction to a table event. - */ -static int pool_preresume(struct dm_target *ti) +static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) { int r; struct pool_c *pt = ti->private; @@ -2093,12 +2150,7 @@ static int pool_preresume(struct dm_target *ti) sector_t data_size = ti->len; dm_block_t sb_data_size; - /* - * Take control of the pool object. - */ - r = bind_control_target(pool, ti); - if (r) - return r; + *need_commit = false; (void) sector_div(data_size, pool->sectors_per_block); @@ -2109,7 +2161,7 @@ static int pool_preresume(struct dm_target *ti) } if (data_size < sb_data_size) { - DMERR("pool target too small, is %llu blocks (expected %llu)", + DMERR("pool target (%llu blocks) too small: expected %llu", (unsigned long long)data_size, sb_data_size); return -EINVAL; @@ -2117,17 +2169,90 @@ static int pool_preresume(struct dm_target *ti) r = dm_pool_resize_data_dev(pool->pmd, data_size); if (r) { DMERR("failed to resize data device"); - /* FIXME Stricter than necessary: Rollback transaction instead here */ set_pool_mode(pool, PM_READ_ONLY); return r; } - (void) commit_or_fallback(pool); + *need_commit = true; } return 0; } +static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + dm_block_t metadata_dev_size, sb_metadata_dev_size; + + *need_commit = false; + + metadata_dev_size = get_metadata_dev_size(pool->md_dev); + + r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size); + if (r) { + DMERR("failed to retrieve data device size"); + return r; + } + + if (metadata_dev_size < sb_metadata_dev_size) { + DMERR("metadata device (%llu sectors) too small: expected %llu", + metadata_dev_size, sb_metadata_dev_size); + return -EINVAL; + + } else if (metadata_dev_size > sb_metadata_dev_size) { + r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); + if (r) { + DMERR("failed to resize metadata device"); + return r; + } + + *need_commit = true; + } + + return 0; +} + +/* + * Retrieves the number of blocks of the data device from + * the superblock and compares it to the actual device size, + * thus resizing the data device in case it has grown. + * + * This both copes with opening preallocated data devices in the ctr + * being followed by a resume + * -and- + * calling the resume method individually after userspace has + * grown the data device in reaction to a table event. + */ +static int pool_preresume(struct dm_target *ti) +{ + int r; + bool need_commit1, need_commit2; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + /* + * Take control of the pool object. + */ + r = bind_control_target(pool, ti); + if (r) + return r; + + r = maybe_resize_data_dev(ti, &need_commit1); + if (r) + return r; + + r = maybe_resize_metadata_dev(ti, &need_commit2); + if (r) + return r; + + if (need_commit1 || need_commit2) + (void) commit_or_fallback(pool); + + return 0; +} + static void pool_resume(struct dm_target *ti) { struct pool_c *pt = ti->private; @@ -2544,7 +2669,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 6, 1}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -2831,7 +2956,7 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 7, 1}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 6ad538375c3c..b948fd864d45 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -93,6 +93,13 @@ struct dm_verity_io { */ }; +struct dm_verity_prefetch_work { + struct work_struct work; + struct dm_verity *v; + sector_t block; + unsigned n_blocks; +}; + static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) { return (struct shash_desc *)(io + 1); @@ -424,15 +431,18 @@ static void verity_end_io(struct bio *bio, int error) * The root buffer is not prefetched, it is assumed that it will be cached * all the time. */ -static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) +static void verity_prefetch_io(struct work_struct *work) { + struct dm_verity_prefetch_work *pw = + container_of(work, struct dm_verity_prefetch_work, work); + struct dm_verity *v = pw->v; int i; for (i = v->levels - 2; i >= 0; i--) { sector_t hash_block_start; sector_t hash_block_end; - verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); - verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); + verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL); + verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL); if (!i) { unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); @@ -452,6 +462,25 @@ no_prefetch_cluster: dm_bufio_prefetch(v->bufio, hash_block_start, hash_block_end - hash_block_start + 1); } + + kfree(pw); +} + +static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) +{ + struct dm_verity_prefetch_work *pw; + + pw = kmalloc(sizeof(struct dm_verity_prefetch_work), + GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + + if (!pw) + return; + + INIT_WORK(&pw->work, verity_prefetch_io); + pw->v = v; + pw->block = io->block; + pw->n_blocks = io->n_blocks; + queue_work(v->verify_wq, &pw->work); } /* @@ -472,7 +501,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) return -EIO; } - if ((bio->bi_sector + bio_sectors(bio)) >> + if (bio_end_sector(bio) >> (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { DMERR_LIMIT("io out of range"); return -EIO; @@ -490,7 +519,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) bio->bi_end_io = verity_end_io; bio->bi_private = io; - io->io_vec_size = bio->bi_vcnt - bio->bi_idx; + io->io_vec_size = bio_segments(bio); if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) io->io_vec = io->io_vec_inline; else @@ -498,7 +527,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) memcpy(io->io_vec, bio_iovec(bio), io->io_vec_size * sizeof(struct bio_vec)); - verity_prefetch_io(v, io); + verity_submit_prefetch(v, io); generic_make_request(bio); @@ -858,7 +887,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7e469260fe5e..d5370a94b2c1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -339,7 +339,7 @@ out: return md ? 0 : -ENXIO; } -static int dm_blk_close(struct gendisk *disk, fmode_t mode) +static void dm_blk_close(struct gendisk *disk, fmode_t mode) { struct mapped_device *md = disk->private_data; @@ -349,8 +349,6 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode) dm_put(md); spin_unlock(&_minor_lock); - - return 0; } int dm_open_count(struct mapped_device *md) @@ -611,6 +609,7 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ + trace_block_bio_complete(md->queue, bio, io_error); bio_endio(bio, io_error); } } diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 5e7dc772f5de..3193aefe982b 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -185,8 +185,7 @@ static void make_request(struct mddev *mddev, struct bio *bio) return; } - if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9), - WRITE)) + if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE)) failit = 1; if (check_mode(conf, WritePersistent)) { add_sector(conf, bio->bi_sector, WritePersistent); @@ -196,8 +195,7 @@ static void make_request(struct mddev *mddev, struct bio *bio) failit = 1; } else { /* read request */ - if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9), - READ)) + if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ)) failit = 1; if (check_mode(conf, ReadTransient)) failit = 1; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 21014836bdbf..f03fabd2b37b 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -317,8 +317,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) bio_io_error(bio); return; } - if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > - tmp_dev->end_sector)) { + if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) { /* This bio crosses a device boundary, so we have to * split it. */ diff --git a/drivers/md/md.c b/drivers/md/md.c index fcb878f88796..681d1099a2d5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -72,6 +72,9 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this); + #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } /* @@ -194,21 +197,12 @@ void md_trim_bio(struct bio *bio, int offset, int size) if (offset == 0 && size == bio->bi_size) return; - bio->bi_sector += offset; - bio->bi_size = size; - offset <<= 9; clear_bit(BIO_SEG_VALID, &bio->bi_flags); - while (bio->bi_idx < bio->bi_vcnt && - bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { - /* remove this whole bio_vec */ - offset -= bio->bi_io_vec[bio->bi_idx].bv_len; - bio->bi_idx++; - } - if (bio->bi_idx < bio->bi_vcnt) { - bio->bi_io_vec[bio->bi_idx].bv_offset += offset; - bio->bi_io_vec[bio->bi_idx].bv_len -= offset; - } + bio_advance(bio, offset << 9); + + bio->bi_size = size; + /* avoid any complications with bi_idx being non-zero*/ if (bio->bi_idx) { memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, @@ -1564,8 +1558,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sector, count, 1) == 0) return -EINVAL; } - } else if (sb->bblog_offset == 0) - rdev->badblocks.shift = -1; + } else if (sb->bblog_offset != 0) + rdev->badblocks.shift = 0; if (!refdev) { ret = 1; @@ -2411,6 +2405,11 @@ static void md_update_sb(struct mddev * mddev, int force_change) int nospares = 0; int any_badblocks_changed = 0; + if (mddev->ro) { + if (force_change) + set_bit(MD_CHANGE_DEVS, &mddev->flags); + return; + } repeat: /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { @@ -2800,12 +2799,10 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) /* personality does all needed checks */ if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; - err = rdev->mddev->pers-> - hot_remove_disk(rdev->mddev, rdev); - if (err) - return err; - sysfs_unlink_rdev(rdev->mddev, rdev); - rdev->raid_disk = -1; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk >= 0) + return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { @@ -3221,7 +3218,7 @@ int md_rdev_init(struct md_rdev *rdev) * be used - I wonder if that matters */ rdev->badblocks.count = 0; - rdev->badblocks.shift = 0; + rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); seqlock_init(&rdev->badblocks.lock); if (rdev->badblocks.page == NULL) @@ -3293,9 +3290,6 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe goto abort_free; } } - if (super_format == -1) - /* hot-add for 0.90, or non-persistent: so no badblocks */ - rdev->badblocks.shift = -1; return rdev; @@ -4225,8 +4219,6 @@ action_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", type); } -static void reap_sync_thread(struct mddev *mddev); - static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { @@ -4241,7 +4233,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -5279,7 +5271,7 @@ static void __md_stop_writes(struct mddev *mddev) if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); } del_timer_sync(&mddev->safemode_timer); @@ -5287,7 +5279,8 @@ static void __md_stop_writes(struct mddev *mddev) bitmap_flush(mddev); md_super_wait(mddev); - if (!mddev->in_sync || mddev->flags) { + if (mddev->ro == 0 && + (!mddev->in_sync || mddev->flags)) { /* mark array as shutdown cleanly */ mddev->in_sync = 1; md_update_sb(mddev, 1); @@ -5810,7 +5803,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) else sysfs_notify_dirent_safe(rdev->sysfs_state); - md_update_sb(mddev, 1); + set_bit(MD_CHANGE_DEVS, &mddev->flags); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -5877,6 +5870,9 @@ static int hot_remove_disk(struct mddev * mddev, dev_t dev) if (!rdev) return -ENXIO; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(mddev, rdev); + if (rdev->raid_disk >= 0) goto busy; @@ -6490,6 +6486,28 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = md_set_readonly(mddev, bdev); goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, new_decode_dev(arg)); + goto done_unlock; + + case ADD_NEW_DISK: + /* We can support ADD_NEW_DISK on read-only arrays + * on if we are re-adding a preexisting device. + * So require mddev->pers and MD_DISK_SYNC. + */ + if (mddev->pers) { + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else if (!(info.state & (1<<MD_DISK_SYNC))) + /* Need to clear read-only for this */ + break; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + break; + case BLKROSET: if (get_user(ro, (int __user *)(arg))) { err = -EFAULT; @@ -6560,10 +6578,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto done_unlock; } - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; - case HOT_ADD_DISK: err = hot_add_disk(mddev, new_decode_dev(arg)); goto done_unlock; @@ -6651,15 +6665,13 @@ static int md_open(struct block_device *bdev, fmode_t mode) return err; } -static int md_release(struct gendisk *disk, fmode_t mode) +static void md_release(struct gendisk *disk, fmode_t mode) { struct mddev *mddev = disk->private_data; BUG_ON(!mddev); atomic_dec(&mddev->openers); mddev_put(mddev); - - return 0; } static int md_media_changed(struct gendisk *disk) @@ -7644,14 +7656,16 @@ void md_do_sync(struct md_thread *thread) } EXPORT_SYMBOL_GPL(md_do_sync); -static int remove_and_add_spares(struct mddev *mddev) +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) { struct md_rdev *rdev; int spares = 0; int removed = 0; rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0 && + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && @@ -7663,79 +7677,55 @@ static int remove_and_add_spares(struct mddev *mddev) removed++; } } - if (removed) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); + if (removed && mddev->kobj.sd) + sysfs_notify(&mddev->kobj, NULL, "degraded"); + if (this) + goto no_add; rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; - if (rdev->raid_disk < 0 - && !test_bit(Faulty, &rdev->flags)) { - rdev->recovery_offset = 0; - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; - spares++; - md_new_event(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - } + if (rdev->raid_disk >= 0) + continue; + if (test_bit(Faulty, &rdev->flags)) + continue; + if (mddev->ro && + rdev->saved_raid_disk < 0) + continue; + + rdev->recovery_offset = 0; + if (rdev->saved_raid_disk >= 0 && mddev->in_sync) { + spin_lock_irq(&mddev->write_lock); + if (mddev->in_sync) + /* OK, this device, which is in_sync, + * will definitely be noticed before + * the next write, so recovery isn't + * needed. + */ + rdev->recovery_offset = mddev->recovery_cp; + spin_unlock_irq(&mddev->write_lock); + } + if (mddev->ro && rdev->recovery_offset != MaxSector) + /* not safe to add this disk now */ + continue; + if (mddev->pers-> + hot_add_disk(mddev, rdev) == 0) { + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; + spares++; + md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); } } +no_add: if (removed) set_bit(MD_CHANGE_DEVS, &mddev->flags); return spares; } -static void reap_sync_thread(struct mddev *mddev) -{ - struct md_rdev *rdev; - - /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) { - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - } - } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) - mddev->pers->finish_reshape(mddev); - - /* If array is no-longer degraded, then any saved_raid_disk - * information must be scrapped. Also if any device is now - * In_sync we must scrape the saved_raid_disk for that device - * do the superblock for an incrementally recovered device - * written out. - */ - rdev_for_each(rdev, mddev) - if (!mddev->degraded || - test_bit(In_sync, &rdev->flags)) - rdev->saved_raid_disk = -1; - - md_update_sb(mddev, 1); - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); - if (mddev->event_work.func) - queue_work(md_misc_wq, &mddev->event_work); -} - /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -7791,22 +7781,16 @@ void md_check_recovery(struct mddev *mddev) int spares = 0; if (mddev->ro) { - /* Only thing we do on a ro array is remove - * failed devices. + /* On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself + * is in-sync. + * As we only add devices that are already in-sync, + * we can activate the spares immediately. */ - struct md_rdev *rdev; - rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - test_bit(Faulty, &rdev->flags) && - atomic_read(&rdev->nr_pending)==0) { - if (mddev->pers->hot_remove_disk( - mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->raid_disk = -1; - } - } clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + remove_and_add_spares(mddev, NULL); + mddev->pers->spare_active(mddev); goto unlock; } @@ -7838,7 +7822,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } if (mddev->sync_thread) { - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid @@ -7869,7 +7853,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev))) { + } else if ((spares = remove_and_add_spares(mddev, NULL))) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -7919,6 +7903,51 @@ void md_check_recovery(struct mddev *mddev) } } +void md_reap_sync_thread(struct mddev *mddev) +{ + struct md_rdev *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(&mddev->sync_thread); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) { + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + } + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + + /* If array is no-longer degraded, then any saved_raid_disk + * information must be scrapped. Also if any device is now + * In_sync we must scrape the saved_raid_disk for that device + * do the superblock for an incrementally recovered device + * written out. + */ + rdev_for_each(rdev, mddev) + if (!mddev->degraded || + test_bit(In_sync, &rdev->flags)) + rdev->saved_raid_disk = -1; + + md_update_sb(mddev, 1); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); +} + void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -8644,6 +8673,7 @@ EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_check_recovery); +EXPORT_SYMBOL(md_reap_sync_thread); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); diff --git a/drivers/md/md.h b/drivers/md/md.h index eca59c3074ef..653f992b687a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -506,7 +506,7 @@ static inline char * mdname (struct mddev * mddev) static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; - if (!test_bit(Replacement, &rdev->flags)) { + if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); } else @@ -516,7 +516,7 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; - if (!test_bit(Replacement, &rdev->flags)) { + if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&mddev->kobj, nm); } @@ -567,6 +567,7 @@ extern struct md_thread *md_register_thread( extern void md_unregister_thread(struct md_thread **threadp); extern void md_wakeup_thread(struct md_thread *thread); extern void md_check_recovery(struct mddev *mddev); +extern void md_reap_sync_thread(struct mddev *mddev); extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index c4f28133ef82..b88757cd0d1d 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -139,15 +139,8 @@ struct child { struct btree_node *n; }; -static struct dm_btree_value_type le64_type = { - .context = NULL, - .size = sizeof(__le64), - .inc = NULL, - .dec = NULL, - .equal = NULL -}; - -static int init_child(struct dm_btree_info *info, struct btree_node *parent, +static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, unsigned index, struct child *result) { int r, inc; @@ -164,7 +157,7 @@ static int init_child(struct dm_btree_info *info, struct btree_node *parent, result->n = dm_block_data(result->block); if (inc) - inc_children(info->tm, result->n, &le64_type); + inc_children(info->tm, result->n, vt); *((__le64 *) value_ptr(parent, index)) = cpu_to_le64(dm_block_location(result->block)); @@ -236,7 +229,7 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, } static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; struct btree_node *parent; @@ -244,11 +237,11 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, parent = dm_block_data(shadow_current(s)); - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, &right); + r = init_child(info, vt, parent, left_index + 1, &right); if (r) { exit_child(info, &left); return r; @@ -368,7 +361,7 @@ static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent, } static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; struct btree_node *parent = dm_block_data(shadow_current(s)); @@ -377,17 +370,17 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, /* * FIXME: fill out an array? */ - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, ¢er); + r = init_child(info, vt, parent, left_index + 1, ¢er); if (r) { exit_child(info, &left); return r; } - r = init_child(info, parent, left_index + 2, &right); + r = init_child(info, vt, parent, left_index + 2, &right); if (r) { exit_child(info, &left); exit_child(info, ¢er); @@ -434,7 +427,8 @@ static int get_nr_entries(struct dm_transaction_manager *tm, } static int rebalance_children(struct shadow_spine *s, - struct dm_btree_info *info, uint64_t key) + struct dm_btree_info *info, + struct dm_btree_value_type *vt, uint64_t key) { int i, r, has_left_sibling, has_right_sibling; uint32_t child_entries; @@ -472,13 +466,13 @@ static int rebalance_children(struct shadow_spine *s, has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); if (!has_left_sibling) - r = rebalance2(s, info, i); + r = rebalance2(s, info, vt, i); else if (!has_right_sibling) - r = rebalance2(s, info, i - 1); + r = rebalance2(s, info, vt, i - 1); else - r = rebalance3(s, info, i - 1); + r = rebalance3(s, info, vt, i - 1); return r; } @@ -529,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, if (le32_to_cpu(n->header.flags) & LEAF_NODE) return do_leaf(n, key, index); - r = rebalance_children(s, info, key); + r = rebalance_children(s, info, vt, key); if (r) break; @@ -550,6 +544,14 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, return r; } +static struct dm_btree_value_type le64_type = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL +}; + int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root) { diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index f6d29e614ab7..e735a6d5a793 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -248,7 +248,8 @@ static struct dm_space_map ops = { .new_block = sm_disk_new_block, .commit = sm_disk_commit, .root_size = sm_disk_root_size, - .copy_root = sm_disk_copy_root + .copy_root = sm_disk_copy_root, + .register_threshold_callback = NULL }; struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 906cf3df71af..1c959684caef 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -17,6 +17,55 @@ /*----------------------------------------------------------------*/ /* + * An edge triggered threshold. + */ +struct threshold { + bool threshold_set; + bool value_set; + dm_block_t threshold; + dm_block_t current_value; + dm_sm_threshold_fn fn; + void *context; +}; + +static void threshold_init(struct threshold *t) +{ + t->threshold_set = false; + t->value_set = false; +} + +static void set_threshold(struct threshold *t, dm_block_t value, + dm_sm_threshold_fn fn, void *context) +{ + t->threshold_set = true; + t->threshold = value; + t->fn = fn; + t->context = context; +} + +static bool below_threshold(struct threshold *t, dm_block_t value) +{ + return t->threshold_set && value <= t->threshold; +} + +static bool threshold_already_triggered(struct threshold *t) +{ + return t->value_set && below_threshold(t, t->current_value); +} + +static void check_threshold(struct threshold *t, dm_block_t value) +{ + if (below_threshold(t, value) && + !threshold_already_triggered(t)) + t->fn(t->context); + + t->value_set = true; + t->current_value = value; +} + +/*----------------------------------------------------------------*/ + +/* * Space map interface. * * The low level disk format is written using the standard btree and @@ -54,6 +103,8 @@ struct sm_metadata { unsigned allocated_this_transaction; unsigned nr_uncommitted; struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; + + struct threshold threshold; }; static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) @@ -144,12 +195,6 @@ static void sm_metadata_destroy(struct dm_space_map *sm) kfree(smm); } -static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - DMERR("doesn't support extend"); - return -EINVAL; -} - static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) { struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); @@ -335,9 +380,19 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) { + dm_block_t count; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + int r = sm_metadata_new_block_(sm, b); if (r) DMERR("unable to allocate new metadata block"); + + r = sm_metadata_get_nr_free(sm, &count); + if (r) + DMERR("couldn't get free block count"); + + check_threshold(&smm->threshold, count); + return r; } @@ -357,6 +412,18 @@ static int sm_metadata_commit(struct dm_space_map *sm) return 0; } +static int sm_metadata_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + set_threshold(&smm->threshold, threshold, fn, context); + + return 0; +} + static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result) { *result = sizeof(struct disk_sm_root); @@ -382,6 +449,8 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t return 0; } +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks); + static struct dm_space_map ops = { .destroy = sm_metadata_destroy, .extend = sm_metadata_extend, @@ -395,7 +464,8 @@ static struct dm_space_map ops = { .new_block = sm_metadata_new_block, .commit = sm_metadata_commit, .root_size = sm_metadata_root_size, - .copy_root = sm_metadata_copy_root + .copy_root = sm_metadata_copy_root, + .register_threshold_callback = sm_metadata_register_threshold_callback }; /*----------------------------------------------------------------*/ @@ -410,7 +480,7 @@ static void sm_bootstrap_destroy(struct dm_space_map *sm) static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks) { - DMERR("boostrap doesn't support extend"); + DMERR("bootstrap doesn't support extend"); return -EINVAL; } @@ -450,7 +520,7 @@ static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { - DMERR("boostrap doesn't support set_count"); + DMERR("bootstrap doesn't support set_count"); return -EINVAL; } @@ -491,7 +561,7 @@ static int sm_bootstrap_commit(struct dm_space_map *sm) static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) { - DMERR("boostrap doesn't support root_size"); + DMERR("bootstrap doesn't support root_size"); return -EINVAL; } @@ -499,7 +569,7 @@ static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, size_t max) { - DMERR("boostrap doesn't support copy_root"); + DMERR("bootstrap doesn't support copy_root"); return -EINVAL; } @@ -517,11 +587,42 @@ static struct dm_space_map bootstrap_ops = { .new_block = sm_bootstrap_new_block, .commit = sm_bootstrap_commit, .root_size = sm_bootstrap_root_size, - .copy_root = sm_bootstrap_copy_root + .copy_root = sm_bootstrap_copy_root, + .register_threshold_callback = NULL }; /*----------------------------------------------------------------*/ +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + int r, i; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + dm_block_t old_len = smm->ll.nr_blocks; + + /* + * Flick into a mode where all blocks get allocated in the new area. + */ + smm->begin = old_len; + memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); + + /* + * Extend. + */ + r = sm_ll_extend(&smm->ll, extra_blocks); + + /* + * Switch back to normal behaviour. + */ + memcpy(&smm->sm, &ops, sizeof(smm->sm)); + for (i = old_len; !r && i < smm->begin; i++) + r = sm_ll_inc(&smm->ll, i, &ev); + + return r; +} + +/*----------------------------------------------------------------*/ + struct dm_space_map *dm_sm_metadata_init(void) { struct sm_metadata *smm; @@ -549,6 +650,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, smm->recursion_count = 0; smm->allocated_this_transaction = 0; smm->nr_uncommitted = 0; + threshold_init(&smm->threshold); memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); @@ -590,6 +692,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm, smm->recursion_count = 0; smm->allocated_this_transaction = 0; smm->nr_uncommitted = 0; + threshold_init(&smm->threshold); memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); return 0; diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h index 1cbfc6b1638a..3e6d1153b7c4 100644 --- a/drivers/md/persistent-data/dm-space-map.h +++ b/drivers/md/persistent-data/dm-space-map.h @@ -9,6 +9,8 @@ #include "dm-block-manager.h" +typedef void (*dm_sm_threshold_fn)(void *context); + /* * struct dm_space_map keeps a record of how many times each block in a device * is referenced. It needs to be fixed on disk as part of the transaction. @@ -59,6 +61,15 @@ struct dm_space_map { */ int (*root_size)(struct dm_space_map *sm, size_t *result); int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len); + + /* + * You can register one threshold callback which is edge-triggered + * when the free space in the space map drops below the threshold. + */ + int (*register_threshold_callback)(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context); }; /*----------------------------------------------------------------*/ @@ -131,4 +142,16 @@ static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le return sm->copy_root(sm, copy_to_here_le, len); } +static inline int dm_sm_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + if (sm->register_threshold_callback) + return sm->register_threshold_callback(sm, threshold, fn, context); + + return -EINVAL; +} + + #endif /* _LINUX_DM_SPACE_MAP_H */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 0505452de8d6..fcf65e512cf5 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -502,11 +502,11 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev, { if (likely(is_power_of_2(chunk_sects))) { return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) - + (bio->bi_size >> 9)); + + bio_sectors(bio)); } else{ sector_t sector = bio->bi_sector; return chunk_sects >= (sector_div(sector, chunk_sects) - + (bio->bi_size >> 9)); + + bio_sectors(bio)); } } @@ -527,8 +527,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) sector_t sector = bio->bi_sector; struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || - bio->bi_idx != 0) + if (bio_segments(bio) > 1) goto bad_map; /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. @@ -567,7 +566,7 @@ bad_map: printk("md/raid0:%s: make_request bug: can't convert block across chunks" " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects / 2, - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2); bio_io_error(bio); return; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fd86b372692d..55951182af73 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -92,7 +92,6 @@ static void r1bio_pool_free(void *r1_bio, void *data) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { struct pool_info *pi = data; - struct page *page; struct r1bio *r1_bio; struct bio *bio; int i, j; @@ -122,14 +121,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) j = 1; while(j--) { bio = r1_bio->bios[j]; - for (i = 0; i < RESYNC_PAGES; i++) { - page = alloc_page(gfp_flags); - if (unlikely(!page)) - goto out_free_pages; + bio->bi_vcnt = RESYNC_PAGES; - bio->bi_io_vec[i].bv_page = page; - bio->bi_vcnt = i+1; - } + if (bio_alloc_pages(bio, gfp_flags)) + goto out_free_bio; } /* If not user-requests, copy the page pointers to all bios */ if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { @@ -143,11 +138,6 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; -out_free_pages: - for (j=0 ; j < pi->raid_disks; j++) - for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++) - put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); - j = -1; out_free_bio: while (++j < pi->raid_disks) bio_put(r1_bio->bios[j]); @@ -267,7 +257,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio) (bio_data_dir(bio) == WRITE) ? "write" : "read", (unsigned long long) bio->bi_sector, (unsigned long long) bio->bi_sector + - (bio->bi_size >> 9) - 1); + bio_sectors(bio) - 1); call_bio_endio(r1_bio); } @@ -458,7 +448,7 @@ static void raid1_end_write_request(struct bio *bio, int error) " %llu-%llu\n", (unsigned long long) mbio->bi_sector, (unsigned long long) mbio->bi_sector + - (mbio->bi_size >> 9) - 1); + bio_sectors(mbio) - 1); call_bio_endio(r1_bio); } } @@ -925,7 +915,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio) if (unlikely(!bvecs)) return; - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { bvecs[i] = *bvec; bvecs[i].bv_page = alloc_page(GFP_NOIO); if (unlikely(!bvecs[i].bv_page)) @@ -981,7 +971,12 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } kfree(plug); @@ -1018,7 +1013,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) md_write_start(mddev, bio); /* wait on superblock update early */ if (bio_data_dir(bio) == WRITE && - bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo && + bio_end_sector(bio) > mddev->suspend_lo && bio->bi_sector < mddev->suspend_hi) { /* As the suspend_* range is controlled by * userspace, we want an interruptible @@ -1029,7 +1024,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) flush_signals(current); prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); - if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo || + if (bio_end_sector(bio) <= mddev->suspend_lo || bio->bi_sector >= mddev->suspend_hi) break; schedule(); @@ -1049,7 +1044,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); r1_bio->master_bio = bio; - r1_bio->sectors = bio->bi_size >> 9; + r1_bio->sectors = bio_sectors(bio); r1_bio->state = 0; r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; @@ -1127,7 +1122,7 @@ read_again: r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); r1_bio->master_bio = bio; - r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; r1_bio->state = 0; r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector + sectors_handled; @@ -1284,14 +1279,10 @@ read_again: struct bio_vec *bvec; int j; - /* Yes, I really want the '__' version so that - * we clear any unused pointer in the io_vec, rather - * than leave them unchanged. This is important - * because when we come to free the pages, we won't - * know the original bi_idx, so we just free - * them all + /* + * We trimmed the bio, so _all is legit */ - __bio_for_each_segment(bvec, mbio, j, 0) + bio_for_each_segment_all(bvec, mbio, j) bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); @@ -1329,14 +1320,14 @@ read_again: /* Mustn't call r1_bio_write_done before this next test, * as it could result in the bio being freed. */ - if (sectors_handled < (bio->bi_size >> 9)) { + if (sectors_handled < bio_sectors(bio)) { r1_bio_write_done(r1_bio); /* We need another r1_bio. It has already been counted * in bio->bi_phys_segments */ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); r1_bio->master_bio = bio; - r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; r1_bio->state = 0; r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector + sectors_handled; @@ -1862,7 +1853,7 @@ static int process_checks(struct r1bio *r1_bio) struct bio *sbio = r1_bio->bios[i]; int size; - if (r1_bio->bios[i]->bi_end_io != end_sync_read) + if (sbio->bi_end_io != end_sync_read) continue; if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { @@ -1887,16 +1878,15 @@ static int process_checks(struct r1bio *r1_bio) continue; } /* fixup the bio for reuse */ + bio_reset(sbio); sbio->bi_vcnt = vcnt; sbio->bi_size = r1_bio->sectors << 9; - sbio->bi_idx = 0; - sbio->bi_phys_segments = 0; - sbio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bi_next = NULL; sbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; sbio->bi_bdev = conf->mirrors[i].rdev->bdev; + sbio->bi_end_io = end_sync_read; + sbio->bi_private = r1_bio; + size = sbio->bi_size; for (j = 0; j < vcnt ; j++) { struct bio_vec *bi; @@ -1907,10 +1897,9 @@ static int process_checks(struct r1bio *r1_bio) else bi->bv_len = size; size -= PAGE_SIZE; - memcpy(page_address(bi->bv_page), - page_address(pbio->bi_io_vec[j].bv_page), - PAGE_SIZE); } + + bio_copy_data(sbio, pbio); } return 0; } @@ -1947,7 +1936,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) wbio->bi_rw = WRITE; wbio->bi_end_io = end_sync_write; atomic_inc(&r1_bio->remaining); - md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); + md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); generic_make_request(wbio); } @@ -2059,32 +2048,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk, } } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion *)bio->bi_private); -} - -static int submit_bio_wait(int rw, struct bio *bio) -{ - struct completion event; - rw |= REQ_SYNC; - - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); - - return test_bit(BIO_UPTODATE, &bio->bi_flags); -} - static int narrow_write_error(struct r1bio *r1_bio, int i) { struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; struct md_rdev *rdev = conf->mirrors[i].rdev; - int vcnt, idx; - struct bio_vec *vec; /* bio has the data to be written to device 'i' where * we just recently had a write error. @@ -2112,30 +2080,32 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) & ~(sector_t)(block_sectors - 1)) - sector; - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - vcnt = r1_bio->behind_page_count; - vec = r1_bio->behind_bvecs; - idx = 0; - while (vec[idx].bv_page == NULL) - idx++; - } else { - vcnt = r1_bio->master_bio->bi_vcnt; - vec = r1_bio->master_bio->bi_io_vec; - idx = r1_bio->master_bio->bi_idx; - } while (sect_to_write) { struct bio *wbio; if (sectors > sect_to_write) sectors = sect_to_write; /* Write at 'sector' for 'sectors'*/ - wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); - memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); - wbio->bi_sector = r1_bio->sector; + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + unsigned vcnt = r1_bio->behind_page_count; + struct bio_vec *vec = r1_bio->behind_bvecs; + + while (!vec->bv_page) { + vec++; + vcnt--; + } + + wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); + memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); + + wbio->bi_vcnt = vcnt; + } else { + wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + } + wbio->bi_rw = WRITE; - wbio->bi_vcnt = vcnt; + wbio->bi_sector = r1_bio->sector; wbio->bi_size = r1_bio->sectors << 9; - wbio->bi_idx = idx; md_trim_bio(wbio, sector - r1_bio->sector, sectors); wbio->bi_sector += rdev->data_offset; @@ -2284,8 +2254,7 @@ read_more: r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); r1_bio->master_bio = mbio; - r1_bio->sectors = (mbio->bi_size >> 9) - - sectors_handled; + r1_bio->sectors = bio_sectors(mbio) - sectors_handled; r1_bio->state = 0; set_bit(R1BIO_ReadError, &r1_bio->state); r1_bio->mddev = mddev; @@ -2459,18 +2428,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp for (i = 0; i < conf->raid_disks * 2; i++) { struct md_rdev *rdev; bio = r1_bio->bios[i]; - - /* take from bio_init */ - bio->bi_next = NULL; - bio->bi_flags &= ~(BIO_POOL_MASK-1); - bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_rw = READ; - bio->bi_vcnt = 0; - bio->bi_idx = 0; - bio->bi_phys_segments = 0; - bio->bi_size = 0; - bio->bi_end_io = NULL; - bio->bi_private = NULL; + bio_reset(bio); rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev == NULL || @@ -2901,6 +2859,7 @@ static int stop(struct mddev *mddev) if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); + safe_put_page(conf->tmppage); kfree(conf->poolinfo); kfree(conf); mddev->private = NULL; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 77b562d18a90..59d4daa5f4c7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1133,7 +1133,12 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } kfree(plug); @@ -1169,14 +1174,13 @@ static void make_request(struct mddev *mddev, struct bio * bio) /* If this request crosses a chunk boundary, we need to * split it. This will only happen for 1 PAGE (or less) requests. */ - if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) + if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio) > chunk_sects && (conf->geo.near_copies < conf->geo.raid_disks || conf->prev.near_copies < conf->prev.raid_disks))) { struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || - bio->bi_idx != 0) + if (bio_segments(bio) > 1) goto bad_map; /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. @@ -1209,7 +1213,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) bad_map: printk("md/raid10:%s: make_request bug: can't convert block across chunks" " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2); bio_io_error(bio); return; @@ -1224,7 +1228,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) */ wait_barrier(conf); - sectors = bio->bi_size >> 9; + sectors = bio_sectors(bio); while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_sector < conf->reshape_progress && bio->bi_sector + sectors > conf->reshape_progress) { @@ -1326,8 +1330,7 @@ read_again: r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; - r10_bio->sectors = ((bio->bi_size >> 9) - - sectors_handled); + r10_bio->sectors = bio_sectors(bio) - sectors_handled; r10_bio->state = 0; r10_bio->mddev = mddev; r10_bio->sector = bio->bi_sector + sectors_handled; @@ -1569,7 +1572,7 @@ retry_write: * after checking if we need to go around again. */ - if (sectors_handled < (bio->bi_size >> 9)) { + if (sectors_handled < bio_sectors(bio)) { one_write_done(r10_bio); /* We need another r10_bio. It has already been counted * in bio->bi_phys_segments. @@ -1577,7 +1580,7 @@ retry_write: r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; - r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + r10_bio->sectors = bio_sectors(bio) - sectors_handled; r10_bio->mddev = mddev; r10_bio->sector = bio->bi_sector + sectors_handled; @@ -2079,13 +2082,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) * First we need to fixup bv_offset, bv_len and * bi_vecs, as the read request might have corrupted these */ + bio_reset(tbio); + tbio->bi_vcnt = vcnt; tbio->bi_size = r10_bio->sectors << 9; - tbio->bi_idx = 0; - tbio->bi_phys_segments = 0; - tbio->bi_flags &= ~(BIO_POOL_MASK - 1); - tbio->bi_flags |= 1 << BIO_UPTODATE; - tbio->bi_next = NULL; tbio->bi_rw = WRITE; tbio->bi_private = r10_bio; tbio->bi_sector = r10_bio->devs[i].addr; @@ -2103,7 +2103,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) d = r10_bio->devs[i].devnum; atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); + md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); tbio->bi_sector += conf->mirrors[d].rdev->data_offset; tbio->bi_bdev = conf->mirrors[d].rdev->bdev; @@ -2128,7 +2128,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) d = r10_bio->devs[i].devnum; atomic_inc(&r10_bio->remaining); md_sync_acct(conf->mirrors[d].replacement->bdev, - tbio->bi_size >> 9); + bio_sectors(tbio)); generic_make_request(tbio); } @@ -2254,13 +2254,13 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) wbio2 = r10_bio->devs[1].repl_bio; if (wbio->bi_end_io) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); - md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); + md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); generic_make_request(wbio); } if (wbio2 && wbio2->bi_end_io) { atomic_inc(&conf->mirrors[d].replacement->nr_pending); md_sync_acct(conf->mirrors[d].replacement->bdev, - wbio2->bi_size >> 9); + bio_sectors(wbio2)); generic_make_request(wbio2); } } @@ -2531,25 +2531,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 } } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion *)bio->bi_private); -} - -static int submit_bio_wait(int rw, struct bio *bio) -{ - struct completion event; - rw |= REQ_SYNC; - - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); - - return test_bit(BIO_UPTODATE, &bio->bi_flags); -} - static int narrow_write_error(struct r10bio *r10_bio, int i) { struct bio *bio = r10_bio->master_bio; @@ -2690,8 +2671,7 @@ read_more: r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = mbio; - r10_bio->sectors = (mbio->bi_size >> 9) - - sectors_handled; + r10_bio->sectors = bio_sectors(mbio) - sectors_handled; r10_bio->state = 0; set_bit(R10BIO_ReadError, &r10_bio->state); @@ -2913,6 +2893,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (init_resync(conf)) return 0; + /* + * Allow skipping a full rebuild for incremental assembly + * of a clean array, like RAID1 does. + */ + if (mddev->bitmap == NULL && + mddev->recovery_cp == MaxSector && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + conf->fullsync == 0) { + *skipped = 1; + max_sector = mddev->dev_sectors; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sector = mddev->resync_max_sectors; + return max_sector - sector_nr; + } + skipped: max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -3112,6 +3108,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, } } bio = r10_bio->devs[0].bio; + bio_reset(bio); bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; @@ -3136,6 +3133,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, rdev = mirror->rdev; if (!test_bit(In_sync, &rdev->flags)) { bio = r10_bio->devs[1].bio; + bio_reset(bio); bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; @@ -3164,6 +3162,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (rdev == NULL || bio == NULL || test_bit(Faulty, &rdev->flags)) break; + bio_reset(bio); bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; @@ -3262,7 +3261,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[i].repl_bio->bi_end_io = NULL; bio = r10_bio->devs[i].bio; - bio->bi_end_io = NULL; + bio_reset(bio); clear_bit(BIO_UPTODATE, &bio->bi_flags); if (conf->mirrors[d].rdev == NULL || test_bit(Faulty, &conf->mirrors[d].rdev->flags)) @@ -3299,6 +3298,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; + bio_reset(bio); clear_bit(BIO_UPTODATE, &bio->bi_flags); sector = r10_bio->devs[i].addr; @@ -3332,17 +3332,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, } } - for (bio = biolist; bio ; bio=bio->bi_next) { - - bio->bi_flags &= ~(BIO_POOL_MASK - 1); - if (bio->bi_end_io) - bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_vcnt = 0; - bio->bi_idx = 0; - bio->bi_phys_segments = 0; - bio->bi_size = 0; - } - nr_sectors = 0; if (sector_nr + max_sync < max_sector) max_sector = sector_nr + max_sync; @@ -3810,6 +3799,7 @@ static int stop(struct mddev *mddev) if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); + safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf); mddev->private = NULL; @@ -4389,7 +4379,6 @@ read_more: read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); read_bio->bi_flags |= 1 << BIO_UPTODATE; read_bio->bi_vcnt = 0; - read_bio->bi_idx = 0; read_bio->bi_size = 0; r10_bio->master_bio = read_bio; r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; @@ -4413,17 +4402,14 @@ read_more: } if (!rdev2 || test_bit(Faulty, &rdev2->flags)) continue; + + bio_reset(b); b->bi_bdev = rdev2->bdev; b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; b->bi_private = r10_bio; b->bi_end_io = end_reshape_write; b->bi_rw = WRITE; - b->bi_flags &= ~(BIO_POOL_MASK - 1); - b->bi_flags |= 1 << BIO_UPTODATE; b->bi_next = blist; - b->bi_vcnt = 0; - b->bi_idx = 0; - b->bi_size = 0; blist = b; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3ee2912889e7..9359828ffe26 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -90,7 +90,7 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) */ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) { - int sectors = bio->bi_size >> 9; + int sectors = bio_sectors(bio); if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) return bio->bi_next; else @@ -184,6 +184,8 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); bi = return_bi; } @@ -567,14 +569,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi = &sh->dev[i].req; rbi = &sh->dev[i].rreq; /* For writing to replacement */ - bi->bi_rw = rw; - rbi->bi_rw = rw; - if (rw & WRITE) { - bi->bi_end_io = raid5_end_write_request; - rbi->bi_end_io = raid5_end_write_request; - } else - bi->bi_end_io = raid5_end_read_request; - rcu_read_lock(); rrdev = rcu_dereference(conf->disks[i].replacement); smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ @@ -649,7 +643,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) set_bit(STRIPE_IO_STARTED, &sh->state); + bio_reset(bi); bi->bi_bdev = rdev->bdev; + bi->bi_rw = rw; + bi->bi_end_io = (rw & WRITE) + ? raid5_end_write_request + : raid5_end_read_request; + bi->bi_private = sh; + pr_debug("%s: for %llu schedule op %ld on disc %d\n", __func__, (unsigned long long)sh->sector, bi->bi_rw, i); @@ -663,17 +664,16 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) bi->bi_rw |= REQ_FLUSH; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_idx = 0; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); - trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), - bi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), + bi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(bi); } if (rrdev) { @@ -683,7 +683,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) set_bit(STRIPE_IO_STARTED, &sh->state); + bio_reset(rbi); rbi->bi_bdev = rrdev->bdev; + rbi->bi_rw = rw; + BUG_ON(!(rw & WRITE)); + rbi->bi_end_io = raid5_end_write_request; + rbi->bi_private = sh; + pr_debug("%s: for %llu schedule op %ld on " "replacement disc %d\n", __func__, (unsigned long long)sh->sector, @@ -695,15 +701,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) else rbi->bi_sector = (sh->sector + rrdev->data_offset); - rbi->bi_flags = 1 << BIO_UPTODATE; - rbi->bi_idx = 0; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; - rbi->bi_next = NULL; - trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), - rbi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), + rbi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(rbi); } if (!rdev && !rrdev) { @@ -1882,8 +1886,15 @@ static void raid5_end_write_request(struct bio *bi, int error) &rdev->mddev->recovery); } else if (is_badblock(rdev, sh->sector, STRIPE_SECTORS, - &first_bad, &bad_sectors)) + &first_bad, &bad_sectors)) { set_bit(R5_MadeGood, &sh->dev[i].flags); + if (test_bit(R5_ReadError, &sh->dev[i].flags)) + /* That was a successful write so make + * sure it looks like we already did + * a re-write. + */ + set_bit(R5_ReWrite, &sh->dev[i].flags); + } } rdev_dec_pending(rdev, conf->mddev); @@ -2280,17 +2291,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int level = conf->level; if (rcw) { - /* if we are not expanding this is a proper write request, and - * there will be bios with new data to be drained into the - * stripe cache - */ - if (!expand) { - sh->reconstruct_state = reconstruct_state_drain_run; - set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - } else - sh->reconstruct_state = reconstruct_state_run; - - set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -2303,6 +2303,21 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } + /* if we are not expanding this is a proper write request, and + * there will be bios with new data to be drained into the + * stripe cache + */ + if (!expand) { + if (!s->locked) + /* False alarm, nothing to do */ + return; + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; + + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); + if (s->locked + conf->max_degraded == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&conf->pending_full_writes); @@ -2311,11 +2326,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - sh->reconstruct_state = reconstruct_state_prexor_drain_run; - set_bit(STRIPE_OP_PREXOR, &s->ops_request); - set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); - for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (i == pd_idx) @@ -2330,6 +2340,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } + if (!s->locked) + /* False alarm - nothing to do */ + return; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); } /* keep the parity disk(s) locked while asynchronous operations @@ -2384,11 +2401,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in } else bip = &sh->dev[dd_idx].toread; while (*bip && (*bip)->bi_sector < bi->bi_sector) { - if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) + if (bio_end_sector(*bip) > bi->bi_sector) goto overlap; bip = & (*bip)->bi_next; } - if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) + if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) goto overlap; BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); @@ -2404,8 +2421,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && bi && bi->bi_sector <= sector; bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { - if (bi->bi_sector + (bi->bi_size>>9) >= sector) - sector = bi->bi_sector + (bi->bi_size>>9); + if (bio_end_sector(bi) >= sector) + sector = bio_end_sector(bi); } if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); @@ -2564,6 +2581,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, int i; clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); s->syncing = 0; s->replacing = 0; /* There is nothing more to do for sync/check/repair. @@ -2737,6 +2756,7 @@ static void handle_stripe_clean_event(struct r5conf *conf, { int i; struct r5dev *dev; + int discard_pending = 0; for (i = disks; i--; ) if (sh->dev[i].written) { @@ -2765,9 +2785,23 @@ static void handle_stripe_clean_event(struct r5conf *conf, STRIPE_SECTORS, !test_bit(STRIPE_DEGRADED, &sh->state), 0); - } - } else if (test_bit(R5_Discard, &sh->dev[i].flags)) - clear_bit(R5_Discard, &sh->dev[i].flags); + } else if (test_bit(R5_Discard, &dev->flags)) + discard_pending = 1; + } + if (!discard_pending && + test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { + clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + if (sh->qd_idx >= 0) { + clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); + } + /* now that discard is done we can proceed with any sync */ + clear_bit(STRIPE_DISCARD, &sh->state); + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) + set_bit(STRIPE_HANDLE, &sh->state); + + } if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) @@ -2826,8 +2860,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_HANDLE, &sh->state); if (rmw < rcw && rmw > 0) { /* prefer read-modify-write, but need to get some data */ - blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d", - (unsigned long long)sh->sector, rmw); + if (conf->mddev->queue) + blk_add_trace_msg(conf->mddev->queue, + "raid5 rmw %llu %d", + (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && @@ -2877,7 +2913,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } - if (rcw) + if (rcw && conf->mddev->queue) blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", (unsigned long long)sh->sector, rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); @@ -3417,9 +3453,15 @@ static void handle_stripe(struct stripe_head *sh) return; } - if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { - set_bit(STRIPE_SYNCING, &sh->state); - clear_bit(STRIPE_INSYNC, &sh->state); + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + spin_lock(&sh->stripe_lock); + /* Cannot process 'sync' concurrently with 'discard' */ + if (!test_bit(STRIPE_DISCARD, &sh->state) && + test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + } + spin_unlock(&sh->stripe_lock); } clear_bit(STRIPE_DELAYED, &sh->state); @@ -3579,6 +3621,8 @@ static void handle_stripe(struct stripe_head *sh) test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS, 1); clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); } /* If the failed drives are just a ReadError, then we might need @@ -3804,7 +3848,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); unsigned int chunk_sectors = mddev->chunk_sectors; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bio_sectors(bio); if (mddev->new_chunk_sectors < mddev->chunk_sectors) chunk_sectors = mddev->new_chunk_sectors; @@ -3878,6 +3922,8 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { + trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), + raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); @@ -3894,7 +3940,7 @@ static int bio_fits_rdev(struct bio *bi) { struct request_queue *q = bdev_get_queue(bi->bi_bdev); - if ((bi->bi_size>>9) > queue_max_sectors(q)) + if (bio_sectors(bi) > queue_max_sectors(q)) return 0; blk_recount_segments(q, bi); if (bi->bi_phys_segments > queue_max_segments(q)) @@ -3941,7 +3987,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 0, &dd_idx, NULL); - end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); + end_sector = bio_end_sector(align_bi); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].replacement); if (!rdev || test_bit(Faulty, &rdev->flags) || @@ -3964,7 +4010,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); if (!bio_fits_rdev(align_bi) || - is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, + is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), &first_bad, &bad_sectors)) { /* too big in some way, or has a known bad block */ bio_put(align_bi); @@ -3982,9 +4028,10 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) atomic_inc(&conf->active_aligned_reads); spin_unlock_irq(&conf->device_lock); - trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), - align_bi, disk_devt(mddev->gendisk), - raid_bio->bi_sector); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), + align_bi, disk_devt(mddev->gendisk), + raid_bio->bi_sector); generic_make_request(align_bi); return 1; } else { @@ -4078,7 +4125,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) } spin_unlock_irq(&conf->device_lock); } - trace_block_unplug(mddev->queue, cnt, !from_schedule); + if (mddev->queue) + trace_block_unplug(mddev->queue, cnt, !from_schedule); kfree(cb); } @@ -4141,6 +4189,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) sh = get_active_stripe(conf, logical_sector, 0, 0, 0); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); + set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); + if (test_bit(STRIPE_SYNCING, &sh->state)) { + release_stripe(sh); + schedule(); + goto again; + } + clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); spin_lock_irq(&sh->stripe_lock); for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) @@ -4153,6 +4208,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) goto again; } } + set_bit(STRIPE_DISCARD, &sh->state); finish_wait(&conf->wait_for_overlap, &w); for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) @@ -4216,7 +4272,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) } logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_sector + (bi->bi_size>>9); + last_sector = bio_end_sector(bi); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ @@ -4336,6 +4392,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); } } @@ -4620,9 +4678,10 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped = 1; return rv; } - if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && - !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { + if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !conf->fullsync && + !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + sync_blocks >= STRIPE_SECTORS) { /* we can skip this block, and probably more */ sync_blocks /= STRIPE_SECTORS; *skipped = 1; @@ -4679,7 +4738,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); sector = raid5_compute_sector(conf, logical_sector, 0, &dd_idx, NULL); - last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); + last_sector = bio_end_sector(raid_bio); for (; logical_sector < last_sector; logical_sector += STRIPE_SECTORS, @@ -4712,8 +4771,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) + if (remaining == 0) { + trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), + raid_bio, 0); bio_endio(raid_bio, 0); + } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 18b2c4a8a1fd..b0b663b119a8 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -221,10 +221,6 @@ struct stripe_head { struct stripe_operations { int target, target2; enum sum_check_flags zero_sum_result; - #ifdef CONFIG_MULTICORE_RAID456 - unsigned long request; - wait_queue_head_t wait_for_ops; - #endif } ops; struct r5dev { /* rreq and rvec are used for the replacement device when @@ -323,6 +319,7 @@ enum { STRIPE_COMPUTE_RUN, STRIPE_OPS_REQ_PENDING, STRIPE_ON_UNPLUG_LIST, + STRIPE_DISCARD, }; /* |