diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/super.c | 2 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 109 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 3 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 64 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 351 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 7 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 20 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 165 | ||||
-rw-r--r-- | drivers/md/dm.c | 12 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 96 | ||||
-rw-r--r-- | drivers/md/md-cluster.h | 1 | ||||
-rw-r--r-- | drivers/md/md.c | 95 | ||||
-rw-r--r-- | drivers/md/raid0.c | 2 | ||||
-rw-r--r-- | drivers/md/raid1.c | 6 | ||||
-rw-r--r-- | drivers/md/raid10.c | 20 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 7 | ||||
-rw-r--r-- | drivers/md/raid5.c | 12 |
18 files changed, 592 insertions, 382 deletions
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a296425a7270..f5dbb4e884d8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -816,7 +816,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); - blk_queue_flush(q, REQ_FLUSH|REQ_FUA); + blk_queue_write_cache(q, true, true); return 0; } diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 7df6b4f1548a..d8129ec93ebd 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -46,7 +46,7 @@ static inline char *bmname(struct bitmap *bitmap) * allocated while we're using it */ static int bitmap_checkpage(struct bitmap_counts *bitmap, - unsigned long page, int create) + unsigned long page, int create, int no_hijack) __releases(bitmap->lock) __acquires(bitmap->lock) { @@ -90,6 +90,9 @@ __acquires(bitmap->lock) if (mappage == NULL) { pr_debug("md/bitmap: map page allocation failed, hijacking\n"); + /* We don't support hijack for cluster raid */ + if (no_hijack) + return -ENOMEM; /* failed - set the hijacked flag so that we can use the * pointer as a counter */ if (!bitmap->bp[page].map) @@ -322,7 +325,7 @@ __clear_page_buffers(struct page *page) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } static void free_buffers(struct page *page) { @@ -756,7 +759,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, bytes += sizeof(bitmap_super_t); num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); - offset = slot_number * (num_pages - 1); + offset = slot_number * num_pages; store->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); @@ -900,6 +903,11 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) struct page *page; void *kaddr; unsigned long chunk = block >> bitmap->counts.chunkshift; + struct bitmap_storage *store = &bitmap->storage; + unsigned long node_offset = 0; + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * store->file_pages; page = filemap_get_page(&bitmap->storage, chunk); if (!page) @@ -915,7 +923,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) kunmap_atomic(kaddr); pr_debug("set file bit %lu page %lu\n", bit, page->index); /* record page number so it gets flushed to disk when unplug occurs */ - set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY); + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); } static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) @@ -924,6 +932,11 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) struct page *page; void *paddr; unsigned long chunk = block >> bitmap->counts.chunkshift; + struct bitmap_storage *store = &bitmap->storage; + unsigned long node_offset = 0; + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * store->file_pages; page = filemap_get_page(&bitmap->storage, chunk); if (!page) @@ -935,8 +948,8 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) else clear_bit_le(bit, paddr); kunmap_atomic(paddr); - if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { - set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); + if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); bitmap->allclean = 0; } } @@ -1321,7 +1334,7 @@ __acquires(bitmap->lock) sector_t csize; int err; - err = bitmap_checkpage(bitmap, page, create); + err = bitmap_checkpage(bitmap, page, create, 0); if (bitmap->bp[page].hijacked || bitmap->bp[page].map == NULL) @@ -1594,6 +1607,27 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) } EXPORT_SYMBOL(bitmap_cond_end_sync); +void bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi) +{ + struct bitmap *bitmap = mddev->bitmap; + sector_t sector, blocks = 0; + + for (sector = old_lo; sector < new_lo; ) { + bitmap_end_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); + + for (sector = old_hi; sector < new_hi; ) { + bitmap_start_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); +} +EXPORT_SYMBOL(bitmap_sync_with_cluster); + static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { /* For each chunk covered by any of these sectors, set the @@ -1673,6 +1707,9 @@ static void bitmap_free(struct bitmap *bitmap) if (!bitmap) /* there was no bitmap */ return; + if (bitmap->sysfs_can_clear) + sysfs_put(bitmap->sysfs_can_clear); + if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) md_cluster_stop(bitmap->mddev); @@ -1712,15 +1749,13 @@ void bitmap_destroy(struct mddev *mddev) if (mddev->thread) mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - if (bitmap->sysfs_can_clear) - sysfs_put(bitmap->sysfs_can_clear); - bitmap_free(bitmap); } /* * initialize the bitmap structure * if this returns an error, bitmap_destroy must be called to do clean up + * once mddev->bitmap is set */ struct bitmap *bitmap_create(struct mddev *mddev, int slot) { @@ -1813,6 +1848,9 @@ int bitmap_load(struct mddev *mddev) if (!bitmap) goto out; + if (mddev_is_clustered(mddev)) + md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); + /* Clear out old bitmap info first: Either there is none, or we * are resuming after someone else has possibly changed things, * so we should forget old cached info. @@ -1865,8 +1903,10 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot, struct bitmap_counts *counts; struct bitmap *bitmap = bitmap_create(mddev, slot); - if (IS_ERR(bitmap)) + if (IS_ERR(bitmap)) { + bitmap_free(bitmap); return PTR_ERR(bitmap); + } rv = bitmap_init_from_disk(bitmap, 0); if (rv) @@ -1887,14 +1927,14 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot, if (clear_bits) { bitmap_update_sb(bitmap); - /* Setting this for the ev_page should be enough. - * And we do not require both write_all and PAGE_DIRT either - */ + /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs + * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ for (i = 0; i < bitmap->storage.file_pages; i++) - set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); - bitmap_write_all(bitmap); + if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); bitmap_unplug(bitmap); } + bitmap_unplug(mddev->bitmap); *low = lo; *high = hi; err: @@ -2029,6 +2069,35 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, chunks << chunkshift); spin_lock_irq(&bitmap->counts.lock); + /* For cluster raid, need to pre-allocate bitmap */ + if (mddev_is_clustered(bitmap->mddev)) { + unsigned long page; + for (page = 0; page < pages; page++) { + ret = bitmap_checkpage(&bitmap->counts, page, 1, 1); + if (ret) { + unsigned long k; + + /* deallocate the page memory */ + for (k = 0; k < page; k++) { + kfree(new_bp[k].map); + } + + /* restore some fields from old_counts */ + bitmap->counts.bp = old_counts.bp; + bitmap->counts.pages = old_counts.pages; + bitmap->counts.missing_pages = old_counts.pages; + bitmap->counts.chunkshift = old_counts.chunkshift; + bitmap->counts.chunks = old_counts.chunks; + bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + + BITMAP_BLOCK_SHIFT); + blocks = old_counts.chunks << old_counts.chunkshift; + pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n"); + break; + } else + bitmap->counts.bp[page].count += 1; + } + } + for (block = 0; block < blocks; ) { bitmap_counter_t *bmc_old, *bmc_new; int set; @@ -2170,14 +2239,14 @@ location_store(struct mddev *mddev, const char *buf, size_t len) else { mddev->bitmap = bitmap; rv = bitmap_load(mddev); - if (rv) { - bitmap_destroy(mddev); + if (rv) mddev->bitmap_info.offset = 0; - } } mddev->pers->quiesce(mddev, 0); - if (rv) + if (rv) { + bitmap_destroy(mddev); return rv; + } } } } diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 5e3fcd6ecf77..5b6dd63dda91 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -258,6 +258,9 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); void bitmap_close_sync(struct bitmap *bitmap); void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); +void bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi); void bitmap_unplug(struct bitmap *bitmap); void bitmap_daemon_work(struct mddev *mddev); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 27f2ef300f8b..3970cda10080 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -867,39 +867,55 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, return 0; } -#define WRITE_LOCK(cmd) \ - down_write(&cmd->root_lock); \ - if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ - up_write(&cmd->root_lock); \ - return -EINVAL; \ +static bool cmd_write_lock(struct dm_cache_metadata *cmd) +{ + down_write(&cmd->root_lock); + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { + up_write(&cmd->root_lock); + return false; } + return true; +} -#define WRITE_LOCK_VOID(cmd) \ - down_write(&cmd->root_lock); \ - if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ - up_write(&cmd->root_lock); \ - return; \ - } +#define WRITE_LOCK(cmd) \ + do { \ + if (!cmd_write_lock((cmd))) \ + return -EINVAL; \ + } while(0) + +#define WRITE_LOCK_VOID(cmd) \ + do { \ + if (!cmd_write_lock((cmd))) \ + return; \ + } while(0) #define WRITE_UNLOCK(cmd) \ - up_write(&cmd->root_lock) + up_write(&(cmd)->root_lock) -#define READ_LOCK(cmd) \ - down_read(&cmd->root_lock); \ - if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ - up_read(&cmd->root_lock); \ - return -EINVAL; \ +static bool cmd_read_lock(struct dm_cache_metadata *cmd) +{ + down_read(&cmd->root_lock); + if (cmd->fail_io) { + up_read(&cmd->root_lock); + return false; } + return true; +} -#define READ_LOCK_VOID(cmd) \ - down_read(&cmd->root_lock); \ - if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ - up_read(&cmd->root_lock); \ - return; \ - } +#define READ_LOCK(cmd) \ + do { \ + if (!cmd_read_lock((cmd))) \ + return -EINVAL; \ + } while(0) + +#define READ_LOCK_VOID(cmd) \ + do { \ + if (!cmd_read_lock((cmd))) \ + return; \ + } while(0) #define READ_UNLOCK(cmd) \ - up_read(&cmd->root_lock) + up_read(&(cmd)->root_lock) int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) { diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 2adf81d81fca..2c7ca258c4e4 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1723,7 +1723,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern if (!dmi) { unsigned noio_flag; noio_flag = memalloc_noio_save(); - dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL); + dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL); memalloc_noio_restore(noio_flag); if (dmi) *param_flags |= DM_PARAMS_VMALLOC; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 677ba223e2ae..52baf8a5b0f4 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -76,26 +76,18 @@ struct multipath { wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ - unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ - - unsigned nr_valid_paths; /* Total number of usable paths */ struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ - bool queue_io:1; /* Must we queue all I/O? */ - bool queue_if_no_path:1; /* Queue I/O if last path fails? */ - bool saved_queue_if_no_path:1; /* Saved state during suspension */ - bool retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ - bool pg_init_disabled:1; /* pg_init is not currently allowed */ - bool pg_init_required:1; /* pg_init needs calling? */ - bool pg_init_delay_retry:1; /* Delay pg_init retry? */ + unsigned long flags; /* Multipath state flags */ unsigned pg_init_retries; /* Number of times to retry pg_init */ - unsigned pg_init_count; /* Number of times pg_init called */ unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ - struct work_struct trigger_event; + atomic_t nr_valid_paths; /* Total number of usable paths */ + atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ + atomic_t pg_init_count; /* Number of times pg_init called */ /* * We must use a mempool of dm_mpath_io structs so that we @@ -104,6 +96,7 @@ struct multipath { mempool_t *mpio_pool; struct mutex work_mutex; + struct work_struct trigger_event; }; /* @@ -122,6 +115,17 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); +/*----------------------------------------------- + * Multipath state flags. + *-----------------------------------------------*/ + +#define MPATHF_QUEUE_IO 0 /* Must we queue all I/O? */ +#define MPATHF_QUEUE_IF_NO_PATH 1 /* Queue I/O if last path fails? */ +#define MPATHF_SAVED_QUEUE_IF_NO_PATH 2 /* Saved state during suspension */ +#define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */ +#define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ +#define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ +#define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ /*----------------------------------------------- * Allocation routines @@ -189,7 +193,10 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq) if (m) { INIT_LIST_HEAD(&m->priority_groups); spin_lock_init(&m->lock); - m->queue_io = true; + set_bit(MPATHF_QUEUE_IO, &m->flags); + atomic_set(&m->nr_valid_paths, 0); + atomic_set(&m->pg_init_in_progress, 0); + atomic_set(&m->pg_init_count, 0); m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->trigger_event, trigger_event); init_waitqueue_head(&m->pg_init_wait); @@ -274,17 +281,17 @@ static int __pg_init_all_paths(struct multipath *m) struct pgpath *pgpath; unsigned long pg_init_delay = 0; - if (m->pg_init_in_progress || m->pg_init_disabled) + if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) return 0; - m->pg_init_count++; - m->pg_init_required = false; + atomic_inc(&m->pg_init_count); + clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); /* Check here to reset pg_init_required */ if (!m->current_pg) return 0; - if (m->pg_init_delay_retry) + if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags)) pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { @@ -293,65 +300,99 @@ static int __pg_init_all_paths(struct multipath *m) continue; if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, pg_init_delay)) - m->pg_init_in_progress++; + atomic_inc(&m->pg_init_in_progress); } - return m->pg_init_in_progress; + return atomic_read(&m->pg_init_in_progress); +} + +static int pg_init_all_paths(struct multipath *m) +{ + int r; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + r = __pg_init_all_paths(m); + spin_unlock_irqrestore(&m->lock, flags); + + return r; } -static void __switch_pg(struct multipath *m, struct pgpath *pgpath) +static void __switch_pg(struct multipath *m, struct priority_group *pg) { - m->current_pg = pgpath->pg; + m->current_pg = pg; /* Must we initialise the PG first, and queue I/O till it's ready? */ if (m->hw_handler_name) { - m->pg_init_required = true; - m->queue_io = true; + set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); + set_bit(MPATHF_QUEUE_IO, &m->flags); } else { - m->pg_init_required = false; - m->queue_io = false; + clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); + clear_bit(MPATHF_QUEUE_IO, &m->flags); } - m->pg_init_count = 0; + atomic_set(&m->pg_init_count, 0); } -static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, - size_t nr_bytes) +static struct pgpath *choose_path_in_pg(struct multipath *m, + struct priority_group *pg, + size_t nr_bytes) { + unsigned long flags; struct dm_path *path; + struct pgpath *pgpath; path = pg->ps.type->select_path(&pg->ps, nr_bytes); if (!path) - return -ENXIO; + return ERR_PTR(-ENXIO); - m->current_pgpath = path_to_pgpath(path); + pgpath = path_to_pgpath(path); - if (m->current_pg != pg) - __switch_pg(m, m->current_pgpath); + if (unlikely(lockless_dereference(m->current_pg) != pg)) { + /* Only update current_pgpath if pg changed */ + spin_lock_irqsave(&m->lock, flags); + m->current_pgpath = pgpath; + __switch_pg(m, pg); + spin_unlock_irqrestore(&m->lock, flags); + } - return 0; + return pgpath; } -static void __choose_pgpath(struct multipath *m, size_t nr_bytes) +static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) { + unsigned long flags; struct priority_group *pg; + struct pgpath *pgpath; bool bypassed = true; - if (!m->nr_valid_paths) { - m->queue_io = false; + if (!atomic_read(&m->nr_valid_paths)) { + clear_bit(MPATHF_QUEUE_IO, &m->flags); goto failed; } /* Were we instructed to switch PG? */ - if (m->next_pg) { + if (lockless_dereference(m->next_pg)) { + spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; + if (!pg) { + spin_unlock_irqrestore(&m->lock, flags); + goto check_current_pg; + } m->next_pg = NULL; - if (!__choose_path_in_pg(m, pg, nr_bytes)) - return; + spin_unlock_irqrestore(&m->lock, flags); + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) + return pgpath; } /* Don't change PG until it has no remaining paths */ - if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) - return; +check_current_pg: + pg = lockless_dereference(m->current_pg); + if (pg) { + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) + return pgpath; + } /* * Loop through priority groups until we find a valid path. @@ -363,34 +404,38 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes) list_for_each_entry(pg, &m->priority_groups, list) { if (pg->bypassed == bypassed) continue; - if (!__choose_path_in_pg(m, pg, nr_bytes)) { + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) { if (!bypassed) - m->pg_init_delay_retry = true; - return; + set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); + return pgpath; } } } while (bypassed--); failed: + spin_lock_irqsave(&m->lock, flags); m->current_pgpath = NULL; m->current_pg = NULL; + spin_unlock_irqrestore(&m->lock, flags); + + return NULL; } /* * Check whether bios must be queued in the device-mapper core rather * than here in the target. * - * m->lock must be held on entry. - * * If m->queue_if_no_path and m->saved_queue_if_no_path hold the * same value then we are not between multipath_presuspend() * and multipath_resume() calls and we have no need to check * for the DMF_NOFLUSH_SUSPENDING flag. */ -static int __must_push_back(struct multipath *m) +static int must_push_back(struct multipath *m) { - return (m->queue_if_no_path || - (m->queue_if_no_path != m->saved_queue_if_no_path && + return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || + ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) && dm_noflush_suspending(m->ti))); } @@ -408,35 +453,31 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, struct block_device *bdev; struct dm_mpath_io *mpio; - spin_lock_irq(&m->lock); - /* Do we need to select a new pgpath? */ - if (!m->current_pgpath || !m->queue_io) - __choose_pgpath(m, nr_bytes); - - pgpath = m->current_pgpath; + pgpath = lockless_dereference(m->current_pgpath); + if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) + pgpath = choose_pgpath(m, nr_bytes); if (!pgpath) { - if (!__must_push_back(m)) + if (!must_push_back(m)) r = -EIO; /* Failed */ - goto out_unlock; - } else if (m->queue_io || m->pg_init_required) { - __pg_init_all_paths(m); - goto out_unlock; + return r; + } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || + test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { + pg_init_all_paths(m); + return r; } mpio = set_mpio(m, map_context); if (!mpio) /* ENOMEM, requeue */ - goto out_unlock; + return r; mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; bdev = pgpath->path.dev->bdev; - spin_unlock_irq(&m->lock); - if (clone) { /* * Old request-based interface: allocated clone is passed in. @@ -468,11 +509,6 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, &pgpath->path, nr_bytes); return DM_MAPIO_REMAPPED; - -out_unlock: - spin_unlock_irq(&m->lock); - - return r; } static int multipath_map(struct dm_target *ti, struct request *clone, @@ -503,11 +539,22 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, spin_lock_irqsave(&m->lock, flags); - if (save_old_value) - m->saved_queue_if_no_path = m->queue_if_no_path; + if (save_old_value) { + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) + set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + else + clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + } else { + if (queue_if_no_path) + set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + else + clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); + } + if (queue_if_no_path) + set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); else - m->saved_queue_if_no_path = queue_if_no_path; - m->queue_if_no_path = queue_if_no_path; + clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + spin_unlock_irqrestore(&m->lock, flags); if (!queue_if_no_path) @@ -600,10 +647,10 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps goto bad; } - if (m->retain_attached_hw_handler || m->hw_handler_name) + if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name) q = bdev_get_queue(p->path.dev->bdev); - if (m->retain_attached_hw_handler) { + if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) { retain: attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); if (attached_handler_name) { @@ -808,7 +855,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) } if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { - m->retain_attached_hw_handler = true; + set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); continue; } @@ -884,6 +931,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, /* parse the priority groups */ while (as.argc) { struct priority_group *pg; + unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths); pg = parse_priority_group(&as, m); if (IS_ERR(pg)) { @@ -891,7 +939,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, goto bad; } - m->nr_valid_paths += pg->nr_pgpaths; + nr_valid_paths += pg->nr_pgpaths; + atomic_set(&m->nr_valid_paths, nr_valid_paths); + list_add_tail(&pg->list, &m->priority_groups); pg_count++; pg->pg_num = pg_count; @@ -921,19 +971,14 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, static void multipath_wait_for_pg_init_completion(struct multipath *m) { DECLARE_WAITQUEUE(wait, current); - unsigned long flags; add_wait_queue(&m->pg_init_wait, &wait); while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - spin_lock_irqsave(&m->lock, flags); - if (!m->pg_init_in_progress) { - spin_unlock_irqrestore(&m->lock, flags); + if (!atomic_read(&m->pg_init_in_progress)) break; - } - spin_unlock_irqrestore(&m->lock, flags); io_schedule(); } @@ -944,20 +989,16 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) static void flush_multipath_work(struct multipath *m) { - unsigned long flags; - - spin_lock_irqsave(&m->lock, flags); - m->pg_init_disabled = true; - spin_unlock_irqrestore(&m->lock, flags); + set_bit(MPATHF_PG_INIT_DISABLED, &m->flags); + smp_mb__after_atomic(); flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); flush_workqueue(kmultipathd); flush_work(&m->trigger_event); - spin_lock_irqsave(&m->lock, flags); - m->pg_init_disabled = false; - spin_unlock_irqrestore(&m->lock, flags); + clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); + smp_mb__after_atomic(); } static void multipath_dtr(struct dm_target *ti) @@ -987,13 +1028,13 @@ static int fail_path(struct pgpath *pgpath) pgpath->is_active = false; pgpath->fail_count++; - m->nr_valid_paths--; + atomic_dec(&m->nr_valid_paths); if (pgpath == m->current_pgpath) m->current_pgpath = NULL; dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, - pgpath->path.dev->name, m->nr_valid_paths); + pgpath->path.dev->name, atomic_read(&m->nr_valid_paths)); schedule_work(&m->trigger_event); @@ -1011,6 +1052,7 @@ static int reinstate_path(struct pgpath *pgpath) int r = 0, run_queue = 0; unsigned long flags; struct multipath *m = pgpath->pg->m; + unsigned nr_valid_paths; spin_lock_irqsave(&m->lock, flags); @@ -1025,16 +1067,17 @@ static int reinstate_path(struct pgpath *pgpath) pgpath->is_active = true; - if (!m->nr_valid_paths++) { + nr_valid_paths = atomic_inc_return(&m->nr_valid_paths); + if (nr_valid_paths == 1) { m->current_pgpath = NULL; run_queue = 1; } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) - m->pg_init_in_progress++; + atomic_inc(&m->pg_init_in_progress); } dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, - pgpath->path.dev->name, m->nr_valid_paths); + pgpath->path.dev->name, nr_valid_paths); schedule_work(&m->trigger_event); @@ -1152,8 +1195,9 @@ static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) spin_lock_irqsave(&m->lock, flags); - if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) - m->pg_init_required = true; + if (atomic_read(&m->pg_init_count) <= m->pg_init_retries && + !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) + set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); else limit_reached = true; @@ -1219,19 +1263,23 @@ static void pg_init_done(void *data, int errors) m->current_pgpath = NULL; m->current_pg = NULL; } - } else if (!m->pg_init_required) + } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) pg->bypassed = false; - if (--m->pg_init_in_progress) + if (atomic_dec_return(&m->pg_init_in_progress) > 0) /* Activations of other paths are still on going */ goto out; - if (m->pg_init_required) { - m->pg_init_delay_retry = delay_retry; + if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { + if (delay_retry) + set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); + else + clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); + if (__pg_init_all_paths(m)) goto out; } - m->queue_io = false; + clear_bit(MPATHF_QUEUE_IO, &m->flags); /* * Wake up any thread waiting to suspend. @@ -1287,7 +1335,6 @@ static int do_end_io(struct multipath *m, struct request *clone, * clone bios for it and resubmit it later. */ int r = DM_ENDIO_REQUEUE; - unsigned long flags; if (!error && !clone->errors) return 0; /* I/O complete */ @@ -1298,17 +1345,15 @@ static int do_end_io(struct multipath *m, struct request *clone, if (mpio->pgpath) fail_path(mpio->pgpath); - spin_lock_irqsave(&m->lock, flags); - if (!m->nr_valid_paths) { - if (!m->queue_if_no_path) { - if (!__must_push_back(m)) + if (!atomic_read(&m->nr_valid_paths)) { + if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + if (!must_push_back(m)) r = -EIO; } else { if (error == -EBADE) r = error; } } - spin_unlock_irqrestore(&m->lock, flags); return r; } @@ -1364,11 +1409,12 @@ static void multipath_postsuspend(struct dm_target *ti) static void multipath_resume(struct dm_target *ti) { struct multipath *m = ti->private; - unsigned long flags; - spin_lock_irqsave(&m->lock, flags); - m->queue_if_no_path = m->saved_queue_if_no_path; - spin_unlock_irqrestore(&m->lock, flags); + if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) + set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + else + clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + smp_mb__after_atomic(); } /* @@ -1402,19 +1448,20 @@ static void multipath_status(struct dm_target *ti, status_type_t type, /* Features */ if (type == STATUSTYPE_INFO) - DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count); + DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags), + atomic_read(&m->pg_init_count)); else { - DMEMIT("%u ", m->queue_if_no_path + + DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) + (m->pg_init_retries > 0) * 2 + (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + - m->retain_attached_hw_handler); - if (m->queue_if_no_path) + test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)); + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) DMEMIT("queue_if_no_path "); if (m->pg_init_retries) DMEMIT("pg_init_retries %u ", m->pg_init_retries); if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); - if (m->retain_attached_hw_handler) + if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) DMEMIT("retain_attached_hw_handler "); } @@ -1563,18 +1610,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) { struct multipath *m = ti->private; - unsigned long flags; + struct pgpath *current_pgpath; int r; - spin_lock_irqsave(&m->lock, flags); + current_pgpath = lockless_dereference(m->current_pgpath); + if (!current_pgpath) + current_pgpath = choose_pgpath(m, 0); - if (!m->current_pgpath) - __choose_pgpath(m, 0); - - if (m->current_pgpath) { - if (!m->queue_io) { - *bdev = m->current_pgpath->path.dev->bdev; - *mode = m->current_pgpath->path.dev->mode; + if (current_pgpath) { + if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { + *bdev = current_pgpath->path.dev->bdev; + *mode = current_pgpath->path.dev->mode; r = 0; } else { /* pg_init has not started or completed */ @@ -1582,23 +1628,19 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } } else { /* No path is available */ - if (m->queue_if_no_path) + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) r = -ENOTCONN; else r = -EIO; } - spin_unlock_irqrestore(&m->lock, flags); - if (r == -ENOTCONN) { - spin_lock_irqsave(&m->lock, flags); - if (!m->current_pg) { + if (!lockless_dereference(m->current_pg)) { /* Path status changed, redo selection */ - __choose_pgpath(m, 0); + (void) choose_pgpath(m, 0); } - if (m->pg_init_required) - __pg_init_all_paths(m); - spin_unlock_irqrestore(&m->lock, flags); + if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) + pg_init_all_paths(m); dm_table_run_md_queue_async(m->ti->table); } @@ -1649,39 +1691,37 @@ static int multipath_busy(struct dm_target *ti) { bool busy = false, has_active = false; struct multipath *m = ti->private; - struct priority_group *pg; + struct priority_group *pg, *next_pg; struct pgpath *pgpath; - unsigned long flags; - - spin_lock_irqsave(&m->lock, flags); /* pg_init in progress or no paths available */ - if (m->pg_init_in_progress || - (!m->nr_valid_paths && m->queue_if_no_path)) { - busy = true; - goto out; - } + if (atomic_read(&m->pg_init_in_progress) || + (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) + return true; + /* Guess which priority_group will be used at next mapping time */ - if (unlikely(!m->current_pgpath && m->next_pg)) - pg = m->next_pg; - else if (likely(m->current_pg)) - pg = m->current_pg; - else + pg = lockless_dereference(m->current_pg); + next_pg = lockless_dereference(m->next_pg); + if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) + pg = next_pg; + + if (!pg) { /* * We don't know which pg will be used at next mapping time. - * We don't call __choose_pgpath() here to avoid to trigger + * We don't call choose_pgpath() here to avoid to trigger * pg_init just by busy checking. * So we don't know whether underlying devices we will be using * at next mapping time are busy or not. Just try mapping. */ - goto out; + return busy; + } /* * If there is one non-busy active path at least, the path selector * will be able to select it. So we consider such a pg as not busy. */ busy = true; - list_for_each_entry(pgpath, &pg->pgpaths, list) + list_for_each_entry(pgpath, &pg->pgpaths, list) { if (pgpath->is_active) { has_active = true; if (!pgpath_busy(pgpath)) { @@ -1689,17 +1729,16 @@ static int multipath_busy(struct dm_target *ti) break; } } + } - if (!has_active) + if (!has_active) { /* * No active path in this pg, so this pg won't be used and * the current_pg will be changed at next mapping time. * We need to try mapping to determine it. */ busy = false; - -out: - spin_unlock_irqrestore(&m->lock, flags); + } return busy; } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index a0901214aef5..52532745a50f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1037,6 +1037,11 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) if (!mddev->events && super_init_validation(mddev, rdev)) return -EINVAL; + if (le32_to_cpu(sb->features)) { + rs->ti->error = "Unable to assemble array: No feature flags supported yet"; + return -EINVAL; + } + /* Enable bitmap creation for RAID levels != 0 */ mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0; rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; @@ -1718,7 +1723,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 7, 0}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f9e8f0bef332..626a5ec04466 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1348,13 +1348,13 @@ static void dm_table_verify_integrity(struct dm_table *t) static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - unsigned flush = (*(unsigned *)data); + unsigned long flush = (unsigned long) data; struct request_queue *q = bdev_get_queue(dev->bdev); - return q && (q->flush_flags & flush); + return q && (q->queue_flags & flush); } -static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) +static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) { struct dm_target *ti; unsigned i = 0; @@ -1375,7 +1375,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) return true; if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_flush_capable, &flush)) + ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) return true; } @@ -1506,7 +1506,7 @@ static bool dm_table_supports_discards(struct dm_table *t) void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { - unsigned flush = 0; + bool wc = false, fua = false; /* * Copy table's limits to the DM device's request_queue @@ -1518,12 +1518,12 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - if (dm_table_supports_flush(t, REQ_FLUSH)) { - flush |= REQ_FLUSH; - if (dm_table_supports_flush(t, REQ_FUA)) - flush |= REQ_FUA; + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { + wc = true; + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) + fua = true; } - blk_queue_flush(q, flush); + blk_queue_write_cache(q, wc, fua); if (!dm_table_discard_zeroes_data(t)) q->limits.discard_zeroes_data = 0; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 92237b6fa8cd..fc803d50f9f0 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -322,56 +322,6 @@ struct thin_c { /*----------------------------------------------------------------*/ -/** - * __blkdev_issue_discard_async - queue a discard with async completion - * @bdev: blockdev to issue discard for - * @sector: start sector - * @nr_sects: number of sectors to discard - * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: BLKDEV_IFL_* flags to control behaviour - * @parent_bio: parent discard bio that all sub discards get chained to - * - * Description: - * Asynchronously issue a discard request for the sectors in question. - */ -static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags, - struct bio *parent_bio) -{ - struct request_queue *q = bdev_get_queue(bdev); - int type = REQ_WRITE | REQ_DISCARD; - struct bio *bio; - - if (!q || !nr_sects) - return -ENXIO; - - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - - if (flags & BLKDEV_DISCARD_SECURE) { - if (!blk_queue_secdiscard(q)) - return -EOPNOTSUPP; - type |= REQ_SECURE; - } - - /* - * Required bio_put occurs in bio_endio thanks to bio_chain below - */ - bio = bio_alloc(gfp_mask, 1); - if (!bio) - return -ENOMEM; - - bio_chain(bio, parent_bio); - - bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; - bio->bi_iter.bi_size = nr_sects << 9; - - submit_bio(type, bio); - - return 0; -} - static bool block_size_is_power_of_two(struct pool *pool) { return pool->sectors_per_block_shift >= 0; @@ -384,14 +334,55 @@ static sector_t block_to_sectors(struct pool *pool, dm_block_t b) (b * pool->sectors_per_block); } -static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e, - struct bio *parent_bio) +/*----------------------------------------------------------------*/ + +struct discard_op { + struct thin_c *tc; + struct blk_plug plug; + struct bio *parent_bio; + struct bio *bio; +}; + +static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent) +{ + BUG_ON(!parent); + + op->tc = tc; + blk_start_plug(&op->plug); + op->parent_bio = parent; + op->bio = NULL; +} + +static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e) { + struct thin_c *tc = op->tc; sector_t s = block_to_sectors(tc->pool, data_b); sector_t len = block_to_sectors(tc->pool, data_e - data_b); - return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len, - GFP_NOWAIT, 0, parent_bio); + return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, + GFP_NOWAIT, REQ_WRITE | REQ_DISCARD, &op->bio); +} + +static void end_discard(struct discard_op *op, int r) +{ + if (op->bio) { + /* + * Even if one of the calls to issue_discard failed, we + * need to wait for the chain to complete. + */ + bio_chain(op->bio, op->parent_bio); + submit_bio(REQ_WRITE | REQ_DISCARD, op->bio); + } + + blk_finish_plug(&op->plug); + + /* + * Even if r is set, there could be sub discards in flight that we + * need to wait for. + */ + if (r && !op->parent_bio->bi_error) + op->parent_bio->bi_error = r; + bio_endio(op->parent_bio); } /*----------------------------------------------------------------*/ @@ -632,7 +623,7 @@ static void error_retry_list(struct pool *pool) { int error = get_pool_io_error_code(pool); - return error_retry_list_with_code(pool, error); + error_retry_list_with_code(pool, error); } /* @@ -1006,24 +997,28 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m) mempool_free(m, tc->pool->mapping_pool); } -static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m) +/*----------------------------------------------------------------*/ + +static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m) { /* * We've already unmapped this range of blocks, but before we * passdown we have to check that these blocks are now unused. */ - int r; + int r = 0; bool used = true; struct thin_c *tc = m->tc; struct pool *pool = tc->pool; dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin; + struct discard_op op; + begin_discard(&op, tc, m->bio); while (b != end) { /* find start of unmapped run */ for (; b < end; b++) { r = dm_pool_block_is_used(pool->pmd, b, &used); if (r) - return r; + goto out; if (!used) break; @@ -1036,20 +1031,20 @@ static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m) for (e = b + 1; e != end; e++) { r = dm_pool_block_is_used(pool->pmd, e, &used); if (r) - return r; + goto out; if (used) break; } - r = issue_discard(tc, b, e, m->bio); + r = issue_discard(&op, b, e); if (r) - return r; + goto out; b = e; } - - return 0; +out: + end_discard(&op, r); } static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) @@ -1059,20 +1054,21 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) struct pool *pool = tc->pool; r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end); - if (r) + if (r) { metadata_operation_failed(pool, "dm_thin_remove_range", r); + bio_io_error(m->bio); - else if (m->maybe_shared) - r = passdown_double_checking_shared_status(m); - else - r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio); + } else if (m->maybe_shared) { + passdown_double_checking_shared_status(m); + + } else { + struct discard_op op; + begin_discard(&op, tc, m->bio); + r = issue_discard(&op, m->data_block, + m->data_block + (m->virt_end - m->virt_begin)); + end_discard(&op, r); + } - /* - * Even if r is set, there could be sub discards in flight that we - * need to wait for. - */ - m->bio->bi_error = r; - bio_endio(m->bio); cell_defer_no_holder(tc, m->cell); mempool_free(m, pool->mapping_pool); } @@ -1494,17 +1490,6 @@ static void process_discard_cell_no_passdown(struct thin_c *tc, pool->process_prepared_discard(m); } -/* - * __bio_inc_remaining() is used to defer parent bios's end_io until - * we _know_ all chained sub range discard bios have completed. - */ -static inline void __bio_inc_remaining(struct bio *bio) -{ - bio->bi_flags |= (1 << BIO_CHAIN); - smp_mb__before_atomic(); - atomic_inc(&bio->__bi_remaining); -} - static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end, struct bio *bio) { @@ -1554,13 +1539,13 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t /* * The parent bio must not complete before sub discard bios are - * chained to it (see __blkdev_issue_discard_async's bio_chain)! + * chained to it (see end_discard's bio_chain)! * * This per-mapping bi_remaining increment is paired with * the implicit decrement that occurs via bio_endio() in - * process_prepared_discard_{passdown,no_passdown}. + * end_discard(). */ - __bio_inc_remaining(bio); + bio_inc_remaining(bio); if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) pool->process_prepared_discard(m); @@ -3899,7 +3884,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 18, 0}, + .version = {1, 19, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4273,7 +4258,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 18, 0}, + .version = {1, 19, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index be4905769a45..1b2f96205361 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -674,7 +674,7 @@ static void free_io(struct mapped_device *md, struct dm_io *io) mempool_free(io, md->io_pool); } -static void free_tio(struct mapped_device *md, struct dm_target_io *tio) +static void free_tio(struct dm_target_io *tio) { bio_put(&tio->clone); } @@ -1055,7 +1055,7 @@ static void clone_endio(struct bio *bio) !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) disable_write_same(md); - free_tio(md, tio); + free_tio(tio); dec_pending(io, error); } @@ -1517,7 +1517,6 @@ static void __map_bio(struct dm_target_io *tio) { int r; sector_t sector; - struct mapped_device *md; struct bio *clone = &tio->clone; struct dm_target *ti = tio->ti; @@ -1540,9 +1539,8 @@ static void __map_bio(struct dm_target_io *tio) generic_make_request(clone); } else if (r < 0 || r == DM_MAPIO_REQUEUE) { /* error the io and bail out, or requeue it if needed */ - md = tio->io->md; dec_pending(tio->io, r); - free_tio(md, tio); + free_tio(tio); } else if (r != DM_MAPIO_SUBMITTED) { DMWARN("unimplemented target map return value: %d", r); BUG(); @@ -1662,8 +1660,10 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, tio = alloc_tio(ci, ti, target_bio_nr); tio->len_ptr = len; r = clone_bio(tio, bio, sector, *len); - if (r < 0) + if (r < 0) { + free_tio(tio); break; + } __map_bio(tio); } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index dd97d4245822..41573f1f626f 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -61,6 +61,10 @@ struct resync_info { * the lock. */ #define MD_CLUSTER_SEND_LOCKED_ALREADY 5 +/* We should receive message after node joined cluster and + * set up all the related infos such as bitmap and personality */ +#define MD_CLUSTER_ALREADY_IN_CLUSTER 6 +#define MD_CLUSTER_PENDING_RECV_EVENT 7 struct md_cluster_info { @@ -85,6 +89,9 @@ struct md_cluster_info { struct completion newdisk_completion; wait_queue_head_t wait; unsigned long state; + /* record the region in RESYNCING message */ + sector_t sync_low; + sector_t sync_hi; }; enum msg_type { @@ -284,11 +291,14 @@ static void recover_bitmaps(struct md_thread *thread) goto dlm_unlock; } if (hi > 0) { - /* TODO:Wait for current resync to get over */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); if (lo < mddev->recovery_cp) mddev->recovery_cp = lo; - md_check_recovery(mddev); + /* wake up thread to continue resync in case resync + * is not finished */ + if (mddev->recovery_cp != MaxSector) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } } dlm_unlock: dlm_unlock_sync(bm_lockres); @@ -370,8 +380,12 @@ static void ack_bast(void *arg, int mode) struct dlm_lock_resource *res = arg; struct md_cluster_info *cinfo = res->mddev->cluster_info; - if (mode == DLM_LOCK_EX) - md_wakeup_thread(cinfo->recv_thread); + if (mode == DLM_LOCK_EX) { + if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state)) + md_wakeup_thread(cinfo->recv_thread); + else + set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state); + } } static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) @@ -408,6 +422,30 @@ static void process_suspend_info(struct mddev *mddev, md_wakeup_thread(mddev->thread); return; } + + /* + * The bitmaps are not same for different nodes + * if RESYNCING is happening in one node, then + * the node which received the RESYNCING message + * probably will perform resync with the region + * [lo, hi] again, so we could reduce resync time + * a lot if we can ensure that the bitmaps among + * different nodes are match up well. + * + * sync_low/hi is used to record the region which + * arrived in the previous RESYNCING message, + * + * Call bitmap_sync_with_cluster to clear + * NEEDED_MASK and set RESYNC_MASK since + * resync thread is running in another node, + * so we don't need to do the resync again + * with the same section */ + bitmap_sync_with_cluster(mddev, cinfo->sync_low, + cinfo->sync_hi, + lo, hi); + cinfo->sync_low = lo; + cinfo->sync_hi = hi; + s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); if (!s) return; @@ -482,11 +520,13 @@ static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) __func__, __LINE__, le32_to_cpu(msg->raid_slot)); } -static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) +static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) { + int ret = 0; + if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot), "node %d received it's own msg\n", le32_to_cpu(msg->slot))) - return; + return -1; switch (le32_to_cpu(msg->type)) { case METADATA_UPDATED: process_metadata_update(mddev, msg); @@ -509,9 +549,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) __recover_slot(mddev, le32_to_cpu(msg->slot)); break; default: + ret = -1; pr_warn("%s:%d Received unknown message from %d\n", __func__, __LINE__, msg->slot); } + return ret; } /* @@ -535,7 +577,9 @@ static void recv_daemon(struct md_thread *thread) /* read lvb and wake up thread to process this message_lockres */ memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg)); - process_recvd_msg(thread->mddev, &msg); + ret = process_recvd_msg(thread->mddev, &msg); + if (ret) + goto out; /*release CR on ack_lockres*/ ret = dlm_unlock_sync(ack_lockres); @@ -549,6 +593,7 @@ static void recv_daemon(struct md_thread *thread) ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR); if (unlikely(ret != 0)) pr_info("lock CR on ack failed return %d\n", ret); +out: /*release CR on message_lockres*/ ret = dlm_unlock_sync(message_lockres); if (unlikely(ret != 0)) @@ -778,17 +823,24 @@ static int join(struct mddev *mddev, int nodes) cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); if (!cinfo->token_lockres) goto err; - cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); - if (!cinfo->ack_lockres) - goto err; cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); if (!cinfo->no_new_dev_lockres) goto err; + ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (ret) { + ret = -EAGAIN; + pr_err("md-cluster: can't join cluster to avoid lock issue\n"); + goto err; + } + cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); + if (!cinfo->ack_lockres) + goto err; /* get sync CR lock on ACK. */ if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", ret); + dlm_unlock_sync(cinfo->token_lockres); /* get sync CR lock on no-new-dev. */ if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); @@ -809,12 +861,10 @@ static int join(struct mddev *mddev, int nodes) if (!cinfo->resync_lockres) goto err; - ret = gather_all_resync_info(mddev, nodes); - if (ret) - goto err; - return 0; err: + md_unregister_thread(&cinfo->recovery_thread); + md_unregister_thread(&cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); @@ -828,6 +878,19 @@ err: return ret; } +static void load_bitmaps(struct mddev *mddev, int total_slots) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + /* load all the node's bitmap info for resync */ + if (gather_all_resync_info(mddev, total_slots)) + pr_err("md-cluster: failed to gather all resyn infos\n"); + set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state); + /* wake up recv thread in case something need to be handled */ + if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state)) + md_wakeup_thread(cinfo->recv_thread); +} + static void resync_bitmap(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -937,7 +1000,6 @@ static void metadata_update_cancel(struct mddev *mddev) static int resync_start(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; - cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE; return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); } @@ -967,7 +1029,6 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) static int resync_finish(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; - cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE; dlm_unlock_sync(cinfo->resync_lockres); return resync_info_update(mddev, 0, 0); } @@ -1171,6 +1232,7 @@ static struct md_cluster_operations cluster_ops = { .add_new_disk_cancel = add_new_disk_cancel, .new_disk_ack = new_disk_ack, .remove_disk = remove_disk, + .load_bitmaps = load_bitmaps, .gather_bitmaps = gather_bitmaps, .lock_all_bitmaps = lock_all_bitmaps, .unlock_all_bitmaps = unlock_all_bitmaps, diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 45ce6c97d8bd..e765499ba591 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -23,6 +23,7 @@ struct md_cluster_operations { void (*add_new_disk_cancel)(struct mddev *mddev); int (*new_disk_ack)(struct mddev *mddev, bool ack); int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); + void (*load_bitmaps)(struct mddev *mddev, int total_slots); int (*gather_bitmaps)(struct md_rdev *rdev); int (*lock_all_bitmaps)(struct mddev *mddev); void (*unlock_all_bitmaps)(struct mddev *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index c068f171b4eb..866825f10b4c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -284,6 +284,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) * go away inside make_request */ sectors = bio_sectors(bio); + /* bio could be mergeable after passing to underlayer */ + bio->bi_rw &= ~REQ_NOMERGE; mddev->pers->make_request(mddev, bio); cpu = part_stat_lock(); @@ -305,7 +307,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) */ void mddev_suspend(struct mddev *mddev) { - WARN_ON_ONCE(current == mddev->thread->tsk); + WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); if (mddev->suspended++) return; synchronize_rcu(); @@ -718,6 +720,7 @@ static void super_written(struct bio *bio) if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); + rdev_dec_pending(rdev, mddev); bio_put(bio); } @@ -732,6 +735,8 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, */ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); + atomic_inc(&rdev->nr_pending); + bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; bio->bi_iter.bi_sector = sector; bio_add_page(bio, page, size, 0); @@ -2286,19 +2291,24 @@ void md_update_sb(struct mddev *mddev, int force_change) return; } +repeat: if (mddev_is_clustered(mddev)) { if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; + if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) + nospares = 1; ret = md_cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) md_cluster_ops->metadata_update_cancel(mddev); - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), + BIT(MD_CHANGE_DEVS) | + BIT(MD_CHANGE_CLEAN)); return; } } -repeat: + /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && @@ -2425,15 +2435,14 @@ repeat: md_super_wait(mddev); /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ - spin_lock(&mddev->lock); + if (mddev_is_clustered(mddev) && ret == 0) + md_cluster_ops->metadata_update_finish(mddev); + if (mddev->in_sync != sync_req || - test_bit(MD_CHANGE_DEVS, &mddev->flags)) { + !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN))) /* have to write it out again */ - spin_unlock(&mddev->lock); goto repeat; - } - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - spin_unlock(&mddev->lock); wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); @@ -2447,9 +2456,6 @@ repeat: clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } - - if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_finish(mddev); } EXPORT_SYMBOL(md_update_sb); @@ -4811,6 +4817,10 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; + /* cluster raid doesn't support change array_sectors */ + if (mddev_is_clustered(mddev)) + return -EINVAL; + if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) sectors = mddev->pers->size(mddev, 0, 0); @@ -5034,7 +5044,7 @@ static int md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; - blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); + blk_queue_write_cache(mddev->queue, true, true); /* Allow extended partitions. This makes the * 'mdp' device redundant, but we can't really * remove it now. @@ -6432,6 +6442,10 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) int rv; int fit = (num_sectors == 0); + /* cluster raid doesn't support update size */ + if (mddev_is_clustered(mddev)) + return -EINVAL; + if (mddev->pers->resize == NULL) return -EINVAL; /* The "num_sectors" is the number of sectors of each device that @@ -6883,7 +6897,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, case ADD_NEW_DISK: /* We can support ADD_NEW_DISK on read-only arrays - * on if we are re-adding a preexisting device. + * only if we are re-adding a preexisting device. * So require mddev->pers and MD_DISK_SYNC. */ if (mddev->pers) { @@ -7780,7 +7794,7 @@ void md_do_sync(struct md_thread *thread) struct md_rdev *rdev; char *desc, *action = NULL; struct blk_plug plug; - bool cluster_resync_finished = false; + int ret; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7790,6 +7804,19 @@ void md_do_sync(struct md_thread *thread) return; } + if (mddev_is_clustered(mddev)) { + ret = md_cluster_ops->resync_start(mddev); + if (ret) + goto skip; + + if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + && ((unsigned long long)mddev->curr_resync_completed + < (unsigned long long)mddev->resync_max_sectors)) + goto skip; + } + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { desc = "data-check"; @@ -8084,11 +8111,6 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - /* tell personality and other nodes that we are finished */ - if (mddev_is_clustered(mddev)) { - md_cluster_ops->resync_finish(mddev); - cluster_resync_finished = true; - } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && @@ -8125,12 +8147,18 @@ void md_do_sync(struct md_thread *thread) } } skip: - set_bit(MD_CHANGE_DEVS, &mddev->flags); - if (mddev_is_clustered(mddev) && - test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !cluster_resync_finished) + ret == 0) { + /* set CHANGE_PENDING here since maybe another + * update is needed, so other nodes are informed */ + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); md_cluster_ops->resync_finish(mddev); + } else + set_bit(MD_CHANGE_DEVS, &mddev->flags); spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -8221,18 +8249,9 @@ static void md_start_sync(struct work_struct *ws) struct mddev *mddev = container_of(ws, struct mddev, del_work); int ret = 0; - if (mddev_is_clustered(mddev)) { - ret = md_cluster_ops->resync_start(mddev); - if (ret) { - mddev->sync_thread = NULL; - goto out; - } - } - mddev->sync_thread = md_register_thread(md_do_sync, mddev, "resync"); -out: if (!mddev->sync_thread) { if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) printk(KERN_ERR "%s: could not start resync" @@ -8531,6 +8550,7 @@ EXPORT_SYMBOL(md_finish_reshape); int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { + struct mddev *mddev = rdev->mddev; int rv; if (is_new) s += rdev->new_data_offset; @@ -8540,8 +8560,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, if (rv == 0) { /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); - set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); - set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING)); md_wakeup_thread(rdev->mddev->thread); return 1; } else @@ -8675,6 +8695,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %s\n", bdevname(rdev2->bdev,b)); + /* wakeup mddev->thread here, so array could + * perform resync with the new activated disk */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } /* device faulty * We just want to do the minimum to mark the disk diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 2ea12c6bf659..34783a3c8b3c 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -70,7 +70,6 @@ static void dump_zones(struct mddev *mddev) (unsigned long long)zone_size>>1); zone_start = conf->strip_zone[j].zone_end; } - printk(KERN_INFO "\n"); } static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) @@ -85,6 +84,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); unsigned short blksize = 512; + *private_conf = ERR_PTR(-ENOMEM); if (!conf) return -ENOMEM; rdev_for_each(rdev1, mddev) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 39fb21e048e6..c7c8cde0ab21 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -570,7 +570,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect if (best_dist_disk < 0) { if (is_badblock(rdev, this_sector, sectors, &first_bad, &bad_sectors)) { - if (first_bad < this_sector) + if (first_bad <= this_sector) /* Cannot use this */ continue; best_good_sectors = first_bad - this_sector; @@ -1474,8 +1474,8 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" "md/raid1:%s: Operation continuing on %d devices.\n", diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e3fd725d5c4d..c7de2a53e625 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1102,8 +1102,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio) bio->bi_iter.bi_sector < conf->reshape_progress))) { /* Need to update reshape_position in metadata */ mddev->reshape_position = conf->reshape_progress; - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_PENDING, &mddev->flags)); @@ -1591,8 +1591,8 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); spin_unlock_irqrestore(&conf->device_lock, flags); printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" @@ -3782,8 +3782,10 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, size); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } if (sectors > mddev->dev_sectors && mddev->recovery_cp > oldsize) { mddev->recovery_cp = oldsize; @@ -4593,8 +4595,10 @@ static void raid10_finish_reshape(struct mddev *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->resync_max_sectors = size; - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } } else { int d; for (d = conf->geo.raid_disks ; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 9531f5f05b93..e889e2deb7b3 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -712,8 +712,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, * in_teardown check workaround this issue. */ if (!log->in_teardown) { - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_PENDING, &mddev->flags) || @@ -1188,6 +1188,7 @@ ioerr: int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { + struct request_queue *q = bdev_get_queue(rdev->bdev); struct r5l_log *log; if (PAGE_SIZE != 4096) @@ -1197,7 +1198,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) return -ENOMEM; log->rdev = rdev; - log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0); + log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, sizeof(rdev->mddev->uuid)); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8ab8b65e1741..8959e6dd31dd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2514,8 +2514,8 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" "md/raid:%s: Operation continuing on %d devices.\n", @@ -3502,8 +3502,6 @@ returnbi: dev = &sh->dev[i]; } else if (test_bit(R5_Discard, &dev->flags)) discard_pending = 1; - WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); - WARN_ON(dev->page != dev->orig_page); } r5l_stripe_write_finished(sh); @@ -7574,8 +7572,10 @@ static void raid5_finish_reshape(struct mddev *mddev) if (mddev->delta_disks > 0) { md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } } else { int d; spin_lock_irq(&conf->device_lock); |