diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 634 |
1 files changed, 384 insertions, 250 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 92ac251e91e6..06d7279bdd04 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644); MODULE_PARM_DESC(devices_handle_discard_safely, "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); static struct workqueue_struct *raid5_wq; -/* - * Stripe cache - */ - -#define NR_STRIPES 256 -#define STRIPE_SIZE PAGE_SIZE -#define STRIPE_SHIFT (PAGE_SHIFT - 9) -#define STRIPE_SECTORS (STRIPE_SIZE>>9) -#define IO_THRESHOLD 1 -#define BYPASS_THRESHOLD 1 -#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) -#define HASH_MASK (NR_HASH - 1) -#define MAX_STRIPE_BATCH 8 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) { @@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) local_irq_enable(); } -/* bio's attached to a stripe+device for I/O are linked together in bi_sector - * order without overlap. There may be several bio's per stripe+device, and - * a bio could span several devices. - * When walking this list for a particular stripe+device, we must never proceed - * beyond a bio that extends past this device, as the next bio might no longer - * be valid. - * This function is used to determine the 'next' bio in the list, given the sector - * of the current stripe+device - */ -static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) -{ - int sectors = bio_sectors(bio); - if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) - return bio->bi_next; - else - return NULL; -} - -/* - * We maintain a biased count of active stripes in the bottom 16 bits of - * bi_phys_segments, and a count of processed stripes in the upper 16 bits - */ -static inline int raid5_bi_processed_stripes(struct bio *bio) -{ - atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; - return (atomic_read(segments) >> 16) & 0xffff; -} - -static inline int raid5_dec_bi_active_stripes(struct bio *bio) -{ - atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; - return atomic_sub_return(1, segments) & 0xffff; -} - -static inline void raid5_inc_bi_active_stripes(struct bio *bio) -{ - atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; - atomic_inc(segments); -} - -static inline void raid5_set_bi_processed_stripes(struct bio *bio, - unsigned int cnt) -{ - atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; - int old, new; - - do { - old = atomic_read(segments); - new = (old & 0xffff) | (cnt << 16); - } while (atomic_cmpxchg(segments, old, new) != old); -} - -static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) -{ - atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; - atomic_set(segments, cnt); -} - /* Find first data disk in a raid6 stripe */ static inline int raid6_d0(struct stripe_head *sh) { @@ -289,8 +218,27 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, struct list_head *temp_inactive_list) { + int i; + int injournal = 0; /* number of date pages with R5_InJournal */ + BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); + + if (r5c_is_writeback(conf->log)) + for (i = sh->disks; i--; ) + if (test_bit(R5_InJournal, &sh->dev[i].flags)) + injournal++; + /* + * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with + * data in journal, so they are not released to cached lists + */ + if (conf->quiesce && r5c_is_writeback(conf->log) && + !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { + if (test_bit(STRIPE_R5C_CACHING, &sh->state)) + r5c_make_stripe_write_out(sh); + set_bit(STRIPE_HANDLE, &sh->state); + } + if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) @@ -316,8 +264,30 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); atomic_dec(&conf->active_stripes); - if (!test_bit(STRIPE_EXPANDING, &sh->state)) - list_add_tail(&sh->lru, temp_inactive_list); + if (!test_bit(STRIPE_EXPANDING, &sh->state)) { + if (!r5c_is_writeback(conf->log)) + list_add_tail(&sh->lru, temp_inactive_list); + else { + WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); + if (injournal == 0) + list_add_tail(&sh->lru, temp_inactive_list); + else if (injournal == conf->raid_disks - conf->max_degraded) { + /* full stripe */ + if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) + atomic_inc(&conf->r5c_cached_full_stripes); + if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) + atomic_dec(&conf->r5c_cached_partial_stripes); + list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); + r5c_check_cached_full_stripe(conf); + } else { + /* partial stripe */ + if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE, + &sh->state)) + atomic_inc(&conf->r5c_cached_partial_stripes); + list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); + } + } + } } } @@ -541,7 +511,7 @@ retry: if (dev->toread || dev->read || dev->towrite || dev->written || test_bit(R5_LOCKED, &dev->flags)) { - printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", + pr_err("sector=%llx i=%d %p %p %p %p %d\n", (unsigned long long)sh->sector, i, dev->toread, dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); @@ -680,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, } if (noblock && sh == NULL) break; + + r5c_check_stripe_cache_usage(conf); if (!sh) { set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + r5l_wake_reclaim(conf->log, 0); wait_event_lock_irq( conf->wait_for_stripe, !list_empty(conf->inactive_list + hash) && @@ -901,8 +874,19 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) might_sleep(); - if (r5l_write_stripe(conf->log, sh) == 0) - return; + if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { + /* writing out phase */ + if (s->waiting_extra_page) + return; + if (r5l_write_stripe(conf->log, sh) == 0) + return; + } else { /* caching phase */ + if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) { + r5c_cache_data(conf->log, sh, s); + return; + } + } + for (i = disks; i--; ) { int op, op_flags = 0; int replace_only = 0; @@ -913,7 +897,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { op = REQ_OP_WRITE; if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) - op_flags = WRITE_FUA; + op_flags = REQ_FUA; if (test_bit(R5_Discard, &sh->dev[i].flags)) op = REQ_OP_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) @@ -977,7 +961,7 @@ again: if (bad < 0) { set_bit(BlockedBadBlocks, &rdev->flags); if (!conf->mddev->external && - conf->mddev->flags) { + conf->mddev->sb_flags) { /* It is very unlikely, but we might * still need to write out the * bad block log - better give it @@ -1115,7 +1099,7 @@ again: static struct dma_async_tx_descriptor * async_copy_data(int frombio, struct bio *bio, struct page **page, sector_t sector, struct dma_async_tx_descriptor *tx, - struct stripe_head *sh) + struct stripe_head *sh, int no_skipcopy) { struct bio_vec bvl; struct bvec_iter iter; @@ -1155,7 +1139,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, if (frombio) { if (sh->raid_conf->skip_copy && b_offset == 0 && page_offset == 0 && - clen == STRIPE_SIZE) + clen == STRIPE_SIZE && + !no_skipcopy) *page = bio_page; else tx = async_memcpy(*page, bio_page, page_offset, @@ -1237,7 +1222,7 @@ static void ops_run_biofill(struct stripe_head *sh) while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { tx = async_copy_data(0, rbi, &dev->page, - dev->sector, tx, sh); + dev->sector, tx, sh, 0); rbi = r5_next_bio(rbi, dev->sector); } } @@ -1364,10 +1349,15 @@ static int set_syndrome_sources(struct page **srcs, if (i == sh->qd_idx || i == sh->pd_idx || (srctype == SYNDROME_SRC_ALL) || (srctype == SYNDROME_SRC_WANT_DRAIN && - test_bit(R5_Wantdrain, &dev->flags)) || + (test_bit(R5_Wantdrain, &dev->flags) || + test_bit(R5_InJournal, &dev->flags))) || (srctype == SYNDROME_SRC_WRITTEN && - dev->written)) - srcs[slot] = sh->dev[i].page; + dev->written)) { + if (test_bit(R5_InJournal, &dev->flags)) + srcs[slot] = sh->dev[i].orig_page; + else + srcs[slot] = sh->dev[i].page; + } i = raid6_next_disk(i, disks); } while (i != d0_idx); @@ -1546,6 +1536,13 @@ static void ops_complete_prexor(void *stripe_head_ref) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + + if (r5c_is_writeback(sh->raid_conf->log)) + /* + * raid5-cache write back uses orig_page during prexor. + * After prexor, it is time to free orig_page + */ + r5c_release_extra_page(sh); } static struct dma_async_tx_descriptor * @@ -1567,7 +1564,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (test_bit(R5_Wantdrain, &dev->flags)) + if (test_bit(R5_InJournal, &dev->flags)) + xor_srcs[count++] = dev->orig_page; + else if (test_bit(R5_Wantdrain, &dev->flags)) xor_srcs[count++] = dev->page; } @@ -1601,6 +1600,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, static struct dma_async_tx_descriptor * ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { + struct r5conf *conf = sh->raid_conf; int disks = sh->disks; int i; struct stripe_head *head_sh = sh; @@ -1618,6 +1618,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) again: dev = &sh->dev[i]; + /* + * clear R5_InJournal, so when rewriting a page in + * journal, it is not skipped by r5l_log_stripe() + */ + clear_bit(R5_InJournal, &dev->flags); spin_lock_irq(&sh->stripe_lock); chosen = dev->towrite; dev->towrite = NULL; @@ -1637,8 +1642,10 @@ again: set_bit(R5_Discard, &dev->flags); else { tx = async_copy_data(1, wbi, &dev->page, - dev->sector, tx, sh); - if (dev->page != dev->orig_page) { + dev->sector, tx, sh, + r5c_is_writeback(conf->log)); + if (dev->page != dev->orig_page && + !r5c_is_writeback(conf->log)) { set_bit(R5_SkipCopy, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); clear_bit(R5_OVERWRITE, &dev->flags); @@ -1746,7 +1753,8 @@ again: xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (head_sh->dev[i].written) + if (head_sh->dev[i].written || + test_bit(R5_InJournal, &head_sh->dev[i].flags)) xor_srcs[count++] = dev->page; } } else { @@ -2000,17 +2008,15 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, spin_lock_init(&sh->batch_lock); INIT_LIST_HEAD(&sh->batch_list); INIT_LIST_HEAD(&sh->lru); + INIT_LIST_HEAD(&sh->r5c); + INIT_LIST_HEAD(&sh->log_list); atomic_set(&sh->count, 1); + sh->log_start = MaxSector; for (i = 0; i < disks; i++) { struct r5dev *dev = &sh->dev[i]; - bio_init(&dev->req); - dev->req.bi_io_vec = &dev->vec; - dev->req.bi_max_vecs = 1; - - bio_init(&dev->rreq); - dev->rreq.bi_io_vec = &dev->rvec; - dev->rreq.bi_max_vecs = 1; + bio_init(&dev->req, &dev->vec, 1); + bio_init(&dev->rreq, &dev->rvec, 1); } } return sh; @@ -2245,10 +2251,24 @@ static int resize_stripes(struct r5conf *conf, int newsize) */ ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); if (ndisks) { - for (i=0; i<conf->raid_disks; i++) + for (i = 0; i < conf->pool_size; i++) ndisks[i] = conf->disks[i]; - kfree(conf->disks); - conf->disks = ndisks; + + for (i = conf->pool_size; i < newsize; i++) { + ndisks[i].extra_page = alloc_page(GFP_NOIO); + if (!ndisks[i].extra_page) + err = -ENOMEM; + } + + if (err) { + for (i = conf->pool_size; i < newsize; i++) + if (ndisks[i].extra_page) + put_page(ndisks[i].extra_page); + kfree(ndisks); + } else { + kfree(conf->disks); + conf->disks = ndisks; + } } else err = -ENOMEM; @@ -2347,10 +2367,8 @@ static void raid5_end_read_request(struct bio * bi) * replacement device. We just fail those on * any error */ - printk_ratelimited( - KERN_INFO - "md/raid:%s: read error corrected" - " (%lu sectors at %llu on %s)\n", + pr_info_ratelimited( + "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", mdname(conf->mddev), STRIPE_SECTORS, (unsigned long long)s, bdevname(rdev->bdev, b)); @@ -2370,36 +2388,29 @@ static void raid5_end_read_request(struct bio * bi) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error on replacement device " - "(sector %llu on %s).\n", + pr_warn_ratelimited( + "md/raid:%s: read error on replacement device (sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)s, bdn); else if (conf->mddev->degraded >= conf->max_degraded) { set_bad = 1; - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error not correctable " - "(sector %llu on %s).\n", + pr_warn_ratelimited( + "md/raid:%s: read error not correctable (sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)s, bdn); } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { /* Oh, no!!! */ set_bad = 1; - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error NOT corrected!! " - "(sector %llu on %s).\n", + pr_warn_ratelimited( + "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)s, bdn); } else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) - printk(KERN_WARNING - "md/raid:%s: Too many read errors, failing device %s.\n", + pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", mdname(conf->mddev), bdn); else retry = 1; @@ -2531,15 +2542,14 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); - printk(KERN_ALERT - "md/raid:%s: Disk failure on %s, disabling device.\n" - "md/raid:%s: Operation continuing on %d devices.\n", - mdname(mddev), - bdevname(rdev->bdev, b), - mdname(mddev), - conf->raid_disks - mddev->degraded); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); + pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" + "md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), + bdevname(rdev->bdev, b), + mdname(mddev), + conf->raid_disks - mddev->degraded); } /* @@ -2861,8 +2871,8 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) previous, &dummy1, &sh2); if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx || sh2.qd_idx != sh->qd_idx) { - printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", - mdname(conf->mddev)); + pr_warn("md/raid:%s: compute_blocknr: map not correct\n", + mdname(conf->mddev)); return 0; } return r_sector; @@ -2877,6 +2887,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int level = conf->level; if (rcw) { + /* + * In some cases, handle_stripe_dirtying initially decided to + * run rmw and allocates extra page for prexor. However, rcw is + * cheaper later on. We need to free the extra page now, + * because we won't be able to do that in ops_complete_prexor(). + */ + r5c_release_extra_page(sh); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -2887,6 +2904,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, if (!expand) clear_bit(R5_UPTODATE, &dev->flags); s->locked++; + } else if (test_bit(R5_InJournal, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + s->locked++; } } /* if we are not expanding this is a proper write request, and @@ -2926,6 +2946,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, set_bit(R5_LOCKED, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); s->locked++; + } else if (test_bit(R5_InJournal, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + s->locked++; } } if (!s->locked) @@ -3569,10 +3592,10 @@ unhash: break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); } -static void handle_stripe_dirtying(struct r5conf *conf, - struct stripe_head *sh, - struct stripe_head_state *s, - int disks) +static int handle_stripe_dirtying(struct r5conf *conf, + struct stripe_head *sh, + struct stripe_head_state *s, + int disks) { int rmw = 0, rcw = 0, i; sector_t recovery_cp = conf->mddev->recovery_cp; @@ -3597,9 +3620,12 @@ static void handle_stripe_dirtying(struct r5conf *conf, } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && + if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || + test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && - !(test_bit(R5_UPTODATE, &dev->flags) || + !((test_bit(R5_UPTODATE, &dev->flags) && + (!test_bit(R5_InJournal, &dev->flags) || + dev->page != dev->orig_page)) || test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rmw++; @@ -3611,13 +3637,15 @@ static void handle_stripe_dirtying(struct r5conf *conf, i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { + test_bit(R5_InJournal, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rcw++; else rcw += 2*disks; } } + pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); @@ -3629,10 +3657,44 @@ static void handle_stripe_dirtying(struct r5conf *conf, (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && + if (test_bit(R5_InJournal, &dev->flags) && + dev->page == dev->orig_page && + !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { + /* alloc page for prexor */ + struct page *p = alloc_page(GFP_NOIO); + + if (p) { + dev->orig_page = p; + continue; + } + + /* + * alloc_page() failed, try use + * disk_info->extra_page + */ + if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, + &conf->cache_state)) { + r5c_use_extra_page(sh); + break; + } + + /* extra_page in use, add to delayed_list */ + set_bit(STRIPE_DELAYED, &sh->state); + s->waiting_extra_page = 1; + return -EAGAIN; + } + } + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if ((dev->towrite || + i == sh->pd_idx || i == sh->qd_idx || + test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && - !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags)) && + !((test_bit(R5_UPTODATE, &dev->flags) && + (!test_bit(R5_InJournal, &dev->flags) || + dev->page != dev->orig_page)) || + test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -3658,6 +3720,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_InJournal, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { rcw++; if (test_bit(R5_Insync, &dev->flags) && @@ -3697,8 +3760,9 @@ static void handle_stripe_dirtying(struct r5conf *conf, */ if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state))) + !test_bit(STRIPE_BIT_DELAY, &sh->state))) schedule_reconstruction(sh, s, rcw == 0, 0); + return 0; } static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, @@ -3782,7 +3846,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, case check_state_compute_run: break; default: - printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + pr_err("%s: unknown check_state: %d sector: %llu\n", __func__, sh->check_state, (unsigned long long) sh->sector); BUG(); @@ -3946,9 +4010,9 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, case check_state_compute_run: break; default: - printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", - __func__, sh->check_state, - (unsigned long long) sh->sector); + pr_warn("%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); BUG(); } } @@ -4188,6 +4252,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev && !test_bit(Faulty, &rdev->flags)) do_recovery = 1; } + + if (test_bit(R5_InJournal, &dev->flags)) + s->injournal++; + if (test_bit(R5_InJournal, &dev->flags) && dev->written) + s->just_cached++; } if (test_bit(STRIPE_SYNCING, &sh->state)) { /* If there is a failed device being replaced, @@ -4416,7 +4485,8 @@ static void handle_stripe(struct stripe_head *sh) struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_LOCKED, &dev->flags) && (i == sh->pd_idx || i == sh->qd_idx || - dev->written)) { + dev->written || test_bit(R5_InJournal, + &dev->flags))) { pr_debug("Writing block %d\n", i); set_bit(R5_Wantwrite, &dev->flags); if (prexor) @@ -4456,6 +4526,10 @@ static void handle_stripe(struct stripe_head *sh) test_bit(R5_Discard, &qdev->flags)))))) handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + if (s.just_cached) + r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); + r5l_stripe_write_finished(sh); + /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. @@ -4467,14 +4541,51 @@ static void handle_stripe(struct stripe_head *sh) || s.expanding) handle_stripe_fill(sh, &s, disks); - /* Now to consider new write requests and what else, if anything - * should be read. We do not handle new writes when: + /* + * When the stripe finishes full journal write cycle (write to journal + * and raid disk), this is the clean up procedure so it is ready for + * next operation. + */ + r5c_finish_stripe_write_out(conf, sh, &s); + + /* + * Now to consider new write requests, cache write back and what else, + * if anything should be read. We do not handle new writes when: * 1/ A 'write' operation (copy+xor) is already in flight. * 2/ A 'check' operation is in flight, as it may clobber the parity * block. + * 3/ A r5c cache log write is in flight. */ - if (s.to_write && !sh->reconstruct_state && !sh->check_state) - handle_stripe_dirtying(conf, sh, &s, disks); + + if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { + if (!r5c_is_writeback(conf->log)) { + if (s.to_write) + handle_stripe_dirtying(conf, sh, &s, disks); + } else { /* write back cache */ + int ret = 0; + + /* First, try handle writes in caching phase */ + if (s.to_write) + ret = r5c_try_caching_write(conf, sh, &s, + disks); + /* + * If caching phase failed: ret == -EAGAIN + * OR + * stripe under reclaim: !caching && injournal + * + * fall back to handle_stripe_dirtying() + */ + if (ret == -EAGAIN || + /* stripe under reclaim: !caching && injournal */ + (!test_bit(STRIPE_R5C_CACHING, &sh->state) && + s.injournal > 0)) { + ret = handle_stripe_dirtying(conf, sh, &s, + disks); + if (ret == -EAGAIN) + goto finish; + } + } + } /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough @@ -4645,9 +4756,7 @@ finish: } if (!bio_list_empty(&s.return_bi)) { - if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) && - (s.failed <= conf->max_degraded || - conf->mddev->external == 0)) { + if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->return_bi, &s.return_bi); spin_unlock_irq(&conf->device_lock); @@ -4703,6 +4812,10 @@ static int raid5_congested(struct mddev *mddev, int bits) if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) return 1; + + /* Also checks whether there is pressure on r5cache log space */ + if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) + return 1; if (conf->quiesce) return 1; if (atomic_read(&conf->empty_inactive_list_nr)) @@ -5172,6 +5285,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) int remaining; DEFINE_WAIT(w); bool do_prepare; + bool do_flush = false; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = r5l_handle_flush_request(conf->log, bi); @@ -5183,6 +5297,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) return; } /* ret == -EAGAIN, fallback */ + /* + * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, + * we need to flush journal device + */ + do_flush = bi->bi_opf & REQ_PREFLUSH; } md_write_start(mddev, bi); @@ -5193,6 +5312,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) * data on failed drives. */ if (rw == READ && mddev->degraded == 0 && + !r5c_is_writeback(conf->log) && mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); if (!bi) @@ -5321,6 +5441,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) do_prepare = true; goto retry; } + if (do_flush) { + set_bit(STRIPE_R5C_PREFLUSH, &sh->state); + /* we only need flush for one stripe */ + do_flush = false; + } + set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && @@ -5486,9 +5612,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, mddev->flags == 0 || + wait_event(mddev->sb_wait, mddev->sb_flags == 0 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) return 0; @@ -5584,10 +5710,10 @@ finish: mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_DEVS, &mddev->flags) + !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto ret; @@ -5862,10 +5988,10 @@ static void raid5d(struct md_thread *thread) md_check_recovery(mddev); if (!bio_list_empty(&conf->return_bi) && - !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { struct bio_list tmp = BIO_EMPTY_LIST; spin_lock_irq(&conf->device_lock); - if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { bio_list_merge(&tmp, &conf->return_bi); bio_list_init(&conf->return_bi); } @@ -5912,7 +6038,7 @@ static void raid5d(struct md_thread *thread) break; handled += batch_size; - if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { + if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); @@ -6242,6 +6368,7 @@ static struct attribute *raid5_attrs[] = { &raid5_group_thread_cnt.attr, &raid5_skip_copy.attr, &raid5_rmw_level.attr, + &r5c_journal_mode.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -6368,6 +6495,8 @@ static void raid5_free_percpu(struct r5conf *conf) static void free_conf(struct r5conf *conf) { + int i; + if (conf->log) r5l_exit_log(conf->log); if (conf->shrinker.nr_deferred) @@ -6376,6 +6505,9 @@ static void free_conf(struct r5conf *conf) free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); + for (i = 0; i < conf->pool_size; i++) + if (conf->disks[i].extra_page) + put_page(conf->disks[i].extra_page); kfree(conf->disks); kfree(conf->stripe_hashtbl); kfree(conf); @@ -6387,8 +6519,8 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); if (alloc_scratch_buffer(conf, percpu)) { - pr_err("%s: failed memory allocation for cpu%u\n", - __func__, cpu); + pr_warn("%s: failed memory allocation for cpu%u\n", + __func__, cpu); return -ENOMEM; } return 0; @@ -6458,29 +6590,29 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (mddev->new_level != 5 && mddev->new_level != 4 && mddev->new_level != 6) { - printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", - mdname(mddev), mddev->new_level); + pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", + mdname(mddev), mddev->new_level); return ERR_PTR(-EIO); } if ((mddev->new_level == 5 && !algorithm_valid_raid5(mddev->new_layout)) || (mddev->new_level == 6 && !algorithm_valid_raid6(mddev->new_layout))) { - printk(KERN_ERR "md/raid:%s: layout %d not supported\n", - mdname(mddev), mddev->new_layout); + pr_warn("md/raid:%s: layout %d not supported\n", + mdname(mddev), mddev->new_layout); return ERR_PTR(-EIO); } if (mddev->new_level == 6 && mddev->raid_disks < 4) { - printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", - mdname(mddev), mddev->raid_disks); + pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", + mdname(mddev), mddev->raid_disks); return ERR_PTR(-EINVAL); } if (!mddev->new_chunk_sectors || (mddev->new_chunk_sectors << 9) % PAGE_SIZE || !is_power_of_2(mddev->new_chunk_sectors)) { - printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", - mdname(mddev), mddev->new_chunk_sectors << 9); + pr_warn("md/raid:%s: invalid chunk size %d\n", + mdname(mddev), mddev->new_chunk_sectors << 9); return ERR_PTR(-EINVAL); } @@ -6522,9 +6654,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->disks = kzalloc(max_disks * sizeof(struct disk_info), GFP_KERNEL); + if (!conf->disks) goto abort; + for (i = 0; i < max_disks; i++) { + conf->disks[i].extra_page = alloc_page(GFP_KERNEL); + if (!conf->disks[i].extra_page) + goto abort; + } + conf->mddev = mddev; if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) @@ -6545,6 +6684,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) INIT_LIST_HEAD(conf->temp_inactive_list + i); + atomic_set(&conf->r5c_cached_full_stripes, 0); + INIT_LIST_HEAD(&conf->r5c_full_stripe_list); + atomic_set(&conf->r5c_cached_partial_stripes, 0); + INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); + conf->level = mddev->new_level; conf->chunk_sectors = mddev->new_chunk_sectors; if (raid5_alloc_percpu(conf) != 0) @@ -6571,9 +6715,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (test_bit(In_sync, &rdev->flags)) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md/raid:%s: device %s operational as raid" - " disk %d\n", - mdname(mddev), bdevname(rdev->bdev, b), raid_disk); + pr_info("md/raid:%s: device %s operational as raid disk %d\n", + mdname(mddev), bdevname(rdev->bdev, b), raid_disk); } else if (rdev->saved_raid_disk != raid_disk) /* Cannot rely on bitmap to complete recovery */ conf->fullsync = 1; @@ -6607,21 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev) ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); conf->min_nr_stripes = max(NR_STRIPES, stripes); if (conf->min_nr_stripes != NR_STRIPES) - printk(KERN_INFO - "md/raid:%s: force stripe size %d for reshape\n", + pr_info("md/raid:%s: force stripe size %d for reshape\n", mdname(mddev), conf->min_nr_stripes); } memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); if (grow_stripes(conf, conf->min_nr_stripes)) { - printk(KERN_ERR - "md/raid:%s: couldn't allocate %dkB for buffers\n", - mdname(mddev), memory); + pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", + mdname(mddev), memory); goto abort; } else - printk(KERN_INFO "md/raid:%s: allocated %dkB\n", - mdname(mddev), memory); + pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); /* * Losing a stripe head costs more than the time to refill it, * it reduces the queue depth and so can hurt throughput. @@ -6633,18 +6773,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->shrinker.batch = 128; conf->shrinker.flags = 0; if (register_shrinker(&conf->shrinker)) { - printk(KERN_ERR - "md/raid:%s: couldn't register shrinker.\n", - mdname(mddev)); + pr_warn("md/raid:%s: couldn't register shrinker.\n", + mdname(mddev)); goto abort; } sprintf(pers_name, "raid%d", mddev->new_level); conf->thread = md_register_thread(raid5d, mddev, pers_name); if (!conf->thread) { - printk(KERN_ERR - "md/raid:%s: couldn't allocate thread.\n", - mdname(mddev)); + pr_warn("md/raid:%s: couldn't allocate thread.\n", + mdname(mddev)); goto abort; } @@ -6697,9 +6835,8 @@ static int raid5_run(struct mddev *mddev) int first = 1; if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "md/raid:%s: not clean" - " -- starting background reconstruction\n", - mdname(mddev)); + pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", + mdname(mddev)); rdev_for_each(rdev, mddev) { long long diff; @@ -6742,15 +6879,14 @@ static int raid5_run(struct mddev *mddev) int new_data_disks; if (journal_dev) { - printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n", - mdname(mddev)); + pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", + mdname(mddev)); return -EINVAL; } if (mddev->new_level != mddev->level) { - printk(KERN_ERR "md/raid:%s: unsupported reshape " - "required - aborting.\n", - mdname(mddev)); + pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", + mdname(mddev)); return -EINVAL; } old_disks = mddev->raid_disks - mddev->delta_disks; @@ -6765,8 +6901,8 @@ static int raid5_run(struct mddev *mddev) chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); new_data_disks = mddev->raid_disks - max_degraded; if (sector_div(here_new, chunk_sectors * new_data_disks)) { - printk(KERN_ERR "md/raid:%s: reshape_position not " - "on a stripe boundary\n", mdname(mddev)); + pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", + mdname(mddev)); return -EINVAL; } reshape_offset = here_new * chunk_sectors; @@ -6787,10 +6923,8 @@ static int raid5_run(struct mddev *mddev) abs(min_offset_diff) >= mddev->new_chunk_sectors) /* not really in-place - so OK */; else if (mddev->ro == 0) { - printk(KERN_ERR "md/raid:%s: in-place reshape " - "must be started in read-only mode " - "- aborting\n", - mdname(mddev)); + pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", + mdname(mddev)); return -EINVAL; } } else if (mddev->reshape_backwards @@ -6799,13 +6933,11 @@ static int raid5_run(struct mddev *mddev) : (here_new * chunk_sectors >= here_old * chunk_sectors + (-min_offset_diff))) { /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "md/raid:%s: reshape_position too early for " - "auto-recovery - aborting.\n", - mdname(mddev)); + pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", + mdname(mddev)); return -EINVAL; } - printk(KERN_INFO "md/raid:%s: reshape will continue\n", - mdname(mddev)); + pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); /* OK, we should be able to continue; */ } else { BUG_ON(mddev->level != mddev->new_level); @@ -6824,8 +6956,8 @@ static int raid5_run(struct mddev *mddev) if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { if (!journal_dev) { - pr_err("md/raid:%s: journal disk is missing, force array readonly\n", - mdname(mddev)); + pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", + mdname(mddev)); mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); } else if (mddev->recovery_cp == MaxSector) @@ -6852,8 +6984,7 @@ static int raid5_run(struct mddev *mddev) if (conf->disks[i].replacement && conf->reshape_progress != MaxSector) { /* replacements and reshape simply do not mix. */ - printk(KERN_ERR "md: cannot handle concurrent " - "replacement and reshape.\n"); + pr_warn("md: cannot handle concurrent replacement and reshape.\n"); goto abort; } if (test_bit(In_sync, &rdev->flags)) { @@ -6895,8 +7026,7 @@ static int raid5_run(struct mddev *mddev) mddev->degraded = calc_degraded(conf); if (has_failed(conf)) { - printk(KERN_ERR "md/raid:%s: not enough operational devices" - " (%d/%d failed)\n", + pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", mdname(mddev), mddev->degraded, conf->raid_disks); goto abort; } @@ -6908,29 +7038,19 @@ static int raid5_run(struct mddev *mddev) if (mddev->degraded > dirty_parity_disks && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) - printk(KERN_WARNING - "md/raid:%s: starting dirty degraded array" - " - data corruption possible.\n", - mdname(mddev)); + pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", + mdname(mddev)); else { - printk(KERN_ERR - "md/raid:%s: cannot start dirty degraded array.\n", - mdname(mddev)); + pr_crit("md/raid:%s: cannot start dirty degraded array.\n", + mdname(mddev)); goto abort; } } - if (mddev->degraded == 0) - printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" - " devices, algorithm %d\n", mdname(mddev), conf->level, - mddev->raid_disks-mddev->degraded, mddev->raid_disks, - mddev->new_layout); - else - printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" - " out of %d devices, algorithm %d\n", - mdname(mddev), conf->level, - mddev->raid_disks - mddev->degraded, - mddev->raid_disks, mddev->new_layout); + pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", + mdname(mddev), conf->level, + mddev->raid_disks-mddev->degraded, mddev->raid_disks, + mddev->new_layout); print_raid5_conf(conf); @@ -6950,9 +7070,8 @@ static int raid5_run(struct mddev *mddev) mddev->to_remove = NULL; else if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) - printk(KERN_WARNING - "raid5: failed to create sysfs attributes for %s\n", - mdname(mddev)); + pr_warn("raid5: failed to create sysfs attributes for %s\n", + mdname(mddev)); md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); if (mddev->queue) { @@ -6984,6 +7103,15 @@ static int raid5_run(struct mddev *mddev) stripe = (stripe | (stripe-1)) + 1; mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_granularity = stripe; + + /* + * We use 16-bit counter of active stripes in bi_phys_segments + * (minus one for over-loaded initialization) + */ + blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); + blk_queue_max_discard_sectors(mddev->queue, + 0xfffe * STRIPE_SECTORS); + /* * unaligned part of discard request will be ignored, so can't * guarantee discard_zeroes_data @@ -7040,9 +7168,10 @@ static int raid5_run(struct mddev *mddev) if (journal_dev) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO"md/raid:%s: using device %s as journal\n", - mdname(mddev), bdevname(journal_dev->bdev, b)); - r5l_init_log(conf, journal_dev); + pr_debug("md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(journal_dev->bdev, b)); + if (r5l_init_log(conf, journal_dev)) + goto abort; } return 0; @@ -7051,7 +7180,7 @@ abort: print_raid5_conf(conf); free_conf(conf); mddev->private = NULL; - printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); + pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); return -EIO; } @@ -7085,12 +7214,12 @@ static void print_raid5_conf (struct r5conf *conf) int i; struct disk_info *tmp; - printk(KERN_DEBUG "RAID conf printout:\n"); + pr_debug("RAID conf printout:\n"); if (!conf) { - printk("(conf==NULL)\n"); + pr_debug("(conf==NULL)\n"); return; } - printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, + pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, conf->raid_disks, conf->raid_disks - conf->mddev->degraded); @@ -7098,7 +7227,7 @@ static void print_raid5_conf (struct r5conf *conf) char b[BDEVNAME_SIZE]; tmp = conf->disks + i; if (tmp->rdev) - printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", + pr_debug(" disk %d, o:%d, dev:%s\n", i, !test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev, b)); } @@ -7246,8 +7375,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * write requests running. We should be safe */ r5l_init_log(conf, rdev); - printk(KERN_INFO"md/raid:%s: using device %s as journal\n", - mdname(mddev), bdevname(rdev->bdev, b)); + pr_debug("md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(rdev->bdev, b)); return 0; } if (mddev->recovery_disabled == conf->recovery_disabled) @@ -7351,10 +7480,10 @@ static int check_stripe_cache(struct mddev *mddev) > conf->min_nr_stripes || ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 > conf->min_nr_stripes) { - printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", - mdname(mddev), - ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) - / STRIPE_SIZE)*4); + pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", + mdname(mddev), + ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) + / STRIPE_SIZE)*4); return 0; } return 1; @@ -7435,8 +7564,8 @@ static int raid5_start_reshape(struct mddev *mddev) */ if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) < mddev->array_sectors) { - printk(KERN_ERR "md/raid:%s: array size must be reduced " - "before number of disks\n", mdname(mddev)); + pr_warn("md/raid:%s: array size must be reduced before number of disks\n", + mdname(mddev)); return -EINVAL; } @@ -7506,7 +7635,7 @@ static int raid5_start_reshape(struct mddev *mddev) } mddev->raid_disks = conf->raid_disks; mddev->reshape_position = conf->reshape_progress; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -7624,6 +7753,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) /* '2' tells resync/reshape to pause so that all * active stripes can drain */ + r5c_flush_cache(conf, INT_MAX); conf->quiesce = 2; wait_event_cmd(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0 && @@ -7654,8 +7784,8 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) /* for raid0 takeover only one zone is supported */ if (raid0_conf->nr_strip_zones > 1) { - printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", - mdname(mddev)); + pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } @@ -7676,6 +7806,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) static void *raid5_takeover_raid1(struct mddev *mddev) { int chunksect; + void *ret; if (mddev->raid_disks != 2 || mddev->degraded > 1) @@ -7697,7 +7828,10 @@ static void *raid5_takeover_raid1(struct mddev *mddev) mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; mddev->new_chunk_sectors = chunksect; - return setup_conf(mddev); + ret = setup_conf(mddev); + if (!IS_ERR_VALUE(ret)) + clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); + return ret; } static void *raid5_takeover_raid6(struct mddev *mddev) @@ -7767,7 +7901,7 @@ static int raid5_check_reshape(struct mddev *mddev) conf->chunk_sectors = new_chunk ; mddev->chunk_sectors = new_chunk; } - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); } return check_reshape(mddev); |