1 files changed, 384 insertions, 250 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 92ac251e91e6..06d7279bdd04 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644);
 MODULE_PARM_DESC(devices_handle_discard_safely,
 		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
 static struct workqueue_struct *raid5_wq;
-/*
- * Stripe cache
- */
-
-#define NR_STRIPES		256
-#define STRIPE_SIZE		PAGE_SIZE
-#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
-#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
-#define	IO_THRESHOLD		1
-#define BYPASS_THRESHOLD	1
-#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
-#define HASH_MASK		(NR_HASH - 1)
-#define MAX_STRIPE_BATCH	8
 
 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
 {
@@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 	local_irq_enable();
 }
 
-/* bio's attached to a stripe+device for I/O are linked together in bi_sector
- * order without overlap.  There may be several bio's per stripe+device, and
- * a bio could span several devices.
- * When walking this list for a particular stripe+device, we must never proceed
- * beyond a bio that extends past this device, as the next bio might no longer
- * be valid.
- * This function is used to determine the 'next' bio in the list, given the sector
- * of the current stripe+device
- */
-static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
-{
-	int sectors = bio_sectors(bio);
-	if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
-		return bio->bi_next;
-	else
-		return NULL;
-}
-
-/*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	atomic_inc(segments);
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
-	unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	int old, new;
-
-	do {
-		old = atomic_read(segments);
-		new = (old & 0xffff) | (cnt << 16);
-	} while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	atomic_set(segments, cnt);
-}
-
 /* Find first data disk in a raid6 stripe */
 static inline int raid6_d0(struct stripe_head *sh)
 {
@@ -289,8 +218,27 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			      struct list_head *temp_inactive_list)
 {
+	int i;
+	int injournal = 0;	/* number of date pages with R5_InJournal */
+
 	BUG_ON(!list_empty(&sh->lru));
 	BUG_ON(atomic_read(&conf->active_stripes)==0);
+
+	if (r5c_is_writeback(conf->log))
+		for (i = sh->disks; i--; )
+			if (test_bit(R5_InJournal, &sh->dev[i].flags))
+				injournal++;
+	/*
+	 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
+	 * data in journal, so they are not released to cached lists
+	 */
+	if (conf->quiesce && r5c_is_writeback(conf->log) &&
+	    !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
+			r5c_make_stripe_write_out(sh);
+		set_bit(STRIPE_HANDLE, &sh->state);
+	}
+
 	if (test_bit(STRIPE_HANDLE, &sh->state)) {
 		if (test_bit(STRIPE_DELAYED, &sh->state) &&
 		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
@@ -316,8 +264,30 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
-		if (!test_bit(STRIPE_EXPANDING, &sh->state))
-			list_add_tail(&sh->lru, temp_inactive_list);
+		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+			if (!r5c_is_writeback(conf->log))
+				list_add_tail(&sh->lru, temp_inactive_list);
+			else {
+				WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
+				if (injournal == 0)
+					list_add_tail(&sh->lru, temp_inactive_list);
+				else if (injournal == conf->raid_disks - conf->max_degraded) {
+					/* full stripe */
+					if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
+						atomic_inc(&conf->r5c_cached_full_stripes);
+					if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+						atomic_dec(&conf->r5c_cached_partial_stripes);
+					list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
+					r5c_check_cached_full_stripe(conf);
+				} else {
+					/* partial stripe */
+					if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
+							      &sh->state))
+						atomic_inc(&conf->r5c_cached_partial_stripes);
+					list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
+				}
+			}
+		}
 	}
 }
 
@@ -541,7 +511,7 @@ retry:
 
 		if (dev->toread || dev->read || dev->towrite || dev->written ||
 		    test_bit(R5_LOCKED, &dev->flags)) {
-			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
+			pr_err("sector=%llx i=%d %p %p %p %p %d\n",
 			       (unsigned long long)sh->sector, i, dev->toread,
 			       dev->read, dev->towrite, dev->written,
 			       test_bit(R5_LOCKED, &dev->flags));
@@ -680,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 			}
 			if (noblock && sh == NULL)
 				break;
+
+			r5c_check_stripe_cache_usage(conf);
 			if (!sh) {
 				set_bit(R5_INACTIVE_BLOCKED,
 					&conf->cache_state);
+				r5l_wake_reclaim(conf->log, 0);
 				wait_event_lock_irq(
 					conf->wait_for_stripe,
 					!list_empty(conf->inactive_list + hash) &&
@@ -901,8 +874,19 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
-	if (r5l_write_stripe(conf->log, sh) == 0)
-		return;
+	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+		/* writing out phase */
+		if (s->waiting_extra_page)
+			return;
+		if (r5l_write_stripe(conf->log, sh) == 0)
+			return;
+	} else {  /* caching phase */
+		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
+			r5c_cache_data(conf->log, sh, s);
+			return;
+		}
+	}
+
 	for (i = disks; i--; ) {
 		int op, op_flags = 0;
 		int replace_only = 0;
@@ -913,7 +897,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 			op = REQ_OP_WRITE;
 			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
-				op_flags = WRITE_FUA;
+				op_flags = REQ_FUA;
 			if (test_bit(R5_Discard, &sh->dev[i].flags))
 				op = REQ_OP_DISCARD;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
@@ -977,7 +961,7 @@ again:
 			if (bad < 0) {
 				set_bit(BlockedBadBlocks, &rdev->flags);
 				if (!conf->mddev->external &&
-				    conf->mddev->flags) {
+				    conf->mddev->sb_flags) {
 					/* It is very unlikely, but we might
 					 * still need to write out the
 					 * bad block log - better give it
@@ -1115,7 +1099,7 @@ again:
 static struct dma_async_tx_descriptor *
 async_copy_data(int frombio, struct bio *bio, struct page **page,
 	sector_t sector, struct dma_async_tx_descriptor *tx,
-	struct stripe_head *sh)
+	struct stripe_head *sh, int no_skipcopy)
 {
 	struct bio_vec bvl;
 	struct bvec_iter iter;
@@ -1155,7 +1139,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 			if (frombio) {
 				if (sh->raid_conf->skip_copy &&
 				    b_offset == 0 && page_offset == 0 &&
-				    clen == STRIPE_SIZE)
+				    clen == STRIPE_SIZE &&
+				    !no_skipcopy)
 					*page = bio_page;
 				else
 					tx = async_memcpy(*page, bio_page, page_offset,
@@ -1237,7 +1222,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(0, rbi, &dev->page,
-					dev->sector, tx, sh);
+						     dev->sector, tx, sh, 0);
 				rbi = r5_next_bio(rbi, dev->sector);
 			}
 		}
@@ -1364,10 +1349,15 @@ static int set_syndrome_sources(struct page **srcs,
 		if (i == sh->qd_idx || i == sh->pd_idx ||
 		    (srctype == SYNDROME_SRC_ALL) ||
 		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
-		     test_bit(R5_Wantdrain, &dev->flags)) ||
+		     (test_bit(R5_Wantdrain, &dev->flags) ||
+		      test_bit(R5_InJournal, &dev->flags))) ||
 		    (srctype == SYNDROME_SRC_WRITTEN &&
-		     dev->written))
-			srcs[slot] = sh->dev[i].page;
+		     dev->written)) {
+			if (test_bit(R5_InJournal, &dev->flags))
+				srcs[slot] = sh->dev[i].orig_page;
+			else
+				srcs[slot] = sh->dev[i].page;
+		}
 		i = raid6_next_disk(i, disks);
 	} while (i != d0_idx);
 
@@ -1546,6 +1536,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
+
+	if (r5c_is_writeback(sh->raid_conf->log))
+		/*
+		 * raid5-cache write back uses orig_page during prexor.
+		 * After prexor, it is time to free orig_page
+		 */
+		r5c_release_extra_page(sh);
 }
 
 static struct dma_async_tx_descriptor *
@@ -1567,7 +1564,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (test_bit(R5_Wantdrain, &dev->flags))
+		if (test_bit(R5_InJournal, &dev->flags))
+			xor_srcs[count++] = dev->orig_page;
+		else if (test_bit(R5_Wantdrain, &dev->flags))
 			xor_srcs[count++] = dev->page;
 	}
 
@@ -1601,6 +1600,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
 static struct dma_async_tx_descriptor *
 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
+	struct r5conf *conf = sh->raid_conf;
 	int disks = sh->disks;
 	int i;
 	struct stripe_head *head_sh = sh;
@@ -1618,6 +1618,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
 again:
 			dev = &sh->dev[i];
+			/*
+			 * clear R5_InJournal, so when rewriting a page in
+			 * journal, it is not skipped by r5l_log_stripe()
+			 */
+			clear_bit(R5_InJournal, &dev->flags);
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
@@ -1637,8 +1642,10 @@ again:
 					set_bit(R5_Discard, &dev->flags);
 				else {
 					tx = async_copy_data(1, wbi, &dev->page,
-						dev->sector, tx, sh);
-					if (dev->page != dev->orig_page) {
+							     dev->sector, tx, sh,
+							     r5c_is_writeback(conf->log));
+					if (dev->page != dev->orig_page &&
+					    !r5c_is_writeback(conf->log)) {
 						set_bit(R5_SkipCopy, &dev->flags);
 						clear_bit(R5_UPTODATE, &dev->flags);
 						clear_bit(R5_OVERWRITE, &dev->flags);
@@ -1746,7 +1753,8 @@ again:
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (head_sh->dev[i].written)
+			if (head_sh->dev[i].written ||
+			    test_bit(R5_InJournal, &head_sh->dev[i].flags))
 				xor_srcs[count++] = dev->page;
 		}
 	} else {
@@ -2000,17 +2008,15 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		spin_lock_init(&sh->batch_lock);
 		INIT_LIST_HEAD(&sh->batch_list);
 		INIT_LIST_HEAD(&sh->lru);
+		INIT_LIST_HEAD(&sh->r5c);
+		INIT_LIST_HEAD(&sh->log_list);
 		atomic_set(&sh->count, 1);
+		sh->log_start = MaxSector;
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
 
-			bio_init(&dev->req);
-			dev->req.bi_io_vec = &dev->vec;
-			dev->req.bi_max_vecs = 1;
-
-			bio_init(&dev->rreq);
-			dev->rreq.bi_io_vec = &dev->rvec;
-			dev->rreq.bi_max_vecs = 1;
+			bio_init(&dev->req, &dev->vec, 1);
+			bio_init(&dev->rreq, &dev->rvec, 1);
 		}
 	}
 	return sh;
@@ -2245,10 +2251,24 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	 */
 	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
 	if (ndisks) {
-		for (i=0; i<conf->raid_disks; i++)
+		for (i = 0; i < conf->pool_size; i++)
 			ndisks[i] = conf->disks[i];
-		kfree(conf->disks);
-		conf->disks = ndisks;
+
+		for (i = conf->pool_size; i < newsize; i++) {
+			ndisks[i].extra_page = alloc_page(GFP_NOIO);
+			if (!ndisks[i].extra_page)
+				err = -ENOMEM;
+		}
+
+		if (err) {
+			for (i = conf->pool_size; i < newsize; i++)
+				if (ndisks[i].extra_page)
+					put_page(ndisks[i].extra_page);
+			kfree(ndisks);
+		} else {
+			kfree(conf->disks);
+			conf->disks = ndisks;
+		}
 	} else
 		err = -ENOMEM;
 
@@ -2347,10 +2367,8 @@ static void raid5_end_read_request(struct bio * bi)
 			 * replacement device.  We just fail those on
 			 * any error
 			 */
-			printk_ratelimited(
-				KERN_INFO
-				"md/raid:%s: read error corrected"
-				" (%lu sectors at %llu on %s)\n",
+			pr_info_ratelimited(
+				"md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
 				mdname(conf->mddev), STRIPE_SECTORS,
 				(unsigned long long)s,
 				bdevname(rdev->bdev, b));
@@ -2370,36 +2388,29 @@ static void raid5_end_read_request(struct bio * bi)
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 		atomic_inc(&rdev->read_errors);
 		if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
-			printk_ratelimited(
-				KERN_WARNING
-				"md/raid:%s: read error on replacement device "
-				"(sector %llu on %s).\n",
+			pr_warn_ratelimited(
+				"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
 				mdname(conf->mddev),
 				(unsigned long long)s,
 				bdn);
 		else if (conf->mddev->degraded >= conf->max_degraded) {
 			set_bad = 1;
-			printk_ratelimited(
-				KERN_WARNING
-				"md/raid:%s: read error not correctable "
-				"(sector %llu on %s).\n",
+			pr_warn_ratelimited(
+				"md/raid:%s: read error not correctable (sector %llu on %s).\n",
 				mdname(conf->mddev),
 				(unsigned long long)s,
 				bdn);
 		} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
 			/* Oh, no!!! */
 			set_bad = 1;
-			printk_ratelimited(
-				KERN_WARNING
-				"md/raid:%s: read error NOT corrected!! "
-				"(sector %llu on %s).\n",
+			pr_warn_ratelimited(
+				"md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
 				mdname(conf->mddev),
 				(unsigned long long)s,
 				bdn);
 		} else if (atomic_read(&rdev->read_errors)
 			 > conf->max_nr_stripes)
-			printk(KERN_WARNING
-			       "md/raid:%s: Too many read errors, failing device %s.\n",
+			pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
 			       mdname(conf->mddev), bdn);
 		else
 			retry = 1;
@@ -2531,15 +2542,14 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
-	set_mask_bits(&mddev->flags, 0,
-		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
-	printk(KERN_ALERT
-	       "md/raid:%s: Disk failure on %s, disabling device.\n"
-	       "md/raid:%s: Operation continuing on %d devices.\n",
-	       mdname(mddev),
-	       bdevname(rdev->bdev, b),
-	       mdname(mddev),
-	       conf->raid_disks - mddev->degraded);
+	set_mask_bits(&mddev->sb_flags, 0,
+		      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
+	pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
+		"md/raid:%s: Operation continuing on %d devices.\n",
+		mdname(mddev),
+		bdevname(rdev->bdev, b),
+		mdname(mddev),
+		conf->raid_disks - mddev->degraded);
 }
 
 /*
@@ -2861,8 +2871,8 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
 				     previous, &dummy1, &sh2);
 	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
 		|| sh2.qd_idx != sh->qd_idx) {
-		printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
-		       mdname(conf->mddev));
+		pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
+			mdname(conf->mddev));
 		return 0;
 	}
 	return r_sector;
@@ -2877,6 +2887,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 	int level = conf->level;
 
 	if (rcw) {
+		/*
+		 * In some cases, handle_stripe_dirtying initially decided to
+		 * run rmw and allocates extra page for prexor. However, rcw is
+		 * cheaper later on. We need to free the extra page now,
+		 * because we won't be able to do that in ops_complete_prexor().
+		 */
+		r5c_release_extra_page(sh);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -2887,6 +2904,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
+			} else if (test_bit(R5_InJournal, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				s->locked++;
 			}
 		}
 		/* if we are not expanding this is a proper write request, and
@@ -2926,6 +2946,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
+			} else if (test_bit(R5_InJournal, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				s->locked++;
 			}
 		}
 		if (!s->locked)
@@ -3569,10 +3592,10 @@ unhash:
 		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
 }
 
-static void handle_stripe_dirtying(struct r5conf *conf,
-				   struct stripe_head *sh,
-				   struct stripe_head_state *s,
-				   int disks)
+static int handle_stripe_dirtying(struct r5conf *conf,
+				  struct stripe_head *sh,
+				  struct stripe_head_state *s,
+				  int disks)
 {
 	int rmw = 0, rcw = 0, i;
 	sector_t recovery_cp = conf->mddev->recovery_cp;
@@ -3597,9 +3620,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+		     test_bit(R5_InJournal, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !(test_bit(R5_UPTODATE, &dev->flags) ||
+		    !((test_bit(R5_UPTODATE, &dev->flags) &&
+		       (!test_bit(R5_InJournal, &dev->flags) ||
+			dev->page != dev->orig_page)) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rmw++;
@@ -3611,13 +3637,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 		    i != sh->pd_idx && i != sh->qd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
-		    test_bit(R5_Wantcompute, &dev->flags))) {
+		      test_bit(R5_InJournal, &dev->flags) ||
+		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rcw++;
 			else
 				rcw += 2*disks;
 		}
 	}
+
 	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
 		(unsigned long long)sh->sector, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -3629,10 +3657,44 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 					  (unsigned long long)sh->sector, rmw);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+			if (test_bit(R5_InJournal, &dev->flags) &&
+			    dev->page == dev->orig_page &&
+			    !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
+				/* alloc page for prexor */
+				struct page *p = alloc_page(GFP_NOIO);
+
+				if (p) {
+					dev->orig_page = p;
+					continue;
+				}
+
+				/*
+				 * alloc_page() failed, try use
+				 * disk_info->extra_page
+				 */
+				if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
+						      &conf->cache_state)) {
+					r5c_use_extra_page(sh);
+					break;
+				}
+
+				/* extra_page in use, add to delayed_list */
+				set_bit(STRIPE_DELAYED, &sh->state);
+				s->waiting_extra_page = 1;
+				return -EAGAIN;
+			}
+		}
+
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if ((dev->towrite ||
+			     i == sh->pd_idx || i == sh->qd_idx ||
+			     test_bit(R5_InJournal, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !(test_bit(R5_UPTODATE, &dev->flags) ||
-			    test_bit(R5_Wantcompute, &dev->flags)) &&
+			    !((test_bit(R5_UPTODATE, &dev->flags) &&
+			       (!test_bit(R5_InJournal, &dev->flags) ||
+				dev->page != dev->orig_page)) ||
+			      test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (test_bit(STRIPE_PREREAD_ACTIVE,
 					     &sh->state)) {
@@ -3658,6 +3720,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    i != sh->pd_idx && i != sh->qd_idx &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
+			      test_bit(R5_InJournal, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
 				if (test_bit(R5_Insync, &dev->flags) &&
@@ -3697,8 +3760,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	 */
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
-	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
+	     !test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		schedule_reconstruction(sh, s, rcw == 0, 0);
+	return 0;
 }
 
 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
@@ -3782,7 +3846,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
 	case check_state_compute_run:
 		break;
 	default:
-		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+		pr_err("%s: unknown check_state: %d sector: %llu\n",
 		       __func__, sh->check_state,
 		       (unsigned long long) sh->sector);
 		BUG();
@@ -3946,9 +4010,9 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
 	case check_state_compute_run:
 		break;
 	default:
-		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
-		       __func__, sh->check_state,
-		       (unsigned long long) sh->sector);
+		pr_warn("%s: unknown check_state: %d sector: %llu\n",
+			__func__, sh->check_state,
+			(unsigned long long) sh->sector);
 		BUG();
 	}
 }
@@ -4188,6 +4252,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			if (rdev && !test_bit(Faulty, &rdev->flags))
 				do_recovery = 1;
 		}
+
+		if (test_bit(R5_InJournal, &dev->flags))
+			s->injournal++;
+		if (test_bit(R5_InJournal, &dev->flags) && dev->written)
+			s->just_cached++;
 	}
 	if (test_bit(STRIPE_SYNCING, &sh->state)) {
 		/* If there is a failed device being replaced,
@@ -4416,7 +4485,8 @@ static void handle_stripe(struct stripe_head *sh)
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_LOCKED, &dev->flags) &&
 				(i == sh->pd_idx || i == sh->qd_idx ||
-				 dev->written)) {
+				 dev->written || test_bit(R5_InJournal,
+							  &dev->flags))) {
 				pr_debug("Writing block %d\n", i);
 				set_bit(R5_Wantwrite, &dev->flags);
 				if (prexor)
@@ -4456,6 +4526,10 @@ static void handle_stripe(struct stripe_head *sh)
 				 test_bit(R5_Discard, &qdev->flags))))))
 		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
 
+	if (s.just_cached)
+		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+	r5l_stripe_write_finished(sh);
+
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
@@ -4467,14 +4541,51 @@ static void handle_stripe(struct stripe_head *sh)
 	    || s.expanding)
 		handle_stripe_fill(sh, &s, disks);
 
-	/* Now to consider new write requests and what else, if anything
-	 * should be read.  We do not handle new writes when:
+	/*
+	 * When the stripe finishes full journal write cycle (write to journal
+	 * and raid disk), this is the clean up procedure so it is ready for
+	 * next operation.
+	 */
+	r5c_finish_stripe_write_out(conf, sh, &s);
+
+	/*
+	 * Now to consider new write requests, cache write back and what else,
+	 * if anything should be read.  We do not handle new writes when:
 	 * 1/ A 'write' operation (copy+xor) is already in flight.
 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
 	 *    block.
+	 * 3/ A r5c cache log write is in flight.
 	 */
-	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_stripe_dirtying(conf, sh, &s, disks);
+
+	if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
+		if (!r5c_is_writeback(conf->log)) {
+			if (s.to_write)
+				handle_stripe_dirtying(conf, sh, &s, disks);
+		} else { /* write back cache */
+			int ret = 0;
+
+			/* First, try handle writes in caching phase */
+			if (s.to_write)
+				ret = r5c_try_caching_write(conf, sh, &s,
+							    disks);
+			/*
+			 * If caching phase failed: ret == -EAGAIN
+			 *    OR
+			 * stripe under reclaim: !caching && injournal
+			 *
+			 * fall back to handle_stripe_dirtying()
+			 */
+			if (ret == -EAGAIN ||
+			    /* stripe under reclaim: !caching && injournal */
+			    (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
+			     s.injournal > 0)) {
+				ret = handle_stripe_dirtying(conf, sh, &s,
+							     disks);
+				if (ret == -EAGAIN)
+					goto finish;
+			}
+		}
+	}
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
@@ -4645,9 +4756,7 @@ finish:
 	}
 
 	if (!bio_list_empty(&s.return_bi)) {
-		if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) &&
-				(s.failed <= conf->max_degraded ||
-					conf->mddev->external == 0)) {
+		if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
 			spin_lock_irq(&conf->device_lock);
 			bio_list_merge(&conf->return_bi, &s.return_bi);
 			spin_unlock_irq(&conf->device_lock);
@@ -4703,6 +4812,10 @@ static int raid5_congested(struct mddev *mddev, int bits)
 
 	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 		return 1;
+
+	/* Also checks whether there is pressure on r5cache log space */
+	if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
+		return 1;
 	if (conf->quiesce)
 		return 1;
 	if (atomic_read(&conf->empty_inactive_list_nr))
@@ -5172,6 +5285,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	int remaining;
 	DEFINE_WAIT(w);
 	bool do_prepare;
+	bool do_flush = false;
 
 	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
 		int ret = r5l_handle_flush_request(conf->log, bi);
@@ -5183,6 +5297,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 			return;
 		}
 		/* ret == -EAGAIN, fallback */
+		/*
+		 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
+		 * we need to flush journal device
+		 */
+		do_flush = bi->bi_opf & REQ_PREFLUSH;
 	}
 
 	md_write_start(mddev, bi);
@@ -5193,6 +5312,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	 * data on failed drives.
 	 */
 	if (rw == READ && mddev->degraded == 0 &&
+	    !r5c_is_writeback(conf->log) &&
 	    mddev->reshape_position == MaxSector) {
 		bi = chunk_aligned_read(mddev, bi);
 		if (!bi)
@@ -5321,6 +5441,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 				do_prepare = true;
 				goto retry;
 			}
+			if (do_flush) {
+				set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
+				/* we only need flush for one stripe */
+				do_flush = false;
+			}
+
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			if ((!sh->batch_head || sh == sh->batch_head) &&
@@ -5486,9 +5612,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 		mddev->reshape_position = conf->reshape_progress;
 		mddev->curr_resync_completed = sector_nr;
 		conf->reshape_checkpoint = jiffies;
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
-		wait_event(mddev->sb_wait, mddev->flags == 0 ||
+		wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
 			   test_bit(MD_RECOVERY_INTR, &mddev->recovery));
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 			return 0;
@@ -5584,10 +5710,10 @@ finish:
 		mddev->reshape_position = conf->reshape_progress;
 		mddev->curr_resync_completed = sector_nr;
 		conf->reshape_checkpoint = jiffies;
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
-			   !test_bit(MD_CHANGE_DEVS, &mddev->flags)
+			   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
 			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 			goto ret;
@@ -5862,10 +5988,10 @@ static void raid5d(struct md_thread *thread)
 	md_check_recovery(mddev);
 
 	if (!bio_list_empty(&conf->return_bi) &&
-	    !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
 		struct bio_list tmp = BIO_EMPTY_LIST;
 		spin_lock_irq(&conf->device_lock);
-		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
 			bio_list_merge(&tmp, &conf->return_bi);
 			bio_list_init(&conf->return_bi);
 		}
@@ -5912,7 +6038,7 @@ static void raid5d(struct md_thread *thread)
 			break;
 		handled += batch_size;
 
-		if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
+		if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
 			spin_unlock_irq(&conf->device_lock);
 			md_check_recovery(mddev);
 			spin_lock_irq(&conf->device_lock);
@@ -6242,6 +6368,7 @@ static struct attribute *raid5_attrs[] =  {
 	&raid5_group_thread_cnt.attr,
 	&raid5_skip_copy.attr,
 	&raid5_rmw_level.attr,
+	&r5c_journal_mode.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
@@ -6368,6 +6495,8 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+	int i;
+
 	if (conf->log)
 		r5l_exit_log(conf->log);
 	if (conf->shrinker.nr_deferred)
@@ -6376,6 +6505,9 @@ static void free_conf(struct r5conf *conf)
 	free_thread_groups(conf);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
+	for (i = 0; i < conf->pool_size; i++)
+		if (conf->disks[i].extra_page)
+			put_page(conf->disks[i].extra_page);
 	kfree(conf->disks);
 	kfree(conf->stripe_hashtbl);
 	kfree(conf);
@@ -6387,8 +6519,8 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
 	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
 
 	if (alloc_scratch_buffer(conf, percpu)) {
-		pr_err("%s: failed memory allocation for cpu%u\n",
-		       __func__, cpu);
+		pr_warn("%s: failed memory allocation for cpu%u\n",
+			__func__, cpu);
 		return -ENOMEM;
 	}
 	return 0;
@@ -6458,29 +6590,29 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	if (mddev->new_level != 5
 	    && mddev->new_level != 4
 	    && mddev->new_level != 6) {
-		printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
-		       mdname(mddev), mddev->new_level);
+		pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
+			mdname(mddev), mddev->new_level);
 		return ERR_PTR(-EIO);
 	}
 	if ((mddev->new_level == 5
 	     && !algorithm_valid_raid5(mddev->new_layout)) ||
 	    (mddev->new_level == 6
 	     && !algorithm_valid_raid6(mddev->new_layout))) {
-		printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
-		       mdname(mddev), mddev->new_layout);
+		pr_warn("md/raid:%s: layout %d not supported\n",
+			mdname(mddev), mddev->new_layout);
 		return ERR_PTR(-EIO);
 	}
 	if (mddev->new_level == 6 && mddev->raid_disks < 4) {
-		printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
-		       mdname(mddev), mddev->raid_disks);
+		pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
+			mdname(mddev), mddev->raid_disks);
 		return ERR_PTR(-EINVAL);
 	}
 
 	if (!mddev->new_chunk_sectors ||
 	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
 	    !is_power_of_2(mddev->new_chunk_sectors)) {
-		printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
-		       mdname(mddev), mddev->new_chunk_sectors << 9);
+		pr_warn("md/raid:%s: invalid chunk size %d\n",
+			mdname(mddev), mddev->new_chunk_sectors << 9);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -6522,9 +6654,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
 	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
+
 	if (!conf->disks)
 		goto abort;
 
+	for (i = 0; i < max_disks; i++) {
+		conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
+		if (!conf->disks[i].extra_page)
+			goto abort;
+	}
+
 	conf->mddev = mddev;
 
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -6545,6 +6684,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
 		INIT_LIST_HEAD(conf->temp_inactive_list + i);
 
+	atomic_set(&conf->r5c_cached_full_stripes, 0);
+	INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
+	atomic_set(&conf->r5c_cached_partial_stripes, 0);
+	INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+
 	conf->level = mddev->new_level;
 	conf->chunk_sectors = mddev->new_chunk_sectors;
 	if (raid5_alloc_percpu(conf) != 0)
@@ -6571,9 +6715,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
 		if (test_bit(In_sync, &rdev->flags)) {
 			char b[BDEVNAME_SIZE];
-			printk(KERN_INFO "md/raid:%s: device %s operational as raid"
-			       " disk %d\n",
-			       mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
+			pr_info("md/raid:%s: device %s operational as raid disk %d\n",
+				mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
 		} else if (rdev->saved_raid_disk != raid_disk)
 			/* Cannot rely on bitmap to complete recovery */
 			conf->fullsync = 1;
@@ -6607,21 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 			((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
 		conf->min_nr_stripes = max(NR_STRIPES, stripes);
 		if (conf->min_nr_stripes != NR_STRIPES)
-			printk(KERN_INFO
-				"md/raid:%s: force stripe size %d for reshape\n",
+			pr_info("md/raid:%s: force stripe size %d for reshape\n",
 				mdname(mddev), conf->min_nr_stripes);
 	}
 	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
 	if (grow_stripes(conf, conf->min_nr_stripes)) {
-		printk(KERN_ERR
-		       "md/raid:%s: couldn't allocate %dkB for buffers\n",
-		       mdname(mddev), memory);
+		pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
+			mdname(mddev), memory);
 		goto abort;
 	} else
-		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
-		       mdname(mddev), memory);
+		pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
 	/*
 	 * Losing a stripe head costs more than the time to refill it,
 	 * it reduces the queue depth and so can hurt throughput.
@@ -6633,18 +6773,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	conf->shrinker.batch = 128;
 	conf->shrinker.flags = 0;
 	if (register_shrinker(&conf->shrinker)) {
-		printk(KERN_ERR
-		       "md/raid:%s: couldn't register shrinker.\n",
-		       mdname(mddev));
+		pr_warn("md/raid:%s: couldn't register shrinker.\n",
+			mdname(mddev));
 		goto abort;
 	}
 
 	sprintf(pers_name, "raid%d", mddev->new_level);
 	conf->thread = md_register_thread(raid5d, mddev, pers_name);
 	if (!conf->thread) {
-		printk(KERN_ERR
-		       "md/raid:%s: couldn't allocate thread.\n",
-		       mdname(mddev));
+		pr_warn("md/raid:%s: couldn't allocate thread.\n",
+			mdname(mddev));
 		goto abort;
 	}
 
@@ -6697,9 +6835,8 @@ static int raid5_run(struct mddev *mddev)
 	int first = 1;
 
 	if (mddev->recovery_cp != MaxSector)
-		printk(KERN_NOTICE "md/raid:%s: not clean"
-		       " -- starting background reconstruction\n",
-		       mdname(mddev));
+		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
+			  mdname(mddev));
 
 	rdev_for_each(rdev, mddev) {
 		long long diff;
@@ -6742,15 +6879,14 @@ static int raid5_run(struct mddev *mddev)
 		int new_data_disks;
 
 		if (journal_dev) {
-			printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
-			       mdname(mddev));
+			pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
+				mdname(mddev));
 			return -EINVAL;
 		}
 
 		if (mddev->new_level != mddev->level) {
-			printk(KERN_ERR "md/raid:%s: unsupported reshape "
-			       "required - aborting.\n",
-			       mdname(mddev));
+			pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
+				mdname(mddev));
 			return -EINVAL;
 		}
 		old_disks = mddev->raid_disks - mddev->delta_disks;
@@ -6765,8 +6901,8 @@ static int raid5_run(struct mddev *mddev)
 		chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
 		new_data_disks = mddev->raid_disks - max_degraded;
 		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
-			printk(KERN_ERR "md/raid:%s: reshape_position not "
-			       "on a stripe boundary\n", mdname(mddev));
+			pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
+				mdname(mddev));
 			return -EINVAL;
 		}
 		reshape_offset = here_new * chunk_sectors;
@@ -6787,10 +6923,8 @@ static int raid5_run(struct mddev *mddev)
 			    abs(min_offset_diff) >= mddev->new_chunk_sectors)
 				/* not really in-place - so OK */;
 			else if (mddev->ro == 0) {
-				printk(KERN_ERR "md/raid:%s: in-place reshape "
-				       "must be started in read-only mode "
-				       "- aborting\n",
-				       mdname(mddev));
+				pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
+					mdname(mddev));
 				return -EINVAL;
 			}
 		} else if (mddev->reshape_backwards
@@ -6799,13 +6933,11 @@ static int raid5_run(struct mddev *mddev)
 		    : (here_new * chunk_sectors >=
 		       here_old * chunk_sectors + (-min_offset_diff))) {
 			/* Reading from the same stripe as writing to - bad */
-			printk(KERN_ERR "md/raid:%s: reshape_position too early for "
-			       "auto-recovery - aborting.\n",
-			       mdname(mddev));
+			pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
+				mdname(mddev));
 			return -EINVAL;
 		}
-		printk(KERN_INFO "md/raid:%s: reshape will continue\n",
-		       mdname(mddev));
+		pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
 		/* OK, we should be able to continue; */
 	} else {
 		BUG_ON(mddev->level != mddev->new_level);
@@ -6824,8 +6956,8 @@ static int raid5_run(struct mddev *mddev)
 
 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
 		if (!journal_dev) {
-			pr_err("md/raid:%s: journal disk is missing, force array readonly\n",
-			       mdname(mddev));
+			pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
+				mdname(mddev));
 			mddev->ro = 1;
 			set_disk_ro(mddev->gendisk, 1);
 		} else if (mddev->recovery_cp == MaxSector)
@@ -6852,8 +6984,7 @@ static int raid5_run(struct mddev *mddev)
 		if (conf->disks[i].replacement &&
 		    conf->reshape_progress != MaxSector) {
 			/* replacements and reshape simply do not mix. */
-			printk(KERN_ERR "md: cannot handle concurrent "
-			       "replacement and reshape.\n");
+			pr_warn("md: cannot handle concurrent replacement and reshape.\n");
 			goto abort;
 		}
 		if (test_bit(In_sync, &rdev->flags)) {
@@ -6895,8 +7026,7 @@ static int raid5_run(struct mddev *mddev)
 	mddev->degraded = calc_degraded(conf);
 
 	if (has_failed(conf)) {
-		printk(KERN_ERR "md/raid:%s: not enough operational devices"
-			" (%d/%d failed)\n",
+		pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
 			mdname(mddev), mddev->degraded, conf->raid_disks);
 		goto abort;
 	}
@@ -6908,29 +7038,19 @@ static int raid5_run(struct mddev *mddev)
 	if (mddev->degraded > dirty_parity_disks &&
 	    mddev->recovery_cp != MaxSector) {
 		if (mddev->ok_start_degraded)
-			printk(KERN_WARNING
-			       "md/raid:%s: starting dirty degraded array"
-			       " - data corruption possible.\n",
-			       mdname(mddev));
+			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
+				mdname(mddev));
 		else {
-			printk(KERN_ERR
-			       "md/raid:%s: cannot start dirty degraded array.\n",
-			       mdname(mddev));
+			pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
+				mdname(mddev));
 			goto abort;
 		}
 	}
 
-	if (mddev->degraded == 0)
-		printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
-		       " devices, algorithm %d\n", mdname(mddev), conf->level,
-		       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
-		       mddev->new_layout);
-	else
-		printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
-		       " out of %d devices, algorithm %d\n",
-		       mdname(mddev), conf->level,
-		       mddev->raid_disks - mddev->degraded,
-		       mddev->raid_disks, mddev->new_layout);
+	pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
+		mdname(mddev), conf->level,
+		mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+		mddev->new_layout);
 
 	print_raid5_conf(conf);
 
@@ -6950,9 +7070,8 @@ static int raid5_run(struct mddev *mddev)
 		mddev->to_remove = NULL;
 	else if (mddev->kobj.sd &&
 	    sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
-		printk(KERN_WARNING
-		       "raid5: failed to create sysfs attributes for %s\n",
-		       mdname(mddev));
+		pr_warn("raid5: failed to create sysfs attributes for %s\n",
+			mdname(mddev));
 	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
 	if (mddev->queue) {
@@ -6984,6 +7103,15 @@ static int raid5_run(struct mddev *mddev)
 			stripe = (stripe | (stripe-1)) + 1;
 		mddev->queue->limits.discard_alignment = stripe;
 		mddev->queue->limits.discard_granularity = stripe;
+
+		/*
+		 * We use 16-bit counter of active stripes in bi_phys_segments
+		 * (minus one for over-loaded initialization)
+		 */
+		blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
+		blk_queue_max_discard_sectors(mddev->queue,
+					      0xfffe * STRIPE_SECTORS);
+
 		/*
 		 * unaligned part of discard request will be ignored, so can't
 		 * guarantee discard_zeroes_data
@@ -7040,9 +7168,10 @@ static int raid5_run(struct mddev *mddev)
 	if (journal_dev) {
 		char b[BDEVNAME_SIZE];
 
-		printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
-		       mdname(mddev), bdevname(journal_dev->bdev, b));
-		r5l_init_log(conf, journal_dev);
+		pr_debug("md/raid:%s: using device %s as journal\n",
+			 mdname(mddev), bdevname(journal_dev->bdev, b));
+		if (r5l_init_log(conf, journal_dev))
+			goto abort;
 	}
 
 	return 0;
@@ -7051,7 +7180,7 @@ abort:
 	print_raid5_conf(conf);
 	free_conf(conf);
 	mddev->private = NULL;
-	printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
+	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
 	return -EIO;
 }
 
@@ -7085,12 +7214,12 @@ static void print_raid5_conf (struct r5conf *conf)
 	int i;
 	struct disk_info *tmp;
 
-	printk(KERN_DEBUG "RAID conf printout:\n");
+	pr_debug("RAID conf printout:\n");
 	if (!conf) {
-		printk("(conf==NULL)\n");
+		pr_debug("(conf==NULL)\n");
 		return;
 	}
-	printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
+	pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
 	       conf->raid_disks,
 	       conf->raid_disks - conf->mddev->degraded);
 
@@ -7098,7 +7227,7 @@ static void print_raid5_conf (struct r5conf *conf)
 		char b[BDEVNAME_SIZE];
 		tmp = conf->disks + i;
 		if (tmp->rdev)
-			printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
+			pr_debug(" disk %d, o:%d, dev:%s\n",
 			       i, !test_bit(Faulty, &tmp->rdev->flags),
 			       bdevname(tmp->rdev->bdev, b));
 	}
@@ -7246,8 +7375,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * write requests running. We should be safe
 		 */
 		r5l_init_log(conf, rdev);
-		printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
-		       mdname(mddev), bdevname(rdev->bdev, b));
+		pr_debug("md/raid:%s: using device %s as journal\n",
+			 mdname(mddev), bdevname(rdev->bdev, b));
 		return 0;
 	}
 	if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7351,10 +7480,10 @@ static int check_stripe_cache(struct mddev *mddev)
 	    > conf->min_nr_stripes ||
 	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
 	    > conf->min_nr_stripes) {
-		printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
-		       mdname(mddev),
-		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
-			/ STRIPE_SIZE)*4);
+		pr_warn("md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
+			mdname(mddev),
+			((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
+			 / STRIPE_SIZE)*4);
 		return 0;
 	}
 	return 1;
@@ -7435,8 +7564,8 @@ static int raid5_start_reshape(struct mddev *mddev)
 	 */
 	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
 	    < mddev->array_sectors) {
-		printk(KERN_ERR "md/raid:%s: array size must be reduced "
-		       "before number of disks\n", mdname(mddev));
+		pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
+			mdname(mddev));
 		return -EINVAL;
 	}
 
@@ -7506,7 +7635,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 	}
 	mddev->raid_disks = conf->raid_disks;
 	mddev->reshape_position = conf->reshape_progress;
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -7624,6 +7753,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 		/* '2' tells resync/reshape to pause so that all
 		 * active stripes can drain
 		 */
+		r5c_flush_cache(conf, INT_MAX);
 		conf->quiesce = 2;
 		wait_event_cmd(conf->wait_for_quiescent,
 				    atomic_read(&conf->active_stripes) == 0 &&
@@ -7654,8 +7784,8 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
 
 	/* for raid0 takeover only one zone is supported */
 	if (raid0_conf->nr_strip_zones > 1) {
-		printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
-		       mdname(mddev));
+		pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
+			mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -7676,6 +7806,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
 static void *raid5_takeover_raid1(struct mddev *mddev)
 {
 	int chunksect;
+	void *ret;
 
 	if (mddev->raid_disks != 2 ||
 	    mddev->degraded > 1)
@@ -7697,7 +7828,10 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
 	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
 	mddev->new_chunk_sectors = chunksect;
 
-	return setup_conf(mddev);
+	ret = setup_conf(mddev);
+	if (!IS_ERR_VALUE(ret))
+		clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+	return ret;
 }
 
 static void *raid5_takeover_raid6(struct mddev *mddev)
@@ -7767,7 +7901,7 @@ static int raid5_check_reshape(struct mddev *mddev)
 			conf->chunk_sectors = new_chunk ;
 			mddev->chunk_sectors = new_chunk;
 		}
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
 	}
 	return check_reshape(mddev);