From ea0213e0c7cc1c1b52badf27bd7db4f50a67baaa Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 09:59:57 +0100
Subject: md: superblock changes for PPL

Include information about PPL location and size into mdp_superblock_1
and copy it to/from rdev. Because PPL is mutually exclusive with bitmap,
put it in place of 'bitmap_offset'. Add a new flag MD_FEATURE_PPL for
'feature_map', analogically to MD_FEATURE_BITMAP_OFFSET. Add MD_HAS_PPL
to mddev->flags to indicate that PPL is enabled on an array.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 include/uapi/linux/raid/md_p.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 9930f3e9040f..fe2112810c43 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -242,10 +242,18 @@ struct mdp_superblock_1 {
 
 	__le32	chunksize;	/* in 512byte sectors */
 	__le32	raid_disks;
-	__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
-				 * NOTE: signed, so bitmap can be before superblock
-				 * only meaningful of feature_map[0] is set.
-				 */
+	union {
+		__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
+					 * NOTE: signed, so bitmap can be before superblock
+					 * only meaningful of feature_map[0] is set.
+					 */
+
+		/* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+		struct {
+			__le16 offset; /* sectors from start of superblock that ppl starts (signed) */
+			__le16 size; /* ppl size in sectors */
+		} ppl;
+	};
 
 	/* These are only valid with feature bit '4' */
 	__le32	new_level;	/* new level we are reshaping to		*/
@@ -318,6 +326,7 @@ struct mdp_superblock_1 {
 					     */
 #define MD_FEATURE_CLUSTERED		256 /* clustered MD */
 #define	MD_FEATURE_JOURNAL		512 /* support write cache */
+#define	MD_FEATURE_PPL			1024 /* support PPL */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -328,6 +337,7 @@ struct mdp_superblock_1 {
 					|MD_FEATURE_RECOVERY_BITMAP	\
 					|MD_FEATURE_CLUSTERED		\
 					|MD_FEATURE_JOURNAL		\
+					|MD_FEATURE_PPL			\
 					)
 
 struct r5l_payload_header {
-- 
cgit v1.2.3


From 3418d036c81dcb604b7c7c71b209d5890a8418aa Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 09:59:59 +0100
Subject: raid5-ppl: Partial Parity Log write logging implementation

Implement the calculation of partial parity for a stripe and PPL write
logging functionality. The description of PPL is added to the
documentation. More details can be found in the comments in raid5-ppl.c.

Attach a page for holding the partial parity data to stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.

Partial parity is the xor of not modified data chunks of a stripe and is
calculated as follows:

- reconstruct-write case:
  xor data from all not updated disks in a stripe

- read-modify-write case:
  xor old data and parity from all updated disks in a stripe

Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.

Partial parity is not meaningful for full stripe write and is not stored
in the log or used for recovery, so don't attempt to calculate it when
stripe has STRIPE_FULL_WRITE.

Put the PPL metadata structures to md_p.h because userspace tools
(mdadm) will also need to read/write PPL.

Warn about using PPL with enabled disk volatile write-back cache for
now. It can be removed once disk cache flushing before writing PPL is
implemented.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 Documentation/md/raid5-ppl.txt |  44 +++
 drivers/md/Makefile            |   2 +-
 drivers/md/raid5-log.h         |  24 ++
 drivers/md/raid5-ppl.c         | 703 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c             |  64 +++-
 drivers/md/raid5.h             |  10 +-
 include/uapi/linux/raid/md_p.h |  27 ++
 7 files changed, 869 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/md/raid5-ppl.txt
 create mode 100644 drivers/md/raid5-ppl.c

(limited to 'include')

diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt
new file mode 100644
index 000000000000..127072b09363
--- /dev/null
+++ b/Documentation/md/raid5-ppl.txt
@@ -0,0 +1,44 @@
+Partial Parity Log
+
+Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
+addressed by PPL is that after a dirty shutdown, parity of a particular stripe
+may become inconsistent with data on other member disks. If the array is also
+in degraded state, there is no way to recalculate parity, because one of the
+disks is missing. This can lead to silent data corruption when rebuilding the
+array or using it is as degraded - data calculated from parity for array blocks
+that have not been touched by a write request during the unclean shutdown can
+be incorrect. Such condition is known as the RAID5 Write Hole. Because of
+this, md by default does not allow starting a dirty degraded array.
+
+Partial parity for a write operation is the XOR of stripe data chunks not
+modified by this write. It is just enough data needed for recovering from the
+write hole. XORing partial parity with the modified chunks produces parity for
+the stripe, consistent with its state before the write operation, regardless of
+which chunk writes have completed. If one of the not modified data disks of
+this stripe is missing, this updated parity can be used to recover its
+contents. PPL recovery is also performed when starting an array after an
+unclean shutdown and all disks are available, eliminating the need to resync
+the array. Because of this, using write-intent bitmap and PPL together is not
+supported.
+
+When handling a write request PPL writes partial parity before new data and
+parity are dispatched to disks. PPL is a distributed log - it is stored on
+array member drives in the metadata area, on the parity drive of a particular
+stripe.  It does not require a dedicated journaling drive. Write performance is
+reduced by up to 30%-40% but it scales with the number of drives in the array
+and the journaling drive does not become a bottleneck or a single point of
+failure.
+
+Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
+not a true journal. It does not protect from losing in-flight data, only from
+silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
+performed for this stripe (parity is not updated). So it is possible to have
+arbitrary data in the written part of a stripe if that disk is lost. In such
+case the behavior is the same as in plain raid5.
+
+PPL is available for md version-1 metadata and external (specifically IMSM)
+metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
+
+Currently, volatile write-back cache should be disabled on all member drives
+when using PPL. Otherwise it cannot guarantee consistency in case of power
+failure.
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..4d48714ccc6b 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
-raid456-y	+= raid5.o raid5-cache.o
+raid456-y	+= raid5.o raid5-cache.o raid5-ppl.o
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 2da4bd3bbd79..a67fb58513b9 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -31,6 +31,20 @@ extern struct md_sysfs_entry r5c_journal_mode;
 extern void r5c_update_on_rdev_error(struct mddev *mddev);
 extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 
+extern struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+		       struct dma_async_tx_descriptor *tx);
+extern int ppl_init_log(struct r5conf *conf);
+extern void ppl_exit_log(struct r5conf *conf);
+extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
+extern void ppl_write_stripe_run(struct r5conf *conf);
+extern void ppl_stripe_write_finished(struct stripe_head *sh);
+
+static inline bool raid5_has_ppl(struct r5conf *conf)
+{
+	return test_bit(MD_HAS_PPL, &conf->mddev->flags);
+}
+
 static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	struct r5conf *conf = sh->raid_conf;
@@ -45,6 +59,8 @@ static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s
 			/* caching phase */
 			return r5c_cache_data(conf->log, sh);
 		}
+	} else if (raid5_has_ppl(conf)) {
+		return ppl_write_stripe(conf, sh);
 	}
 
 	return -EAGAIN;
@@ -56,24 +72,32 @@ static inline void log_stripe_write_finished(struct stripe_head *sh)
 
 	if (conf->log)
 		r5l_stripe_write_finished(sh);
+	else if (raid5_has_ppl(conf))
+		ppl_stripe_write_finished(sh);
 }
 
 static inline void log_write_stripe_run(struct r5conf *conf)
 {
 	if (conf->log)
 		r5l_write_stripe_run(conf->log);
+	else if (raid5_has_ppl(conf))
+		ppl_write_stripe_run(conf);
 }
 
 static inline void log_exit(struct r5conf *conf)
 {
 	if (conf->log)
 		r5l_exit_log(conf);
+	else if (raid5_has_ppl(conf))
+		ppl_exit_log(conf);
 }
 
 static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
 {
 	if (journal_dev)
 		return r5l_init_log(conf, journal_dev);
+	else if (raid5_has_ppl(conf))
+		return ppl_init_log(conf);
 
 	return 0;
 }
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
new file mode 100644
index 000000000000..db5b72b11594
--- /dev/null
+++ b/drivers/md/raid5-ppl.c
@@ -0,0 +1,703 @@
+/*
+ * Partial Parity Log for closing the RAID5 write hole
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/flex_array.h>
+#include <linux/async_tx.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+#include "raid5.h"
+
+/*
+ * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
+ * partial parity data. The header contains an array of entries
+ * (struct ppl_header_entry) which describe the logged write requests.
+ * Partial parity for the entries comes after the header, written in the same
+ * sequence as the entries:
+ *
+ * Header
+ *   entry0
+ *   ...
+ *   entryN
+ * PP data
+ *   PP for entry0
+ *   ...
+ *   PP for entryN
+ *
+ * An entry describes one or more consecutive stripe_heads, up to a full
+ * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
+ * number of stripe_heads in the entry and n is the number of modified data
+ * disks. Every stripe_head in the entry must write to the same data disks.
+ * An example of a valid case described by a single entry (writes to the first
+ * stripe of a 4 disk array, 16k chunk size):
+ *
+ * sh->sector   dd0   dd1   dd2    ppl
+ *            +-----+-----+-----+
+ * 0          | --- | --- | --- | +----+
+ * 8          | -W- | -W- | --- | | pp |   data_sector = 8
+ * 16         | -W- | -W- | --- | | pp |   data_size = 3 * 2 * 4k
+ * 24         | -W- | -W- | --- | | pp |   pp_size = 3 * 4k
+ *            +-----+-----+-----+ +----+
+ *
+ * data_sector is the first raid sector of the modified data, data_size is the
+ * total size of modified data and pp_size is the size of partial parity for
+ * this entry. Entries for full stripe writes contain no partial parity
+ * (pp_size = 0), they only mark the stripes for which parity should be
+ * recalculated after an unclean shutdown. Every entry holds a checksum of its
+ * partial parity, the header also has a checksum of the header itself.
+ *
+ * A write request is always logged to the PPL instance stored on the parity
+ * disk of the corresponding stripe. For each member disk there is one ppl_log
+ * used to handle logging for this disk, independently from others. They are
+ * grouped in child_logs array in struct ppl_conf, which is assigned to
+ * r5conf->log_private.
+ *
+ * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
+ * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
+ * can be appended to the last entry if it meets the conditions for a valid
+ * entry described above, otherwise a new entry is added. Checksums of entries
+ * are calculated incrementally as stripes containing partial parity are being
+ * added. ppl_submit_iounit() calculates the checksum of the header and submits
+ * a bio containing the header page and partial parity pages (sh->ppl_page) for
+ * all stripes of the io_unit. When the PPL write completes, the stripes
+ * associated with the io_unit are released and raid5d starts writing their data
+ * and parity. When all stripes are written, the io_unit is freed and the next
+ * can be submitted.
+ *
+ * An io_unit is used to gather stripes until it is submitted or becomes full
+ * (if the maximum number of entries or size of PPL is reached). Another io_unit
+ * can't be submitted until the previous has completed (PPL and stripe
+ * data+parity is written). The log->io_list tracks all io_units of a log
+ * (for a single member disk). New io_units are added to the end of the list
+ * and the first io_unit is submitted, if it is not submitted already.
+ * The current io_unit accepting new stripes is always at the end of the list.
+ */
+
+struct ppl_conf {
+	struct mddev *mddev;
+
+	/* array of child logs, one for each raid disk */
+	struct ppl_log *child_logs;
+	int count;
+
+	int block_size;		/* the logical block size used for data_sector
+				 * in ppl_header_entry */
+	u32 signature;		/* raid array identifier */
+	atomic64_t seq;		/* current log write sequence number */
+
+	struct kmem_cache *io_kc;
+	mempool_t *io_pool;
+	struct bio_set *bs;
+	mempool_t *meta_pool;
+};
+
+struct ppl_log {
+	struct ppl_conf *ppl_conf;	/* shared between all log instances */
+
+	struct md_rdev *rdev;		/* array member disk associated with
+					 * this log instance */
+	struct mutex io_mutex;
+	struct ppl_io_unit *current_io;	/* current io_unit accepting new data
+					 * always at the end of io_list */
+	spinlock_t io_list_lock;
+	struct list_head io_list;	/* all io_units of this log */
+	struct list_head no_mem_stripes;/* stripes to retry if failed to
+					 * allocate io_unit */
+};
+
+#define PPL_IO_INLINE_BVECS 32
+
+struct ppl_io_unit {
+	struct ppl_log *log;
+
+	struct page *header_page;	/* for ppl_header */
+
+	unsigned int entries_count;	/* number of entries in ppl_header */
+	unsigned int pp_size;		/* total size current of partial parity */
+
+	u64 seq;			/* sequence number of this log write */
+	struct list_head log_sibling;	/* log->io_list */
+
+	struct list_head stripe_list;	/* stripes added to the io_unit */
+	atomic_t pending_stripes;	/* how many stripes not written to raid */
+
+	bool submitted;			/* true if write to log started */
+
+	/* inline bio and its biovec for submitting the iounit */
+	struct bio bio;
+	struct bio_vec biovec[PPL_IO_INLINE_BVECS];
+};
+
+struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+		       struct dma_async_tx_descriptor *tx)
+{
+	int disks = sh->disks;
+	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+	int count = 0, pd_idx = sh->pd_idx, i;
+	struct async_submit_ctl submit;
+
+	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+	/*
+	 * Partial parity is the XOR of stripe data chunks that are not changed
+	 * during the write request. Depending on available data
+	 * (read-modify-write vs. reconstruct-write case) we calculate it
+	 * differently.
+	 */
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		/* rmw: xor old data and parity from updated disks */
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
+				xor_srcs[count++] = dev->page;
+		}
+	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+		/* rcw: xor data from all not updated disks */
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_UPTODATE, &dev->flags))
+				xor_srcs[count++] = dev->page;
+		}
+	} else {
+		return tx;
+	}
+
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
+			  NULL, sh, flex_array_get(percpu->scribble, 0)
+			  + sizeof(struct page *) * (sh->disks + 2));
+
+	if (count == 1)
+		tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+				  &submit);
+	else
+		tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+			       &submit);
+
+	return tx;
+}
+
+static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
+					  struct stripe_head *sh)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct ppl_io_unit *io;
+	struct ppl_header *pplhdr;
+
+	io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
+	if (!io)
+		return NULL;
+
+	memset(io, 0, sizeof(*io));
+	io->log = log;
+	INIT_LIST_HEAD(&io->log_sibling);
+	INIT_LIST_HEAD(&io->stripe_list);
+	atomic_set(&io->pending_stripes, 0);
+	bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
+
+	io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
+	pplhdr = page_address(io->header_page);
+	clear_page(pplhdr);
+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+	pplhdr->signature = cpu_to_le32(ppl_conf->signature);
+
+	io->seq = atomic64_add_return(1, &ppl_conf->seq);
+	pplhdr->generation = cpu_to_le64(io->seq);
+
+	return io;
+}
+
+static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
+{
+	struct ppl_io_unit *io = log->current_io;
+	struct ppl_header_entry *e = NULL;
+	struct ppl_header *pplhdr;
+	int i;
+	sector_t data_sector = 0;
+	int data_disks = 0;
+	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
+	struct r5conf *conf = sh->raid_conf;
+
+	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
+
+	/* check if current io_unit is full */
+	if (io && (io->pp_size == entry_space ||
+		   io->entries_count == PPL_HDR_MAX_ENTRIES)) {
+		pr_debug("%s: add io_unit blocked by seq: %llu\n",
+			 __func__, io->seq);
+		io = NULL;
+	}
+
+	/* add a new unit if there is none or the current is full */
+	if (!io) {
+		io = ppl_new_iounit(log, sh);
+		if (!io)
+			return -ENOMEM;
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&io->log_sibling, &log->io_list);
+		spin_unlock_irq(&log->io_list_lock);
+
+		log->current_io = io;
+	}
+
+	for (i = 0; i < sh->disks; i++) {
+		struct r5dev *dev = &sh->dev[i];
+
+		if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
+			if (!data_disks || dev->sector < data_sector)
+				data_sector = dev->sector;
+			data_disks++;
+		}
+	}
+	BUG_ON(!data_disks);
+
+	pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
+		 io->seq, (unsigned long long)data_sector, data_disks);
+
+	pplhdr = page_address(io->header_page);
+
+	if (io->entries_count > 0) {
+		struct ppl_header_entry *last =
+				&pplhdr->entries[io->entries_count - 1];
+		struct stripe_head *sh_last = list_last_entry(
+				&io->stripe_list, struct stripe_head, log_list);
+		u64 data_sector_last = le64_to_cpu(last->data_sector);
+		u32 data_size_last = le32_to_cpu(last->data_size);
+
+		/*
+		 * Check if we can append the stripe to the last entry. It must
+		 * be just after the last logged stripe and write to the same
+		 * disks. Use bit shift and logarithm to avoid 64-bit division.
+		 */
+		if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
+		    (data_sector >> ilog2(conf->chunk_sectors) ==
+		     data_sector_last >> ilog2(conf->chunk_sectors)) &&
+		    ((data_sector - data_sector_last) * data_disks ==
+		     data_size_last >> 9))
+			e = last;
+	}
+
+	if (!e) {
+		e = &pplhdr->entries[io->entries_count++];
+		e->data_sector = cpu_to_le64(data_sector);
+		e->parity_disk = cpu_to_le32(sh->pd_idx);
+		e->checksum = cpu_to_le32(~0);
+	}
+
+	le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
+
+	/* don't write any PP if full stripe write */
+	if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
+		le32_add_cpu(&e->pp_size, PAGE_SIZE);
+		io->pp_size += PAGE_SIZE;
+		e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
+						    page_address(sh->ppl_page),
+						    PAGE_SIZE));
+	}
+
+	list_add_tail(&sh->log_list, &io->stripe_list);
+	atomic_inc(&io->pending_stripes);
+	sh->ppl_io = io;
+
+	return 0;
+}
+
+int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+	struct ppl_io_unit *io = sh->ppl_io;
+	struct ppl_log *log;
+
+	if (io || test_bit(STRIPE_SYNCING, &sh->state) ||
+	    !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+	    !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
+		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+		return -EAGAIN;
+	}
+
+	log = &ppl_conf->child_logs[sh->pd_idx];
+
+	mutex_lock(&log->io_mutex);
+
+	if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+		mutex_unlock(&log->io_mutex);
+		return -EAGAIN;
+	}
+
+	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+	clear_bit(STRIPE_DELAYED, &sh->state);
+	atomic_inc(&sh->count);
+
+	if (ppl_log_stripe(log, sh)) {
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&sh->log_list, &log->no_mem_stripes);
+		spin_unlock_irq(&log->io_list_lock);
+	}
+
+	mutex_unlock(&log->io_mutex);
+
+	return 0;
+}
+
+static void ppl_log_endio(struct bio *bio)
+{
+	struct ppl_io_unit *io = bio->bi_private;
+	struct ppl_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct stripe_head *sh, *next;
+
+	pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+	if (bio->bi_error)
+		md_error(ppl_conf->mddev, log->rdev);
+
+	mempool_free(io->header_page, ppl_conf->meta_pool);
+
+	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
+		list_del_init(&sh->log_list);
+
+		set_bit(STRIPE_HANDLE, &sh->state);
+		raid5_release_stripe(sh);
+	}
+}
+
+static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
+{
+	char b[BDEVNAME_SIZE];
+
+	pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
+		 __func__, io->seq, bio->bi_iter.bi_size,
+		 (unsigned long long)bio->bi_iter.bi_sector,
+		 bdevname(bio->bi_bdev, b));
+
+	submit_bio(bio);
+}
+
+static void ppl_submit_iounit(struct ppl_io_unit *io)
+{
+	struct ppl_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct ppl_header *pplhdr = page_address(io->header_page);
+	struct bio *bio = &io->bio;
+	struct stripe_head *sh;
+	int i;
+
+	for (i = 0; i < io->entries_count; i++) {
+		struct ppl_header_entry *e = &pplhdr->entries[i];
+
+		pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
+			 __func__, io->seq, i, le64_to_cpu(e->data_sector),
+			 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
+
+		e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
+					     ilog2(ppl_conf->block_size >> 9));
+		e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
+	}
+
+	pplhdr->entries_count = cpu_to_le32(io->entries_count);
+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
+
+	bio->bi_private = io;
+	bio->bi_end_io = ppl_log_endio;
+	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
+	bio->bi_bdev = log->rdev->bdev;
+	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
+
+	list_for_each_entry(sh, &io->stripe_list, log_list) {
+		/* entries for full stripe writes have no partial parity */
+		if (test_bit(STRIPE_FULL_WRITE, &sh->state))
+			continue;
+
+		if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
+			struct bio *prev = bio;
+
+			bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
+					       ppl_conf->bs);
+			bio->bi_opf = prev->bi_opf;
+			bio->bi_bdev = prev->bi_bdev;
+			bio->bi_iter.bi_sector = bio_end_sector(prev);
+			bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
+
+			bio_chain(bio, prev);
+			ppl_submit_iounit_bio(io, prev);
+		}
+	}
+
+	ppl_submit_iounit_bio(io, bio);
+}
+
+static void ppl_submit_current_io(struct ppl_log *log)
+{
+	struct ppl_io_unit *io;
+
+	spin_lock_irq(&log->io_list_lock);
+
+	io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
+				      log_sibling);
+	if (io && io->submitted)
+		io = NULL;
+
+	spin_unlock_irq(&log->io_list_lock);
+
+	if (io) {
+		io->submitted = true;
+
+		if (io == log->current_io)
+			log->current_io = NULL;
+
+		ppl_submit_iounit(io);
+	}
+}
+
+void ppl_write_stripe_run(struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+	struct ppl_log *log;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		log = &ppl_conf->child_logs[i];
+
+		mutex_lock(&log->io_mutex);
+		ppl_submit_current_io(log);
+		mutex_unlock(&log->io_mutex);
+	}
+}
+
+static void ppl_io_unit_finished(struct ppl_io_unit *io)
+{
+	struct ppl_log *log = io->log;
+	unsigned long flags;
+
+	pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+
+	list_del(&io->log_sibling);
+	mempool_free(io, log->ppl_conf->io_pool);
+
+	if (!list_empty(&log->no_mem_stripes)) {
+		struct stripe_head *sh = list_first_entry(&log->no_mem_stripes,
+							  struct stripe_head,
+							  log_list);
+		list_del_init(&sh->log_list);
+		set_bit(STRIPE_HANDLE, &sh->state);
+		raid5_release_stripe(sh);
+	}
+
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+void ppl_stripe_write_finished(struct stripe_head *sh)
+{
+	struct ppl_io_unit *io;
+
+	io = sh->ppl_io;
+	sh->ppl_io = NULL;
+
+	if (io && atomic_dec_and_test(&io->pending_stripes))
+		ppl_io_unit_finished(io);
+}
+
+static void __ppl_exit_log(struct ppl_conf *ppl_conf)
+{
+	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+
+	kfree(ppl_conf->child_logs);
+
+	mempool_destroy(ppl_conf->meta_pool);
+	if (ppl_conf->bs)
+		bioset_free(ppl_conf->bs);
+	mempool_destroy(ppl_conf->io_pool);
+	kmem_cache_destroy(ppl_conf->io_kc);
+
+	kfree(ppl_conf);
+}
+
+void ppl_exit_log(struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+
+	if (ppl_conf) {
+		__ppl_exit_log(ppl_conf);
+		conf->log_private = NULL;
+	}
+}
+
+static int ppl_validate_rdev(struct md_rdev *rdev)
+{
+	char b[BDEVNAME_SIZE];
+	int ppl_data_sectors;
+	int ppl_size_new;
+
+	/*
+	 * The configured PPL size must be enough to store
+	 * the header and (at the very least) partial parity
+	 * for one stripe. Round it down to ensure the data
+	 * space is cleanly divisible by stripe size.
+	 */
+	ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
+
+	if (ppl_data_sectors > 0)
+		ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
+
+	if (ppl_data_sectors <= 0) {
+		pr_warn("md/raid:%s: PPL space too small on %s\n",
+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
+		return -ENOSPC;
+	}
+
+	ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
+
+	if ((rdev->ppl.sector < rdev->data_offset &&
+	     rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
+	    (rdev->ppl.sector >= rdev->data_offset &&
+	     rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
+		pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
+		return -EINVAL;
+	}
+
+	if (!rdev->mddev->external &&
+	    ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
+	     (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
+		pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
+		return -EINVAL;
+	}
+
+	rdev->ppl.size = ppl_size_new;
+
+	return 0;
+}
+
+int ppl_init_log(struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf;
+	struct mddev *mddev = conf->mddev;
+	int ret = 0;
+	int i;
+	bool need_cache_flush;
+
+	pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
+		 mdname(conf->mddev));
+
+	if (PAGE_SIZE != 4096)
+		return -EINVAL;
+
+	if (mddev->level != 5) {
+		pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
+			mdname(mddev), mddev->level);
+		return -EINVAL;
+	}
+
+	if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
+		pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+		pr_warn("md/raid:%s PPL is not compatible with journal\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+
+	ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
+	if (!ppl_conf)
+		return -ENOMEM;
+
+	ppl_conf->mddev = mddev;
+
+	ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
+	if (!ppl_conf->io_kc) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
+	if (!ppl_conf->io_pool) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->bs = bioset_create(conf->raid_disks, 0);
+	if (!ppl_conf->bs) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
+	if (!ppl_conf->meta_pool) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->count = conf->raid_disks;
+	ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
+				       GFP_KERNEL);
+	if (!ppl_conf->child_logs) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	atomic64_set(&ppl_conf->seq, 0);
+
+	if (!mddev->external) {
+		ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
+		ppl_conf->block_size = 512;
+	} else {
+		ppl_conf->block_size = queue_logical_block_size(mddev->queue);
+	}
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct ppl_log *log = &ppl_conf->child_logs[i];
+		struct md_rdev *rdev = conf->disks[i].rdev;
+
+		mutex_init(&log->io_mutex);
+		spin_lock_init(&log->io_list_lock);
+		INIT_LIST_HEAD(&log->io_list);
+		INIT_LIST_HEAD(&log->no_mem_stripes);
+
+		log->ppl_conf = ppl_conf;
+		log->rdev = rdev;
+
+		if (rdev) {
+			struct request_queue *q;
+
+			ret = ppl_validate_rdev(rdev);
+			if (ret)
+				goto err;
+
+			q = bdev_get_queue(rdev->bdev);
+			if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+				need_cache_flush = true;
+		}
+	}
+
+	if (need_cache_flush)
+		pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
+			mdname(mddev));
+
+	conf->log_private = ppl_conf;
+
+	return 0;
+err:
+	__ppl_exit_log(ppl_conf);
+	return ret;
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f575f40d2acb..6b86e0826afe 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -482,6 +482,11 @@ static void shrink_buffers(struct stripe_head *sh)
 		sh->dev[i].page = NULL;
 		put_page(p);
 	}
+
+	if (sh->ppl_page) {
+		put_page(sh->ppl_page);
+		sh->ppl_page = NULL;
+	}
 }
 
 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -498,6 +503,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 		sh->dev[i].page = page;
 		sh->dev[i].orig_page = page;
 	}
+
+	if (raid5_has_ppl(sh->raid_conf)) {
+		sh->ppl_page = alloc_page(gfp);
+		if (!sh->ppl_page)
+			return 1;
+	}
+
 	return 0;
 }
 
@@ -746,7 +758,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
 
-	if (conf->log)
+	if (conf->log || raid5_has_ppl(conf))
 		return false;
 	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
 		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -2093,6 +2105,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			async_tx_ack(tx);
 	}
 
+	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+		tx = ops_run_partial_parity(sh, percpu, tx);
+
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
 		if (level < 6)
 			tx = ops_run_prexor5(sh, percpu, tx);
@@ -3168,6 +3183,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		s->locked++;
 	}
 
+	if (raid5_has_ppl(sh->raid_conf) &&
+	    test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+	    !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+	    test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+		set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
 	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
 		__func__, (unsigned long long)sh->sector,
 		s->locked, s->ops_request);
@@ -3215,6 +3236,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
+	if (forwrite && raid5_has_ppl(conf)) {
+		/*
+		 * With PPL only writes to consecutive data chunks within a
+		 * stripe are allowed because for a single stripe_head we can
+		 * only have one PPL entry at a time, which describes one data
+		 * range. Not really an overlap, but wait_for_overlap can be
+		 * used to handle this.
+		 */
+		sector_t sector;
+		sector_t first = 0;
+		sector_t last = 0;
+		int count = 0;
+		int i;
+
+		for (i = 0; i < sh->disks; i++) {
+			if (i != sh->pd_idx &&
+			    (i == dd_idx || sh->dev[i].towrite)) {
+				sector = sh->dev[i].sector;
+				if (count == 0 || sector < first)
+					first = sector;
+				if (sector > last)
+					last = sector;
+				count++;
+			}
+		}
+
+		if (first + conf->chunk_sectors * (count - 1) != last)
+			goto overlap;
+	}
+
 	if (!forwrite || previous)
 		clear_bit(STRIPE_BATCH_READY, &sh->state);
 
@@ -7208,6 +7259,13 @@ static int raid5_run(struct mddev *mddev)
 		BUG_ON(mddev->delta_disks != 0);
 	}
 
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+	    test_bit(MD_HAS_PPL, &mddev->flags)) {
+		pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+			mdname(mddev));
+		clear_bit(MD_HAS_PPL, &mddev->flags);
+	}
+
 	if (mddev->private == NULL)
 		conf = setup_conf(mddev);
 	else
@@ -7689,7 +7747,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
 	sector_t newsize;
 	struct r5conf *conf = mddev->private;
 
-	if (conf->log)
+	if (conf->log || raid5_has_ppl(conf))
 		return -EINVAL;
 	sectors &= ~((sector_t)conf->chunk_sectors - 1);
 	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7740,7 +7798,7 @@ static int check_reshape(struct mddev *mddev)
 {
 	struct r5conf *conf = mddev->private;
 
-	if (conf->log)
+	if (conf->log || raid5_has_ppl(conf))
 		return -EINVAL;
 	if (mddev->delta_disks == 0 &&
 	    mddev->new_layout == mddev->layout &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6dd295a80ee1..ba5b7a3790af 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -224,10 +224,16 @@ struct stripe_head {
 	spinlock_t		batch_lock; /* only header's lock is useful */
 	struct list_head	batch_list; /* protected by head's batch lock*/
 
-	struct r5l_io_unit	*log_io;
+	union {
+		struct r5l_io_unit	*log_io;
+		struct ppl_io_unit	*ppl_io;
+	};
+
 	struct list_head	log_list;
 	sector_t		log_start; /* first meta block on the journal */
 	struct list_head	r5c; /* for r5c_cache->stripe_in_journal */
+
+	struct page		*ppl_page; /* partial parity of this stripe */
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -400,6 +406,7 @@ enum {
 	STRIPE_OP_BIODRAIN,
 	STRIPE_OP_RECONSTRUCT,
 	STRIPE_OP_CHECK,
+	STRIPE_OP_PARTIAL_PARITY,
 };
 
 /*
@@ -696,6 +703,7 @@ struct r5conf {
 	int			group_cnt;
 	int			worker_cnt_per_group;
 	struct r5l_log		*log;
+	void			*log_private;
 
 	spinlock_t		pending_bios_lock;
 	bool			batch_bio_dispatch;
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index fe2112810c43..d9a1ead867b9 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -398,4 +398,31 @@ struct r5l_meta_block {
 
 #define R5LOG_VERSION 0x1
 #define R5LOG_MAGIC 0x6433c509
+
+struct ppl_header_entry {
+	__le64 data_sector;	/* raid sector of the new data */
+	__le32 pp_size;		/* length of partial parity */
+	__le32 data_size;	/* length of data */
+	__le32 parity_disk;	/* member disk containing parity */
+	__le32 checksum;	/* checksum of partial parity data for this
+				 * entry (~crc32c) */
+} __attribute__ ((__packed__));
+
+#define PPL_HEADER_SIZE 4096
+#define PPL_HDR_RESERVED 512
+#define PPL_HDR_ENTRY_SPACE \
+	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(u32) - sizeof(u64))
+#define PPL_HDR_MAX_ENTRIES \
+	(PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
+
+struct ppl_header {
+	__u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */
+	__le32 signature;		/* signature (family number of volume) */
+	__le32 padding;			/* zero pad */
+	__le64 generation;		/* generation number of the header */
+	__le32 entries_count;		/* number of entries in entry array */
+	__le32 checksum;		/* checksum of the header (~crc32c) */
+	struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
+} __attribute__ ((__packed__));
+
 #endif
-- 
cgit v1.2.3


From 210f7cdcf088c304ee0533ffd33d6f71a8821862 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:14 +1100
Subject: percpu-refcount: support synchronous switch to atomic mode.

percpu_ref_switch_to_atomic_sync() schedules the switch to atomic mode, then
waits for it to complete.

Also export percpu_ref_switch_to_* so they can be used from modules.

This will be used in md/raid to count the number of pending write
requests to an array.
We occasionally need to check if the count is zero, but most often
we don't care.
We always want updates to the counter to be fast, as in some cases
we count every 4K page.

Signed-off-by: NeilBrown <neilb@suse.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 include/linux/percpu-refcount.h |  1 +
 lib/percpu-refcount.c           | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 3a481a49546e..c13dceb87b60 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -99,6 +99,7 @@ int __must_check percpu_ref_init(struct percpu_ref *ref,
 void percpu_ref_exit(struct percpu_ref *ref);
 void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_switch);
+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill);
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 9ac959ef4cae..fe03c6d52761 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -260,6 +260,22 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 
 	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
+
+/**
+ * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
+ * @ref: percpu_ref to switch to atomic mode
+ *
+ * Schedule switching the ref to atomic mode, and wait for the
+ * switch to complete.  Caller must ensure that no other thread
+ * will switch back to percpu mode.
+ */
+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
+{
+	percpu_ref_switch_to_atomic(ref, NULL);
+	wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+}
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);
 
 /**
  * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
@@ -290,6 +306,7 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 
 	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
 
 /**
  * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation
-- 
cgit v1.2.3


From 6f8802852f7e58a12177a86179803b9efaad98e2 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:29 +0800
Subject: block: introduce bio_copy_data_partial

Turns out we can use bio_copy_data in raid1's write behind,
and we can make alloc_behind_pages() more clean/efficient,
but we need to partial version of bio_copy_data().

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c         | 60 +++++++++++++++++++++++++++++++++++++++++------------
 include/linux/bio.h |  2 ++
 2 files changed, 49 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index e75878f8b14a..1ccff0dace89 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1025,19 +1025,8 @@ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_alloc_pages);
 
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
+static void __bio_copy_data(struct bio *dst, struct bio *src,
+			    int offset, int size)
 {
 	struct bvec_iter src_iter, dst_iter;
 	struct bio_vec src_bv, dst_bv;
@@ -1047,6 +1036,12 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 	src_iter = src->bi_iter;
 	dst_iter = dst->bi_iter;
 
+	/* for supporting partial copy */
+	if (offset || size != src->bi_iter.bi_size) {
+		bio_advance_iter(src, &src_iter, offset);
+		src_iter.bi_size = size;
+	}
+
 	while (1) {
 		if (!src_iter.bi_size) {
 			src = src->bi_next;
@@ -1083,8 +1078,47 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 		bio_advance_iter(dst, &dst_iter, bytes);
 	}
 }
+
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+	__bio_copy_data(dst, src, 0, src->bi_iter.bi_size);
+}
 EXPORT_SYMBOL(bio_copy_data);
 
+/**
+ * bio_copy_data_partial - copy partial contents of data buffers from one
+ * chain of bios to another
+ * @dst: destination bio list
+ * @src: source bio list
+ * @offset: starting copy from the offset
+ * @size: how many bytes to copy
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data_partial(struct bio *dst, struct bio *src,
+			   int offset, int size)
+{
+	__bio_copy_data(dst, src, offset, size);
+
+}
+EXPORT_SYMBOL(bio_copy_data_partial);
+
 struct bio_map_data {
 	int is_our_pages;
 	struct iov_iter iter;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..42b62a0288b0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -468,6 +468,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern void bio_copy_data_partial(struct bio *dst, struct bio *src,
+				  int offset, int size);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern void bio_free_pages(struct bio *bio);
 
-- 
cgit v1.2.3


From f45958756fef552436e4a63029a168495920026e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 24 Mar 2017 10:34:43 -0700
Subject: block: remove bio_clone_bioset_partial()

commit c18a1e0(block: introduce bio_clone_bioset_partial()) introduced
bio_clone_bioset_partial() for raid1 write behind IO. Now the write behind is
rewritten by Ming. We don't need the API any more, so revert the commit.

Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c         | 61 ++++++++++++-----------------------------------------
 include/linux/bio.h | 11 ++--------
 2 files changed, 15 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 1ccff0dace89..036435995c55 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -631,20 +631,21 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
-static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-				      struct bio_set *bs, int offset,
-				      int size)
+/**
+ * 	bio_clone_bioset - clone a bio
+ * 	@bio_src: bio to clone
+ *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
+ *
+ *	Clone bio. Caller will own the returned bio, but not the actual data it
+ *	points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+			     struct bio_set *bs)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	struct bio *bio;
-	struct bvec_iter iter_src = bio_src->bi_iter;
-
-	/* for supporting partial clone */
-	if (offset || size != bio_src->bi_iter.bi_size) {
-		bio_advance_iter(bio_src, &iter_src, offset);
-		iter_src.bi_size = size;
-	}
 
 	/*
 	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -668,8 +669,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	 *    __bio_clone_fast() anyways.
 	 */
 
-	bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
-			       &iter_src), bs);
+	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
 	if (!bio)
 		return NULL;
 	bio->bi_bdev		= bio_src->bi_bdev;
@@ -686,7 +686,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
 		break;
 	default:
-		__bio_for_each_segment(bv, bio_src, iter, iter_src)
+		bio_for_each_segment(bv, bio_src, iter)
 			bio->bi_io_vec[bio->bi_vcnt++] = bv;
 		break;
 	}
@@ -705,43 +705,8 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
 	return bio;
 }
-
-/**
- * 	bio_clone_bioset - clone a bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-			     struct bio_set *bs)
-{
-	return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
-				  bio_src->bi_iter.bi_size);
-}
 EXPORT_SYMBOL(bio_clone_bioset);
 
-/**
- * 	bio_clone_bioset_partial - clone a partial bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *	@offset: cloned starting from the offset
- *	@size: size for the cloned bio
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
-				     struct bio_set *bs, int offset,
-				     int size)
-{
-	return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
-}
-EXPORT_SYMBOL(bio_clone_bioset_partial);
-
 /**
  *	bio_add_pc_page	-	attempt to add page to bio
  *	@q: the target queue
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 42b62a0288b0..fafef6343d1b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
-static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
+static inline unsigned bio_segments(struct bio *bio)
 {
 	unsigned segs = 0;
 	struct bio_vec bv;
@@ -205,17 +205,12 @@ static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 		break;
 	}
 
-	__bio_for_each_segment(bv, bio, iter, *bvec)
+	bio_for_each_segment(bv, bio, iter)
 		segs++;
 
 	return segs;
 }
 
-static inline unsigned bio_segments(struct bio *bio)
-{
-	return __bio_segments(bio, &bio->bi_iter);
-}
-
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
@@ -389,8 +384,6 @@ extern void bio_put(struct bio *);
 extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
-extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t,
-					    struct bio_set *, int, int);
 
 extern struct bio_set *fs_bio_set;
 
-- 
cgit v1.2.3


From 50512625da06c41517cb596f51b923ce15f401a4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:50 +1000
Subject: Revert "block: introduce bio_copy_data_partial"

This reverts commit 6f8802852f7e58a12177a86179803b9efaad98e2.
bio_copy_data_partial() is no longer needed.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c         | 60 ++++++++++++-----------------------------------------
 include/linux/bio.h |  2 --
 2 files changed, 13 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 036435995c55..12c2837c4277 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -990,8 +990,19 @@ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_alloc_pages);
 
-static void __bio_copy_data(struct bio *dst, struct bio *src,
-			    int offset, int size)
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
 {
 	struct bvec_iter src_iter, dst_iter;
 	struct bio_vec src_bv, dst_bv;
@@ -1001,12 +1012,6 @@ static void __bio_copy_data(struct bio *dst, struct bio *src,
 	src_iter = src->bi_iter;
 	dst_iter = dst->bi_iter;
 
-	/* for supporting partial copy */
-	if (offset || size != src->bi_iter.bi_size) {
-		bio_advance_iter(src, &src_iter, offset);
-		src_iter.bi_size = size;
-	}
-
 	while (1) {
 		if (!src_iter.bi_size) {
 			src = src->bi_next;
@@ -1043,47 +1048,8 @@ static void __bio_copy_data(struct bio *dst, struct bio *src,
 		bio_advance_iter(dst, &dst_iter, bytes);
 	}
 }
-
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
-{
-	__bio_copy_data(dst, src, 0, src->bi_iter.bi_size);
-}
 EXPORT_SYMBOL(bio_copy_data);
 
-/**
- * bio_copy_data_partial - copy partial contents of data buffers from one
- * chain of bios to another
- * @dst: destination bio list
- * @src: source bio list
- * @offset: starting copy from the offset
- * @size: how many bytes to copy
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data_partial(struct bio *dst, struct bio *src,
-			   int offset, int size)
-{
-	__bio_copy_data(dst, src, offset, size);
-
-}
-EXPORT_SYMBOL(bio_copy_data_partial);
-
 struct bio_map_data {
 	int is_our_pages;
 	struct iov_iter iter;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index fafef6343d1b..7cf8a6c70a3f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -461,8 +461,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
-extern void bio_copy_data_partial(struct bio *dst, struct bio *src,
-				  int offset, int size);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern void bio_free_pages(struct bio *bio);
 
-- 
cgit v1.2.3


From fc6d2a3ca59d5656d5b0ac3b25ecf493e4614abd Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Wed, 19 Apr 2017 10:48:06 +0200
Subject: uapi: fix linux/raid/md_p.h userspace compilation error

Use __le32 and __le64 instead of u32 and u64.

This fixes klibc build error:
  In file included from /klibc/usr/klibc/../include/sys/md.h:30:0,
                   from /klibc/usr/kinit/do_mounts_md.c:19:
  /linux-next/usr/include/linux/raid/md_p.h:414:51: error: 'u32' undeclared here (not in a function)
    (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(u32) - sizeof(u64))

Reported-by: Greg Thelen <gthelen@google.com>
Reported-by: Nigel Croxon <ncroxon@redhat.com>
Tested-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 include/uapi/linux/raid/md_p.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index d9a1ead867b9..d500bd224979 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -411,7 +411,7 @@ struct ppl_header_entry {
 #define PPL_HEADER_SIZE 4096
 #define PPL_HDR_RESERVED 512
 #define PPL_HDR_ENTRY_SPACE \
-	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(u32) - sizeof(u64))
+	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(__le32) - sizeof(__le64))
 #define PPL_HDR_MAX_ENTRIES \
 	(PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
 
-- 
cgit v1.2.3