From 7405f74badf46b5d023c5d2b670b4471525f6c91 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 2 Jan 2007 11:10:43 -0700
Subject: dmaengine: refactor dmaengine around dma_async_tx_descriptor

The current dmaengine interface defines mutliple routines per operation,
i.e. dma_async_memcpy_buf_to_buf, dma_async_memcpy_buf_to_page etc.  Adding
more operation types (xor, crc, etc) to this model would result in an
unmanageable number of method permutations.

	Are we really going to add a set of hooks for each DMA engine
	whizbang feature?
		- Jeff Garzik

The descriptor creation process is refactored using the new common
dma_async_tx_descriptor structure.  Instead of per driver
do_<operation>_<dest>_to_<src> methods, drivers integrate
dma_async_tx_descriptor into their private software descriptor and then
define a 'prep' routine per operation.  The prep routine allocates a
descriptor and ensures that the tx_set_src, tx_set_dest, tx_submit routines
are valid.  Descriptor creation and submission becomes:

struct dma_device *dev;
struct dma_chan *chan;
struct dma_async_tx_descriptor *tx;

tx = dev->device_prep_dma_<operation>(chan, len, int_flag)
tx->tx_set_src(dma_addr_t, tx, index /* for multi-source ops */)
tx->tx_set_dest(dma_addr_t, tx, index)
tx->tx_submit(tx)

In addition to the refactoring, dma_async_tx_descriptor also lays the
groundwork for definining cross-channel-operation dependencies, and a
callback facility for asynchronous notification of operation completion.

Changelog:
* drop dma mapping methods, suggested by Chris Leech
* fix ioat_dma_dependency_added, also caught by Andrew Morton
* fix dma_sync_wait, change from Andrew Morton
* uninline large functions, change from Andrew Morton
* add tx->callback = NULL to dmaengine calls to interoperate with async_tx
  calls
* hookup ioat_tx_submit
* convert channel capabilities to a 'cpumask_t like' bitmap
* removed DMA_TX_ARRAY_INIT, no longer needed
* checkpatch.pl fixes
* make set_src, set_dest, and tx_submit descriptor specific methods
* fixup git-ioat merge
* move group_list and phys to dma_async_tx_descriptor

Cc: Jeff Garzik <jeff@garzik.org>
Cc: Chris Leech <christopher.leech@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dmaengine.h | 237 +++++++++++++++++++++++++++++-----------------
 1 file changed, 149 insertions(+), 88 deletions(-)

(limited to 'include/linux')
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index c94d8f1d62e5..3de1cf71031a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -21,13 +21,12 @@
 #ifndef DMAENGINE_H
 #define DMAENGINE_H
 
-#ifdef CONFIG_DMA_ENGINE
-
 #include <linux/device.h>
 #include <linux/uio.h>
 #include <linux/kref.h>
 #include <linux/completion.h>
 #include <linux/rcupdate.h>
+#include <linux/dma-mapping.h>
 
 /**
  * enum dma_event - resource PNP/power managment events
@@ -64,6 +63,31 @@ enum dma_status {
 	DMA_ERROR,
 };
 
+/**
+ * enum dma_transaction_type - DMA transaction types/indexes
+ */
+enum dma_transaction_type {
+	DMA_MEMCPY,
+	DMA_XOR,
+	DMA_PQ_XOR,
+	DMA_DUAL_XOR,
+	DMA_PQ_UPDATE,
+	DMA_ZERO_SUM,
+	DMA_PQ_ZERO_SUM,
+	DMA_MEMSET,
+	DMA_MEMCPY_CRC32C,
+	DMA_INTERRUPT,
+};
+
+/* last transaction type for creation of the capabilities mask */
+#define DMA_TX_TYPE_END (DMA_INTERRUPT + 1)
+
+/**
+ * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t.
+ * See linux/cpumask.h
+ */
+typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t;
+
 /**
  * struct dma_chan_percpu - the per-CPU part of struct dma_chan
  * @refcount: local_t used for open-coded "bigref" counting
@@ -157,48 +181,106 @@ struct dma_client {
 	struct list_head	global_node;
 };
 
+typedef void (*dma_async_tx_callback)(void *dma_async_param);
+/**
+ * struct dma_async_tx_descriptor - async transaction descriptor
+ * ---dma generic offload fields---
+ * @cookie: tracking cookie for this transaction, set to -EBUSY if
+ *	this tx is sitting on a dependency list
+ * @ack: the descriptor can not be reused until the client acknowledges
+ *	receipt, i.e. has has a chance to establish any dependency chains
+ * @phys: physical address of the descriptor
+ * @tx_list: driver common field for operations that require multiple
+ *	descriptors
+ * @chan: target channel for this operation
+ * @tx_submit: set the prepared descriptor(s) to be executed by the engine
+ * @tx_set_dest: set a destination address in a hardware descriptor
+ * @tx_set_src: set a source address in a hardware descriptor
+ * @callback: routine to call after this operation is complete
+ * @callback_param: general parameter to pass to the callback routine
+ * ---async_tx api specific fields---
+ * @depend_list: at completion this list of transactions are submitted
+ * @depend_node: allow this transaction to be executed after another
+ *	transaction has completed, possibly on another channel
+ * @parent: pointer to the next level up in the dependency chain
+ * @lock: protect the dependency list
+ */
+struct dma_async_tx_descriptor {
+	dma_cookie_t cookie;
+	int ack;
+	dma_addr_t phys;
+	struct list_head tx_list;
+	struct dma_chan *chan;
+	dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx);
+	void (*tx_set_dest)(dma_addr_t addr,
+		struct dma_async_tx_descriptor *tx, int index);
+	void (*tx_set_src)(dma_addr_t addr,
+		struct dma_async_tx_descriptor *tx, int index);
+	dma_async_tx_callback callback;
+	void *callback_param;
+	struct list_head depend_list;
+	struct list_head depend_node;
+	struct dma_async_tx_descriptor *parent;
+	spinlock_t lock;
+};
+
 /**
  * struct dma_device - info on the entity supplying DMA services
  * @chancnt: how many DMA channels are supported
  * @channels: the list of struct dma_chan
  * @global_node: list_head for global dma_device_list
+ * @cap_mask: one or more dma_capability flags
+ * @max_xor: maximum number of xor sources, 0 if no capability
  * @refcount: reference count
  * @done: IO completion struct
  * @dev_id: unique device ID
+ * @dev: struct device reference for dma mapping api
  * @device_alloc_chan_resources: allocate resources and return the
  *	number of allocated descriptors
  * @device_free_chan_resources: release DMA channel's resources
- * @device_memcpy_buf_to_buf: memcpy buf pointer to buf pointer
- * @device_memcpy_buf_to_pg: memcpy buf pointer to struct page
- * @device_memcpy_pg_to_pg: memcpy struct page/offset to struct page/offset
- * @device_memcpy_complete: poll the status of an IOAT DMA transaction
- * @device_memcpy_issue_pending: push appended descriptors to hardware
+ * @device_prep_dma_memcpy: prepares a memcpy operation
+ * @device_prep_dma_xor: prepares a xor operation
+ * @device_prep_dma_zero_sum: prepares a zero_sum operation
+ * @device_prep_dma_memset: prepares a memset operation
+ * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
+ * @device_dependency_added: async_tx notifies the channel about new deps
+ * @device_issue_pending: push pending transactions to hardware
  */
 struct dma_device {
 
 	unsigned int chancnt;
 	struct list_head channels;
 	struct list_head global_node;
+	dma_cap_mask_t  cap_mask;
+	int max_xor;
 
 	struct kref refcount;
 	struct completion done;
 
 	int dev_id;
+	struct device *dev;
 
 	int (*device_alloc_chan_resources)(struct dma_chan *chan);
 	void (*device_free_chan_resources)(struct dma_chan *chan);
-	dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan,
-			void *dest, void *src, size_t len);
-	dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan,
-			struct page *page, unsigned int offset, void *kdata,
-			size_t len);
-	dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan,
-			struct page *dest_pg, unsigned int dest_off,
-			struct page *src_pg, unsigned int src_off, size_t len);
-	enum dma_status (*device_memcpy_complete)(struct dma_chan *chan,
+
+	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
+		struct dma_chan *chan, size_t len, int int_en);
+	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
+		struct dma_chan *chan, unsigned int src_cnt, size_t len,
+		int int_en);
+	struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)(
+		struct dma_chan *chan, unsigned int src_cnt, size_t len,
+		u32 *result, int int_en);
+	struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
+		struct dma_chan *chan, int value, size_t len, int int_en);
+	struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)(
+		struct dma_chan *chan);
+
+	void (*device_dependency_added)(struct dma_chan *chan);
+	enum dma_status (*device_is_tx_complete)(struct dma_chan *chan,
 			dma_cookie_t cookie, dma_cookie_t *last,
 			dma_cookie_t *used);
-	void (*device_memcpy_issue_pending)(struct dma_chan *chan);
+	void (*device_issue_pending)(struct dma_chan *chan);
 };
 
 /* --- public DMA engine API --- */
@@ -207,96 +289,72 @@ struct dma_client *dma_async_client_register(dma_event_callback event_callback);
 void dma_async_client_unregister(struct dma_client *client);
 void dma_async_client_chan_request(struct dma_client *client,
 		unsigned int number);
+dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
+	void *dest, void *src, size_t len);
+dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
+	struct page *page, unsigned int offset, void *kdata, size_t len);
+dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
+	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
+	unsigned int src_off, size_t len);
+void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
+	struct dma_chan *chan);
 
-/**
- * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
- * @chan: DMA channel to offload copy to
- * @dest: destination address (virtual)
- * @src: source address (virtual)
- * @len: length
- *
- * Both @dest and @src must be mappable to a bus address according to the
- * DMA mapping API rules for streaming mappings.
- * Both @dest and @src must stay memory resident (kernel memory or locked
- * user space pages).
- */
-static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
-	void *dest, void *src, size_t len)
-{
-	int cpu = get_cpu();
-	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
-	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
-	put_cpu();
 
-	return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len);
+static inline void
+async_tx_ack(struct dma_async_tx_descriptor *tx)
+{
+	tx->ack = 1;
 }
 
-/**
- * dma_async_memcpy_buf_to_pg - offloaded copy from address to page
- * @chan: DMA channel to offload copy to
- * @page: destination page
- * @offset: offset in page to copy to
- * @kdata: source address (virtual)
- * @len: length
- *
- * Both @page/@offset and @kdata must be mappable to a bus address according
- * to the DMA mapping API rules for streaming mappings.
- * Both @page/@offset and @kdata must stay memory resident (kernel memory or
- * locked user space pages)
- */
-static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
-	struct page *page, unsigned int offset, void *kdata, size_t len)
+#define first_dma_cap(mask) __first_dma_cap(&(mask))
+static inline int __first_dma_cap(const dma_cap_mask_t *srcp)
 {
-	int cpu = get_cpu();
-	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
-	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
-	put_cpu();
+	return min_t(int, DMA_TX_TYPE_END,
+		find_first_bit(srcp->bits, DMA_TX_TYPE_END));
+}
 
-	return chan->device->device_memcpy_buf_to_pg(chan, page, offset,
-	                                             kdata, len);
+#define next_dma_cap(n, mask) __next_dma_cap((n), &(mask))
+static inline int __next_dma_cap(int n, const dma_cap_mask_t *srcp)
+{
+	return min_t(int, DMA_TX_TYPE_END,
+		find_next_bit(srcp->bits, DMA_TX_TYPE_END, n+1));
 }
 
-/**
- * dma_async_memcpy_pg_to_pg - offloaded copy from page to page
- * @chan: DMA channel to offload copy to
- * @dest_pg: destination page
- * @dest_off: offset in page to copy to
- * @src_pg: source page
- * @src_off: offset in page to copy from
- * @len: length
- *
- * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
- * address according to the DMA mapping API rules for streaming mappings.
- * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
- * (kernel memory or locked user space pages).
- */
-static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
-	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
-	unsigned int src_off, size_t len)
+#define dma_cap_set(tx, mask) __dma_cap_set((tx), &(mask))
+static inline void
+__dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp)
 {
-	int cpu = get_cpu();
-	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
-	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
-	put_cpu();
+	set_bit(tx_type, dstp->bits);
+}
 
-	return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off,
-	                                            src_pg, src_off, len);
+#define dma_has_cap(tx, mask) __dma_has_cap((tx), &(mask))
+static inline int
+__dma_has_cap(enum dma_transaction_type tx_type, dma_cap_mask_t *srcp)
+{
+	return test_bit(tx_type, srcp->bits);
 }
 
+#define for_each_dma_cap_mask(cap, mask) \
+	for ((cap) = first_dma_cap(mask);	\
+		(cap) < DMA_TX_TYPE_END;	\
+		(cap) = next_dma_cap((cap), (mask)))
+
 /**
- * dma_async_memcpy_issue_pending - flush pending copies to HW
+ * dma_async_issue_pending - flush pending transactions to HW
  * @chan: target DMA channel
  *
  * This allows drivers to push copies to HW in batches,
  * reducing MMIO writes where possible.
  */
-static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
+static inline void dma_async_issue_pending(struct dma_chan *chan)
 {
-	return chan->device->device_memcpy_issue_pending(chan);
+	return chan->device->device_issue_pending(chan);
 }
 
+#define dma_async_memcpy_issue_pending(chan) dma_async_issue_pending(chan)
+
 /**
- * dma_async_memcpy_complete - poll for transaction completion
+ * dma_async_is_tx_complete - poll for transaction completion
  * @chan: DMA channel
  * @cookie: transaction identifier to check status of
  * @last: returns last completed cookie, can be NULL
@@ -306,12 +364,15 @@ static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
  * internal state and can be used with dma_async_is_complete() to check
  * the status of multiple cookies without re-checking hardware state.
  */
-static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan,
+static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
 	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
 {
-	return chan->device->device_memcpy_complete(chan, cookie, last, used);
+	return chan->device->device_is_tx_complete(chan, cookie, last, used);
 }
 
+#define dma_async_memcpy_complete(chan, cookie, last, used)\
+	dma_async_is_tx_complete(chan, cookie, last, used)
+
 /**
  * dma_async_is_complete - test a cookie against chan state
  * @cookie: transaction identifier to test status of
@@ -334,6 +395,7 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
 	return DMA_IN_PROGRESS;
 }
 
+enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
 
 /* --- DMA device --- */
 
@@ -362,5 +424,4 @@ dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
 	struct dma_pinned_list *pinned_list, struct page *page,
 	unsigned int offset, size_t len);
 
-#endif /* CONFIG_DMA_ENGINE */
 #endif /* DMAENGINE_H */
-- 
cgit v1.2.3


From d379b01e9087a582d58f4b678208a4f8d8376fe7 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Jul 2007 11:56:42 -0700
Subject: dmaengine: make clients responsible for managing channels

The current implementation assumes that a channel will only be used by one
client at a time.  In order to enable channel sharing the dmaengine core is
changed to a model where clients subscribe to channel-available-events.
Instead of tracking how many channels a client wants and how many it has
received the core just broadcasts the available channels and lets the
clients optionally take a reference.  The core learns about the clients'
needs at dma_event_callback time.

In support of multiple operation types, clients can specify a capability
mask to only be notified of channels that satisfy a certain set of
capabilities.

Changelog:
* removed DMA_TX_ARRAY_INIT, no longer needed
* dma_client_chan_free -> dma_chan_release: switch to global reference
  counting only at device unregistration time, before it was also happening
  at client unregistration time
* clients now return dma_state_client to dmaengine (ack, dup, nak)
* checkpatch.pl fixes
* fixup merge with git-ioat

Cc: Chris Leech <christopher.leech@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 drivers/dma/dmaengine.c   | 217 ++++++++++++++++++++++++----------------------
 drivers/dma/ioatdma.c     |   1 -
 drivers/dma/ioatdma.h     |   3 -
 include/linux/dmaengine.h |  58 ++++++++-----
 net/core/dev.c            | 112 ++++++++++++++++--------
 5 files changed, 224 insertions(+), 167 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 404cc7b6e705..82489923af09 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -37,11 +37,11 @@
  * Each device has a channels list, which runs unlocked but is never modified
  * once the device is registered, it's just setup by the driver.
  *
- * Each client has a channels list, it's only modified under the client->lock
- * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ * Each client is responsible for keeping track of the channels it uses.  See
+ * the definition of dma_event_callback in dmaengine.h.
  *
  * Each device has a kref, which is initialized to 1 when the device is
- * registered. A kref_put is done for each class_device registered.  When the
+ * registered. A kref_get is done for each class_device registered.  When the
  * class_device is released, the coresponding kref_put is done in the release
  * method. Every time one of the device's channels is allocated to a client,
  * a kref_get occurs.  When the channel is freed, the coresponding kref_put
@@ -51,10 +51,12 @@
  * references to finish.
  *
  * Each channel has an open-coded implementation of Rusty Russell's "bigref,"
- * with a kref and a per_cpu local_t.  A single reference is set when on an
- * ADDED event, and removed with a REMOVE event.  Net DMA client takes an
- * extra reference per outstanding transaction.  The relase function does a
- * kref_put on the device. -ChrisL
+ * with a kref and a per_cpu local_t.  A dma_chan_get is called when a client
+ * signals that it wants to use a channel, and dma_chan_put is called when
+ * a channel is removed or a client using it is unregesitered.  A client can
+ * take extra references per outstanding transaction, as is the case with
+ * the NET DMA client.  The release function does a kref_put on the device.
+ *	-ChrisL, DanW
  */
 
 #include <linux/init.h>
@@ -102,8 +104,19 @@ static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
 static ssize_t show_in_use(struct class_device *cd, char *buf)
 {
 	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	int in_use = 0;
+
+	if (unlikely(chan->slow_ref) &&
+		atomic_read(&chan->refcount.refcount) > 1)
+		in_use = 1;
+	else {
+		if (local_read(&(per_cpu_ptr(chan->local,
+			get_cpu())->refcount)) > 0)
+			in_use = 1;
+		put_cpu();
+	}
 
-	return sprintf(buf, "%d\n", (chan->client ? 1 : 0));
+	return sprintf(buf, "%d\n", in_use);
 }
 
 static struct class_device_attribute dma_class_attrs[] = {
@@ -129,42 +142,53 @@ static struct class dma_devclass = {
 
 /* --- client and device registration --- */
 
+#define dma_chan_satisfies_mask(chan, mask) \
+	__dma_chan_satisfies_mask((chan), &(mask))
+static int
+__dma_chan_satisfies_mask(struct dma_chan *chan, dma_cap_mask_t *want)
+{
+	dma_cap_mask_t has;
+
+	bitmap_and(has.bits, want->bits, chan->device->cap_mask.bits,
+		DMA_TX_TYPE_END);
+	return bitmap_equal(want->bits, has.bits, DMA_TX_TYPE_END);
+}
+
 /**
- * dma_client_chan_alloc - try to allocate a channel to a client
+ * dma_client_chan_alloc - try to allocate channels to a client
  * @client: &dma_client
  *
  * Called with dma_list_mutex held.
  */
-static struct dma_chan *dma_client_chan_alloc(struct dma_client *client)
+static void dma_client_chan_alloc(struct dma_client *client)
 {
 	struct dma_device *device;
 	struct dma_chan *chan;
-	unsigned long flags;
 	int desc;	/* allocated descriptor count */
+	enum dma_state_client ack;
 
-	/* Find a channel, any DMA engine will do */
-	list_for_each_entry(device, &dma_device_list, global_node) {
+	/* Find a channel */
+	list_for_each_entry(device, &dma_device_list, global_node)
 		list_for_each_entry(chan, &device->channels, device_node) {
-			if (chan->client)
+			if (!dma_chan_satisfies_mask(chan, client->cap_mask))
 				continue;
 
 			desc = chan->device->device_alloc_chan_resources(chan);
 			if (desc >= 0) {
-				kref_get(&device->refcount);
-				kref_init(&chan->refcount);
-				chan->slow_ref = 0;
-				INIT_RCU_HEAD(&chan->rcu);
-				chan->client = client;
-				spin_lock_irqsave(&client->lock, flags);
-				list_add_tail_rcu(&chan->client_node,
-				                  &client->channels);
-				spin_unlock_irqrestore(&client->lock, flags);
-				return chan;
+				ack = client->event_callback(client,
+						chan,
+						DMA_RESOURCE_AVAILABLE);
+
+				/* we are done once this client rejects
+				 * an available resource
+				 */
+				if (ack == DMA_ACK) {
+					dma_chan_get(chan);
+					kref_get(&device->refcount);
+				} else if (ack == DMA_NAK)
+					return;
 			}
 		}
-	}
-
-	return NULL;
 }
 
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
@@ -193,7 +217,6 @@ void dma_chan_cleanup(struct kref *kref)
 {
 	struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
 	chan->device->device_free_chan_resources(chan);
-	chan->client = NULL;
 	kref_put(&chan->device->refcount, dma_async_device_cleanup);
 }
 EXPORT_SYMBOL(dma_chan_cleanup);
@@ -209,7 +232,7 @@ static void dma_chan_free_rcu(struct rcu_head *rcu)
 	kref_put(&chan->refcount, dma_chan_cleanup);
 }
 
-static void dma_client_chan_free(struct dma_chan *chan)
+static void dma_chan_release(struct dma_chan *chan)
 {
 	atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
 	chan->slow_ref = 1;
@@ -217,70 +240,57 @@ static void dma_client_chan_free(struct dma_chan *chan)
 }
 
 /**
- * dma_chans_rebalance - reallocate channels to clients
- *
- * When the number of DMA channel in the system changes,
- * channels need to be rebalanced among clients.
+ * dma_chans_notify_available - broadcast available channels to the clients
  */
-static void dma_chans_rebalance(void)
+static void dma_clients_notify_available(void)
 {
 	struct dma_client *client;
-	struct dma_chan *chan;
-	unsigned long flags;
 
 	mutex_lock(&dma_list_mutex);
 
-	list_for_each_entry(client, &dma_client_list, global_node) {
-		while (client->chans_desired > client->chan_count) {
-			chan = dma_client_chan_alloc(client);
-			if (!chan)
-				break;
-			client->chan_count++;
-			client->event_callback(client,
-	                                       chan,
-	                                       DMA_RESOURCE_ADDED);
-		}
-		while (client->chans_desired < client->chan_count) {
-			spin_lock_irqsave(&client->lock, flags);
-			chan = list_entry(client->channels.next,
-			                  struct dma_chan,
-			                  client_node);
-			list_del_rcu(&chan->client_node);
-			spin_unlock_irqrestore(&client->lock, flags);
-			client->chan_count--;
-			client->event_callback(client,
-			                       chan,
-			                       DMA_RESOURCE_REMOVED);
-			dma_client_chan_free(chan);
-		}
-	}
+	list_for_each_entry(client, &dma_client_list, global_node)
+		dma_client_chan_alloc(client);
 
 	mutex_unlock(&dma_list_mutex);
 }
 
 /**
- * dma_async_client_register - allocate and register a &dma_client
- * @event_callback: callback for notification of channel addition/removal
+ * dma_chans_notify_available - tell the clients that a channel is going away
+ * @chan: channel on its way out
  */
-struct dma_client *dma_async_client_register(dma_event_callback event_callback)
+static void dma_clients_notify_removed(struct dma_chan *chan)
 {
 	struct dma_client *client;
+	enum dma_state_client ack;
 
-	client = kzalloc(sizeof(*client), GFP_KERNEL);
-	if (!client)
-		return NULL;
+	mutex_lock(&dma_list_mutex);
+
+	list_for_each_entry(client, &dma_client_list, global_node) {
+		ack = client->event_callback(client, chan,
+				DMA_RESOURCE_REMOVED);
+
+		/* client was holding resources for this channel so
+		 * free it
+		 */
+		if (ack == DMA_ACK) {
+			dma_chan_put(chan);
+			kref_put(&chan->device->refcount,
+				dma_async_device_cleanup);
+		}
+	}
 
-	INIT_LIST_HEAD(&client->channels);
-	spin_lock_init(&client->lock);
-	client->chans_desired = 0;
-	client->chan_count = 0;
-	client->event_callback = event_callback;
+	mutex_unlock(&dma_list_mutex);
+}
 
+/**
+ * dma_async_client_register - register a &dma_client
+ * @client: ptr to a client structure with valid 'event_callback' and 'cap_mask'
+ */
+void dma_async_client_register(struct dma_client *client)
+{
 	mutex_lock(&dma_list_mutex);
 	list_add_tail(&client->global_node, &dma_client_list);
 	mutex_unlock(&dma_list_mutex);
-
-	return client;
 }
 EXPORT_SYMBOL(dma_async_client_register);
 
@@ -292,40 +302,42 @@ EXPORT_SYMBOL(dma_async_client_register);
  */
 void dma_async_client_unregister(struct dma_client *client)
 {
+	struct dma_device *device;
 	struct dma_chan *chan;
+	enum dma_state_client ack;
 
 	if (!client)
 		return;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(chan, &client->channels, client_node)
-		dma_client_chan_free(chan);
-	rcu_read_unlock();
-
 	mutex_lock(&dma_list_mutex);
+	/* free all channels the client is holding */
+	list_for_each_entry(device, &dma_device_list, global_node)
+		list_for_each_entry(chan, &device->channels, device_node) {
+			ack = client->event_callback(client, chan,
+				DMA_RESOURCE_REMOVED);
+
+			if (ack == DMA_ACK) {
+				dma_chan_put(chan);
+				kref_put(&chan->device->refcount,
+					dma_async_device_cleanup);
+			}
+		}
+
 	list_del(&client->global_node);
 	mutex_unlock(&dma_list_mutex);
-
-	kfree(client);
-	dma_chans_rebalance();
 }
 EXPORT_SYMBOL(dma_async_client_unregister);
 
 /**
- * dma_async_client_chan_request - request DMA channels
- * @client: &dma_client
- * @number: count of DMA channels requested
- *
- * Clients call dma_async_client_chan_request() to specify how many
- * DMA channels they need, 0 to free all currently allocated.
- * The resulting allocations/frees are indicated to the client via the
- * event callback.
+ * dma_async_client_chan_request - send all available channels to the
+ * client that satisfy the capability mask
+ * @client - requester
  */
-void dma_async_client_chan_request(struct dma_client *client,
-			unsigned int number)
+void dma_async_client_chan_request(struct dma_client *client)
 {
-	client->chans_desired = number;
-	dma_chans_rebalance();
+	mutex_lock(&dma_list_mutex);
+	dma_client_chan_alloc(client);
+	mutex_unlock(&dma_list_mutex);
 }
 EXPORT_SYMBOL(dma_async_client_chan_request);
 
@@ -386,13 +398,16 @@ int dma_async_device_register(struct dma_device *device)
 		}
 
 		kref_get(&device->refcount);
+		kref_init(&chan->refcount);
+		chan->slow_ref = 0;
+		INIT_RCU_HEAD(&chan->rcu);
 	}
 
 	mutex_lock(&dma_list_mutex);
 	list_add_tail(&device->global_node, &dma_device_list);
 	mutex_unlock(&dma_list_mutex);
 
-	dma_chans_rebalance();
+	dma_clients_notify_available();
 
 	return 0;
 
@@ -428,26 +443,16 @@ static void dma_async_device_cleanup(struct kref *kref)
 void dma_async_device_unregister(struct dma_device *device)
 {
 	struct dma_chan *chan;
-	unsigned long flags;
 
 	mutex_lock(&dma_list_mutex);
 	list_del(&device->global_node);
 	mutex_unlock(&dma_list_mutex);
 
 	list_for_each_entry(chan, &device->channels, device_node) {
-		if (chan->client) {
-			spin_lock_irqsave(&chan->client->lock, flags);
-			list_del(&chan->client_node);
-			chan->client->chan_count--;
-			spin_unlock_irqrestore(&chan->client->lock, flags);
-			chan->client->event_callback(chan->client,
-			                             chan,
-			                             DMA_RESOURCE_REMOVED);
-			dma_client_chan_free(chan);
-		}
+		dma_clients_notify_removed(chan);
 		class_device_unregister(&chan->class_dev);
+		dma_chan_release(chan);
 	}
-	dma_chans_rebalance();
 
 	kref_put(&device->refcount, dma_async_device_cleanup);
 	wait_for_completion(&device->done);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 171044930282..81810b3042f1 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -72,7 +72,6 @@ static int enumerate_dma_channels(struct ioat_device *device)
 		INIT_LIST_HEAD(&ioat_chan->used_desc);
 		/* This should be made common somewhere in dmaengine.c */
 		ioat_chan->common.device = &device->common;
-		ioat_chan->common.client = NULL;
 		list_add_tail(&ioat_chan->common.device_node,
 		              &device->common.channels);
 	}
diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
index d3f69bb15fa0..d3726478031a 100644
--- a/drivers/dma/ioatdma.h
+++ b/drivers/dma/ioatdma.h
@@ -30,9 +30,6 @@
 
 #define IOAT_LOW_COMPLETION_MASK	0xffffffc0
 
-extern struct list_head dma_device_list;
-extern struct list_head dma_client_list;
-
 /**
  * struct ioat_device - internal representation of a IOAT device
  * @pdev: PCI-Express device
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3de1cf71031a..a3b6035b6c86 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -29,19 +29,31 @@
 #include <linux/dma-mapping.h>
 
 /**
- * enum dma_event - resource PNP/power managment events
+ * enum dma_state - resource PNP/power managment state
  * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
  * @DMA_RESOURCE_RESUME: DMA device returning to full power
- * @DMA_RESOURCE_ADDED: DMA device added to the system
+ * @DMA_RESOURCE_AVAILABLE: DMA device available to the system
  * @DMA_RESOURCE_REMOVED: DMA device removed from the system
  */
-enum dma_event {
+enum dma_state {
 	DMA_RESOURCE_SUSPEND,
 	DMA_RESOURCE_RESUME,
-	DMA_RESOURCE_ADDED,
+	DMA_RESOURCE_AVAILABLE,
 	DMA_RESOURCE_REMOVED,
 };
 
+/**
+ * enum dma_state_client - state of the channel in the client
+ * @DMA_ACK: client would like to use, or was using this channel
+ * @DMA_DUP: client has already seen this channel, or is not using this channel
+ * @DMA_NAK: client does not want to see any more channels
+ */
+enum dma_state_client {
+	DMA_ACK,
+	DMA_DUP,
+	DMA_NAK,
+};
+
 /**
  * typedef dma_cookie_t - an opaque DMA cookie
  *
@@ -104,7 +116,6 @@ struct dma_chan_percpu {
 
 /**
  * struct dma_chan - devices supply DMA channels, clients use them
- * @client: ptr to the client user of this chan, will be %NULL when unused
  * @device: ptr to the dma device who supplies this channel, always !%NULL
  * @cookie: last cookie value returned to client
  * @chan_id: channel ID for sysfs
@@ -112,12 +123,10 @@ struct dma_chan_percpu {
  * @refcount: kref, used in "bigref" slow-mode
  * @slow_ref: indicates that the DMA channel is free
  * @rcu: the DMA channel's RCU head
- * @client_node: used to add this to the client chan list
  * @device_node: used to add this to the device chan list
  * @local: per-cpu pointer to a struct dma_chan_percpu
  */
 struct dma_chan {
-	struct dma_client *client;
 	struct dma_device *device;
 	dma_cookie_t cookie;
 
@@ -129,11 +138,11 @@ struct dma_chan {
 	int slow_ref;
 	struct rcu_head rcu;
 
-	struct list_head client_node;
 	struct list_head device_node;
 	struct dma_chan_percpu *local;
 };
 
+
 void dma_chan_cleanup(struct kref *kref);
 
 static inline void dma_chan_get(struct dma_chan *chan)
@@ -158,26 +167,31 @@ static inline void dma_chan_put(struct dma_chan *chan)
 
 /*
  * typedef dma_event_callback - function pointer to a DMA event callback
+ * For each channel added to the system this routine is called for each client.
+ * If the client would like to use the channel it returns '1' to signal (ack)
+ * the dmaengine core to take out a reference on the channel and its
+ * corresponding device.  A client must not 'ack' an available channel more
+ * than once.  When a channel is removed all clients are notified.  If a client
+ * is using the channel it must 'ack' the removal.  A client must not 'ack' a
+ * removed channel more than once.
+ * @client - 'this' pointer for the client context
+ * @chan - channel to be acted upon
+ * @state - available or removed
  */
-typedef void (*dma_event_callback) (struct dma_client *client,
-		struct dma_chan *chan, enum dma_event event);
+struct dma_client;
+typedef enum dma_state_client (*dma_event_callback) (struct dma_client *client,
+		struct dma_chan *chan, enum dma_state state);
 
 /**
  * struct dma_client - info on the entity making use of DMA services
  * @event_callback: func ptr to call when something happens
- * @chan_count: number of chans allocated
- * @chans_desired: number of chans requested. Can be +/- chan_count
- * @lock: protects access to the channels list
- * @channels: the list of DMA channels allocated
+ * @cap_mask: only return channels that satisfy the requested capabilities
+ *  a value of zero corresponds to any capability
  * @global_node: list_head for global dma_client_list
  */
 struct dma_client {
 	dma_event_callback	event_callback;
-	unsigned int		chan_count;
-	unsigned int		chans_desired;
-
-	spinlock_t		lock;
-	struct list_head	channels;
+	dma_cap_mask_t		cap_mask;
 	struct list_head	global_node;
 };
 
@@ -285,10 +299,9 @@ struct dma_device {
 
 /* --- public DMA engine API --- */
 
-struct dma_client *dma_async_client_register(dma_event_callback event_callback);
+void dma_async_client_register(struct dma_client *client);
 void dma_async_client_unregister(struct dma_client *client);
-void dma_async_client_chan_request(struct dma_client *client,
-		unsigned int number);
+void dma_async_client_chan_request(struct dma_client *client);
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
 dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
@@ -299,7 +312,6 @@ dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
 void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 	struct dma_chan *chan);
 
-
 static inline void
 async_tx_ack(struct dma_async_tx_descriptor *tx)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index ee051bb398a0..835202fb34c4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -151,9 +151,22 @@ static struct list_head ptype_base[16] __read_mostly;	/* 16 way hashed list */
 static struct list_head ptype_all __read_mostly;	/* Taps */
 
 #ifdef CONFIG_NET_DMA
-static struct dma_client *net_dma_client;
-static unsigned int net_dma_count;
-static spinlock_t net_dma_event_lock;
+struct net_dma {
+	struct dma_client client;
+	spinlock_t lock;
+	cpumask_t channel_mask;
+	struct dma_chan *channels[NR_CPUS];
+};
+
+static enum dma_state_client
+netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+	enum dma_state state);
+
+static struct net_dma net_dma = {
+	.client = {
+		.event_callback = netdev_dma_event,
+	},
+};
 #endif
 
 /*
@@ -2015,12 +2028,13 @@ out:
 	 * There may not be any more sk_buffs coming right now, so push
 	 * any pending DMA copies to hardware
 	 */
-	if (net_dma_client) {
-		struct dma_chan *chan;
-		rcu_read_lock();
-		list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
-			dma_async_memcpy_issue_pending(chan);
-		rcu_read_unlock();
+	if (!cpus_empty(net_dma.channel_mask)) {
+		int chan_idx;
+		for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
+			struct dma_chan *chan = net_dma.channels[chan_idx];
+			if (chan)
+				dma_async_memcpy_issue_pending(chan);
+		}
 	}
 #endif
 	return;
@@ -3563,12 +3577,13 @@ static int dev_cpu_callback(struct notifier_block *nfb,
  * This is called when the number of channels allocated to the net_dma_client
  * changes.  The net_dma_client tries to have one DMA channel per CPU.
  */
-static void net_dma_rebalance(void)
+
+static void net_dma_rebalance(struct net_dma *net_dma)
 {
-	unsigned int cpu, i, n;
+	unsigned int cpu, i, n, chan_idx;
 	struct dma_chan *chan;
 
-	if (net_dma_count == 0) {
+	if (cpus_empty(net_dma->channel_mask)) {
 		for_each_online_cpu(cpu)
 			rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
 		return;
@@ -3577,10 +3592,12 @@ static void net_dma_rebalance(void)
 	i = 0;
 	cpu = first_cpu(cpu_online_map);
 
-	rcu_read_lock();
-	list_for_each_entry(chan, &net_dma_client->channels, client_node) {
-		n = ((num_online_cpus() / net_dma_count)
-		   + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
+	for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
+		chan = net_dma->channels[chan_idx];
+
+		n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
+		   + (i < (num_online_cpus() %
+			cpus_weight(net_dma->channel_mask)) ? 1 : 0));
 
 		while(n) {
 			per_cpu(softnet_data, cpu).net_dma = chan;
@@ -3589,7 +3606,6 @@ static void net_dma_rebalance(void)
 		}
 		i++;
 	}
-	rcu_read_unlock();
 }
 
 /**
@@ -3598,23 +3614,53 @@ static void net_dma_rebalance(void)
  * @chan: DMA channel for the event
  * @event: event type
  */
-static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
-	enum dma_event event)
-{
-	spin_lock(&net_dma_event_lock);
-	switch (event) {
-	case DMA_RESOURCE_ADDED:
-		net_dma_count++;
-		net_dma_rebalance();
+static enum dma_state_client
+netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+	enum dma_state state)
+{
+	int i, found = 0, pos = -1;
+	struct net_dma *net_dma =
+		container_of(client, struct net_dma, client);
+	enum dma_state_client ack = DMA_DUP; /* default: take no action */
+
+	spin_lock(&net_dma->lock);
+	switch (state) {
+	case DMA_RESOURCE_AVAILABLE:
+		for (i = 0; i < NR_CPUS; i++)
+			if (net_dma->channels[i] == chan) {
+				found = 1;
+				break;
+			} else if (net_dma->channels[i] == NULL && pos < 0)
+				pos = i;
+
+		if (!found && pos >= 0) {
+			ack = DMA_ACK;
+			net_dma->channels[pos] = chan;
+			cpu_set(pos, net_dma->channel_mask);
+			net_dma_rebalance(net_dma);
+		}
 		break;
 	case DMA_RESOURCE_REMOVED:
-		net_dma_count--;
-		net_dma_rebalance();
+		for (i = 0; i < NR_CPUS; i++)
+			if (net_dma->channels[i] == chan) {
+				found = 1;
+				pos = i;
+				break;
+			}
+
+		if (found) {
+			ack = DMA_ACK;
+			cpu_clear(pos, net_dma->channel_mask);
+			net_dma->channels[i] = NULL;
+			net_dma_rebalance(net_dma);
+		}
 		break;
 	default:
 		break;
 	}
-	spin_unlock(&net_dma_event_lock);
+	spin_unlock(&net_dma->lock);
+
+	return ack;
 }
 
 /**
@@ -3622,12 +3668,10 @@ static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
  */
 static int __init netdev_dma_register(void)
 {
-	spin_lock_init(&net_dma_event_lock);
-	net_dma_client = dma_async_client_register(netdev_dma_event);
-	if (net_dma_client == NULL)
-		return -ENOMEM;
-
-	dma_async_client_chan_request(net_dma_client, num_online_cpus());
+	spin_lock_init(&net_dma.lock);
+	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
+	dma_async_client_register(&net_dma.client);
+	dma_async_client_chan_request(&net_dma.client);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 685784aaf3cd0e3ff5e36c7ecf6f441cdbf57f73 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Jul 2007 11:56:42 -0700
Subject: xor: make 'xor_blocks' a library routine for use with async_tx

The async_tx api tries to use a dma engine for an operation, but will fall
back to an optimized software routine otherwise.  Xor support is
implemented using the raid5 xor routines.  For organizational purposes this
routine is moved to a common area.

The following fixes are also made:
* rename xor_block => xor_blocks, suggested by Adrian Bunk
* ensure that xor.o initializes before md.o in the built-in case
* checkpatch.pl fixes
* mark calibrate_xor_blocks __init, Adrian Bunk

Cc: Adrian Bunk <bunk@stusta.de>
Cc: NeilBrown <neilb@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/Kconfig           |   6 ++
 crypto/Makefile          |   6 ++
 crypto/xor.c             | 156 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/Kconfig       |   1 +
 drivers/md/Makefile      |   4 +-
 drivers/md/md.c          |   2 +-
 drivers/md/raid5.c       |  10 +--
 drivers/md/xor.c         | 154 ----------------------------------------------
 include/linux/raid/xor.h |   2 +-
 9 files changed, 178 insertions(+), 163 deletions(-)
 create mode 100644 crypto/xor.c
 delete mode 100644 drivers/md/xor.c

(limited to 'include/linux')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 4ca0ab3448d9..b749a1a46e22 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1,3 +1,9 @@
+#
+# Generic algorithms support
+#
+config XOR_BLOCKS
+	tristate
+
 #
 # Cryptographic API Configuration
 #
diff --git a/crypto/Makefile b/crypto/Makefile
index cce46a1c9dc7..68e934b4bee2 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -50,3 +50,9 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
 obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
 
 obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
+
+#
+# generic algorithms and the async_tx api
+#
+obj-$(CONFIG_XOR_BLOCKS) += xor.o
+
diff --git a/crypto/xor.c b/crypto/xor.c
new file mode 100644
index 000000000000..8281ac5e68a8
--- /dev/null
+++ b/crypto/xor.c
@@ -0,0 +1,156 @@
+/*
+ * xor.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000,
+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
+ *
+ * Dispatch optimized RAID-5 checksumming functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define BH_TRACE 0
+#include <linux/module.h>
+#include <linux/raid/md.h>
+#include <linux/raid/xor.h>
+#include <asm/xor.h>
+
+/* The xor routines to use.  */
+static struct xor_block_template *active_template;
+
+void
+xor_blocks(unsigned int count, unsigned int bytes, void **ptr)
+{
+	unsigned long *p0, *p1, *p2, *p3, *p4;
+
+	p0 = (unsigned long *) ptr[0];
+	p1 = (unsigned long *) ptr[1];
+	if (count == 2) {
+		active_template->do_2(bytes, p0, p1);
+		return;
+	}
+
+	p2 = (unsigned long *) ptr[2];
+	if (count == 3) {
+		active_template->do_3(bytes, p0, p1, p2);
+		return;
+	}
+
+	p3 = (unsigned long *) ptr[3];
+	if (count == 4) {
+		active_template->do_4(bytes, p0, p1, p2, p3);
+		return;
+	}
+
+	p4 = (unsigned long *) ptr[4];
+	active_template->do_5(bytes, p0, p1, p2, p3, p4);
+}
+EXPORT_SYMBOL(xor_blocks);
+
+/* Set of all registered templates.  */
+static struct xor_block_template *template_list;
+
+#define BENCH_SIZE (PAGE_SIZE)
+
+static void
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
+{
+	int speed;
+	unsigned long now;
+	int i, count, max;
+
+	tmpl->next = template_list;
+	template_list = tmpl;
+
+	/*
+	 * Count the number of XORs done during a whole jiffy, and use
+	 * this to calculate the speed of checksumming.  We use a 2-page
+	 * allocation to have guaranteed color L1-cache layout.
+	 */
+	max = 0;
+	for (i = 0; i < 5; i++) {
+		now = jiffies;
+		count = 0;
+		while (jiffies == now) {
+			mb(); /* prevent loop optimzation */
+			tmpl->do_2(BENCH_SIZE, b1, b2);
+			mb();
+			count++;
+			mb();
+		}
+		if (count > max)
+			max = count;
+	}
+
+	speed = max * (HZ * BENCH_SIZE / 1024);
+	tmpl->speed = speed;
+
+	printk(KERN_INFO "   %-10s: %5d.%03d MB/sec\n", tmpl->name,
+	       speed / 1000, speed % 1000);
+}
+
+static int __init
+calibrate_xor_blocks(void)
+{
+	void *b1, *b2;
+	struct xor_block_template *f, *fastest;
+
+	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
+	if (!b1) {
+		printk(KERN_WARNING "xor: Yikes!  No memory available.\n");
+		return -ENOMEM;
+	}
+	b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
+
+	/*
+	 * If this arch/cpu has a short-circuited selection, don't loop through
+	 * all the possible functions, just test the best one
+	 */
+
+	fastest = NULL;
+
+#ifdef XOR_SELECT_TEMPLATE
+		fastest = XOR_SELECT_TEMPLATE(fastest);
+#endif
+
+#define xor_speed(templ)	do_xor_speed((templ), b1, b2)
+
+	if (fastest) {
+		printk(KERN_INFO "xor: automatically using best "
+			"checksumming function: %s\n",
+			fastest->name);
+		xor_speed(fastest);
+	} else {
+		printk(KERN_INFO "xor: measuring checksumming speed\n");
+		XOR_TRY_TEMPLATES;
+		fastest = template_list;
+		for (f = fastest; f; f = f->next)
+			if (f->speed > fastest->speed)
+				fastest = f;
+	}
+
+	printk(KERN_INFO "xor: using function: %s (%d.%03d MB/sec)\n",
+	       fastest->name, fastest->speed / 1000, fastest->speed % 1000);
+
+#undef xor_speed
+
+	free_pages((unsigned long)b1, 2);
+
+	active_template = fastest;
+	return 0;
+}
+
+static __exit void xor_exit(void) { }
+
+MODULE_LICENSE("GPL");
+
+/* when built-in xor.o must initialize before drivers/md/md.o */
+core_initcall(calibrate_xor_blocks);
+module_exit(xor_exit);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 7df934d69134..24d93d02a1f3 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -109,6 +109,7 @@ config MD_RAID10
 config MD_RAID456
 	tristate "RAID-4/RAID-5/RAID-6 mode"
 	depends on BLK_DEV_MD
+	select XOR_BLOCKS
 	---help---
 	  A RAID-5 set of N drives with a capacity of C MB per drive provides
 	  the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 38754084eac7..71eb45f74171 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -17,7 +17,7 @@ raid456-objs	:= raid5.o raid6algos.o raid6recov.o raid6tables.o \
 hostprogs-y	:= mktables
 
 # Note: link order is important.  All raid personalities
-# and xor.o must come before md.o, as they each initialise 
+# and must come before md.o, as they each initialise 
 # themselves, and md.o may use the personalities when it 
 # auto-initialised.
 
@@ -25,7 +25,7 @@ obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
 obj-$(CONFIG_MD_RAID1)		+= raid1.o
 obj-$(CONFIG_MD_RAID10)		+= raid10.o
-obj-$(CONFIG_MD_RAID456)	+= raid456.o xor.o
+obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1c54f3c1cca7..33beaa7da085 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5814,7 +5814,7 @@ static __exit void md_exit(void)
 	}
 }
 
-module_init(md_init)
+subsys_initcall(md_init);
 module_exit(md_exit)
 
 static int get_ro(char *buffer, struct kernel_param *kp)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 061375ee6592..5adbe0b22684 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -918,7 +918,7 @@ static void copy_data(int frombio, struct bio *bio,
 
 #define check_xor() 	do { 						\
 			   if (count == MAX_XOR_BLOCKS) {		\
-				xor_block(count, STRIPE_SIZE, ptr);	\
+				xor_blocks(count, STRIPE_SIZE, ptr);	\
 				count = 1;				\
 			   }						\
 			} while(0)
@@ -949,7 +949,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
 		check_xor();
 	}
 	if (count != 1)
-		xor_block(count, STRIPE_SIZE, ptr);
+		xor_blocks(count, STRIPE_SIZE, ptr);
 	set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 }
 
@@ -1004,7 +1004,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
 		break;
 	}
 	if (count>1) {
-		xor_block(count, STRIPE_SIZE, ptr);
+		xor_blocks(count, STRIPE_SIZE, ptr);
 		count = 1;
 	}
 	
@@ -1038,7 +1038,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
 			}
 	}
 	if (count != 1)
-		xor_block(count, STRIPE_SIZE, ptr);
+		xor_blocks(count, STRIPE_SIZE, ptr);
 	
 	if (method != CHECK_PARITY) {
 		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
@@ -1160,7 +1160,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 			check_xor();
 		}
 		if (count != 1)
-			xor_block(count, STRIPE_SIZE, ptr);
+			xor_blocks(count, STRIPE_SIZE, ptr);
 		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 	}
diff --git a/drivers/md/xor.c b/drivers/md/xor.c
deleted file mode 100644
index 324897c4be4e..000000000000
--- a/drivers/md/xor.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * xor.c : Multiple Devices driver for Linux
- *
- * Copyright (C) 1996, 1997, 1998, 1999, 2000,
- * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
- *
- * Dispatch optimized RAID-5 checksumming functions.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define BH_TRACE 0
-#include <linux/module.h>
-#include <linux/raid/md.h>
-#include <linux/raid/xor.h>
-#include <asm/xor.h>
-
-/* The xor routines to use.  */
-static struct xor_block_template *active_template;
-
-void
-xor_block(unsigned int count, unsigned int bytes, void **ptr)
-{
-	unsigned long *p0, *p1, *p2, *p3, *p4;
-
-	p0 = (unsigned long *) ptr[0];
-	p1 = (unsigned long *) ptr[1];
-	if (count == 2) {
-		active_template->do_2(bytes, p0, p1);
-		return;
-	}
-
-	p2 = (unsigned long *) ptr[2];
-	if (count == 3) {
-		active_template->do_3(bytes, p0, p1, p2);
-		return;
-	}
-
-	p3 = (unsigned long *) ptr[3];
-	if (count == 4) {
-		active_template->do_4(bytes, p0, p1, p2, p3);
-		return;
-	}
-
-	p4 = (unsigned long *) ptr[4];
-	active_template->do_5(bytes, p0, p1, p2, p3, p4);
-}
-
-/* Set of all registered templates.  */
-static struct xor_block_template *template_list;
-
-#define BENCH_SIZE (PAGE_SIZE)
-
-static void
-do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
-{
-	int speed;
-	unsigned long now;
-	int i, count, max;
-
-	tmpl->next = template_list;
-	template_list = tmpl;
-
-	/*
-	 * Count the number of XORs done during a whole jiffy, and use
-	 * this to calculate the speed of checksumming.  We use a 2-page
-	 * allocation to have guaranteed color L1-cache layout.
-	 */
-	max = 0;
-	for (i = 0; i < 5; i++) {
-		now = jiffies;
-		count = 0;
-		while (jiffies == now) {
-			mb();
-			tmpl->do_2(BENCH_SIZE, b1, b2);
-			mb();
-			count++;
-			mb();
-		}
-		if (count > max)
-			max = count;
-	}
-
-	speed = max * (HZ * BENCH_SIZE / 1024);
-	tmpl->speed = speed;
-
-	printk("   %-10s: %5d.%03d MB/sec\n", tmpl->name,
-	       speed / 1000, speed % 1000);
-}
-
-static int
-calibrate_xor_block(void)
-{
-	void *b1, *b2;
-	struct xor_block_template *f, *fastest;
-
-	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
-	if (! b1) {
-		printk("raid5: Yikes!  No memory available.\n");
-		return -ENOMEM;
-	}
-	b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
-
-	/*
-	 * If this arch/cpu has a short-circuited selection, don't loop through all
-	 * the possible functions, just test the best one
-	 */
-
-	fastest = NULL;
-
-#ifdef XOR_SELECT_TEMPLATE
-		fastest = XOR_SELECT_TEMPLATE(fastest);
-#endif
-
-#define xor_speed(templ)	do_xor_speed((templ), b1, b2)
-
-	if (fastest) {
-		printk(KERN_INFO "raid5: automatically using best checksumming function: %s\n",
-			fastest->name);
-		xor_speed(fastest);
-	} else {
-		printk(KERN_INFO "raid5: measuring checksumming speed\n");
-		XOR_TRY_TEMPLATES;
-		fastest = template_list;
-		for (f = fastest; f; f = f->next)
-			if (f->speed > fastest->speed)
-				fastest = f;
-	}
-
-	printk("raid5: using function: %s (%d.%03d MB/sec)\n",
-	       fastest->name, fastest->speed / 1000, fastest->speed % 1000);
-
-#undef xor_speed
-
-	free_pages((unsigned long)b1, 2);
-
-	active_template = fastest;
-	return 0;
-}
-
-static __exit void xor_exit(void) { }
-
-EXPORT_SYMBOL(xor_block);
-MODULE_LICENSE("GPL");
-
-module_init(calibrate_xor_block);
-module_exit(xor_exit);
diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h
index f0d67cbdea40..7d6c20b654fa 100644
--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -5,7 +5,7 @@
 
 #define MAX_XOR_BLOCKS 5
 
-extern void xor_block(unsigned int count, unsigned int bytes, void **ptr);
+extern void xor_blocks(unsigned int count, unsigned int bytes, void **ptr);
 
 struct xor_block_template {
         struct xor_block_template *next;
-- 
cgit v1.2.3


From 9bc89cd82d6f88fb0ca39b30445c329a430fd66b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 2 Jan 2007 11:10:44 -0700
Subject: async_tx: add the async_tx api

The async_tx api provides methods for describing a chain of asynchronous
bulk memory transfers/transforms with support for inter-transactional
dependencies.  It is implemented as a dmaengine client that smooths over
the details of different hardware offload engine implementations.  Code
that is written to the api can optimize for asynchronous operation and the
api will fit the chain of operations to the available offload resources.

	I imagine that any piece of ADMA hardware would register with the
	'async_*' subsystem, and a call to async_X would be routed as
	appropriate, or be run in-line. - Neil Brown

async_tx exploits the capabilities of struct dma_async_tx_descriptor to
provide an api of the following general format:

struct dma_async_tx_descriptor *
async_<operation>(..., struct dma_async_tx_descriptor *depend_tx,
			dma_async_tx_callback cb_fn, void *cb_param)
{
	struct dma_chan *chan = async_tx_find_channel(depend_tx, <operation>);
	struct dma_device *device = chan ? chan->device : NULL;
	int int_en = cb_fn ? 1 : 0;
	struct dma_async_tx_descriptor *tx = device ?
		device->device_prep_dma_<operation>(chan, len, int_en) : NULL;

	if (tx) { /* run <operation> asynchronously */
		...
		tx->tx_set_dest(addr, tx, index);
		...
		tx->tx_set_src(addr, tx, index);
		...
		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
	} else { /* run <operation> synchronously */
		...
		<operation>
		...
		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
	}

	return tx;
}

async_tx_find_channel() returns a capable channel from its pool.  The
channel pool is organized as a per-cpu array of channel pointers.  The
async_tx_rebalance() routine is tasked with managing these arrays.  In the
uniprocessor case async_tx_rebalance() tries to spread responsibility
evenly over channels of similar capabilities.  For example if there are two
copy+xor channels, one will handle copy operations and the other will
handle xor.  In the SMP case async_tx_rebalance() attempts to spread the
operations evenly over the cpus, e.g. cpu0 gets copy channel0 and xor
channel0 while cpu1 gets copy channel 1 and xor channel 1.  When a
dependency is specified async_tx_find_channel defaults to keeping the
operation on the same channel.  A xor->copy->xor chain will stay on one
channel if it supports both operation types, otherwise the transaction will
transition between a copy and a xor resource.

Currently the raid5 implementation in the MD raid456 driver has been
converted to the async_tx api.  A driver for the offload engines on the
Intel Xscale series of I/O processors, iop-adma, is provided in a later
commit.  With the iop-adma driver and async_tx, raid456 is able to offload
copy, xor, and xor-zero-sum operations to hardware engines.

On iop342 tiobench showed higher throughput for sequential writes (20 - 30%
improvement) and sequential reads to a degraded array (40 - 55%
improvement).  For the other cases performance was roughly equal, +/- a few
percentage points.  On a x86-smp platform the performance of the async_tx
implementation (in synchronous mode) was also +/- a few percentage points
of the original implementation.  According to 'top' on iop342 CPU
utilization drops from ~50% to ~15% during a 'resync' while the speed
according to /proc/mdstat doubles from ~25 MB/s to ~50 MB/s.

The tiobench command line used for testing was: tiobench --size 2048
--block 4096 --block 131072 --dir /mnt/raid --numruns 5
* iop342 had 1GB of memory available

Details:
* if CONFIG_DMA_ENGINE=n the asynchronous path is compiled away by making
  async_tx_find_channel a static inline routine that always returns NULL
* when a callback is specified for a given transaction an interrupt will
  fire at operation completion time and the callback will occur in a
  tasklet.  if the the channel does not support interrupts then a live
  polling wait will be performed
* the api is written as a dmaengine client that requests all available
  channels
* In support of dependencies the api implicitly schedules channel-switch
  interrupts.  The interrupt triggers the cleanup tasklet which causes
  pending operations to be scheduled on the next channel
* Xor engines treat an xor destination address differently than a software
  xor routine.  To the software routine the destination address is an implied
  source, whereas engines treat it as a write-only destination.  This patch
  modifies the xor_blocks routine to take a an explicit destination address
  to mirror the hardware.

Changelog:
* fixed a leftover debug print
* don't allow callbacks in async_interrupt_cond
* fixed xor_block changes
* fixed usage of ASYNC_TX_XOR_DROP_DEST
* drop dma mapping methods, suggested by Chris Leech
* printk warning fixups from Andrew Morton
* don't use inline in C files, Adrian Bunk
* select the API when MD is enabled
* BUG_ON xor source counts <= 1
* implicitly handle hardware concerns like channel switching and
  interrupts, Neil Brown
* remove the per operation type list, and distribute operation capabilities
  evenly amongst the available channels
* simplify async_tx_find_channel to optimize the fast path
* introduce the channel_table_initialized flag to prevent early calls to
  the api
* reorganize the code to mimic crypto
* include mm.h as not all archs include it in dma-mapping.h
* make the Kconfig options non-user visible, Adrian Bunk
* move async_tx under crypto since it is meant as 'core' functionality, and
  the two may share algorithms in the future
* move large inline functions into c files
* checkpatch.pl fixes
* gpl v2 only correction

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
---
 crypto/Kconfig                 |   6 +-
 crypto/Makefile                |   2 +-
 crypto/async_tx/Kconfig        |  16 ++
 crypto/async_tx/Makefile       |   4 +
 crypto/async_tx/async_memcpy.c | 131 +++++++++++
 crypto/async_tx/async_memset.c | 109 +++++++++
 crypto/async_tx/async_tx.c     | 497 +++++++++++++++++++++++++++++++++++++++++
 crypto/async_tx/async_xor.c    | 327 +++++++++++++++++++++++++++
 crypto/xor.c                   |  29 ++-
 drivers/dma/Kconfig            |   5 +-
 drivers/md/Kconfig             |   3 +-
 drivers/md/raid5.c             |  54 ++---
 include/linux/async_tx.h       | 156 +++++++++++++
 include/linux/raid/xor.h       |   5 +-
 14 files changed, 1294 insertions(+), 50 deletions(-)
 create mode 100644 crypto/async_tx/Kconfig
 create mode 100644 crypto/async_tx/Makefile
 create mode 100644 crypto/async_tx/async_memcpy.c
 create mode 100644 crypto/async_tx/async_memset.c
 create mode 100644 crypto/async_tx/async_tx.c
 create mode 100644 crypto/async_tx/async_xor.c
 create mode 100644 include/linux/async_tx.h

(limited to 'include/linux')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index b749a1a46e22..07090e9f9bcf 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -5,9 +5,13 @@ config XOR_BLOCKS
 	tristate
 
 #
-# Cryptographic API Configuration
+# async_tx api: hardware offloaded memory transfer/transform support
 #
+source "crypto/async_tx/Kconfig"
 
+#
+# Cryptographic API Configuration
+#
 menu "Cryptographic options"
 
 config CRYPTO
diff --git a/crypto/Makefile b/crypto/Makefile
index 68e934b4bee2..0cf17f1ea151 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -55,4 +55,4 @@ obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
 # generic algorithms and the async_tx api
 #
 obj-$(CONFIG_XOR_BLOCKS) += xor.o
-
+obj-$(CONFIG_ASYNC_CORE) += async_tx/
diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig
new file mode 100644
index 000000000000..d8fb39145986
--- /dev/null
+++ b/crypto/async_tx/Kconfig
@@ -0,0 +1,16 @@
+config ASYNC_CORE
+	tristate
+
+config ASYNC_MEMCPY
+	tristate
+	select ASYNC_CORE
+
+config ASYNC_XOR
+	tristate
+	select ASYNC_CORE
+	select XOR_BLOCKS
+
+config ASYNC_MEMSET
+	tristate
+	select ASYNC_CORE
+
diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile
new file mode 100644
index 000000000000..27baa7d52fbc
--- /dev/null
+++ b/crypto/async_tx/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_ASYNC_CORE) += async_tx.o
+obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o
+obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o
+obj-$(CONFIG_ASYNC_XOR) += async_xor.o
diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c
new file mode 100644
index 000000000000..a973f4ef897d
--- /dev/null
+++ b/crypto/async_tx/async_memcpy.c
@@ -0,0 +1,131 @@
+/*
+ * copy offload engine support
+ *
+ * Copyright © 2006, Intel Corporation.
+ *
+ *      Dan Williams <dan.j.williams@intel.com>
+ *
+ *      with architecture considerations by:
+ *      Neil Brown <neilb@suse.de>
+ *      Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/async_tx.h>
+
+/**
+ * async_memcpy - attempt to copy memory with a dma engine.
+ * @dest: destination page
+ * @src: src page
+ * @offset: offset in pages to start transaction
+ * @len: length in bytes
+ * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK,
+ *	ASYNC_TX_KMAP_SRC, ASYNC_TX_KMAP_DST
+ * @depend_tx: memcpy depends on the result of this transaction
+ * @cb_fn: function to call when the memcpy completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
+	unsigned int src_offset, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMCPY);
+	struct dma_device *device = chan ? chan->device : NULL;
+	int int_en = cb_fn ? 1 : 0;
+	struct dma_async_tx_descriptor *tx = device ?
+		device->device_prep_dma_memcpy(chan, len,
+		int_en) : NULL;
+
+	if (tx) { /* run the memcpy asynchronously */
+		dma_addr_t addr;
+		enum dma_data_direction dir;
+
+		pr_debug("%s: (async) len: %zu\n", __FUNCTION__, len);
+
+		dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+			DMA_NONE : DMA_FROM_DEVICE;
+
+		addr = dma_map_page(device->dev, dest, dest_offset, len, dir);
+		tx->tx_set_dest(addr, tx, 0);
+
+		dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+			DMA_NONE : DMA_TO_DEVICE;
+
+		addr = dma_map_page(device->dev, src, src_offset, len, dir);
+		tx->tx_set_src(addr, tx, 0);
+
+		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+	} else { /* run the memcpy synchronously */
+		void *dest_buf, *src_buf;
+		pr_debug("%s: (sync) len: %zu\n", __FUNCTION__, len);
+
+		/* wait for any prerequisite operations */
+		if (depend_tx) {
+			/* if ack is already set then we cannot be sure
+			 * we are referring to the correct operation
+			 */
+			BUG_ON(depend_tx->ack);
+			if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR)
+				panic("%s: DMA_ERROR waiting for depend_tx\n",
+					__FUNCTION__);
+		}
+
+		if (flags & ASYNC_TX_KMAP_DST)
+			dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset;
+		else
+			dest_buf = page_address(dest) + dest_offset;
+
+		if (flags & ASYNC_TX_KMAP_SRC)
+			src_buf = kmap_atomic(src, KM_USER0) + src_offset;
+		else
+			src_buf = page_address(src) + src_offset;
+
+		memcpy(dest_buf, src_buf, len);
+
+		if (flags & ASYNC_TX_KMAP_DST)
+			kunmap_atomic(dest_buf, KM_USER0);
+
+		if (flags & ASYNC_TX_KMAP_SRC)
+			kunmap_atomic(src_buf, KM_USER0);
+
+		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_memcpy);
+
+static int __init async_memcpy_init(void)
+{
+	return 0;
+}
+
+static void __exit async_memcpy_exit(void)
+{
+	do { } while (0);
+}
+
+module_init(async_memcpy_init);
+module_exit(async_memcpy_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("asynchronous memcpy api");
+MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c
new file mode 100644
index 000000000000..66ef6351202e
--- /dev/null
+++ b/crypto/async_tx/async_memset.c
@@ -0,0 +1,109 @@
+/*
+ * memory fill offload engine support
+ *
+ * Copyright © 2006, Intel Corporation.
+ *
+ *      Dan Williams <dan.j.williams@intel.com>
+ *
+ *      with architecture considerations by:
+ *      Neil Brown <neilb@suse.de>
+ *      Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/async_tx.h>
+
+/**
+ * async_memset - attempt to fill memory with a dma engine.
+ * @dest: destination page
+ * @val: fill value
+ * @offset: offset in pages to start transaction
+ * @len: length in bytes
+ * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @depend_tx: memset depends on the result of this transaction
+ * @cb_fn: function to call when the memcpy completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_memset(struct page *dest, int val, unsigned int offset,
+	size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMSET);
+	struct dma_device *device = chan ? chan->device : NULL;
+	int int_en = cb_fn ? 1 : 0;
+	struct dma_async_tx_descriptor *tx = device ?
+		device->device_prep_dma_memset(chan, val, len,
+			int_en) : NULL;
+
+	if (tx) { /* run the memset asynchronously */
+		dma_addr_t dma_addr;
+		enum dma_data_direction dir;
+
+		pr_debug("%s: (async) len: %zu\n", __FUNCTION__, len);
+		dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+			DMA_NONE : DMA_FROM_DEVICE;
+
+		dma_addr = dma_map_page(device->dev, dest, offset, len, dir);
+		tx->tx_set_dest(dma_addr, tx, 0);
+
+		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+	} else { /* run the memset synchronously */
+		void *dest_buf;
+		pr_debug("%s: (sync) len: %zu\n", __FUNCTION__, len);
+
+		dest_buf = (void *) (((char *) page_address(dest)) + offset);
+
+		/* wait for any prerequisite operations */
+		if (depend_tx) {
+			/* if ack is already set then we cannot be sure
+			 * we are referring to the correct operation
+			 */
+			BUG_ON(depend_tx->ack);
+			if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR)
+				panic("%s: DMA_ERROR waiting for depend_tx\n",
+					__FUNCTION__);
+		}
+
+		memset(dest_buf, val, len);
+
+		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_memset);
+
+static int __init async_memset_init(void)
+{
+	return 0;
+}
+
+static void __exit async_memset_exit(void)
+{
+	do { } while (0);
+}
+
+module_init(async_memset_init);
+module_exit(async_memset_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("asynchronous memset api");
+MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
new file mode 100644
index 000000000000..035007145e78
--- /dev/null
+++ b/crypto/async_tx/async_tx.c
@@ -0,0 +1,497 @@
+/*
+ * core routines for the asynchronous memory transfer/transform api
+ *
+ * Copyright © 2006, Intel Corporation.
+ *
+ *	Dan Williams <dan.j.williams@intel.com>
+ *
+ *	with architecture considerations by:
+ *	Neil Brown <neilb@suse.de>
+ *	Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/async_tx.h>
+
+#ifdef CONFIG_DMA_ENGINE
+static enum dma_state_client
+dma_channel_add_remove(struct dma_client *client,
+	struct dma_chan *chan, enum dma_state state);
+
+static struct dma_client async_tx_dma = {
+	.event_callback = dma_channel_add_remove,
+	/* .cap_mask == 0 defaults to all channels */
+};
+
+/**
+ * dma_cap_mask_all - enable iteration over all operation types
+ */
+static dma_cap_mask_t dma_cap_mask_all;
+
+/**
+ * chan_ref_percpu - tracks channel allocations per core/opertion
+ */
+struct chan_ref_percpu {
+	struct dma_chan_ref *ref;
+};
+
+static int channel_table_initialized;
+static struct chan_ref_percpu *channel_table[DMA_TX_TYPE_END];
+
+/**
+ * async_tx_lock - protect modification of async_tx_master_list and serialize
+ *	rebalance operations
+ */
+static spinlock_t async_tx_lock;
+
+static struct list_head
+async_tx_master_list = LIST_HEAD_INIT(async_tx_master_list);
+
+/* async_tx_issue_pending_all - start all transactions on all channels */
+void async_tx_issue_pending_all(void)
+{
+	struct dma_chan_ref *ref;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ref, &async_tx_master_list, node)
+		ref->chan->device->device_issue_pending(ref->chan);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(async_tx_issue_pending_all);
+
+/* dma_wait_for_async_tx - spin wait for a transcation to complete
+ * @tx: transaction to wait on
+ */
+enum dma_status
+dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
+{
+	enum dma_status status;
+	struct dma_async_tx_descriptor *iter;
+
+	if (!tx)
+		return DMA_SUCCESS;
+
+	/* poll through the dependency chain, return when tx is complete */
+	do {
+		iter = tx;
+		while (iter->cookie == -EBUSY)
+			iter = iter->parent;
+
+		status = dma_sync_wait(iter->chan, iter->cookie);
+	} while (status == DMA_IN_PROGRESS || (iter != tx));
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
+
+/* async_tx_run_dependencies - helper routine for dma drivers to process
+ *	(start) dependent operations on their target channel
+ * @tx: transaction with dependencies
+ */
+void
+async_tx_run_dependencies(struct dma_async_tx_descriptor *tx)
+{
+	struct dma_async_tx_descriptor *dep_tx, *_dep_tx;
+	struct dma_device *dev;
+	struct dma_chan *chan;
+
+	list_for_each_entry_safe(dep_tx, _dep_tx, &tx->depend_list,
+		depend_node) {
+		chan = dep_tx->chan;
+		dev = chan->device;
+		/* we can't depend on ourselves */
+		BUG_ON(chan == tx->chan);
+		list_del(&dep_tx->depend_node);
+		tx->tx_submit(dep_tx);
+
+		/* we need to poke the engine as client code does not
+		 * know about dependency submission events
+		 */
+		dev->device_issue_pending(chan);
+	}
+}
+EXPORT_SYMBOL_GPL(async_tx_run_dependencies);
+
+static void
+free_dma_chan_ref(struct rcu_head *rcu)
+{
+	struct dma_chan_ref *ref;
+	ref = container_of(rcu, struct dma_chan_ref, rcu);
+	kfree(ref);
+}
+
+static void
+init_dma_chan_ref(struct dma_chan_ref *ref, struct dma_chan *chan)
+{
+	INIT_LIST_HEAD(&ref->node);
+	INIT_RCU_HEAD(&ref->rcu);
+	ref->chan = chan;
+	atomic_set(&ref->count, 0);
+}
+
+/**
+ * get_chan_ref_by_cap - returns the nth channel of the given capability
+ * 	defaults to returning the channel with the desired capability and the
+ * 	lowest reference count if the index can not be satisfied
+ * @cap: capability to match
+ * @index: nth channel desired, passing -1 has the effect of forcing the
+ *  default return value
+ */
+static struct dma_chan_ref *
+get_chan_ref_by_cap(enum dma_transaction_type cap, int index)
+{
+	struct dma_chan_ref *ret_ref = NULL, *min_ref = NULL, *ref;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ref, &async_tx_master_list, node)
+		if (dma_has_cap(cap, ref->chan->device->cap_mask)) {
+			if (!min_ref)
+				min_ref = ref;
+			else if (atomic_read(&ref->count) <
+				atomic_read(&min_ref->count))
+				min_ref = ref;
+
+			if (index-- == 0) {
+				ret_ref = ref;
+				break;
+			}
+		}
+	rcu_read_unlock();
+
+	if (!ret_ref)
+		ret_ref = min_ref;
+
+	if (ret_ref)
+		atomic_inc(&ret_ref->count);
+
+	return ret_ref;
+}
+
+/**
+ * async_tx_rebalance - redistribute the available channels, optimize
+ * for cpu isolation in the SMP case, and opertaion isolation in the
+ * uniprocessor case
+ */
+static void async_tx_rebalance(void)
+{
+	int cpu, cap, cpu_idx = 0;
+	unsigned long flags;
+
+	if (!channel_table_initialized)
+		return;
+
+	spin_lock_irqsave(&async_tx_lock, flags);
+
+	/* undo the last distribution */
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_possible_cpu(cpu) {
+			struct dma_chan_ref *ref =
+				per_cpu_ptr(channel_table[cap], cpu)->ref;
+			if (ref) {
+				atomic_set(&ref->count, 0);
+				per_cpu_ptr(channel_table[cap], cpu)->ref =
+									NULL;
+			}
+		}
+
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_online_cpu(cpu) {
+			struct dma_chan_ref *new;
+			if (NR_CPUS > 1)
+				new = get_chan_ref_by_cap(cap, cpu_idx++);
+			else
+				new = get_chan_ref_by_cap(cap, -1);
+
+			per_cpu_ptr(channel_table[cap], cpu)->ref = new;
+		}
+
+	spin_unlock_irqrestore(&async_tx_lock, flags);
+}
+
+static enum dma_state_client
+dma_channel_add_remove(struct dma_client *client,
+	struct dma_chan *chan, enum dma_state state)
+{
+	unsigned long found, flags;
+	struct dma_chan_ref *master_ref, *ref;
+	enum dma_state_client ack = DMA_DUP; /* default: take no action */
+
+	switch (state) {
+	case DMA_RESOURCE_AVAILABLE:
+		found = 0;
+		rcu_read_lock();
+		list_for_each_entry_rcu(ref, &async_tx_master_list, node)
+			if (ref->chan == chan) {
+				found = 1;
+				break;
+			}
+		rcu_read_unlock();
+
+		pr_debug("async_tx: dma resource available [%s]\n",
+			found ? "old" : "new");
+
+		if (!found)
+			ack = DMA_ACK;
+		else
+			break;
+
+		/* add the channel to the generic management list */
+		master_ref = kmalloc(sizeof(*master_ref), GFP_KERNEL);
+		if (master_ref) {
+			/* keep a reference until async_tx is unloaded */
+			dma_chan_get(chan);
+			init_dma_chan_ref(master_ref, chan);
+			spin_lock_irqsave(&async_tx_lock, flags);
+			list_add_tail_rcu(&master_ref->node,
+				&async_tx_master_list);
+			spin_unlock_irqrestore(&async_tx_lock,
+				flags);
+		} else {
+			printk(KERN_WARNING "async_tx: unable to create"
+				" new master entry in response to"
+				" a DMA_RESOURCE_ADDED event"
+				" (-ENOMEM)\n");
+			return 0;
+		}
+
+		async_tx_rebalance();
+		break;
+	case DMA_RESOURCE_REMOVED:
+		found = 0;
+		spin_lock_irqsave(&async_tx_lock, flags);
+		list_for_each_entry_rcu(ref, &async_tx_master_list, node)
+			if (ref->chan == chan) {
+				/* permit backing devices to go away */
+				dma_chan_put(ref->chan);
+				list_del_rcu(&ref->node);
+				call_rcu(&ref->rcu, free_dma_chan_ref);
+				found = 1;
+				break;
+			}
+		spin_unlock_irqrestore(&async_tx_lock, flags);
+
+		pr_debug("async_tx: dma resource removed [%s]\n",
+			found ? "ours" : "not ours");
+
+		if (found)
+			ack = DMA_ACK;
+		else
+			break;
+
+		async_tx_rebalance();
+		break;
+	case DMA_RESOURCE_SUSPEND:
+	case DMA_RESOURCE_RESUME:
+		printk(KERN_WARNING "async_tx: does not support dma channel"
+			" suspend/resume\n");
+		break;
+	default:
+		BUG();
+	}
+
+	return ack;
+}
+
+static int __init
+async_tx_init(void)
+{
+	enum dma_transaction_type cap;
+
+	spin_lock_init(&async_tx_lock);
+	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
+
+	/* an interrupt will never be an explicit operation type.
+	 * clearing this bit prevents allocation to a slot in 'channel_table'
+	 */
+	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
+
+	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
+		channel_table[cap] = alloc_percpu(struct chan_ref_percpu);
+		if (!channel_table[cap])
+			goto err;
+	}
+
+	channel_table_initialized = 1;
+	dma_async_client_register(&async_tx_dma);
+	dma_async_client_chan_request(&async_tx_dma);
+
+	printk(KERN_INFO "async_tx: api initialized (async)\n");
+
+	return 0;
+err:
+	printk(KERN_ERR "async_tx: initialization failure\n");
+
+	while (--cap >= 0)
+		free_percpu(channel_table[cap]);
+
+	return 1;
+}
+
+static void __exit async_tx_exit(void)
+{
+	enum dma_transaction_type cap;
+
+	channel_table_initialized = 0;
+
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		if (channel_table[cap])
+			free_percpu(channel_table[cap]);
+
+	dma_async_client_unregister(&async_tx_dma);
+}
+
+/**
+ * async_tx_find_channel - find a channel to carry out the operation or let
+ *	the transaction execute synchronously
+ * @depend_tx: transaction dependency
+ * @tx_type: transaction type
+ */
+struct dma_chan *
+async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
+	enum dma_transaction_type tx_type)
+{
+	/* see if we can keep the chain on one channel */
+	if (depend_tx &&
+		dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
+		return depend_tx->chan;
+	else if (likely(channel_table_initialized)) {
+		struct dma_chan_ref *ref;
+		int cpu = get_cpu();
+		ref = per_cpu_ptr(channel_table[tx_type], cpu)->ref;
+		put_cpu();
+		return ref ? ref->chan : NULL;
+	} else
+		return NULL;
+}
+EXPORT_SYMBOL_GPL(async_tx_find_channel);
+#else
+static int __init async_tx_init(void)
+{
+	printk(KERN_INFO "async_tx: api initialized (sync-only)\n");
+	return 0;
+}
+
+static void __exit async_tx_exit(void)
+{
+	do { } while (0);
+}
+#endif
+
+void
+async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
+	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	tx->callback = cb_fn;
+	tx->callback_param = cb_param;
+
+	/* set this new tx to run after depend_tx if:
+	 * 1/ a dependency exists (depend_tx is !NULL)
+	 * 2/ the tx can not be submitted to the current channel
+	 */
+	if (depend_tx && depend_tx->chan != chan) {
+		/* if ack is already set then we cannot be sure
+		 * we are referring to the correct operation
+		 */
+		BUG_ON(depend_tx->ack);
+
+		tx->parent = depend_tx;
+		spin_lock_bh(&depend_tx->lock);
+		list_add_tail(&tx->depend_node, &depend_tx->depend_list);
+		if (depend_tx->cookie == 0) {
+			struct dma_chan *dep_chan = depend_tx->chan;
+			struct dma_device *dep_dev = dep_chan->device;
+			dep_dev->device_dependency_added(dep_chan);
+		}
+		spin_unlock_bh(&depend_tx->lock);
+
+		/* schedule an interrupt to trigger the channel switch */
+		async_trigger_callback(ASYNC_TX_ACK, depend_tx, NULL, NULL);
+	} else {
+		tx->parent = NULL;
+		tx->tx_submit(tx);
+	}
+
+	if (flags & ASYNC_TX_ACK)
+		async_tx_ack(tx);
+
+	if (depend_tx && (flags & ASYNC_TX_DEP_ACK))
+		async_tx_ack(depend_tx);
+}
+EXPORT_SYMBOL_GPL(async_tx_submit);
+
+/**
+ * async_trigger_callback - schedules the callback function to be run after
+ * any dependent operations have been completed.
+ * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @depend_tx: 'callback' requires the completion of this transaction
+ * @cb_fn: function to call after depend_tx completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_trigger_callback(enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan;
+	struct dma_device *device;
+	struct dma_async_tx_descriptor *tx;
+
+	if (depend_tx) {
+		chan = depend_tx->chan;
+		device = chan->device;
+
+		/* see if we can schedule an interrupt
+		 * otherwise poll for completion
+		 */
+		if (device && !dma_has_cap(DMA_INTERRUPT, device->cap_mask))
+			device = NULL;
+
+		tx = device ? device->device_prep_dma_interrupt(chan) : NULL;
+	} else
+		tx = NULL;
+
+	if (tx) {
+		pr_debug("%s: (async)\n", __FUNCTION__);
+
+		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+	} else {
+		pr_debug("%s: (sync)\n", __FUNCTION__);
+
+		/* wait for any prerequisite operations */
+		if (depend_tx) {
+			/* if ack is already set then we cannot be sure
+			 * we are referring to the correct operation
+			 */
+			BUG_ON(depend_tx->ack);
+			if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR)
+				panic("%s: DMA_ERROR waiting for depend_tx\n",
+					__FUNCTION__);
+		}
+
+		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_trigger_callback);
+
+module_init(async_tx_init);
+module_exit(async_tx_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Asynchronous Bulk Memory Transactions API");
+MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
new file mode 100644
index 000000000000..2575f674dcd5
--- /dev/null
+++ b/crypto/async_tx/async_xor.c
@@ -0,0 +1,327 @@
+/*
+ * xor offload engine api
+ *
+ * Copyright © 2006, Intel Corporation.
+ *
+ *      Dan Williams <dan.j.williams@intel.com>
+ *
+ *      with architecture considerations by:
+ *      Neil Brown <neilb@suse.de>
+ *      Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/raid/xor.h>
+#include <linux/async_tx.h>
+
+static void
+do_async_xor(struct dma_async_tx_descriptor *tx, struct dma_device *device,
+	struct dma_chan *chan, struct page *dest, struct page **src_list,
+	unsigned int offset, unsigned int src_cnt, size_t len,
+	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	dma_addr_t dma_addr;
+	enum dma_data_direction dir;
+	int i;
+
+	pr_debug("%s: len: %zu\n", __FUNCTION__, len);
+
+	dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+		DMA_NONE : DMA_FROM_DEVICE;
+
+	dma_addr = dma_map_page(device->dev, dest, offset, len, dir);
+	tx->tx_set_dest(dma_addr, tx, 0);
+
+	dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+		DMA_NONE : DMA_TO_DEVICE;
+
+	for (i = 0; i < src_cnt; i++) {
+		dma_addr = dma_map_page(device->dev, src_list[i],
+			offset, len, dir);
+		tx->tx_set_src(dma_addr, tx, i);
+	}
+
+	async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+}
+
+static void
+do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
+	unsigned int src_cnt, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	void *_dest;
+	int i;
+
+	pr_debug("%s: len: %zu\n", __FUNCTION__, len);
+
+	/* reuse the 'src_list' array to convert to buffer pointers */
+	for (i = 0; i < src_cnt; i++)
+		src_list[i] = (struct page *)
+			(page_address(src_list[i]) + offset);
+
+	/* set destination address */
+	_dest = page_address(dest) + offset;
+
+	if (flags & ASYNC_TX_XOR_ZERO_DST)
+		memset(_dest, 0, len);
+
+	xor_blocks(src_cnt, len, _dest,
+		(void **) src_list);
+
+	async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
+}
+
+/**
+ * async_xor - attempt to xor a set of blocks with a dma engine.
+ *	xor_blocks always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST
+ *	flag must be set to not include dest data in the calculation.  The
+ *	assumption with dma eninges is that they only use the destination
+ *	buffer as a source when it is explicity specified in the source list.
+ * @dest: destination page
+ * @src_list: array of source pages (if the dest is also a source it must be
+ *	at index zero).  The contents of this array may be overwritten.
+ * @offset: offset in pages to start transaction
+ * @src_cnt: number of source pages
+ * @len: length in bytes
+ * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST,
+ *	ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @depend_tx: xor depends on the result of this transaction.
+ * @cb_fn: function to call when the xor completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_xor(struct page *dest, struct page **src_list, unsigned int offset,
+	int src_cnt, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR);
+	struct dma_device *device = chan ? chan->device : NULL;
+	struct dma_async_tx_descriptor *tx = NULL;
+	dma_async_tx_callback _cb_fn;
+	void *_cb_param;
+	unsigned long local_flags;
+	int xor_src_cnt;
+	int i = 0, src_off = 0, int_en;
+
+	BUG_ON(src_cnt <= 1);
+
+	while (src_cnt) {
+		local_flags = flags;
+		if (device) { /* run the xor asynchronously */
+			xor_src_cnt = min(src_cnt, device->max_xor);
+			/* if we are submitting additional xors
+			 * only set the callback on the last transaction
+			 */
+			if (src_cnt > xor_src_cnt) {
+				local_flags &= ~ASYNC_TX_ACK;
+				_cb_fn = NULL;
+				_cb_param = NULL;
+			} else {
+				_cb_fn = cb_fn;
+				_cb_param = cb_param;
+			}
+
+			int_en = _cb_fn ? 1 : 0;
+
+			tx = device->device_prep_dma_xor(
+				chan, xor_src_cnt, len, int_en);
+
+			if (tx) {
+				do_async_xor(tx, device, chan, dest,
+				&src_list[src_off], offset, xor_src_cnt, len,
+				local_flags, depend_tx, _cb_fn,
+				_cb_param);
+			} else /* fall through */
+				goto xor_sync;
+		} else { /* run the xor synchronously */
+xor_sync:
+			/* in the sync case the dest is an implied source
+			 * (assumes the dest is at the src_off index)
+			 */
+			if (flags & ASYNC_TX_XOR_DROP_DST) {
+				src_cnt--;
+				src_off++;
+			}
+
+			/* process up to 'MAX_XOR_BLOCKS' sources */
+			xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+
+			/* if we are submitting additional xors
+			 * only set the callback on the last transaction
+			 */
+			if (src_cnt > xor_src_cnt) {
+				local_flags &= ~ASYNC_TX_ACK;
+				_cb_fn = NULL;
+				_cb_param = NULL;
+			} else {
+				_cb_fn = cb_fn;
+				_cb_param = cb_param;
+			}
+
+			/* wait for any prerequisite operations */
+			if (depend_tx) {
+				/* if ack is already set then we cannot be sure
+				 * we are referring to the correct operation
+				 */
+				BUG_ON(depend_tx->ack);
+				if (dma_wait_for_async_tx(depend_tx) ==
+					DMA_ERROR)
+					panic("%s: DMA_ERROR waiting for "
+						"depend_tx\n",
+						__FUNCTION__);
+			}
+
+			do_sync_xor(dest, &src_list[src_off], offset,
+				xor_src_cnt, len, local_flags, depend_tx,
+				_cb_fn, _cb_param);
+		}
+
+		/* the previous tx is hidden from the client,
+		 * so ack it
+		 */
+		if (i && depend_tx)
+			async_tx_ack(depend_tx);
+
+		depend_tx = tx;
+
+		if (src_cnt > xor_src_cnt) {
+			/* drop completed sources */
+			src_cnt -= xor_src_cnt;
+			src_off += xor_src_cnt;
+
+			/* unconditionally preserve the destination */
+			flags &= ~ASYNC_TX_XOR_ZERO_DST;
+
+			/* use the intermediate result a source, but remember
+			 * it's dropped, because it's implied, in the sync case
+			 */
+			src_list[--src_off] = dest;
+			src_cnt++;
+			flags |= ASYNC_TX_XOR_DROP_DST;
+		} else
+			src_cnt = 0;
+		i++;
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_xor);
+
+static int page_is_zero(struct page *p, unsigned int offset, size_t len)
+{
+	char *a = page_address(p) + offset;
+	return ((*(u32 *) a) == 0 &&
+		memcmp(a, a + 4, len - 4) == 0);
+}
+
+/**
+ * async_xor_zero_sum - attempt a xor parity check with a dma engine.
+ * @dest: destination page used if the xor is performed synchronously
+ * @src_list: array of source pages.  The dest page must be listed as a source
+ * 	at index zero.  The contents of this array may be overwritten.
+ * @offset: offset in pages to start transaction
+ * @src_cnt: number of source pages
+ * @len: length in bytes
+ * @result: 0 if sum == 0 else non-zero
+ * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @depend_tx: xor depends on the result of this transaction.
+ * @cb_fn: function to call when the xor completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_xor_zero_sum(struct page *dest, struct page **src_list,
+	unsigned int offset, int src_cnt, size_t len,
+	u32 *result, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_ZERO_SUM);
+	struct dma_device *device = chan ? chan->device : NULL;
+	int int_en = cb_fn ? 1 : 0;
+	struct dma_async_tx_descriptor *tx = device ?
+		device->device_prep_dma_zero_sum(chan, src_cnt, len, result,
+			int_en) : NULL;
+	int i;
+
+	BUG_ON(src_cnt <= 1);
+
+	if (tx) {
+		dma_addr_t dma_addr;
+		enum dma_data_direction dir;
+
+		pr_debug("%s: (async) len: %zu\n", __FUNCTION__, len);
+
+		dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+			DMA_NONE : DMA_TO_DEVICE;
+
+		for (i = 0; i < src_cnt; i++) {
+			dma_addr = dma_map_page(device->dev, src_list[i],
+				offset, len, dir);
+			tx->tx_set_src(dma_addr, tx, i);
+		}
+
+		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+	} else {
+		unsigned long xor_flags = flags;
+
+		pr_debug("%s: (sync) len: %zu\n", __FUNCTION__, len);
+
+		xor_flags |= ASYNC_TX_XOR_DROP_DST;
+		xor_flags &= ~ASYNC_TX_ACK;
+
+		tx = async_xor(dest, src_list, offset, src_cnt, len, xor_flags,
+			depend_tx, NULL, NULL);
+
+		if (tx) {
+			if (dma_wait_for_async_tx(tx) == DMA_ERROR)
+				panic("%s: DMA_ERROR waiting for tx\n",
+					__FUNCTION__);
+			async_tx_ack(tx);
+		}
+
+		*result = page_is_zero(dest, offset, len) ? 0 : 1;
+
+		tx = NULL;
+
+		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_xor_zero_sum);
+
+static int __init async_xor_init(void)
+{
+	return 0;
+}
+
+static void __exit async_xor_exit(void)
+{
+	do { } while (0);
+}
+
+module_init(async_xor_init);
+module_exit(async_xor_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("asynchronous xor/xor-zero-sum api");
+MODULE_LICENSE("GPL");
diff --git a/crypto/xor.c b/crypto/xor.c
index 8281ac5e68a8..b2e6db075e49 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -26,31 +26,30 @@
 static struct xor_block_template *active_template;
 
 void
-xor_blocks(unsigned int count, unsigned int bytes, void **ptr)
+xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs)
 {
-	unsigned long *p0, *p1, *p2, *p3, *p4;
+	unsigned long *p1, *p2, *p3, *p4;
 
-	p0 = (unsigned long *) ptr[0];
-	p1 = (unsigned long *) ptr[1];
-	if (count == 2) {
-		active_template->do_2(bytes, p0, p1);
+	p1 = (unsigned long *) srcs[0];
+	if (src_count == 1) {
+		active_template->do_2(bytes, dest, p1);
 		return;
 	}
 
-	p2 = (unsigned long *) ptr[2];
-	if (count == 3) {
-		active_template->do_3(bytes, p0, p1, p2);
+	p2 = (unsigned long *) srcs[1];
+	if (src_count == 2) {
+		active_template->do_3(bytes, dest, p1, p2);
 		return;
 	}
 
-	p3 = (unsigned long *) ptr[3];
-	if (count == 4) {
-		active_template->do_4(bytes, p0, p1, p2, p3);
+	p3 = (unsigned long *) srcs[2];
+	if (src_count == 3) {
+		active_template->do_4(bytes, dest, p1, p2, p3);
 		return;
 	}
 
-	p4 = (unsigned long *) ptr[4];
-	active_template->do_5(bytes, p0, p1, p2, p3, p4);
+	p4 = (unsigned long *) srcs[3];
+	active_template->do_5(bytes, dest, p1, p2, p3, p4);
 }
 EXPORT_SYMBOL(xor_blocks);
 
@@ -128,7 +127,7 @@ calibrate_xor_blocks(void)
 			fastest->name);
 		xor_speed(fastest);
 	} else {
-		printk(KERN_INFO "xor: measuring checksumming speed\n");
+		printk(KERN_INFO "xor: measuring software checksum speed\n");
 		XOR_TRY_TEMPLATES;
 		fastest = template_list;
 		for (f = fastest; f; f = f->next)
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 72be6c63edfc..492aa080562d 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -8,8 +8,8 @@ menu "DMA Engine support"
 config DMA_ENGINE
 	bool "Support for DMA engines"
 	---help---
-	  DMA engines offload copy operations from the CPU to dedicated
-	  hardware, allowing the copies to happen asynchronously.
+          DMA engines offload bulk memory operations from the CPU to dedicated
+          hardware, allowing the operations to happen asynchronously.
 
 comment "DMA Clients"
 
@@ -31,5 +31,4 @@ config INTEL_IOATDMA
 	default m
 	---help---
 	  Enable support for the Intel(R) I/OAT DMA engine.
-
 endmenu
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 24d93d02a1f3..bfd9b9c6252c 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -109,7 +109,8 @@ config MD_RAID10
 config MD_RAID456
 	tristate "RAID-4/RAID-5/RAID-6 mode"
 	depends on BLK_DEV_MD
-	select XOR_BLOCKS
+	select ASYNC_MEMCPY
+	select ASYNC_XOR
 	---help---
 	  A RAID-5 set of N drives with a capacity of C MB per drive provides
 	  the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5adbe0b22684..4f51dfa8e487 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -916,25 +916,25 @@ static void copy_data(int frombio, struct bio *bio,
 	}
 }
 
-#define check_xor() 	do { 						\
-			   if (count == MAX_XOR_BLOCKS) {		\
-				xor_blocks(count, STRIPE_SIZE, ptr);	\
-				count = 1;				\
-			   }						\
+#define check_xor()	do {						  \
+				if (count == MAX_XOR_BLOCKS) {		  \
+				xor_blocks(count, STRIPE_SIZE, dest, ptr);\
+				count = 0;				  \
+			   }						  \
 			} while(0)
 
 
 static void compute_block(struct stripe_head *sh, int dd_idx)
 {
 	int i, count, disks = sh->disks;
-	void *ptr[MAX_XOR_BLOCKS], *p;
+	void *ptr[MAX_XOR_BLOCKS], *dest, *p;
 
 	PRINTK("compute_block, stripe %llu, idx %d\n", 
 		(unsigned long long)sh->sector, dd_idx);
 
-	ptr[0] = page_address(sh->dev[dd_idx].page);
-	memset(ptr[0], 0, STRIPE_SIZE);
-	count = 1;
+	dest = page_address(sh->dev[dd_idx].page);
+	memset(dest, 0, STRIPE_SIZE);
+	count = 0;
 	for (i = disks ; i--; ) {
 		if (i == dd_idx)
 			continue;
@@ -948,8 +948,8 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
 
 		check_xor();
 	}
-	if (count != 1)
-		xor_blocks(count, STRIPE_SIZE, ptr);
+	if (count)
+		xor_blocks(count, STRIPE_SIZE, dest, ptr);
 	set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 }
 
@@ -957,14 +957,14 @@ static void compute_parity5(struct stripe_head *sh, int method)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-	void *ptr[MAX_XOR_BLOCKS];
+	void *ptr[MAX_XOR_BLOCKS], *dest;
 	struct bio *chosen;
 
 	PRINTK("compute_parity5, stripe %llu, method %d\n",
 		(unsigned long long)sh->sector, method);
 
-	count = 1;
-	ptr[0] = page_address(sh->dev[pd_idx].page);
+	count = 0;
+	dest = page_address(sh->dev[pd_idx].page);
 	switch(method) {
 	case READ_MODIFY_WRITE:
 		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
@@ -987,7 +987,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
 		}
 		break;
 	case RECONSTRUCT_WRITE:
-		memset(ptr[0], 0, STRIPE_SIZE);
+		memset(dest, 0, STRIPE_SIZE);
 		for (i= disks; i-- ;)
 			if (i!=pd_idx && sh->dev[i].towrite) {
 				chosen = sh->dev[i].towrite;
@@ -1003,9 +1003,9 @@ static void compute_parity5(struct stripe_head *sh, int method)
 	case CHECK_PARITY:
 		break;
 	}
-	if (count>1) {
-		xor_blocks(count, STRIPE_SIZE, ptr);
-		count = 1;
+	if (count) {
+		xor_blocks(count, STRIPE_SIZE, dest, ptr);
+		count = 0;
 	}
 	
 	for (i = disks; i--;)
@@ -1037,9 +1037,9 @@ static void compute_parity5(struct stripe_head *sh, int method)
 				check_xor();
 			}
 	}
-	if (count != 1)
-		xor_blocks(count, STRIPE_SIZE, ptr);
-	
+	if (count)
+		xor_blocks(count, STRIPE_SIZE, dest, ptr);
+
 	if (method != CHECK_PARITY) {
 		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
 		set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
@@ -1132,7 +1132,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
 static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 {
 	int i, count, disks = sh->disks;
-	void *ptr[MAX_XOR_BLOCKS], *p;
+	void *ptr[MAX_XOR_BLOCKS], *dest, *p;
 	int pd_idx = sh->pd_idx;
 	int qd_idx = raid6_next_disk(pd_idx, disks);
 
@@ -1143,9 +1143,9 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 		/* We're actually computing the Q drive */
 		compute_parity6(sh, UPDATE_PARITY);
 	} else {
-		ptr[0] = page_address(sh->dev[dd_idx].page);
-		if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
-		count = 1;
+		dest = page_address(sh->dev[dd_idx].page);
+		if (!nozero) memset(dest, 0, STRIPE_SIZE);
+		count = 0;
 		for (i = disks ; i--; ) {
 			if (i == dd_idx || i == qd_idx)
 				continue;
@@ -1159,8 +1159,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 
 			check_xor();
 		}
-		if (count != 1)
-			xor_blocks(count, STRIPE_SIZE, ptr);
+		if (count)
+			xor_blocks(count, STRIPE_SIZE, dest, ptr);
 		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 	}
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
new file mode 100644
index 000000000000..ff1255079fa1
--- /dev/null
+++ b/include/linux/async_tx.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright © 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#ifndef _ASYNC_TX_H_
+#define _ASYNC_TX_H_
+#include <linux/dmaengine.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+/**
+ * dma_chan_ref - object used to manage dma channels received from the
+ *   dmaengine core.
+ * @chan - the channel being tracked
+ * @node - node for the channel to be placed on async_tx_master_list
+ * @rcu - for list_del_rcu
+ * @count - number of times this channel is listed in the pool
+ *	(for channels with multiple capabiities)
+ */
+struct dma_chan_ref {
+	struct dma_chan *chan;
+	struct list_head node;
+	struct rcu_head rcu;
+	atomic_t count;
+};
+
+/**
+ * async_tx_flags - modifiers for the async_* calls
+ * @ASYNC_TX_XOR_ZERO_DST: this flag must be used for xor operations where the
+ * the destination address is not a source.  The asynchronous case handles this
+ * implicitly, the synchronous case needs to zero the destination block.
+ * @ASYNC_TX_XOR_DROP_DST: this flag must be used if the destination address is
+ * also one of the source addresses.  In the synchronous case the destination
+ * address is an implied source, whereas the asynchronous case it must be listed
+ * as a source.  The destination address must be the first address in the source
+ * array.
+ * @ASYNC_TX_ASSUME_COHERENT: skip cache maintenance operations
+ * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
+ * dependency chain
+ * @ASYNC_TX_DEP_ACK: ack the dependency descriptor.  Useful for chaining.
+ * @ASYNC_TX_KMAP_SRC: if the transaction is to be performed synchronously
+ * take an atomic mapping (KM_USER0) on the source page(s)
+ * @ASYNC_TX_KMAP_DST: if the transaction is to be performed synchronously
+ * take an atomic mapping (KM_USER0) on the dest page(s)
+ */
+enum async_tx_flags {
+	ASYNC_TX_XOR_ZERO_DST	 = (1 << 0),
+	ASYNC_TX_XOR_DROP_DST	 = (1 << 1),
+	ASYNC_TX_ASSUME_COHERENT = (1 << 2),
+	ASYNC_TX_ACK		 = (1 << 3),
+	ASYNC_TX_DEP_ACK	 = (1 << 4),
+	ASYNC_TX_KMAP_SRC	 = (1 << 5),
+	ASYNC_TX_KMAP_DST	 = (1 << 6),
+};
+
+#ifdef CONFIG_DMA_ENGINE
+void async_tx_issue_pending_all(void);
+enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
+void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx);
+struct dma_chan *
+async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
+	enum dma_transaction_type tx_type);
+#else
+static inline void async_tx_issue_pending_all(void)
+{
+	do { } while (0);
+}
+
+static inline enum dma_status
+dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
+{
+	return DMA_SUCCESS;
+}
+
+static inline void
+async_tx_run_dependencies(struct dma_async_tx_descriptor *tx,
+	struct dma_chan *host_chan)
+{
+	do { } while (0);
+}
+
+static inline struct dma_chan *
+async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
+	enum dma_transaction_type tx_type)
+{
+	return NULL;
+}
+#endif
+
+/**
+ * async_tx_sync_epilog - actions to take if an operation is run synchronously
+ * @flags: async_tx flags
+ * @depend_tx: transaction depends on depend_tx
+ * @cb_fn: function to call when the transaction completes
+ * @cb_fn_param: parameter to pass to the callback routine
+ */
+static inline void
+async_tx_sync_epilog(unsigned long flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param)
+{
+	if (cb_fn)
+		cb_fn(cb_fn_param);
+
+	if (depend_tx && (flags & ASYNC_TX_DEP_ACK))
+		async_tx_ack(depend_tx);
+}
+
+void
+async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
+	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+
+struct dma_async_tx_descriptor *
+async_xor(struct page *dest, struct page **src_list, unsigned int offset,
+	int src_cnt, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+
+struct dma_async_tx_descriptor *
+async_xor_zero_sum(struct page *dest, struct page **src_list,
+	unsigned int offset, int src_cnt, size_t len,
+	u32 *result, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+
+struct dma_async_tx_descriptor *
+async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
+	unsigned int src_offset, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+
+struct dma_async_tx_descriptor *
+async_memset(struct page *dest, int val, unsigned int offset,
+	size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+
+struct dma_async_tx_descriptor *
+async_trigger_callback(enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+#endif /* _ASYNC_TX_H_ */
diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h
index 7d6c20b654fa..3e120587eada 100644
--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -3,9 +3,10 @@
 
 #include <linux/raid/md.h>
 
-#define MAX_XOR_BLOCKS 5
+#define MAX_XOR_BLOCKS 4
 
-extern void xor_blocks(unsigned int count, unsigned int bytes, void **ptr);
+extern void xor_blocks(unsigned int count, unsigned int bytes,
+	void *dest, void **srcs);
 
 struct xor_block_template {
         struct xor_block_template *next;
-- 
cgit v1.2.3


From a445685647e825c713175d180ffc8dd54d90589b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Jul 2007 11:56:43 -0700
Subject: raid5: refactor handle_stripe5 and handle_stripe6 (v3)

handle_stripe5 and handle_stripe6 have very deep logic paths handling the
various states of a stripe_head.  By introducing the 'stripe_head_state'
and 'r6_state' objects, large portions of the logic can be moved to
sub-routines.

'struct stripe_head_state' consumes all of the automatic variables that previously
stood alone in handle_stripe5,6.  'struct r6_state' contains the handle_stripe6
specific variables like p_failed and q_failed.

One of the nice side effects of the 'stripe_head_state' change is that it
allows for further reductions in code duplication between raid5 and raid6.
The following new routines are shared between raid5 and raid6:

	handle_completed_write_requests
	handle_requests_to_failed_array
	handle_stripe_expansion

Changes:
* v2: fixed 'conf->raid_disk-1' for the raid6 'handle_stripe_expansion' path
* v3: removed the unused 'dirty' field from struct stripe_head_state
* v3: coalesced open coded bi_end_io routines into return_io()

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c         | 1526 +++++++++++++++++++++-----------------------
 include/linux/raid/raid5.h |   16 +
 2 files changed, 756 insertions(+), 786 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4f51dfa8e487..38232fa111a4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -104,6 +104,23 @@ static inline int raid6_next_disk(int disk, int raid_disks)
 	disk++;
 	return (disk < raid_disks) ? disk : 0;
 }
+
+static void return_io(struct bio *return_bi)
+{
+	struct bio *bi = return_bi;
+	while (bi) {
+		int bytes = bi->bi_size;
+
+		return_bi = bi->bi_next;
+		bi->bi_next = NULL;
+		bi->bi_size = 0;
+		bi->bi_end_io(bi, bytes,
+			      test_bit(BIO_UPTODATE, &bi->bi_flags)
+			        ? 0 : -EIO);
+		bi = return_bi;
+	}
+}
+
 static void print_raid5_conf (raid5_conf_t *conf);
 
 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
@@ -1326,6 +1343,608 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 	return pd_idx;
 }
 
+static void
+handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
+				struct stripe_head_state *s, int disks,
+				struct bio **return_bi)
+{
+	int i;
+	for (i = disks; i--; ) {
+		struct bio *bi;
+		int bitmap_end = 0;
+
+		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+			mdk_rdev_t *rdev;
+			rcu_read_lock();
+			rdev = rcu_dereference(conf->disks[i].rdev);
+			if (rdev && test_bit(In_sync, &rdev->flags))
+				/* multiple read failures in one stripe */
+				md_error(conf->mddev, rdev);
+			rcu_read_unlock();
+		}
+		spin_lock_irq(&conf->device_lock);
+		/* fail all writes first */
+		bi = sh->dev[i].towrite;
+		sh->dev[i].towrite = NULL;
+		if (bi) {
+			s->to_write--;
+			bitmap_end = 1;
+		}
+
+		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+			wake_up(&conf->wait_for_overlap);
+
+		while (bi && bi->bi_sector <
+			sh->dev[i].sector + STRIPE_SECTORS) {
+			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+			if (--bi->bi_phys_segments == 0) {
+				md_write_end(conf->mddev);
+				bi->bi_next = *return_bi;
+				*return_bi = bi;
+			}
+			bi = nextbi;
+		}
+		/* and fail all 'written' */
+		bi = sh->dev[i].written;
+		sh->dev[i].written = NULL;
+		if (bi) bitmap_end = 1;
+		while (bi && bi->bi_sector <
+		       sh->dev[i].sector + STRIPE_SECTORS) {
+			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
+			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+			if (--bi->bi_phys_segments == 0) {
+				md_write_end(conf->mddev);
+				bi->bi_next = *return_bi;
+				*return_bi = bi;
+			}
+			bi = bi2;
+		}
+
+		/* fail any reads if this device is non-operational */
+		if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
+		    test_bit(R5_ReadError, &sh->dev[i].flags)) {
+			bi = sh->dev[i].toread;
+			sh->dev[i].toread = NULL;
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				wake_up(&conf->wait_for_overlap);
+			if (bi) s->to_read--;
+			while (bi && bi->bi_sector <
+			       sh->dev[i].sector + STRIPE_SECTORS) {
+				struct bio *nextbi =
+					r5_next_bio(bi, sh->dev[i].sector);
+				clear_bit(BIO_UPTODATE, &bi->bi_flags);
+				if (--bi->bi_phys_segments == 0) {
+					bi->bi_next = *return_bi;
+					*return_bi = bi;
+				}
+				bi = nextbi;
+			}
+		}
+		spin_unlock_irq(&conf->device_lock);
+		if (bitmap_end)
+			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+					STRIPE_SECTORS, 0, 0);
+	}
+
+}
+
+static void handle_issuing_new_read_requests5(struct stripe_head *sh,
+			struct stripe_head_state *s, int disks)
+{
+	int i;
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		if (!test_bit(R5_LOCKED, &dev->flags) &&
+		    !test_bit(R5_UPTODATE, &dev->flags) &&
+		    (dev->toread ||
+		     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+		     s->syncing || s->expanding ||
+		     (s->failed && (sh->dev[s->failed_num].toread ||
+			(sh->dev[s->failed_num].towrite &&
+			!test_bit(R5_OVERWRITE, &sh->dev[s->failed_num].flags))
+		      )))) {
+			/* we would like to get this block, possibly
+			 * by computing it, but we might not be able to
+			 */
+			if (s->uptodate == disks-1) {
+				PRINTK("Computing block %d\n", i);
+				compute_block(sh, i);
+				s->uptodate++;
+			} else if (test_bit(R5_Insync, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantread, &dev->flags);
+				s->locked++;
+				PRINTK("Reading block %d (sync=%d)\n",
+					i, s->syncing);
+			}
+		}
+	}
+	set_bit(STRIPE_HANDLE, &sh->state);
+}
+
+static void handle_issuing_new_read_requests6(struct stripe_head *sh,
+			struct stripe_head_state *s, struct r6_state *r6s,
+			int disks)
+{
+	int i;
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		if (!test_bit(R5_LOCKED, &dev->flags) &&
+		    !test_bit(R5_UPTODATE, &dev->flags) &&
+		    (dev->toread || (dev->towrite &&
+		     !test_bit(R5_OVERWRITE, &dev->flags)) ||
+		     s->syncing || s->expanding ||
+		     (s->failed >= 1 &&
+		      (sh->dev[r6s->failed_num[0]].toread ||
+		       s->to_write)) ||
+		     (s->failed >= 2 &&
+		      (sh->dev[r6s->failed_num[1]].toread ||
+		       s->to_write)))) {
+			/* we would like to get this block, possibly
+			 * by computing it, but we might not be able to
+			 */
+			if (s->uptodate == disks-1) {
+				PRINTK("Computing stripe %llu block %d\n",
+				       (unsigned long long)sh->sector, i);
+				compute_block_1(sh, i, 0);
+				s->uptodate++;
+			} else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
+				/* Computing 2-failure is *very* expensive; only
+				 * do it if failed >= 2
+				 */
+				int other;
+				for (other = disks; other--; ) {
+					if (other == i)
+						continue;
+					if (!test_bit(R5_UPTODATE,
+					      &sh->dev[other].flags))
+						break;
+				}
+				BUG_ON(other < 0);
+				PRINTK("Computing stripe %llu blocks %d,%d\n",
+				       (unsigned long long)sh->sector,
+				       i, other);
+				compute_block_2(sh, i, other);
+				s->uptodate += 2;
+			} else if (test_bit(R5_Insync, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantread, &dev->flags);
+				s->locked++;
+				PRINTK("Reading block %d (sync=%d)\n",
+					i, s->syncing);
+			}
+		}
+	}
+	set_bit(STRIPE_HANDLE, &sh->state);
+}
+
+
+/* handle_completed_write_requests
+ * any written block on an uptodate or failed drive can be returned.
+ * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
+ * never LOCKED, so we don't need to test 'failed' directly.
+ */
+static void handle_completed_write_requests(raid5_conf_t *conf,
+	struct stripe_head *sh, int disks, struct bio **return_bi)
+{
+	int i;
+	struct r5dev *dev;
+
+	for (i = disks; i--; )
+		if (sh->dev[i].written) {
+			dev = &sh->dev[i];
+			if (!test_bit(R5_LOCKED, &dev->flags) &&
+				test_bit(R5_UPTODATE, &dev->flags)) {
+				/* We can return any write requests */
+				struct bio *wbi, *wbi2;
+				int bitmap_end = 0;
+				PRINTK("Return write for disc %d\n", i);
+				spin_lock_irq(&conf->device_lock);
+				wbi = dev->written;
+				dev->written = NULL;
+				while (wbi && wbi->bi_sector <
+					dev->sector + STRIPE_SECTORS) {
+					wbi2 = r5_next_bio(wbi, dev->sector);
+					if (--wbi->bi_phys_segments == 0) {
+						md_write_end(conf->mddev);
+						wbi->bi_next = *return_bi;
+						*return_bi = wbi;
+					}
+					wbi = wbi2;
+				}
+				if (dev->towrite == NULL)
+					bitmap_end = 1;
+				spin_unlock_irq(&conf->device_lock);
+				if (bitmap_end)
+					bitmap_endwrite(conf->mddev->bitmap,
+							sh->sector,
+							STRIPE_SECTORS,
+					 !test_bit(STRIPE_DEGRADED, &sh->state),
+							0);
+			}
+		}
+}
+
+static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
+		struct stripe_head *sh,	struct stripe_head_state *s, int disks)
+{
+	int rmw = 0, rcw = 0, i;
+	for (i = disks; i--; ) {
+		/* would I have to read this buffer for read_modify_write */
+		struct r5dev *dev = &sh->dev[i];
+		if ((dev->towrite || i == sh->pd_idx) &&
+		    !test_bit(R5_LOCKED, &dev->flags) &&
+		    !test_bit(R5_UPTODATE, &dev->flags)) {
+			if (test_bit(R5_Insync, &dev->flags))
+				rmw++;
+			else
+				rmw += 2*disks;  /* cannot read it */
+		}
+		/* Would I have to read this buffer for reconstruct_write */
+		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+		    !test_bit(R5_LOCKED, &dev->flags) &&
+		    !test_bit(R5_UPTODATE, &dev->flags)) {
+			if (test_bit(R5_Insync, &dev->flags))
+				rcw++;
+			else
+				rcw += 2*disks;
+		}
+	}
+	PRINTK("for sector %llu, rmw=%d rcw=%d\n",
+		(unsigned long long)sh->sector, rmw, rcw);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	if (rmw < rcw && rmw > 0)
+		/* prefer read-modify-write, but need to get some data */
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if ((dev->towrite || i == sh->pd_idx) &&
+			    !test_bit(R5_LOCKED, &dev->flags) &&
+			    !test_bit(R5_UPTODATE, &dev->flags) &&
+			    test_bit(R5_Insync, &dev->flags)) {
+				if (
+				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+					PRINTK("Read_old block "
+						"%d for r-m-w\n", i);
+					set_bit(R5_LOCKED, &dev->flags);
+					set_bit(R5_Wantread, &dev->flags);
+					s->locked++;
+				} else {
+					set_bit(STRIPE_DELAYED, &sh->state);
+					set_bit(STRIPE_HANDLE, &sh->state);
+				}
+			}
+		}
+	if (rcw <= rmw && rcw > 0)
+		/* want reconstruct write, but need to get some data */
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+			    i != sh->pd_idx &&
+			    !test_bit(R5_LOCKED, &dev->flags) &&
+			    !test_bit(R5_UPTODATE, &dev->flags) &&
+			    test_bit(R5_Insync, &dev->flags)) {
+				if (
+				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+					PRINTK("Read_old block "
+						"%d for Reconstruct\n", i);
+					set_bit(R5_LOCKED, &dev->flags);
+					set_bit(R5_Wantread, &dev->flags);
+					s->locked++;
+				} else {
+					set_bit(STRIPE_DELAYED, &sh->state);
+					set_bit(STRIPE_HANDLE, &sh->state);
+				}
+			}
+		}
+	/* now if nothing is locked, and if we have enough data,
+	 * we can start a write request
+	 */
+	if (s->locked == 0 && (rcw == 0 || rmw == 0) &&
+	    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
+		PRINTK("Computing parity...\n");
+		compute_parity5(sh, rcw == 0 ?
+			RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
+		/* now every locked buffer is ready to be written */
+		for (i = disks; i--; )
+			if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+				PRINTK("Writing block %d\n", i);
+				s->locked++;
+				set_bit(R5_Wantwrite, &sh->dev[i].flags);
+				if (!test_bit(R5_Insync, &sh->dev[i].flags)
+				    || (i == sh->pd_idx && s->failed == 0))
+					set_bit(STRIPE_INSYNC, &sh->state);
+			}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+			atomic_dec(&conf->preread_active_stripes);
+			if (atomic_read(&conf->preread_active_stripes) <
+			    IO_THRESHOLD)
+				md_wakeup_thread(conf->mddev->thread);
+		}
+	}
+}
+
+static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
+		struct stripe_head *sh,	struct stripe_head_state *s,
+		struct r6_state *r6s, int disks)
+{
+	int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
+	int qd_idx = r6s->qd_idx;
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		/* Would I have to read this buffer for reconstruct_write */
+		if (!test_bit(R5_OVERWRITE, &dev->flags)
+		    && i != pd_idx && i != qd_idx
+		    && (!test_bit(R5_LOCKED, &dev->flags)
+			    ) &&
+		    !test_bit(R5_UPTODATE, &dev->flags)) {
+			if (test_bit(R5_Insync, &dev->flags)) rcw++;
+			else {
+				PRINTK("raid6: must_compute: "
+					"disk %d flags=%#lx\n", i, dev->flags);
+				must_compute++;
+			}
+		}
+	}
+	PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
+	       (unsigned long long)sh->sector, rcw, must_compute);
+	set_bit(STRIPE_HANDLE, &sh->state);
+
+	if (rcw > 0)
+		/* want reconstruct write, but need to get some data */
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (!test_bit(R5_OVERWRITE, &dev->flags)
+			    && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
+			    && !test_bit(R5_LOCKED, &dev->flags) &&
+			    !test_bit(R5_UPTODATE, &dev->flags) &&
+			    test_bit(R5_Insync, &dev->flags)) {
+				if (
+				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+					PRINTK("Read_old stripe %llu "
+						"block %d for Reconstruct\n",
+					     (unsigned long long)sh->sector, i);
+					set_bit(R5_LOCKED, &dev->flags);
+					set_bit(R5_Wantread, &dev->flags);
+					s->locked++;
+				} else {
+					PRINTK("Request delayed stripe %llu "
+						"block %d for Reconstruct\n",
+					     (unsigned long long)sh->sector, i);
+					set_bit(STRIPE_DELAYED, &sh->state);
+					set_bit(STRIPE_HANDLE, &sh->state);
+				}
+			}
+		}
+	/* now if nothing is locked, and if we have enough data, we can start a
+	 * write request
+	 */
+	if (s->locked == 0 && rcw == 0 &&
+	    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
+		if (must_compute > 0) {
+			/* We have failed blocks and need to compute them */
+			switch (s->failed) {
+			case 0:
+				BUG();
+			case 1:
+				compute_block_1(sh, r6s->failed_num[0], 0);
+				break;
+			case 2:
+				compute_block_2(sh, r6s->failed_num[0],
+						r6s->failed_num[1]);
+				break;
+			default: /* This request should have been failed? */
+				BUG();
+			}
+		}
+
+		PRINTK("Computing parity for stripe %llu\n",
+			(unsigned long long)sh->sector);
+		compute_parity6(sh, RECONSTRUCT_WRITE);
+		/* now every locked buffer is ready to be written */
+		for (i = disks; i--; )
+			if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+				PRINTK("Writing stripe %llu block %d\n",
+				       (unsigned long long)sh->sector, i);
+				s->locked++;
+				set_bit(R5_Wantwrite, &sh->dev[i].flags);
+			}
+		/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
+		set_bit(STRIPE_INSYNC, &sh->state);
+
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+			atomic_dec(&conf->preread_active_stripes);
+			if (atomic_read(&conf->preread_active_stripes) <
+			    IO_THRESHOLD)
+				md_wakeup_thread(conf->mddev->thread);
+		}
+	}
+}
+
+static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
+				struct stripe_head_state *s, int disks)
+{
+	set_bit(STRIPE_HANDLE, &sh->state);
+	if (s->failed == 0) {
+		BUG_ON(s->uptodate != disks);
+		compute_parity5(sh, CHECK_PARITY);
+		s->uptodate--;
+		if (page_is_zero(sh->dev[sh->pd_idx].page)) {
+			/* parity is correct (on disc, not in buffer any more)
+			 */
+			set_bit(STRIPE_INSYNC, &sh->state);
+		} else {
+			conf->mddev->resync_mismatches += STRIPE_SECTORS;
+			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+				/* don't try to repair!! */
+				set_bit(STRIPE_INSYNC, &sh->state);
+			else {
+				compute_block(sh, sh->pd_idx);
+				s->uptodate++;
+			}
+		}
+	}
+	if (!test_bit(STRIPE_INSYNC, &sh->state)) {
+		struct r5dev *dev;
+		/* either failed parity check, or recovery is happening */
+		if (s->failed == 0)
+			s->failed_num = sh->pd_idx;
+		dev = &sh->dev[s->failed_num];
+		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
+		BUG_ON(s->uptodate != disks);
+
+		set_bit(R5_LOCKED, &dev->flags);
+		set_bit(R5_Wantwrite, &dev->flags);
+		clear_bit(STRIPE_DEGRADED, &sh->state);
+		s->locked++;
+		set_bit(STRIPE_INSYNC, &sh->state);
+	}
+}
+
+
+static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
+				struct stripe_head_state *s,
+				struct r6_state *r6s, struct page *tmp_page,
+				int disks)
+{
+	int update_p = 0, update_q = 0;
+	struct r5dev *dev;
+	int pd_idx = sh->pd_idx;
+	int qd_idx = r6s->qd_idx;
+
+	set_bit(STRIPE_HANDLE, &sh->state);
+
+	BUG_ON(s->failed > 2);
+	BUG_ON(s->uptodate < disks);
+	/* Want to check and possibly repair P and Q.
+	 * However there could be one 'failed' device, in which
+	 * case we can only check one of them, possibly using the
+	 * other to generate missing data
+	 */
+
+	/* If !tmp_page, we cannot do the calculations,
+	 * but as we have set STRIPE_HANDLE, we will soon be called
+	 * by stripe_handle with a tmp_page - just wait until then.
+	 */
+	if (tmp_page) {
+		if (s->failed == r6s->q_failed) {
+			/* The only possible failed device holds 'Q', so it
+			 * makes sense to check P (If anything else were failed,
+			 * we would have used P to recreate it).
+			 */
+			compute_block_1(sh, pd_idx, 1);
+			if (!page_is_zero(sh->dev[pd_idx].page)) {
+				compute_block_1(sh, pd_idx, 0);
+				update_p = 1;
+			}
+		}
+		if (!r6s->q_failed && s->failed < 2) {
+			/* q is not failed, and we didn't use it to generate
+			 * anything, so it makes sense to check it
+			 */
+			memcpy(page_address(tmp_page),
+			       page_address(sh->dev[qd_idx].page),
+			       STRIPE_SIZE);
+			compute_parity6(sh, UPDATE_PARITY);
+			if (memcmp(page_address(tmp_page),
+				   page_address(sh->dev[qd_idx].page),
+				   STRIPE_SIZE) != 0) {
+				clear_bit(STRIPE_INSYNC, &sh->state);
+				update_q = 1;
+			}
+		}
+		if (update_p || update_q) {
+			conf->mddev->resync_mismatches += STRIPE_SECTORS;
+			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+				/* don't try to repair!! */
+				update_p = update_q = 0;
+		}
+
+		/* now write out any block on a failed drive,
+		 * or P or Q if they need it
+		 */
+
+		if (s->failed == 2) {
+			dev = &sh->dev[r6s->failed_num[1]];
+			s->locked++;
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantwrite, &dev->flags);
+		}
+		if (s->failed >= 1) {
+			dev = &sh->dev[r6s->failed_num[0]];
+			s->locked++;
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantwrite, &dev->flags);
+		}
+
+		if (update_p) {
+			dev = &sh->dev[pd_idx];
+			s->locked++;
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantwrite, &dev->flags);
+		}
+		if (update_q) {
+			dev = &sh->dev[qd_idx];
+			s->locked++;
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantwrite, &dev->flags);
+		}
+		clear_bit(STRIPE_DEGRADED, &sh->state);
+
+		set_bit(STRIPE_INSYNC, &sh->state);
+	}
+}
+
+static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
+				struct r6_state *r6s)
+{
+	int i;
+
+	/* We have read all the blocks in this stripe and now we need to
+	 * copy some of them into a target stripe for expand.
+	 */
+	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	for (i = 0; i < sh->disks; i++)
+		if (i != sh->pd_idx && (r6s && i != r6s->qd_idx)) {
+			int dd_idx, pd_idx, j;
+			struct stripe_head *sh2;
+
+			sector_t bn = compute_blocknr(sh, i);
+			sector_t s = raid5_compute_sector(bn, conf->raid_disks,
+						conf->raid_disks -
+						conf->max_degraded, &dd_idx,
+						&pd_idx, conf);
+			sh2 = get_active_stripe(conf, s, conf->raid_disks,
+						pd_idx, 1);
+			if (sh2 == NULL)
+				/* so far only the early blocks of this stripe
+				 * have been requested.  When later blocks
+				 * get requested, we will try again
+				 */
+				continue;
+			if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+			   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
+				/* must have already done this block */
+				release_stripe(sh2);
+				continue;
+			}
+			memcpy(page_address(sh2->dev[dd_idx].page),
+			       page_address(sh->dev[i].page),
+			       STRIPE_SIZE);
+			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
+			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
+			for (j = 0; j < conf->raid_disks; j++)
+				if (j != sh2->pd_idx &&
+				    (r6s && j != r6s->qd_idx) &&
+				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
+					break;
+			if (j == conf->raid_disks) {
+				set_bit(STRIPE_EXPAND_READY, &sh2->state);
+				set_bit(STRIPE_HANDLE, &sh2->state);
+			}
+			release_stripe(sh2);
+		}
+}
 
 /*
  * handle_stripe - do things to a stripe.
@@ -1344,20 +1963,16 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
  * get BH_Lock set before the stripe lock is released.
  *
  */
- 
+
 static void handle_stripe5(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = sh->disks;
-	struct bio *return_bi= NULL;
-	struct bio *bi;
-	int i;
-	int syncing, expanding, expanded;
-	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-	int non_overwrite = 0;
-	int failed_num=0;
+	int disks = sh->disks, i;
+	struct bio *return_bi = NULL;
+	struct stripe_head_state s;
 	struct r5dev *dev;
 
+	memset(&s, 0, sizeof(s));
 	PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
 		(unsigned long long)sh->sector, atomic_read(&sh->count),
 		sh->pd_idx);
@@ -1366,15 +1981,15 @@ static void handle_stripe5(struct stripe_head *sh)
 	clear_bit(STRIPE_HANDLE, &sh->state);
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
-	syncing = test_bit(STRIPE_SYNCING, &sh->state);
-	expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-	expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
-		dev = &sh->dev[i];
+		struct r5dev *dev = &sh->dev[i];
 		clear_bit(R5_Insync, &dev->flags);
 
 		PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
@@ -1403,17 +2018,18 @@ static void handle_stripe5(struct stripe_head *sh)
 		}
 
 		/* now count some things */
-		if (test_bit(R5_LOCKED, &dev->flags)) locked++;
-		if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
+		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
 
-		
-		if (dev->toread) to_read++;
+		if (dev->toread)
+			s.to_read++;
 		if (dev->towrite) {
-			to_write++;
+			s.to_write++;
 			if (!test_bit(R5_OVERWRITE, &dev->flags))
-				non_overwrite++;
+				s.non_overwrite++;
 		}
-		if (dev->written) written++;
+		if (dev->written)
+			s.written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
 			/* The ReadError flag will just be confusing now */
@@ -1422,306 +2038,59 @@ static void handle_stripe5(struct stripe_head *sh)
 		}
 		if (!rdev || !test_bit(In_sync, &rdev->flags)
 		    || test_bit(R5_ReadError, &dev->flags)) {
-			failed++;
-			failed_num = i;
+			s.failed++;
+			s.failed_num = i;
 		} else
 			set_bit(R5_Insync, &dev->flags);
 	}
 	rcu_read_unlock();
 	PRINTK("locked=%d uptodate=%d to_read=%d"
 		" to_write=%d failed=%d failed_num=%d\n",
-		locked, uptodate, to_read, to_write, failed, failed_num);
+		s.locked, s.uptodate, s.to_read, s.to_write,
+		s.failed, s.failed_num);
 	/* check if the array has lost two devices and, if so, some requests might
 	 * need to be failed
 	 */
-	if (failed > 1 && to_read+to_write+written) {
-		for (i=disks; i--; ) {
-			int bitmap_end = 0;
-
-			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				mdk_rdev_t *rdev;
-				rcu_read_lock();
-				rdev = rcu_dereference(conf->disks[i].rdev);
-				if (rdev && test_bit(In_sync, &rdev->flags))
-					/* multiple read failures in one stripe */
-					md_error(conf->mddev, rdev);
-				rcu_read_unlock();
-			}
-
-			spin_lock_irq(&conf->device_lock);
-			/* fail all writes first */
-			bi = sh->dev[i].towrite;
-			sh->dev[i].towrite = NULL;
-			if (bi) { to_write--; bitmap_end = 1; }
-
-			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-				wake_up(&conf->wait_for_overlap);
-
-			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
-					md_write_end(conf->mddev);
-					bi->bi_next = return_bi;
-					return_bi = bi;
-				}
-				bi = nextbi;
-			}
-			/* and fail all 'written' */
-			bi = sh->dev[i].written;
-			sh->dev[i].written = NULL;
-			if (bi) bitmap_end = 1;
-			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
-				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
-					md_write_end(conf->mddev);
-					bi->bi_next = return_bi;
-					return_bi = bi;
-				}
-				bi = bi2;
-			}
-
-			/* fail any reads if this device is non-operational */
-			if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
-			    test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				bi = sh->dev[i].toread;
-				sh->dev[i].toread = NULL;
-				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-					wake_up(&conf->wait_for_overlap);
-				if (bi) to_read--;
-				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-					clear_bit(BIO_UPTODATE, &bi->bi_flags);
-					if (--bi->bi_phys_segments == 0) {
-						bi->bi_next = return_bi;
-						return_bi = bi;
-					}
-					bi = nextbi;
-				}
-			}
-			spin_unlock_irq(&conf->device_lock);
-			if (bitmap_end)
-				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-						STRIPE_SECTORS, 0, 0);
-		}
-	}
-	if (failed > 1 && syncing) {
+	if (s.failed > 1 && s.to_read+s.to_write+s.written)
+		handle_requests_to_failed_array(conf, sh, &s, disks,
+						&return_bi);
+	if (s.failed > 1 && s.syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
-		syncing = 0;
+		s.syncing = 0;
 	}
 
 	/* might be able to return some write requests if the parity block
 	 * is safe, or on a failed drive
 	 */
 	dev = &sh->dev[sh->pd_idx];
-	if ( written &&
-	     ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
-		test_bit(R5_UPTODATE, &dev->flags))
-	       || (failed == 1 && failed_num == sh->pd_idx))
-	    ) {
-	    /* any written block on an uptodate or failed drive can be returned.
-	     * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 
-	     * never LOCKED, so we don't need to test 'failed' directly.
-	     */
-	    for (i=disks; i--; )
-		if (sh->dev[i].written) {
-		    dev = &sh->dev[i];
-		    if (!test_bit(R5_LOCKED, &dev->flags) &&
-			 test_bit(R5_UPTODATE, &dev->flags) ) {
-			/* We can return any write requests */
-			    struct bio *wbi, *wbi2;
-			    int bitmap_end = 0;
-			    PRINTK("Return write for disc %d\n", i);
-			    spin_lock_irq(&conf->device_lock);
-			    wbi = dev->written;
-			    dev->written = NULL;
-			    while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-				    wbi2 = r5_next_bio(wbi, dev->sector);
-				    if (--wbi->bi_phys_segments == 0) {
-					    md_write_end(conf->mddev);
-					    wbi->bi_next = return_bi;
-					    return_bi = wbi;
-				    }
-				    wbi = wbi2;
-			    }
-			    if (dev->towrite == NULL)
-				    bitmap_end = 1;
-			    spin_unlock_irq(&conf->device_lock);
-			    if (bitmap_end)
-				    bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-						    STRIPE_SECTORS,
-						    !test_bit(STRIPE_DEGRADED, &sh->state), 0);
-		    }
-		}
-	}
+	if ( s.written &&
+	     ((test_bit(R5_Insync, &dev->flags) &&
+	       !test_bit(R5_LOCKED, &dev->flags) &&
+	       test_bit(R5_UPTODATE, &dev->flags)) ||
+	       (s.failed == 1 && s.failed_num == sh->pd_idx)))
+		handle_completed_write_requests(conf, sh, disks, &return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
-		for (i=disks; i--;) {
-			dev = &sh->dev[i];
-			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-			    (dev->toread ||
-			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-			     syncing ||
-			     expanding ||
-			     (failed && (sh->dev[failed_num].toread ||
-					 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
-				    )
-				) {
-				/* we would like to get this block, possibly
-				 * by computing it, but we might not be able to
-				 */
-				if (uptodate == disks-1) {
-					PRINTK("Computing block %d\n", i);
-					compute_block(sh, i);
-					uptodate++;
-				} else if (test_bit(R5_Insync, &dev->flags)) {
-					set_bit(R5_LOCKED, &dev->flags);
-					set_bit(R5_Wantread, &dev->flags);
-					locked++;
-					PRINTK("Reading block %d (sync=%d)\n", 
-						i, syncing);
-				}
-			}
-		}
-		set_bit(STRIPE_HANDLE, &sh->state);
-	}
+	if (s.to_read || s.non_overwrite ||
+		(s.syncing && (s.uptodate < disks)) || s.expanding)
+		handle_issuing_new_read_requests5(sh, &s, disks);
 
 	/* now to consider writing and what else, if anything should be read */
-	if (to_write) {
-		int rmw=0, rcw=0;
-		for (i=disks ; i--;) {
-			/* would I have to read this buffer for read_modify_write */
-			dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx) &&
-			    (!test_bit(R5_LOCKED, &dev->flags) 
-				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
-				if (test_bit(R5_Insync, &dev->flags)
-/*				    && !(!mddev->insync && i == sh->pd_idx) */
-					)
-					rmw++;
-				else rmw += 2*disks;  /* cannot read it */
-			}
-			/* Would I have to read this buffer for reconstruct_write */
-			if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
-			    (!test_bit(R5_LOCKED, &dev->flags) 
-				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
-				if (test_bit(R5_Insync, &dev->flags)) rcw++;
-				else rcw += 2*disks;
-			}
-		}
-		PRINTK("for sector %llu, rmw=%d rcw=%d\n", 
-			(unsigned long long)sh->sector, rmw, rcw);
-		set_bit(STRIPE_HANDLE, &sh->state);
-		if (rmw < rcw && rmw > 0)
-			/* prefer read-modify-write, but need to get some data */
-			for (i=disks; i--;) {
-				dev = &sh->dev[i];
-				if ((dev->towrite || i == sh->pd_idx) &&
-				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-				    test_bit(R5_Insync, &dev->flags)) {
-					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-					{
-						PRINTK("Read_old block %d for r-m-w\n", i);
-						set_bit(R5_LOCKED, &dev->flags);
-						set_bit(R5_Wantread, &dev->flags);
-						locked++;
-					} else {
-						set_bit(STRIPE_DELAYED, &sh->state);
-						set_bit(STRIPE_HANDLE, &sh->state);
-					}
-				}
-			}
-		if (rcw <= rmw && rcw > 0)
-			/* want reconstruct write, but need to get some data */
-			for (i=disks; i--;) {
-				dev = &sh->dev[i];
-				if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
-				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-				    test_bit(R5_Insync, &dev->flags)) {
-					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-					{
-						PRINTK("Read_old block %d for Reconstruct\n", i);
-						set_bit(R5_LOCKED, &dev->flags);
-						set_bit(R5_Wantread, &dev->flags);
-						locked++;
-					} else {
-						set_bit(STRIPE_DELAYED, &sh->state);
-						set_bit(STRIPE_HANDLE, &sh->state);
-					}
-				}
-			}
-		/* now if nothing is locked, and if we have enough data, we can start a write request */
-		if (locked == 0 && (rcw == 0 ||rmw == 0) &&
-		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-			PRINTK("Computing parity...\n");
-			compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
-			/* now every locked buffer is ready to be written */
-			for (i=disks; i--;)
-				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-					PRINTK("Writing block %d\n", i);
-					locked++;
-					set_bit(R5_Wantwrite, &sh->dev[i].flags);
-					if (!test_bit(R5_Insync, &sh->dev[i].flags)
-					    || (i==sh->pd_idx && failed == 0))
-						set_bit(STRIPE_INSYNC, &sh->state);
-				}
-			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-					md_wakeup_thread(conf->mddev->thread);
-			}
-		}
-	}
+	if (s.to_write)
+		handle_issuing_new_write_requests5(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough data
 	 * is available
 	 */
-	if (syncing && locked == 0 &&
-	    !test_bit(STRIPE_INSYNC, &sh->state)) {
-		set_bit(STRIPE_HANDLE, &sh->state);
-		if (failed == 0) {
-			BUG_ON(uptodate != disks);
-			compute_parity5(sh, CHECK_PARITY);
-			uptodate--;
-			if (page_is_zero(sh->dev[sh->pd_idx].page)) {
-				/* parity is correct (on disc, not in buffer any more) */
-				set_bit(STRIPE_INSYNC, &sh->state);
-			} else {
-				conf->mddev->resync_mismatches += STRIPE_SECTORS;
-				if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
-					/* don't try to repair!! */
-					set_bit(STRIPE_INSYNC, &sh->state);
-				else {
-					compute_block(sh, sh->pd_idx);
-					uptodate++;
-				}
-			}
-		}
-		if (!test_bit(STRIPE_INSYNC, &sh->state)) {
-			/* either failed parity check, or recovery is happening */
-			if (failed==0)
-				failed_num = sh->pd_idx;
-			dev = &sh->dev[failed_num];
-			BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
-			BUG_ON(uptodate != disks);
-
-			set_bit(R5_LOCKED, &dev->flags);
-			set_bit(R5_Wantwrite, &dev->flags);
-			clear_bit(STRIPE_DEGRADED, &sh->state);
-			locked++;
-			set_bit(STRIPE_INSYNC, &sh->state);
-		}
-	}
-	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+	if (s.syncing && s.locked == 0 &&
+	    !test_bit(STRIPE_INSYNC, &sh->state))
+		handle_parity_checks5(conf, sh, &s, disks);
+	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
@@ -1729,99 +2098,50 @@ static void handle_stripe5(struct stripe_head *sh)
 	/* If the failed drive is just a ReadError, then we might need to progress
 	 * the repair/check process
 	 */
-	if (failed == 1 && ! conf->mddev->ro &&
-	    test_bit(R5_ReadError, &sh->dev[failed_num].flags)
-	    && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags)
-	    && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)
+	if (s.failed == 1 && !conf->mddev->ro &&
+	    test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
+	    && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
+	    && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
 		) {
-		dev = &sh->dev[failed_num];
+		dev = &sh->dev[s.failed_num];
 		if (!test_bit(R5_ReWrite, &dev->flags)) {
 			set_bit(R5_Wantwrite, &dev->flags);
 			set_bit(R5_ReWrite, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
-			locked++;
+			s.locked++;
 		} else {
 			/* let's read it back */
 			set_bit(R5_Wantread, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
-			locked++;
+			s.locked++;
 		}
 	}
 
-	if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
 		/* Need to write out all blocks after computing parity */
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
 		compute_parity5(sh, RECONSTRUCT_WRITE);
-		for (i= conf->raid_disks; i--;) {
+		for (i = conf->raid_disks; i--; ) {
 			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			locked++;
+			s.locked++;
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 		}
 		clear_bit(STRIPE_EXPANDING, &sh->state);
-	} else if (expanded) {
+	} else if (s.expanded) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 	}
 
-	if (expanding && locked == 0) {
-		/* We have read all the blocks in this stripe and now we need to
-		 * copy some of them into a target stripe for expand.
-		 */
-		clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-		for (i=0; i< sh->disks; i++)
-			if (i != sh->pd_idx) {
-				int dd_idx, pd_idx, j;
-				struct stripe_head *sh2;
-
-				sector_t bn = compute_blocknr(sh, i);
-				sector_t s = raid5_compute_sector(bn, conf->raid_disks,
-								  conf->raid_disks-1,
-								  &dd_idx, &pd_idx, conf);
-				sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
-				if (sh2 == NULL)
-					/* so far only the early blocks of this stripe
-					 * have been requested.  When later blocks
-					 * get requested, we will try again
-					 */
-					continue;
-				if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
-				   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
-					/* must have already done this block */
-					release_stripe(sh2);
-					continue;
-				}
-				memcpy(page_address(sh2->dev[dd_idx].page),
-				       page_address(sh->dev[i].page),
-				       STRIPE_SIZE);
-				set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
-				set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
-				for (j=0; j<conf->raid_disks; j++)
-					if (j != sh2->pd_idx &&
-					    !test_bit(R5_Expanded, &sh2->dev[j].flags))
-						break;
-				if (j == conf->raid_disks) {
-					set_bit(STRIPE_EXPAND_READY, &sh2->state);
-					set_bit(STRIPE_HANDLE, &sh2->state);
-				}
-				release_stripe(sh2);
-			}
-	}
+	if (s.expanding && s.locked == 0)
+		handle_stripe_expansion(conf, sh, NULL);
 
 	spin_unlock(&sh->lock);
 
-	while ((bi=return_bi)) {
-		int bytes = bi->bi_size;
+	return_io(return_bi);
 
-		return_bi = bi->bi_next;
-		bi->bi_next = NULL;
-		bi->bi_size = 0;
-		bi->bi_end_io(bi, bytes,
-			      test_bit(BIO_UPTODATE, &bi->bi_flags)
-			        ? 0 : -EIO);
-	}
 	for (i=disks; i-- ;) {
 		int rw;
 		struct bio *bi;
@@ -1850,7 +2170,7 @@ static void handle_stripe5(struct stripe_head *sh)
 		rcu_read_unlock();
  
 		if (rdev) {
-			if (syncing || expanding || expanded)
+			if (s.syncing || s.expanding || s.expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
@@ -1886,29 +2206,26 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 {
 	raid6_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
-	struct bio *return_bi= NULL;
-	struct bio *bi;
-	int i;
-	int syncing, expanding, expanded;
-	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-	int non_overwrite = 0;
-	int failed_num[2] = {0, 0};
+	struct bio *return_bi = NULL;
+	int i, pd_idx = sh->pd_idx;
+	struct stripe_head_state s;
+	struct r6_state r6s;
 	struct r5dev *dev, *pdev, *qdev;
-	int pd_idx = sh->pd_idx;
-	int qd_idx = raid6_next_disk(pd_idx, disks);
-	int p_failed, q_failed;
 
-	PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
-	       (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
-	       pd_idx, qd_idx);
+	r6s.qd_idx = raid6_next_disk(pd_idx, disks);
+	PRINTK("handling stripe %llu, state=%#lx cnt=%d, "
+		"pd_idx=%d, qd_idx=%d\n",
+	       (unsigned long long)sh->sector, sh->state,
+	       atomic_read(&sh->count), pd_idx, r6s.qd_idx);
+	memset(&s, 0, sizeof(s));
 
 	spin_lock(&sh->lock);
 	clear_bit(STRIPE_HANDLE, &sh->state);
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
-	syncing = test_bit(STRIPE_SYNCING, &sh->state);
-	expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-	expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
@@ -1943,17 +2260,19 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		}
 
 		/* now count some things */
-		if (test_bit(R5_LOCKED, &dev->flags)) locked++;
-		if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
+		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
 
 
-		if (dev->toread) to_read++;
+		if (dev->toread)
+			s.to_read++;
 		if (dev->towrite) {
-			to_write++;
+			s.to_write++;
 			if (!test_bit(R5_OVERWRITE, &dev->flags))
-				non_overwrite++;
+				s.non_overwrite++;
 		}
-		if (dev->written) written++;
+		if (dev->written)
+			s.written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
 			/* The ReadError flag will just be confusing now */
@@ -1962,96 +2281,27 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		}
 		if (!rdev || !test_bit(In_sync, &rdev->flags)
 		    || test_bit(R5_ReadError, &dev->flags)) {
-			if ( failed < 2 )
-				failed_num[failed] = i;
-			failed++;
+			if (s.failed < 2)
+				r6s.failed_num[s.failed] = i;
+			s.failed++;
 		} else
 			set_bit(R5_Insync, &dev->flags);
 	}
 	rcu_read_unlock();
 	PRINTK("locked=%d uptodate=%d to_read=%d"
 	       " to_write=%d failed=%d failed_num=%d,%d\n",
-	       locked, uptodate, to_read, to_write, failed,
-	       failed_num[0], failed_num[1]);
-	/* check if the array has lost >2 devices and, if so, some requests might
-	 * need to be failed
+	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
+	       r6s.failed_num[0], r6s.failed_num[1]);
+	/* check if the array has lost >2 devices and, if so, some requests
+	 * might need to be failed
 	 */
-	if (failed > 2 && to_read+to_write+written) {
-		for (i=disks; i--; ) {
-			int bitmap_end = 0;
-
-			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				mdk_rdev_t *rdev;
-				rcu_read_lock();
-				rdev = rcu_dereference(conf->disks[i].rdev);
-				if (rdev && test_bit(In_sync, &rdev->flags))
-					/* multiple read failures in one stripe */
-					md_error(conf->mddev, rdev);
-				rcu_read_unlock();
-			}
-
-			spin_lock_irq(&conf->device_lock);
-			/* fail all writes first */
-			bi = sh->dev[i].towrite;
-			sh->dev[i].towrite = NULL;
-			if (bi) { to_write--; bitmap_end = 1; }
-
-			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-				wake_up(&conf->wait_for_overlap);
-
-			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
-					md_write_end(conf->mddev);
-					bi->bi_next = return_bi;
-					return_bi = bi;
-				}
-				bi = nextbi;
-			}
-			/* and fail all 'written' */
-			bi = sh->dev[i].written;
-			sh->dev[i].written = NULL;
-			if (bi) bitmap_end = 1;
-			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
-				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
-					md_write_end(conf->mddev);
-					bi->bi_next = return_bi;
-					return_bi = bi;
-				}
-				bi = bi2;
-			}
-
-			/* fail any reads if this device is non-operational */
-			if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
-			    test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				bi = sh->dev[i].toread;
-				sh->dev[i].toread = NULL;
-				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-					wake_up(&conf->wait_for_overlap);
-				if (bi) to_read--;
-				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-					clear_bit(BIO_UPTODATE, &bi->bi_flags);
-					if (--bi->bi_phys_segments == 0) {
-						bi->bi_next = return_bi;
-						return_bi = bi;
-					}
-					bi = nextbi;
-				}
-			}
-			spin_unlock_irq(&conf->device_lock);
-			if (bitmap_end)
-				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-						STRIPE_SECTORS, 0, 0);
-		}
-	}
-	if (failed > 2 && syncing) {
+	if (s.failed > 2 && s.to_read+s.to_write+s.written)
+		handle_requests_to_failed_array(conf, sh, &s, disks,
+						&return_bi);
+	if (s.failed > 2 && s.syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
-		syncing = 0;
+		s.syncing = 0;
 	}
 
 	/*
@@ -2059,279 +2309,41 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	 * are safe, or on a failed drive
 	 */
 	pdev = &sh->dev[pd_idx];
-	p_failed = (failed >= 1 && failed_num[0] == pd_idx)
-		|| (failed >= 2 && failed_num[1] == pd_idx);
-	qdev = &sh->dev[qd_idx];
-	q_failed = (failed >= 1 && failed_num[0] == qd_idx)
-		|| (failed >= 2 && failed_num[1] == qd_idx);
-
-	if ( written &&
-	     ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
+	r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
+		|| (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
+	qdev = &sh->dev[r6s.qd_idx];
+	r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
+		|| (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
+
+	if ( s.written &&
+	     ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
 			     && !test_bit(R5_LOCKED, &pdev->flags)
-			     && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
-	     ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
+			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+	     ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
-			     && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
-		/* any written block on an uptodate or failed drive can be
-		 * returned.  Note that if we 'wrote' to a failed drive,
-		 * it will be UPTODATE, but never LOCKED, so we don't need
-		 * to test 'failed' directly.
-		 */
-		for (i=disks; i--; )
-			if (sh->dev[i].written) {
-				dev = &sh->dev[i];
-				if (!test_bit(R5_LOCKED, &dev->flags) &&
-				    test_bit(R5_UPTODATE, &dev->flags) ) {
-					/* We can return any write requests */
-					int bitmap_end = 0;
-					struct bio *wbi, *wbi2;
-					PRINTK("Return write for stripe %llu disc %d\n",
-					       (unsigned long long)sh->sector, i);
-					spin_lock_irq(&conf->device_lock);
-					wbi = dev->written;
-					dev->written = NULL;
-					while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-						wbi2 = r5_next_bio(wbi, dev->sector);
-						if (--wbi->bi_phys_segments == 0) {
-							md_write_end(conf->mddev);
-							wbi->bi_next = return_bi;
-							return_bi = wbi;
-						}
-						wbi = wbi2;
-					}
-					if (dev->towrite == NULL)
-						bitmap_end = 1;
-					spin_unlock_irq(&conf->device_lock);
-					if (bitmap_end)
-						bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-								STRIPE_SECTORS,
-								!test_bit(STRIPE_DEGRADED, &sh->state), 0);
-				}
-			}
-	}
+			     && test_bit(R5_UPTODATE, &qdev->flags)))))
+		handle_completed_write_requests(conf, sh, disks, &return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (to_write && failed) ||
-	    (syncing && (uptodate < disks)) || expanding) {
-		for (i=disks; i--;) {
-			dev = &sh->dev[i];
-			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-			    (dev->toread ||
-			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-			     syncing ||
-			     expanding ||
-			     (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
-			     (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
-				    )
-				) {
-				/* we would like to get this block, possibly
-				 * by computing it, but we might not be able to
-				 */
-				if (uptodate == disks-1) {
-					PRINTK("Computing stripe %llu block %d\n",
-					       (unsigned long long)sh->sector, i);
-					compute_block_1(sh, i, 0);
-					uptodate++;
-				} else if ( uptodate == disks-2 && failed >= 2 ) {
-					/* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
-					int other;
-					for (other=disks; other--;) {
-						if ( other == i )
-							continue;
-						if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
-							break;
-					}
-					BUG_ON(other < 0);
-					PRINTK("Computing stripe %llu blocks %d,%d\n",
-					       (unsigned long long)sh->sector, i, other);
-					compute_block_2(sh, i, other);
-					uptodate += 2;
-				} else if (test_bit(R5_Insync, &dev->flags)) {
-					set_bit(R5_LOCKED, &dev->flags);
-					set_bit(R5_Wantread, &dev->flags);
-					locked++;
-					PRINTK("Reading block %d (sync=%d)\n",
-						i, syncing);
-				}
-			}
-		}
-		set_bit(STRIPE_HANDLE, &sh->state);
-	}
+	if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
+	    (s.syncing && (s.uptodate < disks)) || s.expanding)
+		handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
 
 	/* now to consider writing and what else, if anything should be read */
-	if (to_write) {
-		int rcw=0, must_compute=0;
-		for (i=disks ; i--;) {
-			dev = &sh->dev[i];
-			/* Would I have to read this buffer for reconstruct_write */
-			if (!test_bit(R5_OVERWRITE, &dev->flags)
-			    && i != pd_idx && i != qd_idx
-			    && (!test_bit(R5_LOCKED, &dev->flags)
-				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
-				if (test_bit(R5_Insync, &dev->flags)) rcw++;
-				else {
-					PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
-					must_compute++;
-				}
-			}
-		}
-		PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
-		       (unsigned long long)sh->sector, rcw, must_compute);
-		set_bit(STRIPE_HANDLE, &sh->state);
-
-		if (rcw > 0)
-			/* want reconstruct write, but need to get some data */
-			for (i=disks; i--;) {
-				dev = &sh->dev[i];
-				if (!test_bit(R5_OVERWRITE, &dev->flags)
-				    && !(failed == 0 && (i == pd_idx || i == qd_idx))
-				    && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-				    test_bit(R5_Insync, &dev->flags)) {
-					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-					{
-						PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
-						       (unsigned long long)sh->sector, i);
-						set_bit(R5_LOCKED, &dev->flags);
-						set_bit(R5_Wantread, &dev->flags);
-						locked++;
-					} else {
-						PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
-						       (unsigned long long)sh->sector, i);
-						set_bit(STRIPE_DELAYED, &sh->state);
-						set_bit(STRIPE_HANDLE, &sh->state);
-					}
-				}
-			}
-		/* now if nothing is locked, and if we have enough data, we can start a write request */
-		if (locked == 0 && rcw == 0 &&
-		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-			if ( must_compute > 0 ) {
-				/* We have failed blocks and need to compute them */
-				switch ( failed ) {
-				case 0:	BUG();
-				case 1: compute_block_1(sh, failed_num[0], 0); break;
-				case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
-				default: BUG();	/* This request should have been failed? */
-				}
-			}
-
-			PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
-			compute_parity6(sh, RECONSTRUCT_WRITE);
-			/* now every locked buffer is ready to be written */
-			for (i=disks; i--;)
-				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-					PRINTK("Writing stripe %llu block %d\n",
-					       (unsigned long long)sh->sector, i);
-					locked++;
-					set_bit(R5_Wantwrite, &sh->dev[i].flags);
-				}
-			/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
-			set_bit(STRIPE_INSYNC, &sh->state);
-
-			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-					md_wakeup_thread(conf->mddev->thread);
-			}
-		}
-	}
+	if (s.to_write)
+		handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
-	 * Any reads will already have been scheduled, so we just see if enough data
-	 * is available
+	 * Any reads will already have been scheduled, so we just see if enough
+	 * data is available
 	 */
-	if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
-		int update_p = 0, update_q = 0;
-		struct r5dev *dev;
-
-		set_bit(STRIPE_HANDLE, &sh->state);
+	if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
+		handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
 
-		BUG_ON(failed>2);
-		BUG_ON(uptodate < disks);
-		/* Want to check and possibly repair P and Q.
-		 * However there could be one 'failed' device, in which
-		 * case we can only check one of them, possibly using the
-		 * other to generate missing data
-		 */
-
-		/* If !tmp_page, we cannot do the calculations,
-		 * but as we have set STRIPE_HANDLE, we will soon be called
-		 * by stripe_handle with a tmp_page - just wait until then.
-		 */
-		if (tmp_page) {
-			if (failed == q_failed) {
-				/* The only possible failed device holds 'Q', so it makes
-				 * sense to check P (If anything else were failed, we would
-				 * have used P to recreate it).
-				 */
-				compute_block_1(sh, pd_idx, 1);
-				if (!page_is_zero(sh->dev[pd_idx].page)) {
-					compute_block_1(sh,pd_idx,0);
-					update_p = 1;
-				}
-			}
-			if (!q_failed && failed < 2) {
-				/* q is not failed, and we didn't use it to generate
-				 * anything, so it makes sense to check it
-				 */
-				memcpy(page_address(tmp_page),
-				       page_address(sh->dev[qd_idx].page),
-				       STRIPE_SIZE);
-				compute_parity6(sh, UPDATE_PARITY);
-				if (memcmp(page_address(tmp_page),
-					   page_address(sh->dev[qd_idx].page),
-					   STRIPE_SIZE)!= 0) {
-					clear_bit(STRIPE_INSYNC, &sh->state);
-					update_q = 1;
-				}
-			}
-			if (update_p || update_q) {
-				conf->mddev->resync_mismatches += STRIPE_SECTORS;
-				if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
-					/* don't try to repair!! */
-					update_p = update_q = 0;
-			}
-
-			/* now write out any block on a failed drive,
-			 * or P or Q if they need it
-			 */
-
-			if (failed == 2) {
-				dev = &sh->dev[failed_num[1]];
-				locked++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-			if (failed >= 1) {
-				dev = &sh->dev[failed_num[0]];
-				locked++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-
-			if (update_p) {
-				dev = &sh->dev[pd_idx];
-				locked ++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-			if (update_q) {
-				dev = &sh->dev[qd_idx];
-				locked++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-			clear_bit(STRIPE_DEGRADED, &sh->state);
-
-			set_bit(STRIPE_INSYNC, &sh->state);
-		}
-	}
-
-	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
@@ -2339,9 +2351,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	/* If the failed drives are just a ReadError, then we might need
 	 * to progress the repair/check process
 	 */
-	if (failed <= 2 && ! conf->mddev->ro)
-		for (i=0; i<failed;i++) {
-			dev = &sh->dev[failed_num[i]];
+	if (s.failed <= 2 && !conf->mddev->ro)
+		for (i = 0; i < s.failed; i++) {
+			dev = &sh->dev[r6s.failed_num[i]];
 			if (test_bit(R5_ReadError, &dev->flags)
 			    && !test_bit(R5_LOCKED, &dev->flags)
 			    && test_bit(R5_UPTODATE, &dev->flags)
@@ -2358,7 +2370,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 			}
 		}
 
-	if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
 		/* Need to write out all blocks after computing P&Q */
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
@@ -2366,82 +2378,24 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		compute_parity6(sh, RECONSTRUCT_WRITE);
 		for (i = conf->raid_disks ; i-- ;  ) {
 			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			locked++;
+			s.locked++;
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 		}
 		clear_bit(STRIPE_EXPANDING, &sh->state);
-	} else if (expanded) {
+	} else if (s.expanded) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 	}
 
-	if (expanding && locked == 0) {
-		/* We have read all the blocks in this stripe and now we need to
-		 * copy some of them into a target stripe for expand.
-		 */
-		clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-		for (i = 0; i < sh->disks ; i++)
-			if (i != pd_idx && i != qd_idx) {
-				int dd_idx2, pd_idx2, j;
-				struct stripe_head *sh2;
-
-				sector_t bn = compute_blocknr(sh, i);
-				sector_t s = raid5_compute_sector(
-					bn, conf->raid_disks,
-					conf->raid_disks - conf->max_degraded,
-					&dd_idx2, &pd_idx2, conf);
-				sh2 = get_active_stripe(conf, s,
-							conf->raid_disks,
-						       pd_idx2, 1);
-				if (sh2 == NULL)
-					/* so for only the early blocks of
-					 * this stripe have been requests.
-					 * When later blocks get requests, we
-					 * will try again
-					 */
-					continue;
-				if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
-				    test_bit(R5_Expanded,
-					     &sh2->dev[dd_idx2].flags)) {
-					/* must have already done this block */
-					release_stripe(sh2);
-					continue;
-				}
-				memcpy(page_address(sh2->dev[dd_idx2].page),
-				       page_address(sh->dev[i].page),
-				       STRIPE_SIZE);
-				set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
-				set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
-				for (j = 0 ; j < conf->raid_disks ; j++)
-					if (j != sh2->pd_idx &&
-					    j != raid6_next_disk(sh2->pd_idx,
-							   sh2->disks) &&
-					    !test_bit(R5_Expanded,
-						      &sh2->dev[j].flags))
-						break;
-				if (j == conf->raid_disks) {
-					set_bit(STRIPE_EXPAND_READY,
-						&sh2->state);
-					set_bit(STRIPE_HANDLE, &sh2->state);
-				}
-				release_stripe(sh2);
-			}
-	}
+	if (s.expanding && s.locked == 0)
+		handle_stripe_expansion(conf, sh, &r6s);
 
 	spin_unlock(&sh->lock);
 
-	while ((bi=return_bi)) {
-		int bytes = bi->bi_size;
+	return_io(return_bi);
 
-		return_bi = bi->bi_next;
-		bi->bi_next = NULL;
-		bi->bi_size = 0;
-		bi->bi_end_io(bi, bytes,
-			      test_bit(BIO_UPTODATE, &bi->bi_flags)
-			        ? 0 : -EIO);
-	}
 	for (i=disks; i-- ;) {
 		int rw;
 		struct bio *bi;
@@ -2470,7 +2424,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		rcu_read_unlock();
 
 		if (rdev) {
-			if (syncing || expanding || expanded)
+			if (s.syncing || s.expanding || s.expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index d8286db60b96..b99d354f6128 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -145,6 +145,22 @@ struct stripe_head {
 		unsigned long	flags;
 	} dev[1]; /* allocated with extra space depending of RAID geometry */
 };
+
+/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
+ *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
+ */
+struct stripe_head_state {
+	int syncing, expanding, expanded;
+	int locked, uptodate, to_read, to_write, failed, written;
+	int non_overwrite;
+	int failed_num;
+};
+
+/* r6_state - extra state data only relevant to r6 */
+struct r6_state {
+	int p_failed, q_failed, qd_idx, failed_num[2];
+};
+
 /* Flags */
 #define	R5_UPTODATE	0	/* page contains current data */
 #define	R5_LOCKED	1	/* IO has been submitted on "req" */
-- 
cgit v1.2.3


From 91c00924846a0034020451c280c76baa4299f9dc Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 2 Jan 2007 13:52:30 -0700
Subject: md: raid5_run_ops - run stripe operations outside sh->lock

When the raid acceleration work was proposed, Neil laid out the following
attack plan:

1/ move the xor and copy operations outside spin_lock(&sh->lock)
2/ find/implement an asynchronous offload api

The raid5_run_ops routine uses the asynchronous offload api (async_tx) and
the stripe_operations member of a stripe_head to carry out xor+copy
operations asynchronously, outside the lock.

To perform operations outside the lock a new set of state flags is needed
to track new requests, in-flight requests, and completed requests.  In this
new model handle_stripe is tasked with scanning the stripe_head for work,
updating the stripe_operations structure, and finally dropping the lock and
calling raid5_run_ops for processing.  The following flags outline the
requests that handle_stripe can make of raid5_run_ops:

STRIPE_OP_BIOFILL
 - copy data into request buffers to satisfy a read request
STRIPE_OP_COMPUTE_BLK
 - generate a missing block in the cache from the other blocks
STRIPE_OP_PREXOR
 - subtract existing data as part of the read-modify-write process
STRIPE_OP_BIODRAIN
 - copy data out of request buffers to satisfy a write request
STRIPE_OP_POSTXOR
 - recalculate parity for new data that has entered the cache
STRIPE_OP_CHECK
 - verify that the parity is correct
STRIPE_OP_IO
 - submit i/o to the member disks (note this was already performed outside
   the stripe lock, but it made sense to add it as an operation type

The flow is:
1/ handle_stripe sets STRIPE_OP_* in sh->ops.pending
2/ raid5_run_ops reads sh->ops.pending, sets sh->ops.ack, and submits the
   operation to the async_tx api
3/ async_tx triggers the completion callback routine to set
   sh->ops.complete and release the stripe
4/ handle_stripe runs again to finish the operation and optionally submit
   new operations that were previously blocked

Note this patch just defines raid5_run_ops, subsequent commits (one per
major operation type) modify handle_stripe to take advantage of this
routine.

Changelog:
* removed ops_complete_biodrain in favor of ops_complete_postxor and
  ops_complete_write.
* removed the raid5_run_ops workqueue
* call bi_end_io for reads in ops_complete_biofill, saves a call to
  handle_stripe
* explicitly handle the 2-disk raid5 case (xor becomes memcpy), Neil Brown
* fix race between async engines and bi_end_io call for reads, Neil Brown
* remove unnecessary spin_lock from ops_complete_biofill
* remove test_and_set/test_and_clear BUG_ONs, Neil Brown
* remove explicit interrupt handling for channel switching, this feature
  was absorbed (i.e. it is now implicit) by the async_tx api
* use return_io in ops_complete_biofill

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c         | 536 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/raid/raid5.h |  81 ++++++-
 2 files changed, 614 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e372e57687ee..0b7002479655 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -52,6 +52,7 @@
 #include "raid6.h"
 
 #include <linux/raid/bitmap.h>
+#include <linux/async_tx.h>
 
 /*
  * Stripe cache
@@ -341,6 +342,541 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 	return sh;
 }
 
+static int
+raid5_end_read_request(struct bio *bi, unsigned int bytes_done, int error);
+static int
+raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error);
+
+static void ops_run_io(struct stripe_head *sh)
+{
+	raid5_conf_t *conf = sh->raid_conf;
+	int i, disks = sh->disks;
+
+	might_sleep();
+
+	for (i = disks; i--; ) {
+		int rw;
+		struct bio *bi;
+		mdk_rdev_t *rdev;
+		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+			rw = WRITE;
+		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+			rw = READ;
+		else
+			continue;
+
+		bi = &sh->dev[i].req;
+
+		bi->bi_rw = rw;
+		if (rw == WRITE)
+			bi->bi_end_io = raid5_end_write_request;
+		else
+			bi->bi_end_io = raid5_end_read_request;
+
+		rcu_read_lock();
+		rdev = rcu_dereference(conf->disks[i].rdev);
+		if (rdev && test_bit(Faulty, &rdev->flags))
+			rdev = NULL;
+		if (rdev)
+			atomic_inc(&rdev->nr_pending);
+		rcu_read_unlock();
+
+		if (rdev) {
+			if (test_bit(STRIPE_SYNCING, &sh->state) ||
+				test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
+				test_bit(STRIPE_EXPAND_READY, &sh->state))
+				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+
+			bi->bi_bdev = rdev->bdev;
+			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
+				__FUNCTION__, (unsigned long long)sh->sector,
+				bi->bi_rw, i);
+			atomic_inc(&sh->count);
+			bi->bi_sector = sh->sector + rdev->data_offset;
+			bi->bi_flags = 1 << BIO_UPTODATE;
+			bi->bi_vcnt = 1;
+			bi->bi_max_vecs = 1;
+			bi->bi_idx = 0;
+			bi->bi_io_vec = &sh->dev[i].vec;
+			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+			bi->bi_io_vec[0].bv_offset = 0;
+			bi->bi_size = STRIPE_SIZE;
+			bi->bi_next = NULL;
+			if (rw == WRITE &&
+			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+				atomic_add(STRIPE_SECTORS,
+					&rdev->corrected_errors);
+			generic_make_request(bi);
+		} else {
+			if (rw == WRITE)
+				set_bit(STRIPE_DEGRADED, &sh->state);
+			pr_debug("skip op %ld on disc %d for sector %llu\n",
+				bi->bi_rw, i, (unsigned long long)sh->sector);
+			clear_bit(R5_LOCKED, &sh->dev[i].flags);
+			set_bit(STRIPE_HANDLE, &sh->state);
+		}
+	}
+}
+
+static struct dma_async_tx_descriptor *
+async_copy_data(int frombio, struct bio *bio, struct page *page,
+	sector_t sector, struct dma_async_tx_descriptor *tx)
+{
+	struct bio_vec *bvl;
+	struct page *bio_page;
+	int i;
+	int page_offset;
+
+	if (bio->bi_sector >= sector)
+		page_offset = (signed)(bio->bi_sector - sector) * 512;
+	else
+		page_offset = (signed)(sector - bio->bi_sector) * -512;
+	bio_for_each_segment(bvl, bio, i) {
+		int len = bio_iovec_idx(bio, i)->bv_len;
+		int clen;
+		int b_offset = 0;
+
+		if (page_offset < 0) {
+			b_offset = -page_offset;
+			page_offset += b_offset;
+			len -= b_offset;
+		}
+
+		if (len > 0 && page_offset + len > STRIPE_SIZE)
+			clen = STRIPE_SIZE - page_offset;
+		else
+			clen = len;
+
+		if (clen > 0) {
+			b_offset += bio_iovec_idx(bio, i)->bv_offset;
+			bio_page = bio_iovec_idx(bio, i)->bv_page;
+			if (frombio)
+				tx = async_memcpy(page, bio_page, page_offset,
+					b_offset, clen,
+					ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_SRC,
+					tx, NULL, NULL);
+			else
+				tx = async_memcpy(bio_page, page, b_offset,
+					page_offset, clen,
+					ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_DST,
+					tx, NULL, NULL);
+		}
+		if (clen < len) /* hit end of page */
+			break;
+		page_offset +=  len;
+	}
+
+	return tx;
+}
+
+static void ops_complete_biofill(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+	struct bio *return_bi = NULL;
+	raid5_conf_t *conf = sh->raid_conf;
+	int i, more_to_read = 0;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	/* clear completed biofills */
+	for (i = sh->disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		/* check if this stripe has new incoming reads */
+		if (dev->toread)
+			more_to_read++;
+
+		/* acknowledge completion of a biofill operation */
+		/* and check if we need to reply to a read request
+		*/
+		if (test_bit(R5_Wantfill, &dev->flags) && !dev->toread) {
+			struct bio *rbi, *rbi2;
+			clear_bit(R5_Wantfill, &dev->flags);
+
+			/* The access to dev->read is outside of the
+			 * spin_lock_irq(&conf->device_lock), but is protected
+			 * by the STRIPE_OP_BIOFILL pending bit
+			 */
+			BUG_ON(!dev->read);
+			rbi = dev->read;
+			dev->read = NULL;
+			while (rbi && rbi->bi_sector <
+				dev->sector + STRIPE_SECTORS) {
+				rbi2 = r5_next_bio(rbi, dev->sector);
+				spin_lock_irq(&conf->device_lock);
+				if (--rbi->bi_phys_segments == 0) {
+					rbi->bi_next = return_bi;
+					return_bi = rbi;
+				}
+				spin_unlock_irq(&conf->device_lock);
+				rbi = rbi2;
+			}
+		}
+	}
+	clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
+	clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+
+	return_io(return_bi);
+
+	if (more_to_read)
+		set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static void ops_run_biofill(struct stripe_head *sh)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	raid5_conf_t *conf = sh->raid_conf;
+	int i;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	for (i = sh->disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		if (test_bit(R5_Wantfill, &dev->flags)) {
+			struct bio *rbi;
+			spin_lock_irq(&conf->device_lock);
+			dev->read = rbi = dev->toread;
+			dev->toread = NULL;
+			spin_unlock_irq(&conf->device_lock);
+			while (rbi && rbi->bi_sector <
+				dev->sector + STRIPE_SECTORS) {
+				tx = async_copy_data(0, rbi, dev->page,
+					dev->sector, tx);
+				rbi = r5_next_bio(rbi, dev->sector);
+			}
+		}
+	}
+
+	atomic_inc(&sh->count);
+	async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
+		ops_complete_biofill, sh);
+}
+
+static void ops_complete_compute5(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+	int target = sh->ops.target;
+	struct r5dev *tgt = &sh->dev[target];
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	set_bit(R5_UPTODATE, &tgt->flags);
+	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+	clear_bit(R5_Wantcompute, &tgt->flags);
+	set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute5(struct stripe_head *sh, unsigned long pending)
+{
+	/* kernel stack size limits the total number of disks */
+	int disks = sh->disks;
+	struct page *xor_srcs[disks];
+	int target = sh->ops.target;
+	struct r5dev *tgt = &sh->dev[target];
+	struct page *xor_dest = tgt->page;
+	int count = 0;
+	struct dma_async_tx_descriptor *tx;
+	int i;
+
+	pr_debug("%s: stripe %llu block: %d\n",
+		__FUNCTION__, (unsigned long long)sh->sector, target);
+	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+
+	for (i = disks; i--; )
+		if (i != target)
+			xor_srcs[count++] = sh->dev[i].page;
+
+	atomic_inc(&sh->count);
+
+	if (unlikely(count == 1))
+		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
+			0, NULL, ops_complete_compute5, sh);
+	else
+		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+			ASYNC_TX_XOR_ZERO_DST, NULL,
+			ops_complete_compute5, sh);
+
+	/* ack now if postxor is not set to be run */
+	if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
+		async_tx_ack(tx);
+
+	return tx;
+}
+
+static void ops_complete_prexor(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+{
+	/* kernel stack size limits the total number of disks */
+	int disks = sh->disks;
+	struct page *xor_srcs[disks];
+	int count = 0, pd_idx = sh->pd_idx, i;
+
+	/* existing parity data subtracted */
+	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		/* Only process blocks that are known to be uptodate */
+		if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
+			xor_srcs[count++] = dev->page;
+	}
+
+	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+		ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
+		ops_complete_prexor, sh);
+
+	return tx;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+{
+	int disks = sh->disks;
+	int pd_idx = sh->pd_idx, i;
+
+	/* check if prexor is active which means only process blocks
+	 * that are part of a read-modify-write (Wantprexor)
+	 */
+	int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		struct bio *chosen;
+		int towrite;
+
+		towrite = 0;
+		if (prexor) { /* rmw */
+			if (dev->towrite &&
+			    test_bit(R5_Wantprexor, &dev->flags))
+				towrite = 1;
+		} else { /* rcw */
+			if (i != pd_idx && dev->towrite &&
+				test_bit(R5_LOCKED, &dev->flags))
+				towrite = 1;
+		}
+
+		if (towrite) {
+			struct bio *wbi;
+
+			spin_lock(&sh->lock);
+			chosen = dev->towrite;
+			dev->towrite = NULL;
+			BUG_ON(dev->written);
+			wbi = dev->written = chosen;
+			spin_unlock(&sh->lock);
+
+			while (wbi && wbi->bi_sector <
+				dev->sector + STRIPE_SECTORS) {
+				tx = async_copy_data(1, wbi, dev->page,
+					dev->sector, tx);
+				wbi = r5_next_bio(wbi, dev->sector);
+			}
+		}
+	}
+
+	return tx;
+}
+
+static void ops_complete_postxor(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static void ops_complete_write(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+	int disks = sh->disks, i, pd_idx = sh->pd_idx;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		if (dev->written || i == pd_idx)
+			set_bit(R5_UPTODATE, &dev->flags);
+	}
+
+	set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
+	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+
+	set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static void
+ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+{
+	/* kernel stack size limits the total number of disks */
+	int disks = sh->disks;
+	struct page *xor_srcs[disks];
+
+	int count = 0, pd_idx = sh->pd_idx, i;
+	struct page *xor_dest;
+	int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+	unsigned long flags;
+	dma_async_tx_callback callback;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	/* check if prexor is active which means only process blocks
+	 * that are part of a read-modify-write (written)
+	 */
+	if (prexor) {
+		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (dev->written)
+				xor_srcs[count++] = dev->page;
+		}
+	} else {
+		xor_dest = sh->dev[pd_idx].page;
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (i != pd_idx)
+				xor_srcs[count++] = dev->page;
+		}
+	}
+
+	/* check whether this postxor is part of a write */
+	callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ?
+		ops_complete_write : ops_complete_postxor;
+
+	/* 1/ if we prexor'd then the dest is reused as a source
+	 * 2/ if we did not prexor then we are redoing the parity
+	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
+	 * for the synchronous xor case
+	 */
+	flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
+		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
+
+	atomic_inc(&sh->count);
+
+	if (unlikely(count == 1)) {
+		flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
+		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
+			flags, tx, callback, sh);
+	} else
+		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+			flags, tx, callback, sh);
+}
+
+static void ops_complete_check(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+	int pd_idx = sh->pd_idx;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
+		sh->ops.zero_sum_result == 0)
+		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+
+	set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static void ops_run_check(struct stripe_head *sh)
+{
+	/* kernel stack size limits the total number of disks */
+	int disks = sh->disks;
+	struct page *xor_srcs[disks];
+	struct dma_async_tx_descriptor *tx;
+
+	int count = 0, pd_idx = sh->pd_idx, i;
+	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		if (i != pd_idx)
+			xor_srcs[count++] = dev->page;
+	}
+
+	tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+		&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
+
+	if (tx)
+		set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
+	else
+		clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
+
+	atomic_inc(&sh->count);
+	tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
+		ops_complete_check, sh);
+}
+
+static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
+{
+	int overlap_clear = 0, i, disks = sh->disks;
+	struct dma_async_tx_descriptor *tx = NULL;
+
+	if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
+		ops_run_biofill(sh);
+		overlap_clear++;
+	}
+
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
+		tx = ops_run_compute5(sh, pending);
+
+	if (test_bit(STRIPE_OP_PREXOR, &pending))
+		tx = ops_run_prexor(sh, tx);
+
+	if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
+		tx = ops_run_biodrain(sh, tx);
+		overlap_clear++;
+	}
+
+	if (test_bit(STRIPE_OP_POSTXOR, &pending))
+		ops_run_postxor(sh, tx);
+
+	if (test_bit(STRIPE_OP_CHECK, &pending))
+		ops_run_check(sh);
+
+	if (test_bit(STRIPE_OP_IO, &pending))
+		ops_run_io(sh);
+
+	if (overlap_clear)
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_and_clear_bit(R5_Overlap, &dev->flags))
+				wake_up(&sh->raid_conf->wait_for_overlap);
+		}
+}
+
 static int grow_one_stripe(raid5_conf_t *conf)
 {
 	struct stripe_head *sh;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index b99d354f6128..6fb9d94e6f2e 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -116,13 +116,46 @@
  *  attach a request to an active stripe (add_stripe_bh())
  *     lockdev attach-buffer unlockdev
  *  handle a stripe (handle_stripe())
- *     lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
+ *     lockstripe clrSTRIPE_HANDLE ...
+ *		(lockdev check-buffers unlockdev) ..
+ *		change-state ..
+ *		record io/ops needed unlockstripe schedule io/ops
  *  release an active stripe (release_stripe())
  *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
  *
  * The refcount counts each thread that have activated the stripe,
  * plus raid5d if it is handling it, plus one for each active request
- * on a cached buffer.
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * Stripe operations are performed outside the stripe lock,
+ * the stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and
+ *  read-modify-write)
+ * -checking parity correctness
+ * -running i/o to disk
+ * These operations are carried out by raid5_run_ops which uses the async_tx
+ * api to (optionally) offload operations to dedicated hardware engines.
+ * When requesting an operation handle_stripe sets the pending bit for the
+ * operation and increments the count.  raid5_run_ops is then run whenever
+ * the count is non-zero.
+ * There are some critical dependencies between the operations that prevent some
+ * from being requested while another is in flight.
+ * 1/ Parity check operations destroy the in cache version of the parity block,
+ *    so we prevent parity dependent operations like writes and compute_blocks
+ *    from starting while a check is in progress.  Some dma engines can perform
+ *    the check without damaging the parity block, in these cases the parity
+ *    block is re-marked up to date (assuming the check was successful) and is
+ *    not re-read from disk.
+ * 2/ When a write operation is requested we immediately lock the affected
+ *    blocks, and mark them as not up to date.  This causes new read requests
+ *    to be held off, as well as parity checks and compute block operations.
+ * 3/ Once a compute block operation has been requested handle_stripe treats
+ *    that block as if it is up to date.  raid5_run_ops guaruntees that any
+ *    operation that is dependent on the compute block result is initiated after
+ *    the compute block completes.
  */
 
 struct stripe_head {
@@ -136,11 +169,26 @@ struct stripe_head {
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;			/* disks in stripe */
+	/* stripe_operations
+	 * @pending - pending ops flags (set for request->issue->complete)
+	 * @ack - submitted ops flags (set for issue->complete)
+	 * @complete - completed ops flags (set for complete)
+	 * @target - STRIPE_OP_COMPUTE_BLK target
+	 * @count - raid5_runs_ops is set to run when this is non-zero
+	 */
+	struct stripe_operations {
+		unsigned long	   pending;
+		unsigned long	   ack;
+		unsigned long	   complete;
+		int		   target;
+		int		   count;
+		u32		   zero_sum_result;
+	} ops;
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
 		struct page	*page;
-		struct bio	*toread, *towrite, *written;
+		struct bio	*toread, *read, *towrite, *written;
 		sector_t	sector;			/* sector of this page */
 		unsigned long	flags;
 	} dev[1]; /* allocated with extra space depending of RAID geometry */
@@ -174,6 +222,15 @@ struct r6_state {
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
 
 #define	R5_Expanded	10	/* This block now has post-expand data */
+#define	R5_Wantcompute	11 /* compute_block in progress treat as
+				    * uptodate
+				    */
+#define	R5_Wantfill	12 /* dev->toread contains a bio that needs
+				    * filling
+				    */
+#define	R5_Wantprexor	13 /* distinguish blocks ready for rmw from
+				    * other "towrites"
+				    */
 /*
  * Write method
  */
@@ -195,6 +252,24 @@ struct r6_state {
 #define	STRIPE_EXPANDING	9
 #define	STRIPE_EXPAND_SOURCE	10
 #define	STRIPE_EXPAND_READY	11
+/*
+ * Operations flags (in issue order)
+ */
+#define STRIPE_OP_BIOFILL	0
+#define STRIPE_OP_COMPUTE_BLK	1
+#define STRIPE_OP_PREXOR	2
+#define STRIPE_OP_BIODRAIN	3
+#define STRIPE_OP_POSTXOR	4
+#define STRIPE_OP_CHECK	5
+#define STRIPE_OP_IO		6
+
+/* modifiers to the base operations
+ * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
+ * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
+ */
+#define STRIPE_OP_MOD_REPAIR_PD 7
+#define STRIPE_OP_MOD_DMA_CHECK 8
+
 /*
  * Plugging:
  *
-- 
cgit v1.2.3


From f38e12199a94ca458e4f03c5a2c984fb80adadc5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 2 Jan 2007 13:52:30 -0700
Subject: md: handle_stripe5 - add request/completion logic for async compute
 ops

handle_stripe will compute a block when a backing disk has failed, or when
it determines it can save a disk read by computing the block from all the
other up-to-date blocks.

Previously a block would be computed under the lock and subsequent logic in
handle_stripe could use the newly up-to-date block.  With the raid5_run_ops
implementation the compute operation is carried out a later time outside
the lock.  To preserve the old functionality we take advantage of the
dependency chain feature of async_tx to flag the block as R5_Wantcompute
and then let other parts of handle_stripe operate on the block as if it
were up-to-date.  raid5_run_ops guarantees that the block will be ready
before it is used in another operation.

However, this only works in cases where the compute and the dependent
operation are scheduled at the same time.  If a previous call to
handle_stripe sets the R5_Wantcompute flag there is no facility to pass the
async_tx dependency chain across successive calls to raid5_run_ops.  The
req_compute variable protects against this case.

Changelog:
* remove the req_compute BUG_ON

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c         | 149 ++++++++++++++++++++++++++++++++++-----------
 include/linux/raid/raid5.h |   2 +-
 2 files changed, 115 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d9521aa69461..42439a4c1c51 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2077,36 +2077,101 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
 
 }
 
+/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
+ * to process
+ */
+static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
+			struct stripe_head_state *s, int disk_idx, int disks)
+{
+	struct r5dev *dev = &sh->dev[disk_idx];
+	struct r5dev *failed_dev = &sh->dev[s->failed_num];
+
+	/* don't schedule compute operations or reads on the parity block while
+	 * a check is in flight
+	 */
+	if ((disk_idx == sh->pd_idx) &&
+	     test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+		return ~0;
+
+	/* is the data in this block needed, and can we get it? */
+	if (!test_bit(R5_LOCKED, &dev->flags) &&
+	    !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
+	    (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+	     s->syncing || s->expanding || (s->failed &&
+	     (failed_dev->toread || (failed_dev->towrite &&
+	     !test_bit(R5_OVERWRITE, &failed_dev->flags)
+	     ))))) {
+		/* 1/ We would like to get this block, possibly by computing it,
+		 * but we might not be able to.
+		 *
+		 * 2/ Since parity check operations potentially make the parity
+		 * block !uptodate it will need to be refreshed before any
+		 * compute operations on data disks are scheduled.
+		 *
+		 * 3/ We hold off parity block re-reads until check operations
+		 * have quiesced.
+		 */
+		if ((s->uptodate == disks - 1) &&
+		    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+			set_bit(R5_Wantcompute, &dev->flags);
+			sh->ops.target = disk_idx;
+			s->req_compute = 1;
+			sh->ops.count++;
+			/* Careful: from this point on 'uptodate' is in the eye
+			 * of raid5_run_ops which services 'compute' operations
+			 * before writes. R5_Wantcompute flags a block that will
+			 * be R5_UPTODATE by the time it is needed for a
+			 * subsequent operation.
+			 */
+			s->uptodate++;
+			return 0; /* uptodate + compute == disks */
+		} else if ((s->uptodate < disks - 1) &&
+			test_bit(R5_Insync, &dev->flags)) {
+			/* Note: we hold off compute operations while checks are
+			 * in flight, but we still prefer 'compute' over 'read'
+			 * hence we only read if (uptodate < * disks-1)
+			 */
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantread, &dev->flags);
+			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+				sh->ops.count++;
+			s->locked++;
+			pr_debug("Reading block %d (sync=%d)\n", disk_idx,
+				s->syncing);
+		}
+	}
+
+	return ~0;
+}
+
 static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 			struct stripe_head_state *s, int disks)
 {
 	int i;
-	for (i = disks; i--; ) {
-		struct r5dev *dev = &sh->dev[i];
-		if (!test_bit(R5_LOCKED, &dev->flags) &&
-		    !test_bit(R5_UPTODATE, &dev->flags) &&
-		    (dev->toread ||
-		     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-		     s->syncing || s->expanding ||
-		     (s->failed && (sh->dev[s->failed_num].toread ||
-			(sh->dev[s->failed_num].towrite &&
-			!test_bit(R5_OVERWRITE, &sh->dev[s->failed_num].flags))
-		      )))) {
-			/* we would like to get this block, possibly
-			 * by computing it, but we might not be able to
-			 */
-			if (s->uptodate == disks-1) {
-				pr_debug("Computing block %d\n", i);
-				compute_block(sh, i);
-				s->uptodate++;
-			} else if (test_bit(R5_Insync, &dev->flags)) {
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantread, &dev->flags);
-				s->locked++;
-				pr_debug("Reading block %d (sync=%d)\n",
-					i, s->syncing);
-			}
-		}
+
+	/* Clear completed compute operations.  Parity recovery
+	 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
+	 * later on in this routine
+	 */
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
+		!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
+		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+	}
+
+	/* look for blocks to read/compute, skip this if a compute
+	 * is already in flight, or if the stripe contents are in the
+	 * midst of changing due to a write
+	 */
+	if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
+		!test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
+		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+		for (i = disks; i--; )
+			if (__handle_issuing_new_read_requests5(
+				sh, s, i, disks) == 0)
+				break;
 	}
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
@@ -2223,7 +2288,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 		struct r5dev *dev = &sh->dev[i];
 		if ((dev->towrite || i == sh->pd_idx) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !test_bit(R5_UPTODATE, &dev->flags)) {
+		    !(test_bit(R5_UPTODATE, &dev->flags) ||
+		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rmw++;
 			else
@@ -2232,9 +2298,9 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 		/* Would I have to read this buffer for reconstruct_write */
 		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !test_bit(R5_UPTODATE, &dev->flags)) {
-			if (test_bit(R5_Insync, &dev->flags))
-				rcw++;
+		    !(test_bit(R5_UPTODATE, &dev->flags) ||
+		    test_bit(R5_Wantcompute, &dev->flags))) {
+			if (test_bit(R5_Insync, &dev->flags)) rcw++;
 			else
 				rcw += 2*disks;
 		}
@@ -2248,7 +2314,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 			struct r5dev *dev = &sh->dev[i];
 			if ((dev->towrite || i == sh->pd_idx) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !test_bit(R5_UPTODATE, &dev->flags) &&
+			    !(test_bit(R5_UPTODATE, &dev->flags) ||
+			    test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (
 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -2270,7 +2337,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
 			    i != sh->pd_idx &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !test_bit(R5_UPTODATE, &dev->flags) &&
+			    !(test_bit(R5_UPTODATE, &dev->flags) ||
+			    test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (
 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -2288,8 +2356,17 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 	/* now if nothing is locked, and if we have enough data,
 	 * we can start a write request
 	 */
-	if (s->locked == 0 && (rcw == 0 || rmw == 0) &&
-	    !test_bit(STRIPE_BIT_DELAY, &sh->state))
+	/* since handle_stripe can be called at any time we need to handle the
+	 * case where a compute block operation has been submitted and then a
+	 * subsequent call wants to start a write request.  raid5_run_ops only
+	 * handles the case where compute block and postxor are requested
+	 * simultaneously.  If this is not the case then new writes need to be
+	 * held off until the compute completes.
+	 */
+	if ((s->req_compute ||
+	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
+		(s->locked == 0 && (rcw == 0 || rmw == 0) &&
+		!test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		s->locked += handle_write_operations5(sh, rcw == 0, 0);
 }
 
@@ -2650,6 +2727,7 @@ static void handle_stripe5(struct stripe_head *sh)
 		/* now count some things */
 		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
 		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
+		if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
 
 		if (dev->toread)
 			s.to_read++;
@@ -2706,7 +2784,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * or to load a block that is being partially written.
 	 */
 	if (s.to_read || s.non_overwrite ||
-		(s.syncing && (s.uptodate < disks)) || s.expanding)
+	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
+	    test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
 		handle_issuing_new_read_requests5(sh, &s, disks);
 
 	/* Now we check to see if any write operations have recently
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 6fb9d94e6f2e..2293015de1d5 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -200,7 +200,7 @@ struct stripe_head {
 struct stripe_head_state {
 	int syncing, expanding, expanded;
 	int locked, uptodate, to_read, to_write, failed, written;
-	int non_overwrite;
+	int compute, req_compute, non_overwrite;
 	int failed_num;
 };
 
-- 
cgit v1.2.3


From b5e98d65d34a1c11a2135ea8a9b2619dbc7216c8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 2 Jan 2007 13:52:31 -0700
Subject: md: handle_stripe5 - add request/completion logic for async read ops

When a read bio is attached to the stripe and the corresponding block is
marked R5_UPTODATE, then a read (biofill) operation is scheduled to copy
the data from the stripe cache to the bio buffer.  handle_stripe flags the
blocks to be operated on with the R5_Wantfill flag.  If new read requests
arrive while raid5_run_ops is running they will not be handled until
handle_stripe is scheduled to run again.

Changelog:
* cleanup to_read and to_fill accounting
* do not fail reads that have reached the cache

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c         | 53 ++++++++++++++++++++++------------------------
 include/linux/raid/raid5.h |  2 +-
 2 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 810cf831edda..a33dac7c2e2f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2049,9 +2049,12 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
 			bi = bi2;
 		}
 
-		/* fail any reads if this device is non-operational */
-		if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
-		    test_bit(R5_ReadError, &sh->dev[i].flags)) {
+		/* fail any reads if this device is non-operational and
+		 * the data has not reached the cache yet.
+		 */
+		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
+		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
+		      test_bit(R5_ReadError, &sh->dev[i].flags))) {
 			bi = sh->dev[i].toread;
 			sh->dev[i].toread = NULL;
 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
@@ -2740,37 +2743,27 @@ static void handle_stripe5(struct stripe_head *sh)
 		struct r5dev *dev = &sh->dev[i];
 		clear_bit(R5_Insync, &dev->flags);
 
-		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
-			i, dev->flags, dev->toread, dev->towrite, dev->written);
-		/* maybe we can reply to a read */
-		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
-			struct bio *rbi, *rbi2;
-			pr_debug("Return read for disc %d\n", i);
-			spin_lock_irq(&conf->device_lock);
-			rbi = dev->toread;
-			dev->toread = NULL;
-			if (test_and_clear_bit(R5_Overlap, &dev->flags))
-				wake_up(&conf->wait_for_overlap);
-			spin_unlock_irq(&conf->device_lock);
-			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-				copy_data(0, rbi, dev->page, dev->sector);
-				rbi2 = r5_next_bio(rbi, dev->sector);
-				spin_lock_irq(&conf->device_lock);
-				if (--rbi->bi_phys_segments == 0) {
-					rbi->bi_next = return_bi;
-					return_bi = rbi;
-				}
-				spin_unlock_irq(&conf->device_lock);
-				rbi = rbi2;
-			}
-		}
+		pr_debug("check %d: state 0x%lx toread %p read %p write %p "
+			"written %p\n",	i, dev->flags, dev->toread, dev->read,
+			dev->towrite, dev->written);
+
+		/* maybe we can request a biofill operation
+		 *
+		 * new wantfill requests are only permitted while
+		 * STRIPE_OP_BIOFILL is clear
+		 */
+		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
+			!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+			set_bit(R5_Wantfill, &dev->flags);
 
 		/* now count some things */
 		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
 		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
 		if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
 
-		if (dev->toread)
+		if (test_bit(R5_Wantfill, &dev->flags))
+			s.to_fill++;
+		else if (dev->toread)
 			s.to_read++;
 		if (dev->towrite) {
 			s.to_write++;
@@ -2793,6 +2786,10 @@ static void handle_stripe5(struct stripe_head *sh)
 			set_bit(R5_Insync, &dev->flags);
 	}
 	rcu_read_unlock();
+
+	if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+		sh->ops.count++;
+
 	pr_debug("locked=%d uptodate=%d to_read=%d"
 		" to_write=%d failed=%d failed_num=%d\n",
 		s.locked, s.uptodate, s.to_read, s.to_write,
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 2293015de1d5..93678f57ccbe 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -200,7 +200,7 @@ struct stripe_head {
 struct stripe_head_state {
 	int syncing, expanding, expanded;
 	int locked, uptodate, to_read, to_write, failed, written;
-	int compute, req_compute, non_overwrite;
+	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num;
 };
 
-- 
cgit v1.2.3


From 3039f0735a280b54c7364fbfe6a9287f7f0b510a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 13 Jul 2007 08:06:19 -0700
Subject: ioatdma: add the unisys "i/oat" pci vendor/device id

Cc: John Magolan <john.magolan@unisys.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioatdma.c   | 2 ++
 include/linux/pci_ids.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 81810b3042f1..5fbe56b5cea0 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -517,6 +517,8 @@ static enum dma_status ioat_dma_is_complete(struct dma_chan *chan,
 
 static struct pci_device_id ioat_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_UNISYS,
+		     PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) },
 	{ 0, }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5b1c9994f89a..0275f6917c8e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -475,6 +475,9 @@
 #define PCI_DEVICE_ID_IBM_ICOM_V2_ONE_PORT_RVX_ONE_PORT_MDM_PCIE 0x0361
 #define PCI_DEVICE_ID_IBM_ICOM_FOUR_PORT_MODEL	0x252
 
+#define PCI_VENDOR_ID_UNISYS		0x1018
+#define PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR 0x001C
+
 #define PCI_VENDOR_ID_COMPEX2		0x101a /* pci.ids says "AT&T GIS (NCR)" */
 #define PCI_DEVICE_ID_COMPEX2_100VG	0x0005
 
-- 
cgit v1.2.3