diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/dma/Kconfig | 4 | ||||
-rw-r--r-- | drivers/dma/dmaengine.c | 93 | ||||
-rw-r--r-- | drivers/dma/dmatest.c | 40 | ||||
-rw-r--r-- | drivers/dma/ioat/Makefile | 2 | ||||
-rw-r--r-- | drivers/dma/ioat/dma.c | 136 | ||||
-rw-r--r-- | drivers/dma/ioat/dma.h | 34 | ||||
-rw-r--r-- | drivers/dma/ioat/dma_v2.c | 129 | ||||
-rw-r--r-- | drivers/dma/ioat/dma_v2.h | 45 | ||||
-rw-r--r-- | drivers/dma/ioat/dma_v3.c | 1220 | ||||
-rw-r--r-- | drivers/dma/ioat/hw.h | 142 | ||||
-rw-r--r-- | drivers/dma/ioat/pci.c | 45 | ||||
-rw-r--r-- | drivers/dma/ioat/registers.h | 20 | ||||
-rw-r--r-- | drivers/dma/iop-adma.c | 482 | ||||
-rw-r--r-- | drivers/md/Kconfig | 26 | ||||
-rw-r--r-- | drivers/md/raid5.c | 1494 | ||||
-rw-r--r-- | drivers/md/raid5.h | 28 |
16 files changed, 3146 insertions, 794 deletions
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 81e1020fb514..fe1f3717b1ff 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -17,11 +17,15 @@ if DMADEVICES comment "DMA Devices" +config ASYNC_TX_DISABLE_CHANNEL_SWITCH + bool + config INTEL_IOATDMA tristate "Intel I/OAT DMA support" depends on PCI && X86 select DMA_ENGINE select DCA + select ASYNC_TX_DISABLE_CHANNEL_SWITCH help Enable support for the Intel(R) I/OAT DMA engine present in recent Intel Xeon chipsets. diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 562d182eae66..bd0b248de2cf 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -608,6 +608,40 @@ void dmaengine_put(void) } EXPORT_SYMBOL(dmaengine_put); +static bool device_has_all_tx_types(struct dma_device *device) +{ + /* A device that satisfies this test has channels that will never cause + * an async_tx channel switch event as all possible operation types can + * be handled. + */ + #ifdef CONFIG_ASYNC_TX_DMA + if (!dma_has_cap(DMA_INTERRUPT, device->cap_mask)) + return false; + #endif + + #if defined(CONFIG_ASYNC_MEMCPY) || defined(CONFIG_ASYNC_MEMCPY_MODULE) + if (!dma_has_cap(DMA_MEMCPY, device->cap_mask)) + return false; + #endif + + #if defined(CONFIG_ASYNC_MEMSET) || defined(CONFIG_ASYNC_MEMSET_MODULE) + if (!dma_has_cap(DMA_MEMSET, device->cap_mask)) + return false; + #endif + + #if defined(CONFIG_ASYNC_XOR) || defined(CONFIG_ASYNC_XOR_MODULE) + if (!dma_has_cap(DMA_XOR, device->cap_mask)) + return false; + #endif + + #if defined(CONFIG_ASYNC_PQ) || defined(CONFIG_ASYNC_PQ_MODULE) + if (!dma_has_cap(DMA_PQ, device->cap_mask)) + return false; + #endif + + return true; +} + static int get_dma_id(struct dma_device *device) { int rc; @@ -644,8 +678,12 @@ int dma_async_device_register(struct dma_device *device) !device->device_prep_dma_memcpy); BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) && !device->device_prep_dma_xor); - BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) && - !device->device_prep_dma_zero_sum); + BUG_ON(dma_has_cap(DMA_XOR_VAL, device->cap_mask) && + !device->device_prep_dma_xor_val); + BUG_ON(dma_has_cap(DMA_PQ, device->cap_mask) && + !device->device_prep_dma_pq); + BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) && + !device->device_prep_dma_pq_val); BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) && !device->device_prep_dma_memset); BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) && @@ -661,6 +699,12 @@ int dma_async_device_register(struct dma_device *device) BUG_ON(!device->device_issue_pending); BUG_ON(!device->dev); + /* note: this only matters in the + * CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH=y case + */ + if (device_has_all_tx_types(device)) + dma_cap_set(DMA_ASYNC_TX, device->cap_mask); + idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL); if (!idr_ref) return -ENOMEM; @@ -938,49 +982,24 @@ EXPORT_SYMBOL(dma_async_tx_descriptor_init); /* dma_wait_for_async_tx - spin wait for a transaction to complete * @tx: in-flight transaction to wait on - * - * This routine assumes that tx was obtained from a call to async_memcpy, - * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped - * and submitted). Walking the parent chain is only meant to cover for DMA - * drivers that do not implement the DMA_INTERRUPT capability and may race with - * the driver's descriptor cleanup routine. */ enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) { - enum dma_status status; - struct dma_async_tx_descriptor *iter; - struct dma_async_tx_descriptor *parent; + unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000); if (!tx) return DMA_SUCCESS; - WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for" - " %s\n", __func__, dma_chan_name(tx->chan)); - - /* poll through the dependency chain, return when tx is complete */ - do { - iter = tx; - - /* find the root of the unsubmitted dependency chain */ - do { - parent = iter->parent; - if (!parent) - break; - else - iter = parent; - } while (parent); - - /* there is a small window for ->parent == NULL and - * ->cookie == -EBUSY - */ - while (iter->cookie == -EBUSY) - cpu_relax(); - - status = dma_sync_wait(iter->chan, iter->cookie); - } while (status == DMA_IN_PROGRESS || (iter != tx)); - - return status; + while (tx->cookie == -EBUSY) { + if (time_after_eq(jiffies, dma_sync_wait_timeout)) { + pr_err("%s timeout waiting for descriptor submission\n", + __func__); + return DMA_ERROR; + } + cpu_relax(); + } + return dma_sync_wait(tx->chan, tx->cookie); } EXPORT_SYMBOL_GPL(dma_wait_for_async_tx); diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index d93017fc7872..a32a4cf7b1e0 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -48,6 +48,11 @@ module_param(xor_sources, uint, S_IRUGO); MODULE_PARM_DESC(xor_sources, "Number of xor source buffers (default: 3)"); +static unsigned int pq_sources = 3; +module_param(pq_sources, uint, S_IRUGO); +MODULE_PARM_DESC(pq_sources, + "Number of p+q source buffers (default: 3)"); + /* * Initialization patterns. All bytes in the source buffer has bit 7 * set, all bytes in the destination buffer has bit 7 cleared. @@ -232,6 +237,7 @@ static int dmatest_func(void *data) dma_cookie_t cookie; enum dma_status status; enum dma_ctrl_flags flags; + u8 pq_coefs[pq_sources]; int ret; int src_cnt; int dst_cnt; @@ -248,6 +254,11 @@ static int dmatest_func(void *data) else if (thread->type == DMA_XOR) { src_cnt = xor_sources | 1; /* force odd to ensure dst = src */ dst_cnt = 1; + } else if (thread->type == DMA_PQ) { + src_cnt = pq_sources | 1; /* force odd to ensure dst = src */ + dst_cnt = 2; + for (i = 0; i < pq_sources; i++) + pq_coefs[i] = 1; } else goto err_srcs; @@ -283,6 +294,7 @@ static int dmatest_func(void *data) dma_addr_t dma_dsts[dst_cnt]; struct completion cmp; unsigned long tmo = msecs_to_jiffies(3000); + u8 align = 0; total_tests++; @@ -290,6 +302,18 @@ static int dmatest_func(void *data) src_off = dmatest_random() % (test_buf_size - len + 1); dst_off = dmatest_random() % (test_buf_size - len + 1); + /* honor alignment restrictions */ + if (thread->type == DMA_MEMCPY) + align = dev->copy_align; + else if (thread->type == DMA_XOR) + align = dev->xor_align; + else if (thread->type == DMA_PQ) + align = dev->pq_align; + + len = (len >> align) << align; + src_off = (src_off >> align) << align; + dst_off = (dst_off >> align) << align; + dmatest_init_srcs(thread->srcs, src_off, len); dmatest_init_dsts(thread->dsts, dst_off, len); @@ -306,6 +330,7 @@ static int dmatest_func(void *data) DMA_BIDIRECTIONAL); } + if (thread->type == DMA_MEMCPY) tx = dev->device_prep_dma_memcpy(chan, dma_dsts[0] + dst_off, @@ -316,6 +341,15 @@ static int dmatest_func(void *data) dma_dsts[0] + dst_off, dma_srcs, xor_sources, len, flags); + else if (thread->type == DMA_PQ) { + dma_addr_t dma_pq[dst_cnt]; + + for (i = 0; i < dst_cnt; i++) + dma_pq[i] = dma_dsts[i] + dst_off; + tx = dev->device_prep_dma_pq(chan, dma_pq, dma_srcs, + pq_sources, pq_coefs, + len, flags); + } if (!tx) { for (i = 0; i < src_cnt; i++) @@ -459,6 +493,8 @@ static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_ty op = "copy"; else if (type == DMA_XOR) op = "xor"; + else if (type == DMA_PQ) + op = "pq"; else return -EINVAL; @@ -514,6 +550,10 @@ static int dmatest_add_channel(struct dma_chan *chan) cnt = dmatest_add_threads(dtc, DMA_XOR); thread_count += cnt > 0 ? cnt : 0; } + if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) { + cnt = dmatest_add_threads(dtc, DMA_PQ); + thread_count += cnt > 0 ?: 0; + } pr_info("dmatest: Started %u threads using %s\n", thread_count, dma_chan_name(chan)); diff --git a/drivers/dma/ioat/Makefile b/drivers/dma/ioat/Makefile index 205a639e84df..8997d3fb9051 100644 --- a/drivers/dma/ioat/Makefile +++ b/drivers/dma/ioat/Makefile @@ -1,2 +1,2 @@ obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o -ioatdma-objs := pci.o dma.o dma_v2.o dca.o +ioatdma-objs := pci.o dma.o dma_v2.o dma_v3.o dca.o diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index 21527b89590c..c524d36d3c2e 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -263,6 +263,7 @@ static dma_cookie_t ioat1_tx_submit(struct dma_async_tx_descriptor *tx) if (!test_and_set_bit(IOAT_COMPLETION_PENDING, &chan->state)) mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); + ioat->active += desc->hw->tx_cnt; ioat->pending += desc->hw->tx_cnt; if (ioat->pending >= ioat_pending_level) __ioat1_dma_memcpy_issue_pending(ioat); @@ -539,17 +540,6 @@ static void ioat1_cleanup_tasklet(unsigned long data) writew(IOAT_CHANCTRL_RUN, chan->base.reg_base + IOAT_CHANCTRL_OFFSET); } -static void ioat_unmap(struct pci_dev *pdev, dma_addr_t addr, size_t len, - int direction, enum dma_ctrl_flags flags, bool dst) -{ - if ((dst && (flags & DMA_COMPL_DEST_UNMAP_SINGLE)) || - (!dst && (flags & DMA_COMPL_SRC_UNMAP_SINGLE))) - pci_unmap_single(pdev, addr, len, direction); - else - pci_unmap_page(pdev, addr, len, direction); -} - - void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags, size_t len, struct ioat_dma_descriptor *hw) { @@ -623,6 +613,7 @@ static void __cleanup(struct ioat_dma_chan *ioat, unsigned long phys_complete) chan->completed_cookie = tx->cookie; tx->cookie = 0; ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw); + ioat->active -= desc->hw->tx_cnt; if (tx->callback) { tx->callback(tx->callback_param); tx->callback = NULL; @@ -809,7 +800,7 @@ static void __devinit ioat_dma_test_callback(void *dma_async_param) * ioat_dma_self_test - Perform a IOAT transaction to verify the HW works. * @device: device to be tested */ -static int __devinit ioat_dma_self_test(struct ioatdma_device *device) +int __devinit ioat_dma_self_test(struct ioatdma_device *device) { int i; u8 *src; @@ -1040,13 +1031,8 @@ int __devinit ioat_probe(struct ioatdma_device *device) dma_cap_set(DMA_MEMCPY, dma->cap_mask); dma->dev = &pdev->dev; - dev_err(dev, "Intel(R) I/OAT DMA Engine found," - " %d channels, device version 0x%02x, driver version %s\n", - dma->chancnt, device->version, IOAT_DMA_VERSION); - if (!dma->chancnt) { - dev_err(dev, "Intel(R) I/OAT DMA Engine problem found: " - "zero channels detected\n"); + dev_err(dev, "zero channels detected\n"); goto err_setup_interrupts; } @@ -1054,7 +1040,7 @@ int __devinit ioat_probe(struct ioatdma_device *device) if (err) goto err_setup_interrupts; - err = ioat_dma_self_test(device); + err = device->self_test(device); if (err) goto err_self_test; @@ -1097,6 +1083,113 @@ static void ioat1_intr_quirk(struct ioatdma_device *device) pci_write_config_dword(pdev, IOAT_PCI_DMACTRL_OFFSET, dmactrl); } +static ssize_t ring_size_show(struct dma_chan *c, char *page) +{ + struct ioat_dma_chan *ioat = to_ioat_chan(c); + + return sprintf(page, "%d\n", ioat->desccount); +} +static struct ioat_sysfs_entry ring_size_attr = __ATTR_RO(ring_size); + +static ssize_t ring_active_show(struct dma_chan *c, char *page) +{ + struct ioat_dma_chan *ioat = to_ioat_chan(c); + + return sprintf(page, "%d\n", ioat->active); +} +static struct ioat_sysfs_entry ring_active_attr = __ATTR_RO(ring_active); + +static ssize_t cap_show(struct dma_chan *c, char *page) +{ + struct dma_device *dma = c->device; + + return sprintf(page, "copy%s%s%s%s%s%s\n", + dma_has_cap(DMA_PQ, dma->cap_mask) ? " pq" : "", + dma_has_cap(DMA_PQ_VAL, dma->cap_mask) ? " pq_val" : "", + dma_has_cap(DMA_XOR, dma->cap_mask) ? " xor" : "", + dma_has_cap(DMA_XOR_VAL, dma->cap_mask) ? " xor_val" : "", + dma_has_cap(DMA_MEMSET, dma->cap_mask) ? " fill" : "", + dma_has_cap(DMA_INTERRUPT, dma->cap_mask) ? " intr" : ""); + +} +struct ioat_sysfs_entry ioat_cap_attr = __ATTR_RO(cap); + +static ssize_t version_show(struct dma_chan *c, char *page) +{ + struct dma_device *dma = c->device; + struct ioatdma_device *device = to_ioatdma_device(dma); + + return sprintf(page, "%d.%d\n", + device->version >> 4, device->version & 0xf); +} +struct ioat_sysfs_entry ioat_version_attr = __ATTR_RO(version); + +static struct attribute *ioat1_attrs[] = { + &ring_size_attr.attr, + &ring_active_attr.attr, + &ioat_cap_attr.attr, + &ioat_version_attr.attr, + NULL, +}; + +static ssize_t +ioat_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct ioat_sysfs_entry *entry; + struct ioat_chan_common *chan; + + entry = container_of(attr, struct ioat_sysfs_entry, attr); + chan = container_of(kobj, struct ioat_chan_common, kobj); + + if (!entry->show) + return -EIO; + return entry->show(&chan->common, page); +} + +struct sysfs_ops ioat_sysfs_ops = { + .show = ioat_attr_show, +}; + +static struct kobj_type ioat1_ktype = { + .sysfs_ops = &ioat_sysfs_ops, + .default_attrs = ioat1_attrs, +}; + +void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type) +{ + struct dma_device *dma = &device->common; + struct dma_chan *c; + + list_for_each_entry(c, &dma->channels, device_node) { + struct ioat_chan_common *chan = to_chan_common(c); + struct kobject *parent = &c->dev->device.kobj; + int err; + + err = kobject_init_and_add(&chan->kobj, type, parent, "quickdata"); + if (err) { + dev_warn(to_dev(chan), + "sysfs init error (%d), continuing...\n", err); + kobject_put(&chan->kobj); + set_bit(IOAT_KOBJ_INIT_FAIL, &chan->state); + } + } +} + +void ioat_kobject_del(struct ioatdma_device *device) +{ + struct dma_device *dma = &device->common; + struct dma_chan *c; + + list_for_each_entry(c, &dma->channels, device_node) { + struct ioat_chan_common *chan = to_chan_common(c); + + if (!test_bit(IOAT_KOBJ_INIT_FAIL, &chan->state)) { + kobject_del(&chan->kobj); + kobject_put(&chan->kobj); + } + } +} + int __devinit ioat1_dma_probe(struct ioatdma_device *device, int dca) { struct pci_dev *pdev = device->pdev; @@ -1105,6 +1198,7 @@ int __devinit ioat1_dma_probe(struct ioatdma_device *device, int dca) device->intr_quirk = ioat1_intr_quirk; device->enumerate_channels = ioat1_enumerate_channels; + device->self_test = ioat_dma_self_test; dma = &device->common; dma->device_prep_dma_memcpy = ioat1_dma_prep_memcpy; dma->device_issue_pending = ioat1_dma_memcpy_issue_pending; @@ -1119,6 +1213,8 @@ int __devinit ioat1_dma_probe(struct ioatdma_device *device, int dca) err = ioat_register(device); if (err) return err; + ioat_kobject_add(device, &ioat1_ktype); + if (dca) device->dca = ioat_dca_init(pdev, device->reg_base); @@ -1131,6 +1227,8 @@ void __devexit ioat_dma_remove(struct ioatdma_device *device) ioat_disable_interrupts(device); + ioat_kobject_del(device); + dma_async_device_unregister(dma); pci_pool_destroy(device->dma_pool); diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h index 8966fa5453a7..6a675a2a2d1c 100644 --- a/drivers/dma/ioat/dma.h +++ b/drivers/dma/ioat/dma.h @@ -60,8 +60,12 @@ * @dca: direct cache access context * @intr_quirk: interrupt setup quirk (for ioat_v1 devices) * @enumerate_channels: hw version specific channel enumeration + * @cleanup_tasklet: select between the v2 and v3 cleanup routines + * @timer_fn: select between the v2 and v3 timer watchdog routines + * @self_test: hardware version specific self test for each supported op type + * + * Note: the v3 cleanup routine supports raid operations */ - struct ioatdma_device { struct pci_dev *pdev; void __iomem *reg_base; @@ -74,6 +78,9 @@ struct ioatdma_device { struct dca_provider *dca; void (*intr_quirk)(struct ioatdma_device *device); int (*enumerate_channels)(struct ioatdma_device *device); + void (*cleanup_tasklet)(unsigned long data); + void (*timer_fn)(unsigned long data); + int (*self_test)(struct ioatdma_device *device); }; struct ioat_chan_common { @@ -86,6 +93,7 @@ struct ioat_chan_common { #define IOAT_COMPLETION_PENDING 0 #define IOAT_COMPLETION_ACK 1 #define IOAT_RESET_PENDING 2 + #define IOAT_KOBJ_INIT_FAIL 3 struct timer_list timer; #define COMPLETION_TIMEOUT msecs_to_jiffies(100) #define IDLE_TIMEOUT msecs_to_jiffies(2000) @@ -94,8 +102,13 @@ struct ioat_chan_common { dma_addr_t completion_dma; u64 *completion; struct tasklet_struct cleanup_task; + struct kobject kobj; }; +struct ioat_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct dma_chan *, char *); +}; /** * struct ioat_dma_chan - internal representation of a DMA channel @@ -111,6 +124,7 @@ struct ioat_dma_chan { int pending; u16 desccount; + u16 active; }; static inline struct ioat_chan_common *to_chan_common(struct dma_chan *c) @@ -155,7 +169,7 @@ ioat_is_complete(struct dma_chan *c, dma_cookie_t cookie, /** * struct ioat_desc_sw - wrapper around hardware descriptor - * @hw: hardware DMA descriptor + * @hw: hardware DMA descriptor (for memcpy) * @node: this descriptor will either be on the free list, * or attached to a transaction list (tx_list) * @txd: the generic software descriptor for all engines @@ -288,9 +302,20 @@ static inline bool is_ioat_bug(unsigned long err) IOAT_CHANERR_LENGTH_ERR)); } +static inline void ioat_unmap(struct pci_dev *pdev, dma_addr_t addr, size_t len, + int direction, enum dma_ctrl_flags flags, bool dst) +{ + if ((dst && (flags & DMA_COMPL_DEST_UNMAP_SINGLE)) || + (!dst && (flags & DMA_COMPL_SRC_UNMAP_SINGLE))) + pci_unmap_single(pdev, addr, len, direction); + else + pci_unmap_page(pdev, addr, len, direction); +} + int __devinit ioat_probe(struct ioatdma_device *device); int __devinit ioat_register(struct ioatdma_device *device); int __devinit ioat1_dma_probe(struct ioatdma_device *dev, int dca); +int __devinit ioat_dma_self_test(struct ioatdma_device *device); void __devexit ioat_dma_remove(struct ioatdma_device *device); struct dca_provider * __devinit ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase); @@ -304,4 +329,9 @@ void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags, size_t len, struct ioat_dma_descriptor *hw); bool ioat_cleanup_preamble(struct ioat_chan_common *chan, unsigned long *phys_complete); +void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type); +void ioat_kobject_del(struct ioatdma_device *device); +extern struct sysfs_ops ioat_sysfs_ops; +extern struct ioat_sysfs_entry ioat_version_attr; +extern struct ioat_sysfs_entry ioat_cap_attr; #endif /* IOATDMA_H */ diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c index fa3d6db6624c..5d6ac49e0d32 100644 --- a/drivers/dma/ioat/dma_v2.c +++ b/drivers/dma/ioat/dma_v2.c @@ -39,7 +39,7 @@ #include "registers.h" #include "hw.h" -static int ioat_ring_alloc_order = 8; +int ioat_ring_alloc_order = 8; module_param(ioat_ring_alloc_order, int, 0644); MODULE_PARM_DESC(ioat_ring_alloc_order, "ioat2+: allocate 2^n descriptors per channel (default: n=8)"); @@ -48,7 +48,7 @@ module_param(ioat_ring_max_alloc_order, int, 0644); MODULE_PARM_DESC(ioat_ring_max_alloc_order, "ioat2+: upper limit for dynamic ring resizing (default: n=16)"); -static void __ioat2_issue_pending(struct ioat2_dma_chan *ioat) +void __ioat2_issue_pending(struct ioat2_dma_chan *ioat) { void * __iomem reg_base = ioat->base.reg_base; @@ -63,7 +63,7 @@ static void __ioat2_issue_pending(struct ioat2_dma_chan *ioat) __func__, ioat->head, ioat->tail, ioat->issued, ioat->dmacount); } -static void ioat2_issue_pending(struct dma_chan *chan) +void ioat2_issue_pending(struct dma_chan *chan) { struct ioat2_dma_chan *ioat = to_ioat2_chan(chan); @@ -206,7 +206,7 @@ static void ioat2_cleanup(struct ioat2_dma_chan *ioat) spin_unlock_bh(&chan->cleanup_lock); } -static void ioat2_cleanup_tasklet(unsigned long data) +void ioat2_cleanup_tasklet(unsigned long data) { struct ioat2_dma_chan *ioat = (void *) data; @@ -214,7 +214,7 @@ static void ioat2_cleanup_tasklet(unsigned long data) writew(IOAT_CHANCTRL_RUN, ioat->base.reg_base + IOAT_CHANCTRL_OFFSET); } -static void __restart_chan(struct ioat2_dma_chan *ioat) +void __ioat2_restart_chan(struct ioat2_dma_chan *ioat) { struct ioat_chan_common *chan = &ioat->base; @@ -255,12 +255,10 @@ static void ioat2_restart_channel(struct ioat2_dma_chan *ioat) if (ioat_cleanup_preamble(chan, &phys_complete)) __cleanup(ioat, phys_complete); - __restart_chan(ioat); + __ioat2_restart_chan(ioat); } -static bool reshape_ring(struct ioat2_dma_chan *ioat, int order); - -static void ioat2_timer_event(unsigned long data) +void ioat2_timer_event(unsigned long data) { struct ioat2_dma_chan *ioat = (void *) data; struct ioat_chan_common *chan = &ioat->base; @@ -321,7 +319,7 @@ static void ioat2_timer_event(unsigned long data) * ioat2_enumerate_channels - find and initialize the device's channels * @device: the device to be enumerated */ -static int ioat2_enumerate_channels(struct ioatdma_device *device) +int ioat2_enumerate_channels(struct ioatdma_device *device) { struct ioat2_dma_chan *ioat; struct device *dev = &device->pdev->dev; @@ -354,8 +352,8 @@ static int ioat2_enumerate_channels(struct ioatdma_device *device) break; ioat_init_channel(device, &ioat->base, i, - ioat2_timer_event, - ioat2_cleanup_tasklet, + device->timer_fn, + device->cleanup_tasklet, (unsigned long) ioat); ioat->xfercap_log = xfercap_log; spin_lock_init(&ioat->ring_lock); @@ -461,7 +459,7 @@ static struct ioat_ring_ent **ioat2_alloc_ring(struct dma_chan *c, int order, gf /* ioat2_alloc_chan_resources - allocate/initialize ioat2 descriptor ring * @chan: channel to be initialized */ -static int ioat2_alloc_chan_resources(struct dma_chan *c) +int ioat2_alloc_chan_resources(struct dma_chan *c) { struct ioat2_dma_chan *ioat = to_ioat2_chan(c); struct ioat_chan_common *chan = &ioat->base; @@ -515,7 +513,7 @@ static int ioat2_alloc_chan_resources(struct dma_chan *c) return 1 << ioat->alloc_order; } -static bool reshape_ring(struct ioat2_dma_chan *ioat, int order) +bool reshape_ring(struct ioat2_dma_chan *ioat, int order) { /* reshape differs from normal ring allocation in that we want * to allocate a new software ring while only @@ -628,7 +626,7 @@ static bool reshape_ring(struct ioat2_dma_chan *ioat, int order) * @ioat: ioat2,3 channel (ring) to operate on * @num_descs: allocation length */ -static int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs) +int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs) { struct ioat_chan_common *chan = &ioat->base; @@ -656,9 +654,11 @@ static int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_d spin_lock_bh(&chan->cleanup_lock); if (jiffies > chan->timer.expires && timer_pending(&chan->timer)) { + struct ioatdma_device *device = chan->device; + mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); spin_unlock_bh(&chan->cleanup_lock); - ioat2_timer_event((unsigned long) ioat); + device->timer_fn((unsigned long) ioat); } else spin_unlock_bh(&chan->cleanup_lock); return -ENOMEM; @@ -671,7 +671,7 @@ static int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_d return 0; /* with ioat->ring_lock held */ } -static struct dma_async_tx_descriptor * +struct dma_async_tx_descriptor * ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest, dma_addr_t dma_src, size_t len, unsigned long flags) { @@ -711,6 +711,7 @@ ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest, desc->txd.flags = flags; desc->len = total_len; hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT); + hw->ctl_f.fence = !!(flags & DMA_PREP_FENCE); hw->ctl_f.compl_write = 1; dump_desc_dbg(ioat, desc); /* we leave the channel locked to ensure in order submission */ @@ -722,11 +723,11 @@ ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest, * ioat2_free_chan_resources - release all the descriptors * @chan: the channel to be cleaned */ -static void ioat2_free_chan_resources(struct dma_chan *c) +void ioat2_free_chan_resources(struct dma_chan *c) { struct ioat2_dma_chan *ioat = to_ioat2_chan(c); struct ioat_chan_common *chan = &ioat->base; - struct ioatdma_device *ioatdma_device = chan->device; + struct ioatdma_device *device = chan->device; struct ioat_ring_ent *desc; const u16 total_descs = 1 << ioat->alloc_order; int descs; @@ -740,7 +741,7 @@ static void ioat2_free_chan_resources(struct dma_chan *c) tasklet_disable(&chan->cleanup_task); del_timer_sync(&chan->timer); - ioat2_cleanup(ioat); + device->cleanup_tasklet((unsigned long) ioat); /* Delay 100ms after reset to allow internal DMA logic to quiesce * before removing DMA descriptor resources. @@ -770,8 +771,7 @@ static void ioat2_free_chan_resources(struct dma_chan *c) kfree(ioat->ring); ioat->ring = NULL; ioat->alloc_order = 0; - pci_pool_free(ioatdma_device->completion_pool, - chan->completion, + pci_pool_free(device->completion_pool, chan->completion, chan->completion_dma); spin_unlock_bh(&ioat->ring_lock); @@ -781,66 +781,63 @@ static void ioat2_free_chan_resources(struct dma_chan *c) ioat->dmacount = 0; } -static enum dma_status +enum dma_status ioat2_is_complete(struct dma_chan *c, dma_cookie_t cookie, dma_cookie_t *done, dma_cookie_t *used) { struct ioat2_dma_chan *ioat = to_ioat2_chan(c); + struct ioatdma_device *device = ioat->base.device; if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS) return DMA_SUCCESS; - ioat2_cleanup(ioat); + device->cleanup_tasklet((unsigned long) ioat); return ioat_is_complete(c, cookie, done, used); } -int __devinit ioat2_dma_probe(struct ioatdma_device *device, int dca) +static ssize_t ring_size_show(struct dma_chan *c, char *page) { - struct pci_dev *pdev = device->pdev; - struct dma_device *dma; - struct dma_chan *c; - struct ioat_chan_common *chan; - int err; + struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - device->enumerate_channels = ioat2_enumerate_channels; - dma = &device->common; - dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock; - dma->device_issue_pending = ioat2_issue_pending; - dma->device_alloc_chan_resources = ioat2_alloc_chan_resources; - dma->device_free_chan_resources = ioat2_free_chan_resources; - dma->device_is_tx_complete = ioat2_is_complete; + return sprintf(page, "%d\n", (1 << ioat->alloc_order) & ~1); +} +static struct ioat_sysfs_entry ring_size_attr = __ATTR_RO(ring_size); - err = ioat_probe(device); - if (err) - return err; - ioat_set_tcp_copy_break(2048); +static ssize_t ring_active_show(struct dma_chan *c, char *page) +{ + struct ioat2_dma_chan *ioat = to_ioat2_chan(c); - list_for_each_entry(c, &dma->channels, device_node) { - chan = to_chan_common(c); - writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE | IOAT_DMA_DCA_ANY_CPU, - chan->reg_base + IOAT_DCACTRL_OFFSET); - } + /* ...taken outside the lock, no need to be precise */ + return sprintf(page, "%d\n", ioat2_ring_active(ioat)); +} +static struct ioat_sysfs_entry ring_active_attr = __ATTR_RO(ring_active); - err = ioat_register(device); - if (err) - return err; - if (dca) - device->dca = ioat2_dca_init(pdev, device->reg_base); +static struct attribute *ioat2_attrs[] = { + &ring_size_attr.attr, + &ring_active_attr.attr, + &ioat_cap_attr.attr, + &ioat_version_attr.attr, + NULL, +}; - return err; -} +struct kobj_type ioat2_ktype = { + .sysfs_ops = &ioat_sysfs_ops, + .default_attrs = ioat2_attrs, +}; -int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca) +int __devinit ioat2_dma_probe(struct ioatdma_device *device, int dca) { struct pci_dev *pdev = device->pdev; struct dma_device *dma; struct dma_chan *c; struct ioat_chan_common *chan; int err; - u16 dev_id; device->enumerate_channels = ioat2_enumerate_channels; + device->cleanup_tasklet = ioat2_cleanup_tasklet; + device->timer_fn = ioat2_timer_event; + device->self_test = ioat_dma_self_test; dma = &device->common; dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock; dma->device_issue_pending = ioat2_issue_pending; @@ -848,35 +845,25 @@ int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca) dma->device_free_chan_resources = ioat2_free_chan_resources; dma->device_is_tx_complete = ioat2_is_complete; - /* -= IOAT ver.3 workarounds =- */ - /* Write CHANERRMSK_INT with 3E07h to mask out the errors - * that can cause stability issues for IOAT ver.3 - */ - pci_write_config_dword(pdev, IOAT_PCI_CHANERRMASK_INT_OFFSET, 0x3e07); - - /* Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit - * (workaround for spurious config parity error after restart) - */ - pci_read_config_word(pdev, IOAT_PCI_DEVICE_ID_OFFSET, &dev_id); - if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0) - pci_write_config_dword(pdev, IOAT_PCI_DMAUNCERRSTS_OFFSET, 0x10); - err = ioat_probe(device); if (err) return err; - ioat_set_tcp_copy_break(262144); + ioat_set_tcp_copy_break(2048); list_for_each_entry(c, &dma->channels, device_node) { chan = to_chan_common(c); - writel(IOAT_DMA_DCA_ANY_CPU, + writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE | IOAT_DMA_DCA_ANY_CPU, chan->reg_base + IOAT_DCACTRL_OFFSET); } err = ioat_register(device); if (err) return err; + + ioat_kobject_add(device, &ioat2_ktype); + if (dca) - device->dca = ioat3_dca_init(pdev, device->reg_base); + device->dca = ioat2_dca_init(pdev, device->reg_base); return err; } diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h index ac00adc81974..1d849ef74d5f 100644 --- a/drivers/dma/ioat/dma_v2.h +++ b/drivers/dma/ioat/dma_v2.h @@ -27,6 +27,7 @@ extern int ioat_pending_level; +extern int ioat_ring_alloc_order; /* * workaround for IOAT ver.3.0 null descriptor issue @@ -114,10 +115,36 @@ static inline u16 ioat2_xferlen_to_descs(struct ioat2_dma_chan *ioat, size_t len return num_descs; } +/** + * struct ioat_ring_ent - wrapper around hardware descriptor + * @hw: hardware DMA descriptor (for memcpy) + * @fill: hardware fill descriptor + * @xor: hardware xor descriptor + * @xor_ex: hardware xor extension descriptor + * @pq: hardware pq descriptor + * @pq_ex: hardware pq extension descriptor + * @pqu: hardware pq update descriptor + * @raw: hardware raw (un-typed) descriptor + * @txd: the generic software descriptor for all engines + * @len: total transaction length for unmap + * @result: asynchronous result of validate operations + * @id: identifier for debug + */ + struct ioat_ring_ent { - struct ioat_dma_descriptor *hw; + union { + struct ioat_dma_descriptor *hw; + struct ioat_fill_descriptor *fill; + struct ioat_xor_descriptor *xor; + struct ioat_xor_ext_descriptor *xor_ex; + struct ioat_pq_descriptor *pq; + struct ioat_pq_ext_descriptor *pq_ex; + struct ioat_pq_update_descriptor *pqu; + struct ioat_raw_descriptor *raw; + }; size_t len; struct dma_async_tx_descriptor txd; + enum sum_check_flags *result; #ifdef DEBUG int id; #endif @@ -143,5 +170,21 @@ int __devinit ioat2_dma_probe(struct ioatdma_device *dev, int dca); int __devinit ioat3_dma_probe(struct ioatdma_device *dev, int dca); struct dca_provider * __devinit ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase); struct dca_provider * __devinit ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase); +int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs); +int ioat2_enumerate_channels(struct ioatdma_device *device); +struct dma_async_tx_descriptor * +ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest, + dma_addr_t dma_src, size_t len, unsigned long flags); +void ioat2_issue_pending(struct dma_chan *chan); +int ioat2_alloc_chan_resources(struct dma_chan *c); +void ioat2_free_chan_resources(struct dma_chan *c); +enum dma_status ioat2_is_complete(struct dma_chan *c, dma_cookie_t cookie, + dma_cookie_t *done, dma_cookie_t *used); +void __ioat2_restart_chan(struct ioat2_dma_chan *ioat); +bool reshape_ring(struct ioat2_dma_chan *ioat, int order); +void __ioat2_issue_pending(struct ioat2_dma_chan *ioat); +void ioat2_cleanup_tasklet(unsigned long data); +void ioat2_timer_event(unsigned long data); +extern struct kobj_type ioat2_ktype; extern struct kmem_cache *ioat2_cache; #endif /* IOATDMA_V2_H */ diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c new file mode 100644 index 000000000000..3686dddf6bff --- /dev/null +++ b/drivers/dma/ioat/dma_v3.c @@ -0,0 +1,1220 @@ +/* + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + * BSD LICENSE + * + * Copyright(c) 2004-2009 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Support routines for v3+ hardware + */ + +#include <linux/pci.h> +#include <linux/dmaengine.h> +#include <linux/dma-mapping.h> +#include "registers.h" +#include "hw.h" +#include "dma.h" +#include "dma_v2.h" + +/* ioat hardware assumes at least two sources for raid operations */ +#define src_cnt_to_sw(x) ((x) + 2) +#define src_cnt_to_hw(x) ((x) - 2) + +/* provide a lookup table for setting the source address in the base or + * extended descriptor of an xor or pq descriptor + */ +static const u8 xor_idx_to_desc __read_mostly = 0xd0; +static const u8 xor_idx_to_field[] __read_mostly = { 1, 4, 5, 6, 7, 0, 1, 2 }; +static const u8 pq_idx_to_desc __read_mostly = 0xf8; +static const u8 pq_idx_to_field[] __read_mostly = { 1, 4, 5, 0, 1, 2, 4, 5 }; + +static dma_addr_t xor_get_src(struct ioat_raw_descriptor *descs[2], int idx) +{ + struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1]; + + return raw->field[xor_idx_to_field[idx]]; +} + +static void xor_set_src(struct ioat_raw_descriptor *descs[2], + dma_addr_t addr, u32 offset, int idx) +{ + struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1]; + + raw->field[xor_idx_to_field[idx]] = addr + offset; +} + +static dma_addr_t pq_get_src(struct ioat_raw_descriptor *descs[2], int idx) +{ + struct ioat_raw_descriptor *raw = descs[pq_idx_to_desc >> idx & 1]; + + return raw->field[pq_idx_to_field[idx]]; +} + +static void pq_set_src(struct ioat_raw_descriptor *descs[2], + dma_addr_t addr, u32 offset, u8 coef, int idx) +{ + struct ioat_pq_descriptor *pq = (struct ioat_pq_descriptor *) descs[0]; + struct ioat_raw_descriptor *raw = descs[pq_idx_to_desc >> idx & 1]; + + raw->field[pq_idx_to_field[idx]] = addr + offset; + pq->coef[idx] = coef; +} + +static void ioat3_dma_unmap(struct ioat2_dma_chan *ioat, + struct ioat_ring_ent *desc, int idx) +{ + struct ioat_chan_common *chan = &ioat->base; + struct pci_dev *pdev = chan->device->pdev; + size_t len = desc->len; + size_t offset = len - desc->hw->size; + struct dma_async_tx_descriptor *tx = &desc->txd; + enum dma_ctrl_flags flags = tx->flags; + + switch (desc->hw->ctl_f.op) { + case IOAT_OP_COPY: + if (!desc->hw->ctl_f.null) /* skip 'interrupt' ops */ + ioat_dma_unmap(chan, flags, len, desc->hw); + break; + case IOAT_OP_FILL: { + struct ioat_fill_descriptor *hw = desc->fill; + + if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) + ioat_unmap(pdev, hw->dst_addr - offset, len, + PCI_DMA_FROMDEVICE, flags, 1); + break; + } + case IOAT_OP_XOR_VAL: + case IOAT_OP_XOR: { + struct ioat_xor_descriptor *xor = desc->xor; + struct ioat_ring_ent *ext; + struct ioat_xor_ext_descriptor *xor_ex = NULL; + int src_cnt = src_cnt_to_sw(xor->ctl_f.src_cnt); + struct ioat_raw_descriptor *descs[2]; + int i; + + if (src_cnt > 5) { + ext = ioat2_get_ring_ent(ioat, idx + 1); + xor_ex = ext->xor_ex; + } + + if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { + descs[0] = (struct ioat_raw_descriptor *) xor; + descs[1] = (struct ioat_raw_descriptor *) xor_ex; + for (i = 0; i < src_cnt; i++) { + dma_addr_t src = xor_get_src(descs, i); + + ioat_unmap(pdev, src - offset, len, + PCI_DMA_TODEVICE, flags, 0); + } + + /* dest is a source in xor validate operations */ + if (xor->ctl_f.op == IOAT_OP_XOR_VAL) { + ioat_unmap(pdev, xor->dst_addr - offset, len, + PCI_DMA_TODEVICE, flags, 1); + break; + } + } + + if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) + ioat_unmap(pdev, xor->dst_addr - offset, len, + PCI_DMA_FROMDEVICE, flags, 1); + break; + } + case IOAT_OP_PQ_VAL: + case IOAT_OP_PQ: { + struct ioat_pq_descriptor *pq = desc->pq; + struct ioat_ring_ent *ext; + struct ioat_pq_ext_descriptor *pq_ex = NULL; + int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt); + struct ioat_raw_descriptor *descs[2]; + int i; + + if (src_cnt > 3) { + ext = ioat2_get_ring_ent(ioat, idx + 1); + pq_ex = ext->pq_ex; + } + + /* in the 'continue' case don't unmap the dests as sources */ + if (dmaf_p_disabled_continue(flags)) + src_cnt--; + else if (dmaf_continue(flags)) + src_cnt -= 3; + + if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { + descs[0] = (struct ioat_raw_descriptor *) pq; + descs[1] = (struct ioat_raw_descriptor *) pq_ex; + for (i = 0; i < src_cnt; i++) { + dma_addr_t src = pq_get_src(descs, i); + + ioat_unmap(pdev, src - offset, len, + PCI_DMA_TODEVICE, flags, 0); + } + + /* the dests are sources in pq validate operations */ + if (pq->ctl_f.op == IOAT_OP_XOR_VAL) { + if (!(flags & DMA_PREP_PQ_DISABLE_P)) + ioat_unmap(pdev, pq->p_addr - offset, + len, PCI_DMA_TODEVICE, flags, 0); + if (!(flags & DMA_PREP_PQ_DISABLE_Q)) + ioat_unmap(pdev, pq->q_addr - offset, + len, PCI_DMA_TODEVICE, flags, 0); + break; + } + } + + if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) { + if (!(flags & DMA_PREP_PQ_DISABLE_P)) + ioat_unmap(pdev, pq->p_addr - offset, len, + PCI_DMA_BIDIRECTIONAL, flags, 1); + if (!(flags & DMA_PREP_PQ_DISABLE_Q)) + ioat_unmap(pdev, pq->q_addr - offset, len, + PCI_DMA_BIDIRECTIONAL, flags, 1); + } + break; + } + default: + dev_err(&pdev->dev, "%s: unknown op type: %#x\n", + __func__, desc->hw->ctl_f.op); + } +} + +static bool desc_has_ext(struct ioat_ring_ent *desc) +{ + struct ioat_dma_descriptor *hw = desc->hw; + + if (hw->ctl_f.op == IOAT_OP_XOR || + hw->ctl_f.op == IOAT_OP_XOR_VAL) { + struct ioat_xor_descriptor *xor = desc->xor; + + if (src_cnt_to_sw(xor->ctl_f.src_cnt) > 5) + return true; + } else if (hw->ctl_f.op == IOAT_OP_PQ || + hw->ctl_f.op == IOAT_OP_PQ_VAL) { + struct ioat_pq_descriptor *pq = desc->pq; + + if (src_cnt_to_sw(pq->ctl_f.src_cnt) > 3) + return true; + } + + return false; +} + +/** + * __cleanup - reclaim used descriptors + * @ioat: channel (ring) to clean + * + * The difference from the dma_v2.c __cleanup() is that this routine + * handles extended descriptors and dma-unmapping raid operations. + */ +static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete) +{ + struct ioat_chan_common *chan = &ioat->base; + struct ioat_ring_ent *desc; + bool seen_current = false; + u16 active; + int i; + + dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n", + __func__, ioat->head, ioat->tail, ioat->issued); + + active = ioat2_ring_active(ioat); + for (i = 0; i < active && !seen_current; i++) { + struct dma_async_tx_descriptor *tx; + + prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1)); + desc = ioat2_get_ring_ent(ioat, ioat->tail + i); + dump_desc_dbg(ioat, desc); + tx = &desc->txd; + if (tx->cookie) { + chan->completed_cookie = tx->cookie; + ioat3_dma_unmap(ioat, desc, ioat->tail + i); + tx->cookie = 0; + if (tx->callback) { + tx->callback(tx->callback_param); + tx->callback = NULL; + } + } + + if (tx->phys == phys_complete) + seen_current = true; + + /* skip extended descriptors */ + if (desc_has_ext(desc)) { + BUG_ON(i + 1 >= active); + i++; + } + } + ioat->tail += i; + BUG_ON(!seen_current); /* no active descs have written a completion? */ + chan->last_completion = phys_complete; + if (ioat->head == ioat->tail) { + dev_dbg(to_dev(chan), "%s: cancel completion timeout\n", + __func__); + clear_bit(IOAT_COMPLETION_PENDING, &chan->state); + mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT); + } +} + +static void ioat3_cleanup(struct ioat2_dma_chan *ioat) +{ + struct ioat_chan_common *chan = &ioat->base; + unsigned long phys_complete; + + prefetch(chan->completion); + + if (!spin_trylock_bh(&chan->cleanup_lock)) + return; + + if (!ioat_cleanup_preamble(chan, &phys_complete)) { + spin_unlock_bh(&chan->cleanup_lock); + return; + } + + if (!spin_trylock_bh(&ioat->ring_lock)) { + spin_unlock_bh(&chan->cleanup_lock); + return; + } + + __cleanup(ioat, phys_complete); + + spin_unlock_bh(&ioat->ring_lock); + spin_unlock_bh(&chan->cleanup_lock); +} + +static void ioat3_cleanup_tasklet(unsigned long data) +{ + struct ioat2_dma_chan *ioat = (void *) data; + + ioat3_cleanup(ioat); + writew(IOAT_CHANCTRL_RUN | IOAT3_CHANCTRL_COMPL_DCA_EN, + ioat->base.reg_base + IOAT_CHANCTRL_OFFSET); +} + +static void ioat3_restart_channel(struct ioat2_dma_chan *ioat) +{ + struct ioat_chan_common *chan = &ioat->base; + unsigned long phys_complete; + u32 status; + + status = ioat_chansts(chan); + if (is_ioat_active(status) || is_ioat_idle(status)) + ioat_suspend(chan); + while (is_ioat_active(status) || is_ioat_idle(status)) { + status = ioat_chansts(chan); + cpu_relax(); + } + + if (ioat_cleanup_preamble(chan, &phys_complete)) + __cleanup(ioat, phys_complete); + + __ioat2_restart_chan(ioat); +} + +static void ioat3_timer_event(unsigned long data) +{ + struct ioat2_dma_chan *ioat = (void *) data; + struct ioat_chan_common *chan = &ioat->base; + + spin_lock_bh(&chan->cleanup_lock); + if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) { + unsigned long phys_complete; + u64 status; + + spin_lock_bh(&ioat->ring_lock); + status = ioat_chansts(chan); + + /* when halted due to errors check for channel + * programming errors before advancing the completion state + */ + if (is_ioat_halted(status)) { + u32 chanerr; + + chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET); + BUG_ON(is_ioat_bug(chanerr)); + } + + /* if we haven't made progress and we have already + * acknowledged a pending completion once, then be more + * forceful with a restart + */ + if (ioat_cleanup_preamble(chan, &phys_complete)) + __cleanup(ioat, phys_complete); + else if (test_bit(IOAT_COMPLETION_ACK, &chan->state)) + ioat3_restart_channel(ioat); + else { + set_bit(IOAT_COMPLETION_ACK, &chan->state); + mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT); + } + spin_unlock_bh(&ioat->ring_lock); + } else { + u16 active; + + /* if the ring is idle, empty, and oversized try to step + * down the size + */ + spin_lock_bh(&ioat->ring_lock); + active = ioat2_ring_active(ioat); + if (active == 0 && ioat->alloc_order > ioat_get_alloc_order()) + reshape_ring(ioat, ioat->alloc_order-1); + spin_unlock_bh(&ioat->ring_lock); + + /* keep shrinking until we get back to our minimum + * default size + */ + if (ioat->alloc_order > ioat_get_alloc_order()) + mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT); + } + spin_unlock_bh(&chan->cleanup_lock); +} + +static enum dma_status +ioat3_is_complete(struct dma_chan *c, dma_cookie_t cookie, + dma_cookie_t *done, dma_cookie_t *used) +{ + struct ioat2_dma_chan *ioat = to_ioat2_chan(c); + + if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS) + return DMA_SUCCESS; + + ioat3_cleanup(ioat); + + return ioat_is_complete(c, cookie, done, used); +} + +static struct dma_async_tx_descriptor * +ioat3_prep_memset_lock(struct dma_chan *c, dma_addr_t dest, int value, + size_t len, unsigned long flags) +{ + struct ioat2_dma_chan *ioat = to_ioat2_chan(c); + struct ioat_ring_ent *desc; + size_t total_len = len; + struct ioat_fill_descriptor *fill; + int num_descs; + u64 src_data = (0x0101010101010101ULL) * (value & 0xff); + u16 idx; + int i; + + num_descs = ioat2_xferlen_to_descs(ioat, len); + if (likely(num_descs) && + ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0) + /* pass */; + else + return NULL; + for (i = 0; i < num_descs; i++) { + size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log); + + desc = ioat2_get_ring_ent(ioat, idx + i); + fill = desc->fill; + + fill->size = xfer_size; + fill->src_data = src_data; + fill->dst_addr = dest; + fill->ctl = 0; + fill->ctl_f.op = IOAT_OP_FILL; + + len -= xfer_size; + dest += xfer_size; + dump_desc_dbg(ioat, desc); + } + + desc->txd.flags = flags; + desc->len = total_len; + fill->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT); + fill->ctl_f.fence = !!(flags & DMA_PREP_FENCE); + fill->ctl_f.compl_write = 1; + dump_desc_dbg(ioat, desc); + + /* we leave the channel locked to ensure in order submission */ + return &desc->txd; +} + +static struct dma_async_tx_descriptor * +__ioat3_prep_xor_lock(struct dma_chan *c, enum sum_check_flags *result, + dma_addr_t dest, dma_addr_t *src, unsigned int src_cnt, + size_t len, unsigned long flags) +{ + struct ioat2_dma_chan *ioat = to_ioat2_chan(c); + struct ioat_ring_ent *compl_desc; + struct ioat_ring_ent *desc; + struct ioat_ring_ent *ext; + size_t total_len = len; + struct ioat_xor_descriptor *xor; + struct ioat_xor_ext_descriptor *xor_ex = NULL; + struct ioat_dma_descriptor *hw; + u32 offset = 0; + int num_descs; + int with_ext; + int i; + u16 idx; + u8 op = result ? IOAT_OP_XOR_VAL : IOAT_OP_XOR; + + BUG_ON(src_cnt < 2); + + num_descs = ioat2_xferlen_to_descs(ioat, len); + /* we need 2x the number of descriptors to cover greater than 5 + * sources + */ + if (src_cnt > 5) { + with_ext = 1; + num_descs *= 2; + } else + with_ext = 0; + + /* completion writes from the raid engine may pass completion + * writes from the legacy engine, so we need one extra null + * (legacy) descriptor to ensure all completion writes arrive in + * order. + */ + if (likely(num_descs) && + ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0) + /* pass */; + else + return NULL; + for (i = 0; i < num_descs; i += 1 + with_ext) { + struct ioat_raw_descriptor *descs[2]; + size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log); + int s; + + desc = ioat2_get_ring_ent(ioat, idx + i); + xor = desc->xor; + + /* save a branch by unconditionally retrieving the + * extended descriptor xor_set_src() knows to not write + * to it in the single descriptor case + */ + ext = ioat2_get_ring_ent(ioat, idx + i + 1); + xor_ex = ext->xor_ex; + + descs[0] = (struct ioat_raw_descriptor *) xor; + descs[1] = (struct ioat_raw_descriptor *) xor_ex; + for (s = 0; s < src_cnt; s++) + xor_set_src(descs, src[s], offset, s); + xor->size = xfer_size; + xor->dst_addr = dest + offset; + xor->ctl = 0; + xor->ctl_f.op = op; + xor->ctl_f.src_cnt = src_cnt_to_hw(src_cnt); + + len -= xfer_size; + offset += xfer_size; + dump_desc_dbg(ioat, desc); + } + + /* last xor descriptor carries the unmap parameters and fence bit */ + desc->txd.flags = flags; + desc->len = total_len; + if (result) + desc->result = result; + xor->ctl_f.fence = !!(flags & DMA_PREP_FENCE); + + /* completion descriptor carries interrupt bit */ + compl_desc = ioat2_get_ring_ent(ioat, idx + i); + compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT; + hw = compl_desc->hw; + hw->ctl = 0; + hw->ctl_f.null = 1; + hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT); + hw->ctl_f.compl_write = 1; + hw->size = NULL_DESC_BUFFER_SIZE; + dump_desc_dbg(ioat, compl_desc); + + /* we leave the channel locked to ensure in order submission */ + return &desc->txd; +} + +static struct dma_async_tx_descriptor * +ioat3_prep_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, + unsigned int src_cnt, size_t len, unsigned long flags) +{ + return __ioat3_prep_xor_lock(chan, NULL, dest, src, src_cnt, len, flags); +} + +struct dma_async_tx_descriptor * +ioat3_prep_xor_val(struct dma_chan *chan, dma_addr_t *src, + unsigned int src_cnt, size_t len, + enum sum_check_flags *result, unsigned long flags) +{ + /* the cleanup routine only sets bits on validate failure, it + * does not clear bits on validate success... so clear it here + */ + *result = 0; + + return __ioat3_prep_xor_lock(chan, result, src[0], &src[1], + src_cnt - 1, len, flags); +} + +static void +dump_pq_desc_dbg(struct ioat2_dma_chan *ioat, struct ioat_ring_ent *desc, struct ioat_ring_ent *ext) +{ + struct device *dev = to_dev(&ioat->base); + struct ioat_pq_descriptor *pq = desc->pq; + struct ioat_pq_ext_descriptor *pq_ex = ext ? ext->pq_ex : NULL; + struct ioat_raw_descriptor *descs[] = { (void *) pq, (void *) pq_ex }; + int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt); + int i; + + dev_dbg(dev, "desc[%d]: (%#llx->%#llx) flags: %#x" + " sz: %#x ctl: %#x (op: %d int: %d compl: %d pq: '%s%s' src_cnt: %d)\n", + desc_id(desc), (unsigned long long) desc->txd.phys, + (unsigned long long) (pq_ex ? pq_ex->next : pq->next), + desc->txd.flags, pq->size, pq->ctl, pq->ctl_f.op, pq->ctl_f.int_en, + pq->ctl_f.compl_write, + pq->ctl_f.p_disable ? "" : "p", pq->ctl_f.q_disable ? "" : "q", + pq->ctl_f.src_cnt); + for (i = 0; i < src_cnt; i++) + dev_dbg(dev, "\tsrc[%d]: %#llx coef: %#x\n", i, + (unsigned long long) pq_get_src(descs, i), pq->coef[i]); + dev_dbg(dev, "\tP: %#llx\n", pq->p_addr); + dev_dbg(dev, "\tQ: %#llx\n", pq->q_addr); +} + +static struct dma_async_tx_descriptor * +__ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result, + const dma_addr_t *dst, const dma_addr_t *src, + unsigned int src_cnt, const unsigned char *scf, + size_t len, unsigned long flags) +{ + struct ioat2_dma_chan *ioat = to_ioat2_chan(c); + struct ioat_chan_common *chan = &ioat->base; + struct ioat_ring_ent *compl_desc; + struct ioat_ring_ent *desc; + struct ioat_ring_ent *ext; + size_t total_len = len; + struct ioat_pq_descriptor *pq; + struct ioat_pq_ext_descriptor *pq_ex = NULL; + struct ioat_dma_descriptor *hw; + u32 offset = 0; + int num_descs; + int with_ext; + int i, s; + u16 idx; + u8 op = result ? IOAT_OP_PQ_VAL : IOAT_OP_PQ; + + dev_dbg(to_dev(chan), "%s\n", __func__); + /* the engine requires at least two sources (we provide + * at least 1 implied source in the DMA_PREP_CONTINUE case) + */ + BUG_ON(src_cnt + dmaf_continue(flags) < 2); + + num_descs = ioat2_xferlen_to_descs(ioat, len); + /* we need 2x the number of descriptors to cover greater than 3 + * sources + */ + if (src_cnt > 3 || flags & DMA_PREP_CONTINUE) { + with_ext = 1; + num_descs *= 2; + } else + with_ext = 0; + + /* completion writes from the raid engine may pass completion + * writes from the legacy engine, so we need one extra null + * (legacy) descriptor to ensure all completion writes arrive in + * order. + */ + if (likely(num_descs) && + ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0) + /* pass */; + else + return NULL; + for (i = 0; i < num_descs; i += 1 + with_ext) { + struct ioat_raw_descriptor *descs[2]; + size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log); + + desc = ioat2_get_ring_ent(ioat, idx + i); + pq = desc->pq; + + /* save a branch by unconditionally retrieving the + * extended descriptor pq_set_src() knows to not write + * to it in the single descriptor case + */ + ext = ioat2_get_ring_ent(ioat, idx + i + with_ext); + pq_ex = ext->pq_ex; + + descs[0] = (struct ioat_raw_descriptor *) pq; + descs[1] = (struct ioat_raw_descriptor *) pq_ex; + + for (s = 0; s < src_cnt; s++) + pq_set_src(descs, src[s], offset, scf[s], s); + + /* see the comment for dma_maxpq in include/linux/dmaengine.h */ + if (dmaf_p_disabled_continue(flags)) + pq_set_src(descs, dst[1], offset, 1, s++); + else if (dmaf_continue(flags)) { + pq_set_src(descs, dst[0], offset, 0, s++); + pq_set_src(descs, dst[1], offset, 1, s++); + pq_set_src(descs, dst[1], offset, 0, s++); + } + pq->size = xfer_size; + pq->p_addr = dst[0] + offset; + pq->q_addr = dst[1] + offset; + pq->ctl = 0; + pq->ctl_f.op = op; + pq->ctl_f.src_cnt = src_cnt_to_hw(s); + pq->ctl_f.p_disable = !!(flags & DMA_PREP_PQ_DISABLE_P); + pq->ctl_f.q_disable = !!(flags & DMA_PREP_PQ_DISABLE_Q); + + len -= xfer_size; + offset += xfer_size; + } + + /* last pq descriptor carries the unmap parameters and fence bit */ + desc->txd.flags = flags; + desc->len = total_len; + if (result) + desc->result = result; + pq->ctl_f.fence = !!(flags & DMA_PREP_FENCE); + dump_pq_desc_dbg(ioat, desc, ext); + + /* completion descriptor carries interrupt bit */ + compl_desc = ioat2_get_ring_ent(ioat, idx + i); + compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT; + hw = compl_desc->hw; + hw->ctl = 0; + hw->ctl_f.null = 1; + hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT); + hw->ctl_f.compl_write = 1; + hw->size = NULL_DESC_BUFFER_SIZE; + dump_desc_dbg(ioat, compl_desc); + + /* we leave the channel locked to ensure in order submission */ + return &desc->txd; +} + +static struct dma_async_tx_descriptor * +ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, + unsigned int src_cnt, const unsigned char *scf, size_t len, + unsigned long flags) +{ + /* handle the single source multiply case from the raid6 + * recovery path + */ + if (unlikely((flags & DMA_PREP_PQ_DISABLE_P) && src_cnt == 1)) { + dma_addr_t single_source[2]; + unsigned char single_source_coef[2]; + + BUG_ON(flags & DMA_PREP_PQ_DISABLE_Q); + single_source[0] = src[0]; + single_source[1] = src[0]; + single_source_coef[0] = scf[0]; + single_source_coef[1] = 0; + + return __ioat3_prep_pq_lock(chan, NULL, dst, single_source, 2, + single_source_coef, len, flags); + } else + return __ioat3_prep_pq_lock(chan, NULL, dst, src, src_cnt, scf, + len, flags); +} + +struct dma_async_tx_descriptor * +ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, + unsigned int src_cnt, const unsigned char *scf, size_t len, + enum sum_check_flags *pqres, unsigned long flags) +{ + /* the cleanup routine only sets bits on validate failure, it + * does not clear bits on validate success... so clear it here + */ + *pqres = 0; + + return __ioat3_prep_pq_lock(chan, pqres, pq, src, src_cnt, scf, len, + flags); +} + +static struct dma_async_tx_descriptor * +ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src, + unsigned int src_cnt, size_t len, unsigned long flags) +{ + unsigned char scf[src_cnt]; + dma_addr_t pq[2]; + + memset(scf, 0, src_cnt); + flags |= DMA_PREP_PQ_DISABLE_Q; + pq[0] = dst; + pq[1] = ~0; + + return __ioat3_prep_pq_lock(chan, NULL, pq, src, src_cnt, scf, len, + flags); +} + +struct dma_async_tx_descriptor * +ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src, + unsigned int src_cnt, size_t len, + enum sum_check_flags *result, unsigned long flags) +{ + unsigned char scf[src_cnt]; + dma_addr_t pq[2]; + + /* the cleanup routine only sets bits on validate failure, it + * does not clear bits on validate success... so clear it here + */ + *result = 0; + + memset(scf, 0, src_cnt); + flags |= DMA_PREP_PQ_DISABLE_Q; + pq[0] = src[0]; + pq[1] = ~0; + + return __ioat3_prep_pq_lock(chan, result, pq, &src[1], src_cnt - 1, scf, + len, flags); +} + +static struct dma_async_tx_descriptor * +ioat3_prep_interrupt_lock(struct dma_chan *c, unsigned long flags) +{ + struct ioat2_dma_chan *ioat = to_ioat2_chan(c); + struct ioat_ring_ent *desc; + struct ioat_dma_descriptor *hw; + u16 idx; + + if (ioat2_alloc_and_lock(&idx, ioat, 1) == 0) + desc = ioat2_get_ring_ent(ioat, idx); + else + return NULL; + + hw = desc->hw; + hw->ctl = 0; + hw->ctl_f.null = 1; + hw->ctl_f.int_en = 1; + hw->ctl_f.fence = !!(flags & DMA_PREP_FENCE); + hw->ctl_f.compl_write = 1; + hw->size = NULL_DESC_BUFFER_SIZE; + hw->src_addr = 0; + hw->dst_addr = 0; + + desc->txd.flags = flags; + desc->len = 1; + + dump_desc_dbg(ioat, desc); + + /* we leave the channel locked to ensure in order submission */ + return &desc->txd; +} + +static void __devinit ioat3_dma_test_callback(void *dma_async_param) +{ + struct completion *cmp = dma_async_param; + + complete(cmp); +} + +#define IOAT_NUM_SRC_TEST 6 /* must be <= 8 */ +static int __devinit ioat_xor_val_self_test(struct ioatdma_device *device) +{ + int i, src_idx; + struct page *dest; + struct page *xor_srcs[IOAT_NUM_SRC_TEST]; + struct page *xor_val_srcs[IOAT_NUM_SRC_TEST + 1]; + dma_addr_t dma_srcs[IOAT_NUM_SRC_TEST + 1]; + dma_addr_t dma_addr, dest_dma; + struct dma_async_tx_descriptor *tx; + struct dma_chan *dma_chan; + dma_cookie_t cookie; + u8 cmp_byte = 0; + u32 cmp_word; + u32 xor_val_result; + int err = 0; + struct completion cmp; + unsigned long tmo; + struct device *dev = &device->pdev->dev; + struct dma_device *dma = &device->common; + + dev_dbg(dev, "%s\n", __func__); + + if (!dma_has_cap(DMA_XOR, dma->cap_mask)) + return 0; + + for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) { + xor_srcs[src_idx] = alloc_page(GFP_KERNEL); + if (!xor_srcs[src_idx]) { + while (src_idx--) + __free_page(xor_srcs[src_idx]); + return -ENOMEM; + } + } + + dest = alloc_page(GFP_KERNEL); + if (!dest) { + while (src_idx--) + __free_page(xor_srcs[src_idx]); + return -ENOMEM; + } + + /* Fill in src buffers */ + for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) { + u8 *ptr = page_address(xor_srcs[src_idx]); + for (i = 0; i < PAGE_SIZE; i++) + ptr[i] = (1 << src_idx); + } + + for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) + cmp_byte ^= (u8) (1 << src_idx); + + cmp_word = (cmp_byte << 24) | (cmp_byte << 16) | + (cmp_byte << 8) | cmp_byte; + + memset(page_address(dest), 0, PAGE_SIZE); + + dma_chan = container_of(dma->channels.next, struct dma_chan, + device_node); + if (dma->device_alloc_chan_resources(dma_chan) < 1) { + err = -ENODEV; + goto out; + } + + /* test xor */ + dest_dma = dma_map_page(dev, dest, 0, PAGE_SIZE, DMA_FROM_DEVICE); + for (i = 0; i < IOAT_NUM_SRC_TEST; i++) + dma_srcs[i] = dma_map_page(dev, xor_srcs[i], 0, PAGE_SIZE, + DMA_TO_DEVICE); + tx = dma->device_prep_dma_xor(dma_chan, dest_dma, dma_srcs, + IOAT_NUM_SRC_TEST, PAGE_SIZE, + DMA_PREP_INTERRUPT); + + if (!tx) { + dev_err(dev, "Self-test xor prep failed\n"); + err = -ENODEV; + goto free_resources; + } + + async_tx_ack(tx); + init_completion(&cmp); + tx->callback = ioat3_dma_test_callback; + tx->callback_param = &cmp; + cookie = tx->tx_submit(tx); + if (cookie < 0) { + dev_err(dev, "Self-test xor setup failed\n"); + err = -ENODEV; + goto free_resources; + } + dma->device_issue_pending(dma_chan); + + tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); + + if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { + dev_err(dev, "Self-test xor timed out\n"); + err = -ENODEV; + goto free_resources; + } + + dma_sync_single_for_cpu(dev, dest_dma, PAGE_SIZE, DMA_FROM_DEVICE); + for (i = 0; i < (PAGE_SIZE / sizeof(u32)); i++) { + u32 *ptr = page_address(dest); + if (ptr[i] != cmp_word) { + dev_err(dev, "Self-test xor failed compare\n"); + err = -ENODEV; + goto free_resources; + } + } + dma_sync_single_for_device(dev, dest_dma, PAGE_SIZE, DMA_TO_DEVICE); + + /* skip validate if the capability is not present */ + if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask)) + goto free_resources; + + /* validate the sources with the destintation page */ + for (i = 0; i < IOAT_NUM_SRC_TEST; i++) + xor_val_srcs[i] = xor_srcs[i]; + xor_val_srcs[i] = dest; + + xor_val_result = 1; + + for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++) + dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE, + DMA_TO_DEVICE); + tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs, + IOAT_NUM_SRC_TEST + 1, PAGE_SIZE, + &xor_val_result, DMA_PREP_INTERRUPT); + if (!tx) { + dev_err(dev, "Self-test zero prep failed\n"); + err = -ENODEV; + goto free_resources; + } + + async_tx_ack(tx); + init_completion(&cmp); + tx->callback = ioat3_dma_test_callback; + tx->callback_param = &cmp; + cookie = tx->tx_submit(tx); + if (cookie < 0) { + dev_err(dev, "Self-test zero setup failed\n"); + err = -ENODEV; + goto free_resources; + } + dma->device_issue_pending(dma_chan); + + tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); + + if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { + dev_err(dev, "Self-test validate timed out\n"); + err = -ENODEV; + goto free_resources; + } + + if (xor_val_result != 0) { + dev_err(dev, "Self-test validate failed compare\n"); + err = -ENODEV; + goto free_resources; + } + + /* skip memset if the capability is not present */ + if (!dma_has_cap(DMA_MEMSET, dma_chan->device->cap_mask)) + goto free_resources; + + /* test memset */ + dma_addr = dma_map_page(dev, dest, 0, + PAGE_SIZE, DMA_FROM_DEVICE); + tx = dma->device_prep_dma_memset(dma_chan, dma_addr, 0, PAGE_SIZE, + DMA_PREP_INTERRUPT); + if (!tx) { + dev_err(dev, "Self-test memset prep failed\n"); + err = -ENODEV; + goto free_resources; + } + + async_tx_ack(tx); + init_completion(&cmp); + tx->callback = ioat3_dma_test_callback; + tx->callback_param = &cmp; + cookie = tx->tx_submit(tx); + if (cookie < 0) { + dev_err(dev, "Self-test memset setup failed\n"); + err = -ENODEV; + goto free_resources; + } + dma->device_issue_pending(dma_chan); + + tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); + + if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { + dev_err(dev, "Self-test memset timed out\n"); + err = -ENODEV; + goto free_resources; + } + + for (i = 0; i < PAGE_SIZE/sizeof(u32); i++) { + u32 *ptr = page_address(dest); + if (ptr[i]) { + dev_err(dev, "Self-test memset failed compare\n"); + err = -ENODEV; + goto free_resources; + } + } + + /* test for non-zero parity sum */ + xor_val_result = 0; + for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++) + dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE, + DMA_TO_DEVICE); + tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs, + IOAT_NUM_SRC_TEST + 1, PAGE_SIZE, + &xor_val_result, DMA_PREP_INTERRUPT); + if (!tx) { + dev_err(dev, "Self-test 2nd zero prep failed\n"); + err = -ENODEV; + goto free_resources; + } + + async_tx_ack(tx); + init_completion(&cmp); + tx->callback = ioat3_dma_test_callback; + tx->callback_param = &cmp; + cookie = tx->tx_submit(tx); + if (cookie < 0) { + dev_err(dev, "Self-test 2nd zero setup failed\n"); + err = -ENODEV; + goto free_resources; + } + dma->device_issue_pending(dma_chan); + + tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); + + if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { + dev_err(dev, "Self-test 2nd validate timed out\n"); + err = -ENODEV; + goto free_resources; + } + + if (xor_val_result != SUM_CHECK_P_RESULT) { + dev_err(dev, "Self-test validate failed compare\n"); + err = -ENODEV; + goto free_resources; + } + +free_resources: + dma->device_free_chan_resources(dma_chan); +out: + src_idx = IOAT_NUM_SRC_TEST; + while (src_idx--) + __free_page(xor_srcs[src_idx]); + __free_page(dest); + return err; +} + +static int __devinit ioat3_dma_self_test(struct ioatdma_device *device) +{ + int rc = ioat_dma_self_test(device); + + if (rc) + return rc; + + rc = ioat_xor_val_self_test(device); + if (rc) + return rc; + + return 0; +} + +int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca) +{ + struct pci_dev *pdev = device->pdev; + struct dma_device *dma; + struct dma_chan *c; + struct ioat_chan_common *chan; + bool is_raid_device = false; + int err; + u16 dev_id; + u32 cap; + + device->enumerate_channels = ioat2_enumerate_channels; + device->self_test = ioat3_dma_self_test; + dma = &device->common; + dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock; + dma->device_issue_pending = ioat2_issue_pending; + dma->device_alloc_chan_resources = ioat2_alloc_chan_resources; + dma->device_free_chan_resources = ioat2_free_chan_resources; + + dma_cap_set(DMA_INTERRUPT, dma->cap_mask); + dma->device_prep_dma_interrupt = ioat3_prep_interrupt_lock; + + cap = readl(device->reg_base + IOAT_DMA_CAP_OFFSET); + if (cap & IOAT_CAP_XOR) { + is_raid_device = true; + dma->max_xor = 8; + dma->xor_align = 2; + + dma_cap_set(DMA_XOR, dma->cap_mask); + dma->device_prep_dma_xor = ioat3_prep_xor; + + dma_cap_set(DMA_XOR_VAL, dma->cap_mask); + dma->device_prep_dma_xor_val = ioat3_prep_xor_val; + } + if (cap & IOAT_CAP_PQ) { + is_raid_device = true; + dma_set_maxpq(dma, 8, 0); + dma->pq_align = 2; + + dma_cap_set(DMA_PQ, dma->cap_mask); + dma->device_prep_dma_pq = ioat3_prep_pq; + + dma_cap_set(DMA_PQ_VAL, dma->cap_mask); + dma->device_prep_dma_pq_val = ioat3_prep_pq_val; + + if (!(cap & IOAT_CAP_XOR)) { + dma->max_xor = 8; + dma->xor_align = 2; + + dma_cap_set(DMA_XOR, dma->cap_mask); + dma->device_prep_dma_xor = ioat3_prep_pqxor; + + dma_cap_set(DMA_XOR_VAL, dma->cap_mask); + dma->device_prep_dma_xor_val = ioat3_prep_pqxor_val; + } + } + if (is_raid_device && (cap & IOAT_CAP_FILL_BLOCK)) { + dma_cap_set(DMA_MEMSET, dma->cap_mask); + dma->device_prep_dma_memset = ioat3_prep_memset_lock; + } + + + if (is_raid_device) { + dma->device_is_tx_complete = ioat3_is_complete; + device->cleanup_tasklet = ioat3_cleanup_tasklet; + device->timer_fn = ioat3_timer_event; + } else { + dma->device_is_tx_complete = ioat2_is_complete; + device->cleanup_tasklet = ioat2_cleanup_tasklet; + device->timer_fn = ioat2_timer_event; + } + + /* -= IOAT ver.3 workarounds =- */ + /* Write CHANERRMSK_INT with 3E07h to mask out the errors + * that can cause stability issues for IOAT ver.3 + */ + pci_write_config_dword(pdev, IOAT_PCI_CHANERRMASK_INT_OFFSET, 0x3e07); + + /* Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit + * (workaround for spurious config parity error after restart) + */ + pci_read_config_word(pdev, IOAT_PCI_DEVICE_ID_OFFSET, &dev_id); + if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0) + pci_write_config_dword(pdev, IOAT_PCI_DMAUNCERRSTS_OFFSET, 0x10); + + err = ioat_probe(device); + if (err) + return err; + ioat_set_tcp_copy_break(262144); + + list_for_each_entry(c, &dma->channels, device_node) { + chan = to_chan_common(c); + writel(IOAT_DMA_DCA_ANY_CPU, + chan->reg_base + IOAT_DCACTRL_OFFSET); + } + + err = ioat_register(device); + if (err) + return err; + + ioat_kobject_add(device, &ioat2_ktype); + + if (dca) + device->dca = ioat3_dca_init(pdev, device->reg_base); + + return 0; +} diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h index 7481fb13ce00..99afb12bd409 100644 --- a/drivers/dma/ioat/hw.h +++ b/drivers/dma/ioat/hw.h @@ -37,6 +37,7 @@ #define IOAT_VER_1_2 0x12 /* Version 1.2 */ #define IOAT_VER_2_0 0x20 /* Version 2.0 */ #define IOAT_VER_3_0 0x30 /* Version 3.0 */ +#define IOAT_VER_3_2 0x32 /* Version 3.2 */ struct ioat_dma_descriptor { uint32_t size; @@ -55,6 +56,7 @@ struct ioat_dma_descriptor { unsigned int dest_dca:1; unsigned int hint:1; unsigned int rsvd2:13; + #define IOAT_OP_COPY 0x00 unsigned int op:8; } ctl_f; }; @@ -70,4 +72,144 @@ struct ioat_dma_descriptor { }; uint64_t user2; }; + +struct ioat_fill_descriptor { + uint32_t size; + union { + uint32_t ctl; + struct { + unsigned int int_en:1; + unsigned int rsvd:1; + unsigned int dest_snoop_dis:1; + unsigned int compl_write:1; + unsigned int fence:1; + unsigned int rsvd2:2; + unsigned int dest_brk:1; + unsigned int bundle:1; + unsigned int rsvd4:15; + #define IOAT_OP_FILL 0x01 + unsigned int op:8; + } ctl_f; + }; + uint64_t src_data; + uint64_t dst_addr; + uint64_t next; + uint64_t rsv1; + uint64_t next_dst_addr; + uint64_t user1; + uint64_t user2; +}; + +struct ioat_xor_descriptor { + uint32_t size; + union { + uint32_t ctl; + struct { + unsigned int int_en:1; + unsigned int src_snoop_dis:1; + unsigned int dest_snoop_dis:1; + unsigned int compl_write:1; + unsigned int fence:1; + unsigned int src_cnt:3; + unsigned int bundle:1; + unsigned int dest_dca:1; + unsigned int hint:1; + unsigned int rsvd:13; + #define IOAT_OP_XOR 0x87 + #define IOAT_OP_XOR_VAL 0x88 + unsigned int op:8; + } ctl_f; + }; + uint64_t src_addr; + uint64_t dst_addr; + uint64_t next; + uint64_t src_addr2; + uint64_t src_addr3; + uint64_t src_addr4; + uint64_t src_addr5; +}; + +struct ioat_xor_ext_descriptor { + uint64_t src_addr6; + uint64_t src_addr7; + uint64_t src_addr8; + uint64_t next; + uint64_t rsvd[4]; +}; + +struct ioat_pq_descriptor { + uint32_t size; + union { + uint32_t ctl; + struct { + unsigned int int_en:1; + unsigned int src_snoop_dis:1; + unsigned int dest_snoop_dis:1; + unsigned int compl_write:1; + unsigned int fence:1; + unsigned int src_cnt:3; + unsigned int bundle:1; + unsigned int dest_dca:1; + unsigned int hint:1; + unsigned int p_disable:1; + unsigned int q_disable:1; + unsigned int rsvd:11; + #define IOAT_OP_PQ 0x89 + #define IOAT_OP_PQ_VAL 0x8a + unsigned int op:8; + } ctl_f; + }; + uint64_t src_addr; + uint64_t p_addr; + uint64_t next; + uint64_t src_addr2; + uint64_t src_addr3; + uint8_t coef[8]; + uint64_t q_addr; +}; + +struct ioat_pq_ext_descriptor { + uint64_t src_addr4; + uint64_t src_addr5; + uint64_t src_addr6; + uint64_t next; + uint64_t src_addr7; + uint64_t src_addr8; + uint64_t rsvd[2]; +}; + +struct ioat_pq_update_descriptor { + uint32_t size; + union { + uint32_t ctl; + struct { + unsigned int int_en:1; + unsigned int src_snoop_dis:1; + unsigned int dest_snoop_dis:1; + unsigned int compl_write:1; + unsigned int fence:1; + unsigned int src_cnt:3; + unsigned int bundle:1; + unsigned int dest_dca:1; + unsigned int hint:1; + unsigned int p_disable:1; + unsigned int q_disable:1; + unsigned int rsvd:3; + unsigned int coef:8; + #define IOAT_OP_PQ_UP 0x8b + unsigned int op:8; + } ctl_f; + }; + uint64_t src_addr; + uint64_t p_addr; + uint64_t next; + uint64_t src_addr2; + uint64_t p_src; + uint64_t q_src; + uint64_t q_addr; +}; + +struct ioat_raw_descriptor { + uint64_t field[8]; +}; #endif diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c index 61086c6bbf42..c788fa266470 100644 --- a/drivers/dma/ioat/pci.c +++ b/drivers/dma/ioat/pci.c @@ -36,30 +36,44 @@ #include "hw.h" MODULE_VERSION(IOAT_DMA_VERSION); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Intel Corporation"); static struct pci_device_id ioat_pci_tbl[] = { /* I/OAT v1 platforms */ - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SCNB) }, - { PCI_DEVICE(PCI_VENDOR_ID_UNISYS, PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SCNB) }, + { PCI_VDEVICE(UNISYS, PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) }, /* I/OAT v2 platforms */ - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB) }, /* I/OAT v3 platforms */ - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG0) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG1) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG2) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG3) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG4) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG0) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG1) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG2) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG3) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG4) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) }, + + /* I/OAT v3.2 platforms */ + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF0) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF1) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF2) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF3) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF4) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF5) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF6) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF7) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF8) }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF9) }, + { 0, } }; +MODULE_DEVICE_TABLE(pci, ioat_pci_tbl); static int __devinit ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id); @@ -172,6 +186,9 @@ static int __init ioat_init_module(void) { int err; + pr_info("%s: Intel(R) QuickData Technology Driver %s\n", + DRV_NAME, IOAT_DMA_VERSION); + ioat2_cache = kmem_cache_create("ioat2", sizeof(struct ioat_ring_ent), 0, SLAB_HWCACHE_ALIGN, NULL); if (!ioat2_cache) diff --git a/drivers/dma/ioat/registers.h b/drivers/dma/ioat/registers.h index e4334a195380..63038e18ab03 100644 --- a/drivers/dma/ioat/registers.h +++ b/drivers/dma/ioat/registers.h @@ -64,12 +64,27 @@ #define IOAT_DEVICE_STATUS_OFFSET 0x0E /* 16-bit */ #define IOAT_DEVICE_STATUS_DEGRADED_MODE 0x0001 +#define IOAT_DEVICE_MMIO_RESTRICTED 0x0002 +#define IOAT_DEVICE_MEMORY_BYPASS 0x0004 +#define IOAT_DEVICE_ADDRESS_REMAPPING 0x0008 + +#define IOAT_DMA_CAP_OFFSET 0x10 /* 32-bit */ +#define IOAT_CAP_PAGE_BREAK 0x00000001 +#define IOAT_CAP_CRC 0x00000002 +#define IOAT_CAP_SKIP_MARKER 0x00000004 +#define IOAT_CAP_DCA 0x00000010 +#define IOAT_CAP_CRC_MOVE 0x00000020 +#define IOAT_CAP_FILL_BLOCK 0x00000040 +#define IOAT_CAP_APIC 0x00000080 +#define IOAT_CAP_XOR 0x00000100 +#define IOAT_CAP_PQ 0x00000200 #define IOAT_CHANNEL_MMIO_SIZE 0x80 /* Each Channel MMIO space is this size */ /* DMA Channel Registers */ #define IOAT_CHANCTRL_OFFSET 0x00 /* 16-bit Channel Control Register */ #define IOAT_CHANCTRL_CHANNEL_PRIORITY_MASK 0xF000 +#define IOAT3_CHANCTRL_COMPL_DCA_EN 0x0200 #define IOAT_CHANCTRL_CHANNEL_IN_USE 0x0100 #define IOAT_CHANCTRL_DESCRIPTOR_ADDR_SNOOP_CONTROL 0x0020 #define IOAT_CHANCTRL_ERR_INT_EN 0x0010 @@ -224,6 +239,11 @@ #define IOAT_CHANERR_INT_CONFIGURATION_ERR 0x2000 #define IOAT_CHANERR_SOFT_ERR 0x4000 #define IOAT_CHANERR_UNAFFILIATED_ERR 0x8000 +#define IOAT_CHANERR_XOR_P_OR_CRC_ERR 0x10000 +#define IOAT_CHANERR_XOR_Q_ERR 0x20000 +#define IOAT_CHANERR_DESCRIPTOR_COUNT_ERR 0x40000 + +#define IOAT_CHANERR_HANDLE_MASK (IOAT_CHANERR_XOR_P_OR_CRC_ERR | IOAT_CHANERR_XOR_Q_ERR) #define IOAT_CHANERR_MASK_OFFSET 0x2C /* 32-bit Channel Error Register */ diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index 9f6c16f8e2be..645ca8d54ec4 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -31,6 +31,7 @@ #include <linux/platform_device.h> #include <linux/memory.h> #include <linux/ioport.h> +#include <linux/raid/pq.h> #include <mach/adma.h> @@ -57,65 +58,110 @@ static void iop_adma_free_slots(struct iop_adma_desc_slot *slot) } } +static void +iop_desc_unmap(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc) +{ + struct dma_async_tx_descriptor *tx = &desc->async_tx; + struct iop_adma_desc_slot *unmap = desc->group_head; + struct device *dev = &iop_chan->device->pdev->dev; + u32 len = unmap->unmap_len; + enum dma_ctrl_flags flags = tx->flags; + u32 src_cnt; + dma_addr_t addr; + dma_addr_t dest; + + src_cnt = unmap->unmap_src_cnt; + dest = iop_desc_get_dest_addr(unmap, iop_chan); + if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) { + enum dma_data_direction dir; + + if (src_cnt > 1) /* is xor? */ + dir = DMA_BIDIRECTIONAL; + else + dir = DMA_FROM_DEVICE; + + dma_unmap_page(dev, dest, len, dir); + } + + if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { + while (src_cnt--) { + addr = iop_desc_get_src_addr(unmap, iop_chan, src_cnt); + if (addr == dest) + continue; + dma_unmap_page(dev, addr, len, DMA_TO_DEVICE); + } + } + desc->group_head = NULL; +} + +static void +iop_desc_unmap_pq(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc) +{ + struct dma_async_tx_descriptor *tx = &desc->async_tx; + struct iop_adma_desc_slot *unmap = desc->group_head; + struct device *dev = &iop_chan->device->pdev->dev; + u32 len = unmap->unmap_len; + enum dma_ctrl_flags flags = tx->flags; + u32 src_cnt = unmap->unmap_src_cnt; + dma_addr_t pdest = iop_desc_get_dest_addr(unmap, iop_chan); + dma_addr_t qdest = iop_desc_get_qdest_addr(unmap, iop_chan); + int i; + + if (tx->flags & DMA_PREP_CONTINUE) + src_cnt -= 3; + + if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP) && !desc->pq_check_result) { + dma_unmap_page(dev, pdest, len, DMA_BIDIRECTIONAL); + dma_unmap_page(dev, qdest, len, DMA_BIDIRECTIONAL); + } + + if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { + dma_addr_t addr; + + for (i = 0; i < src_cnt; i++) { + addr = iop_desc_get_src_addr(unmap, iop_chan, i); + dma_unmap_page(dev, addr, len, DMA_TO_DEVICE); + } + if (desc->pq_check_result) { + dma_unmap_page(dev, pdest, len, DMA_TO_DEVICE); + dma_unmap_page(dev, qdest, len, DMA_TO_DEVICE); + } + } + + desc->group_head = NULL; +} + + static dma_cookie_t iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc, struct iop_adma_chan *iop_chan, dma_cookie_t cookie) { - BUG_ON(desc->async_tx.cookie < 0); - if (desc->async_tx.cookie > 0) { - cookie = desc->async_tx.cookie; - desc->async_tx.cookie = 0; + struct dma_async_tx_descriptor *tx = &desc->async_tx; + + BUG_ON(tx->cookie < 0); + if (tx->cookie > 0) { + cookie = tx->cookie; + tx->cookie = 0; /* call the callback (must not sleep or submit new * operations to this channel) */ - if (desc->async_tx.callback) - desc->async_tx.callback( - desc->async_tx.callback_param); + if (tx->callback) + tx->callback(tx->callback_param); /* unmap dma addresses * (unmap_single vs unmap_page?) */ if (desc->group_head && desc->unmap_len) { - struct iop_adma_desc_slot *unmap = desc->group_head; - struct device *dev = - &iop_chan->device->pdev->dev; - u32 len = unmap->unmap_len; - enum dma_ctrl_flags flags = desc->async_tx.flags; - u32 src_cnt; - dma_addr_t addr; - dma_addr_t dest; - - src_cnt = unmap->unmap_src_cnt; - dest = iop_desc_get_dest_addr(unmap, iop_chan); - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - enum dma_data_direction dir; - - if (src_cnt > 1) /* is xor? */ - dir = DMA_BIDIRECTIONAL; - else - dir = DMA_FROM_DEVICE; - - dma_unmap_page(dev, dest, len, dir); - } - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - while (src_cnt--) { - addr = iop_desc_get_src_addr(unmap, - iop_chan, - src_cnt); - if (addr == dest) - continue; - dma_unmap_page(dev, addr, len, - DMA_TO_DEVICE); - } - } - desc->group_head = NULL; + if (iop_desc_is_pq(desc)) + iop_desc_unmap_pq(iop_chan, desc); + else + iop_desc_unmap(iop_chan, desc); } } /* run dependent operations */ - dma_run_dependencies(&desc->async_tx); + dma_run_dependencies(tx); return cookie; } @@ -287,7 +333,12 @@ static void iop_adma_tasklet(unsigned long data) { struct iop_adma_chan *iop_chan = (struct iop_adma_chan *) data; - spin_lock(&iop_chan->lock); + /* lockdep will flag depedency submissions as potentially + * recursive locking, this is not the case as a dependency + * submission will never recurse a channels submit routine. + * There are checks in async_tx.c to prevent this. + */ + spin_lock_nested(&iop_chan->lock, SINGLE_DEPTH_NESTING); __iop_adma_slot_cleanup(iop_chan); spin_unlock(&iop_chan->lock); } @@ -661,9 +712,9 @@ iop_adma_prep_dma_xor(struct dma_chan *chan, dma_addr_t dma_dest, } static struct dma_async_tx_descriptor * -iop_adma_prep_dma_zero_sum(struct dma_chan *chan, dma_addr_t *dma_src, - unsigned int src_cnt, size_t len, u32 *result, - unsigned long flags) +iop_adma_prep_dma_xor_val(struct dma_chan *chan, dma_addr_t *dma_src, + unsigned int src_cnt, size_t len, u32 *result, + unsigned long flags) { struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); struct iop_adma_desc_slot *sw_desc, *grp_start; @@ -697,6 +748,118 @@ iop_adma_prep_dma_zero_sum(struct dma_chan *chan, dma_addr_t *dma_src, return sw_desc ? &sw_desc->async_tx : NULL; } +static struct dma_async_tx_descriptor * +iop_adma_prep_dma_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, + unsigned int src_cnt, const unsigned char *scf, size_t len, + unsigned long flags) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + struct iop_adma_desc_slot *sw_desc, *g; + int slot_cnt, slots_per_op; + int continue_srcs; + + if (unlikely(!len)) + return NULL; + BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT); + + dev_dbg(iop_chan->device->common.dev, + "%s src_cnt: %d len: %u flags: %lx\n", + __func__, src_cnt, len, flags); + + if (dmaf_p_disabled_continue(flags)) + continue_srcs = 1+src_cnt; + else if (dmaf_continue(flags)) + continue_srcs = 3+src_cnt; + else + continue_srcs = 0+src_cnt; + + spin_lock_bh(&iop_chan->lock); + slot_cnt = iop_chan_pq_slot_count(len, continue_srcs, &slots_per_op); + sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); + if (sw_desc) { + int i; + + g = sw_desc->group_head; + iop_desc_set_byte_count(g, iop_chan, len); + + /* even if P is disabled its destination address (bits + * [3:0]) must match Q. It is ok if P points to an + * invalid address, it won't be written. + */ + if (flags & DMA_PREP_PQ_DISABLE_P) + dst[0] = dst[1] & 0x7; + + iop_desc_set_pq_addr(g, dst); + sw_desc->unmap_src_cnt = src_cnt; + sw_desc->unmap_len = len; + sw_desc->async_tx.flags = flags; + for (i = 0; i < src_cnt; i++) + iop_desc_set_pq_src_addr(g, i, src[i], scf[i]); + + /* if we are continuing a previous operation factor in + * the old p and q values, see the comment for dma_maxpq + * in include/linux/dmaengine.h + */ + if (dmaf_p_disabled_continue(flags)) + iop_desc_set_pq_src_addr(g, i++, dst[1], 1); + else if (dmaf_continue(flags)) { + iop_desc_set_pq_src_addr(g, i++, dst[0], 0); + iop_desc_set_pq_src_addr(g, i++, dst[1], 1); + iop_desc_set_pq_src_addr(g, i++, dst[1], 0); + } + iop_desc_init_pq(g, i, flags); + } + spin_unlock_bh(&iop_chan->lock); + + return sw_desc ? &sw_desc->async_tx : NULL; +} + +static struct dma_async_tx_descriptor * +iop_adma_prep_dma_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, + unsigned int src_cnt, const unsigned char *scf, + size_t len, enum sum_check_flags *pqres, + unsigned long flags) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + struct iop_adma_desc_slot *sw_desc, *g; + int slot_cnt, slots_per_op; + + if (unlikely(!len)) + return NULL; + BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT); + + dev_dbg(iop_chan->device->common.dev, "%s src_cnt: %d len: %u\n", + __func__, src_cnt, len); + + spin_lock_bh(&iop_chan->lock); + slot_cnt = iop_chan_pq_zero_sum_slot_count(len, src_cnt + 2, &slots_per_op); + sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); + if (sw_desc) { + /* for validate operations p and q are tagged onto the + * end of the source list + */ + int pq_idx = src_cnt; + + g = sw_desc->group_head; + iop_desc_init_pq_zero_sum(g, src_cnt+2, flags); + iop_desc_set_pq_zero_sum_byte_count(g, len); + g->pq_check_result = pqres; + pr_debug("\t%s: g->pq_check_result: %p\n", + __func__, g->pq_check_result); + sw_desc->unmap_src_cnt = src_cnt+2; + sw_desc->unmap_len = len; + sw_desc->async_tx.flags = flags; + while (src_cnt--) + iop_desc_set_pq_zero_sum_src_addr(g, src_cnt, + src[src_cnt], + scf[src_cnt]); + iop_desc_set_pq_zero_sum_addr(g, pq_idx, src); + } + spin_unlock_bh(&iop_chan->lock); + + return sw_desc ? &sw_desc->async_tx : NULL; +} + static void iop_adma_free_chan_resources(struct dma_chan *chan) { struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); @@ -907,7 +1070,7 @@ out: #define IOP_ADMA_NUM_SRC_TEST 4 /* must be <= 15 */ static int __devinit -iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) +iop_adma_xor_val_self_test(struct iop_adma_device *device) { int i, src_idx; struct page *dest; @@ -1003,7 +1166,7 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) PAGE_SIZE, DMA_TO_DEVICE); /* skip zero sum if the capability is not present */ - if (!dma_has_cap(DMA_ZERO_SUM, dma_chan->device->cap_mask)) + if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask)) goto free_resources; /* zero sum the sources with the destintation page */ @@ -1017,10 +1180,10 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) dma_srcs[i] = dma_map_page(dma_chan->device->dev, zero_sum_srcs[i], 0, PAGE_SIZE, DMA_TO_DEVICE); - tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs, - IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, - &zero_sum_result, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs, + IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, + &zero_sum_result, + DMA_PREP_INTERRUPT | DMA_CTRL_ACK); cookie = iop_adma_tx_submit(tx); iop_adma_issue_pending(dma_chan); @@ -1073,10 +1236,10 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) dma_srcs[i] = dma_map_page(dma_chan->device->dev, zero_sum_srcs[i], 0, PAGE_SIZE, DMA_TO_DEVICE); - tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs, - IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, - &zero_sum_result, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs, + IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE, + &zero_sum_result, + DMA_PREP_INTERRUPT | DMA_CTRL_ACK); cookie = iop_adma_tx_submit(tx); iop_adma_issue_pending(dma_chan); @@ -1106,6 +1269,170 @@ out: return err; } +#ifdef CONFIG_MD_RAID6_PQ +static int __devinit +iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device) +{ + /* combined sources, software pq results, and extra hw pq results */ + struct page *pq[IOP_ADMA_NUM_SRC_TEST+2+2]; + /* ptr to the extra hw pq buffers defined above */ + struct page **pq_hw = &pq[IOP_ADMA_NUM_SRC_TEST+2]; + /* address conversion buffers (dma_map / page_address) */ + void *pq_sw[IOP_ADMA_NUM_SRC_TEST+2]; + dma_addr_t pq_src[IOP_ADMA_NUM_SRC_TEST]; + dma_addr_t pq_dest[2]; + + int i; + struct dma_async_tx_descriptor *tx; + struct dma_chan *dma_chan; + dma_cookie_t cookie; + u32 zero_sum_result; + int err = 0; + struct device *dev; + + dev_dbg(device->common.dev, "%s\n", __func__); + + for (i = 0; i < ARRAY_SIZE(pq); i++) { + pq[i] = alloc_page(GFP_KERNEL); + if (!pq[i]) { + while (i--) + __free_page(pq[i]); + return -ENOMEM; + } + } + + /* Fill in src buffers */ + for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++) { + pq_sw[i] = page_address(pq[i]); + memset(pq_sw[i], 0x11111111 * (1<<i), PAGE_SIZE); + } + pq_sw[i] = page_address(pq[i]); + pq_sw[i+1] = page_address(pq[i+1]); + + dma_chan = container_of(device->common.channels.next, + struct dma_chan, + device_node); + if (iop_adma_alloc_chan_resources(dma_chan) < 1) { + err = -ENODEV; + goto out; + } + + dev = dma_chan->device->dev; + + /* initialize the dests */ + memset(page_address(pq_hw[0]), 0 , PAGE_SIZE); + memset(page_address(pq_hw[1]), 0 , PAGE_SIZE); + + /* test pq */ + pq_dest[0] = dma_map_page(dev, pq_hw[0], 0, PAGE_SIZE, DMA_FROM_DEVICE); + pq_dest[1] = dma_map_page(dev, pq_hw[1], 0, PAGE_SIZE, DMA_FROM_DEVICE); + for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++) + pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE, + DMA_TO_DEVICE); + + tx = iop_adma_prep_dma_pq(dma_chan, pq_dest, pq_src, + IOP_ADMA_NUM_SRC_TEST, (u8 *)raid6_gfexp, + PAGE_SIZE, + DMA_PREP_INTERRUPT | + DMA_CTRL_ACK); + + cookie = iop_adma_tx_submit(tx); + iop_adma_issue_pending(dma_chan); + msleep(8); + + if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != + DMA_SUCCESS) { + dev_err(dev, "Self-test pq timed out, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + raid6_call.gen_syndrome(IOP_ADMA_NUM_SRC_TEST+2, PAGE_SIZE, pq_sw); + + if (memcmp(pq_sw[IOP_ADMA_NUM_SRC_TEST], + page_address(pq_hw[0]), PAGE_SIZE) != 0) { + dev_err(dev, "Self-test p failed compare, disabling\n"); + err = -ENODEV; + goto free_resources; + } + if (memcmp(pq_sw[IOP_ADMA_NUM_SRC_TEST+1], + page_address(pq_hw[1]), PAGE_SIZE) != 0) { + dev_err(dev, "Self-test q failed compare, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + /* test correct zero sum using the software generated pq values */ + for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 2; i++) + pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE, + DMA_TO_DEVICE); + + zero_sum_result = ~0; + tx = iop_adma_prep_dma_pq_val(dma_chan, &pq_src[IOP_ADMA_NUM_SRC_TEST], + pq_src, IOP_ADMA_NUM_SRC_TEST, + raid6_gfexp, PAGE_SIZE, &zero_sum_result, + DMA_PREP_INTERRUPT|DMA_CTRL_ACK); + + cookie = iop_adma_tx_submit(tx); + iop_adma_issue_pending(dma_chan); + msleep(8); + + if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != + DMA_SUCCESS) { + dev_err(dev, "Self-test pq-zero-sum timed out, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + if (zero_sum_result != 0) { + dev_err(dev, "Self-test pq-zero-sum failed to validate: %x\n", + zero_sum_result); + err = -ENODEV; + goto free_resources; + } + + /* test incorrect zero sum */ + i = IOP_ADMA_NUM_SRC_TEST; + memset(pq_sw[i] + 100, 0, 100); + memset(pq_sw[i+1] + 200, 0, 200); + for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 2; i++) + pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE, + DMA_TO_DEVICE); + + zero_sum_result = 0; + tx = iop_adma_prep_dma_pq_val(dma_chan, &pq_src[IOP_ADMA_NUM_SRC_TEST], + pq_src, IOP_ADMA_NUM_SRC_TEST, + raid6_gfexp, PAGE_SIZE, &zero_sum_result, + DMA_PREP_INTERRUPT|DMA_CTRL_ACK); + + cookie = iop_adma_tx_submit(tx); + iop_adma_issue_pending(dma_chan); + msleep(8); + + if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != + DMA_SUCCESS) { + dev_err(dev, "Self-test !pq-zero-sum timed out, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + if (zero_sum_result != (SUM_CHECK_P_RESULT | SUM_CHECK_Q_RESULT)) { + dev_err(dev, "Self-test !pq-zero-sum failed to validate: %x\n", + zero_sum_result); + err = -ENODEV; + goto free_resources; + } + +free_resources: + iop_adma_free_chan_resources(dma_chan); +out: + i = ARRAY_SIZE(pq); + while (i--) + __free_page(pq[i]); + return err; +} +#endif + static int __devexit iop_adma_remove(struct platform_device *dev) { struct iop_adma_device *device = platform_get_drvdata(dev); @@ -1193,9 +1520,16 @@ static int __devinit iop_adma_probe(struct platform_device *pdev) dma_dev->max_xor = iop_adma_get_max_xor(); dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor; } - if (dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask)) - dma_dev->device_prep_dma_zero_sum = - iop_adma_prep_dma_zero_sum; + if (dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask)) + dma_dev->device_prep_dma_xor_val = + iop_adma_prep_dma_xor_val; + if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) { + dma_set_maxpq(dma_dev, iop_adma_get_max_pq(), 0); + dma_dev->device_prep_dma_pq = iop_adma_prep_dma_pq; + } + if (dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask)) + dma_dev->device_prep_dma_pq_val = + iop_adma_prep_dma_pq_val; if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask)) dma_dev->device_prep_dma_interrupt = iop_adma_prep_dma_interrupt; @@ -1249,23 +1583,35 @@ static int __devinit iop_adma_probe(struct platform_device *pdev) } if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) || - dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) { - ret = iop_adma_xor_zero_sum_self_test(adev); + dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) { + ret = iop_adma_xor_val_self_test(adev); dev_dbg(&pdev->dev, "xor self test returned %d\n", ret); if (ret) goto err_free_iop_chan; } + if (dma_has_cap(DMA_PQ, dma_dev->cap_mask) && + dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask)) { + #ifdef CONFIG_MD_RAID6_PQ + ret = iop_adma_pq_zero_sum_self_test(adev); + dev_dbg(&pdev->dev, "pq self test returned %d\n", ret); + #else + /* can not test raid6, so do not publish capability */ + dma_cap_clear(DMA_PQ, dma_dev->cap_mask); + dma_cap_clear(DMA_PQ_VAL, dma_dev->cap_mask); + ret = 0; + #endif + if (ret) + goto err_free_iop_chan; + } + dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: " - "( %s%s%s%s%s%s%s%s%s%s)\n", - dma_has_cap(DMA_PQ_XOR, dma_dev->cap_mask) ? "pq_xor " : "", - dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "", - dma_has_cap(DMA_PQ_ZERO_SUM, dma_dev->cap_mask) ? "pq_zero_sum " : "", + "( %s%s%s%s%s%s%s)\n", + dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "", + dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "", dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "", - dma_has_cap(DMA_DUAL_XOR, dma_dev->cap_mask) ? "dual_xor " : "", - dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask) ? "xor_zero_sum " : "", + dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask) ? "xor_val " : "", dma_has_cap(DMA_MEMSET, dma_dev->cap_mask) ? "fill " : "", - dma_has_cap(DMA_MEMCPY_CRC32C, dma_dev->cap_mask) ? "cpy+crc " : "", dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "", dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : ""); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 020f9573fd82..2158377a1359 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -124,6 +124,8 @@ config MD_RAID456 select MD_RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR + select ASYNC_PQ + select ASYNC_RAID6_RECOV ---help--- A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure @@ -152,9 +154,33 @@ config MD_RAID456 If unsure, say Y. +config MULTICORE_RAID456 + bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" + depends on MD_RAID456 + depends on SMP + depends on EXPERIMENTAL + ---help--- + Enable the raid456 module to dispatch per-stripe raid operations to a + thread pool. + + If unsure, say N. + config MD_RAID6_PQ tristate +config ASYNC_RAID6_TEST + tristate "Self test for hardware accelerated raid6 recovery" + depends on MD_RAID6_PQ + select ASYNC_RAID6_RECOV + ---help--- + This is a one-shot self test that permutes through the + recovery of all the possible two disk failure scenarios for a + N-disk array. Recovery is performed with the asynchronous + raid6 recovery routines, and will optionally use an offload + engine if one is available. + + If unsure, say N. + config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f9f991e6e138..cac6f4d3a143 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -47,7 +47,9 @@ #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/async_tx.h> +#include <linux/async.h> #include <linux/seq_file.h> +#include <linux/cpu.h> #include "md.h" #include "raid5.h" #include "bitmap.h" @@ -499,11 +501,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, struct page *bio_page; int i; int page_offset; + struct async_submit_ctl submit; + enum async_tx_flags flags = 0; if (bio->bi_sector >= sector) page_offset = (signed)(bio->bi_sector - sector) * 512; else page_offset = (signed)(sector - bio->bi_sector) * -512; + + if (frombio) + flags |= ASYNC_TX_FENCE; + init_async_submit(&submit, flags, tx, NULL, NULL, NULL); + bio_for_each_segment(bvl, bio, i) { int len = bio_iovec_idx(bio, i)->bv_len; int clen; @@ -525,15 +534,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, bio_page = bio_iovec_idx(bio, i)->bv_page; if (frombio) tx = async_memcpy(page, bio_page, page_offset, - b_offset, clen, - ASYNC_TX_DEP_ACK, - tx, NULL, NULL); + b_offset, clen, &submit); else tx = async_memcpy(bio_page, page, b_offset, - page_offset, clen, - ASYNC_TX_DEP_ACK, - tx, NULL, NULL); + page_offset, clen, &submit); } + /* chain the operations */ + submit.depend_tx = tx; + if (clen < len) /* hit end of page */ break; page_offset += len; @@ -592,6 +600,7 @@ static void ops_run_biofill(struct stripe_head *sh) { struct dma_async_tx_descriptor *tx = NULL; raid5_conf_t *conf = sh->raid_conf; + struct async_submit_ctl submit; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -615,22 +624,34 @@ static void ops_run_biofill(struct stripe_head *sh) } atomic_inc(&sh->count); - async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, - ops_complete_biofill, sh); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); + async_trigger_callback(&submit); } -static void ops_complete_compute5(void *stripe_head_ref) +static void mark_target_uptodate(struct stripe_head *sh, int target) { - struct stripe_head *sh = stripe_head_ref; - int target = sh->ops.target; - struct r5dev *tgt = &sh->dev[target]; + struct r5dev *tgt; - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); + if (target < 0) + return; + tgt = &sh->dev[target]; set_bit(R5_UPTODATE, &tgt->flags); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); clear_bit(R5_Wantcompute, &tgt->flags); +} + +static void ops_complete_compute(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + /* mark the computed target(s) as uptodate */ + mark_target_uptodate(sh, sh->ops.target); + mark_target_uptodate(sh, sh->ops.target2); + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); if (sh->check_state == check_state_compute_run) sh->check_state = check_state_compute_result; @@ -638,16 +659,24 @@ static void ops_complete_compute5(void *stripe_head_ref) release_stripe(sh); } -static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) +/* return a pointer to the address conversion region of the scribble buffer */ +static addr_conv_t *to_addr_conv(struct stripe_head *sh, + struct raid5_percpu *percpu) +{ + return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); +} + +static struct dma_async_tx_descriptor * +ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) { - /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page *xor_srcs[disks]; + struct page **xor_srcs = percpu->scribble; int target = sh->ops.target; struct r5dev *tgt = &sh->dev[target]; struct page *xor_dest = tgt->page; int count = 0; struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; int i; pr_debug("%s: stripe %llu block: %d\n", @@ -660,17 +689,212 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, + ops_complete_compute, sh, to_addr_conv(sh, percpu)); if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - 0, NULL, ops_complete_compute5, sh); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - ASYNC_TX_XOR_ZERO_DST, NULL, - ops_complete_compute5, sh); + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); return tx; } +/* set_syndrome_sources - populate source buffers for gen_syndrome + * @srcs - (struct page *) array of size sh->disks + * @sh - stripe_head to parse + * + * Populates srcs in proper layout order for the stripe and returns the + * 'count' of sources to be used in a call to async_gen_syndrome. The P + * destination buffer is recorded in srcs[count] and the Q destination + * is recorded in srcs[count+1]]. + */ +static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) +{ + int disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); + int d0_idx = raid6_d0(sh); + int count; + int i; + + for (i = 0; i < disks; i++) + srcs[i] = (void *)raid6_empty_zero_page; + + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + srcs[slot] = sh->dev[i].page; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); + + return count; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int disks = sh->disks; + struct page **blocks = percpu->scribble; + int target; + int qd_idx = sh->qd_idx; + struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; + struct r5dev *tgt; + struct page *dest; + int i; + int count; + + if (sh->ops.target < 0) + target = sh->ops.target2; + else if (sh->ops.target2 < 0) + target = sh->ops.target; + else + /* we should only have one valid target */ + BUG(); + BUG_ON(target < 0); + pr_debug("%s: stripe %llu block: %d\n", + __func__, (unsigned long long)sh->sector, target); + + tgt = &sh->dev[target]; + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + dest = tgt->page; + + atomic_inc(&sh->count); + + if (target == qd_idx) { + count = set_syndrome_sources(blocks, sh); + blocks[count] = NULL; /* regenerating p is not necessary */ + BUG_ON(blocks[count+1] != dest); /* q should already be set */ + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + } else { + /* Compute any data- or p-drive using XOR */ + count = 0; + for (i = disks; i-- ; ) { + if (i == target || i == qd_idx) + continue; + blocks[count++] = sh->dev[i].page; + } + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, + NULL, ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); + } + + return tx; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int i, count, disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + int target = sh->ops.target; + int target2 = sh->ops.target2; + struct r5dev *tgt = &sh->dev[target]; + struct r5dev *tgt2 = &sh->dev[target2]; + struct dma_async_tx_descriptor *tx; + struct page **blocks = percpu->scribble; + struct async_submit_ctl submit; + + pr_debug("%s: stripe %llu block1: %d block2: %d\n", + __func__, (unsigned long long)sh->sector, target, target2); + BUG_ON(target < 0 || target2 < 0); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); + + /* we need to open-code set_syndrome_sources to handle to the + * slot number conversion for 'faila' and 'failb' + */ + for (i = 0; i < disks ; i++) + blocks[i] = (void *)raid6_empty_zero_page; + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + blocks[slot] = sh->dev[i].page; + + if (i == target) + faila = slot; + if (i == target2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); + + BUG_ON(faila == failb); + if (failb < faila) + swap(faila, failb); + pr_debug("%s: stripe: %llu faila: %d failb: %d\n", + __func__, (unsigned long long)sh->sector, faila, failb); + + atomic_inc(&sh->count); + + if (failb == syndrome_disks+1) { + /* Q disk is one of the missing disks */ + if (faila == syndrome_disks) { + /* Missing P+Q, just recompute */ + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + return async_gen_syndrome(blocks, 0, count+2, + STRIPE_SIZE, &submit); + } else { + struct page *dest; + int data_target; + int qd_idx = sh->qd_idx; + + /* Missing D+Q: recompute D from P, then recompute Q */ + if (target == qd_idx) + data_target = target2; + else + data_target = target; + + count = 0; + for (i = disks; i-- ; ) { + if (i == data_target || i == qd_idx) + continue; + blocks[count++] = sh->dev[i].page; + } + dest = sh->dev[data_target].page; + init_async_submit(&submit, + ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, + NULL, NULL, NULL, + to_addr_conv(sh, percpu)); + tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, + &submit); + + count = set_syndrome_sources(blocks, sh); + init_async_submit(&submit, ASYNC_TX_FENCE, tx, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + return async_gen_syndrome(blocks, 0, count+2, + STRIPE_SIZE, &submit); + } + } + + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, + sh, to_addr_conv(sh, percpu)); + if (failb == syndrome_disks) { + /* We're missing D+P. */ + return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, + faila, blocks, &submit); + } else { + /* We're missing D+D. */ + return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, + faila, failb, blocks, &submit); + } +} + + static void ops_complete_prexor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; @@ -680,12 +904,13 @@ static void ops_complete_prexor(void *stripe_head_ref) } static struct dma_async_tx_descriptor * -ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) { - /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page *xor_srcs[disks]; + struct page **xor_srcs = percpu->scribble; int count = 0, pd_idx = sh->pd_idx, i; + struct async_submit_ctl submit; /* existing parity data subtracted */ struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; @@ -700,9 +925,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) xor_srcs[count++] = dev->page; } - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, - ops_complete_prexor, sh); + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, + ops_complete_prexor, sh, to_addr_conv(sh, percpu)); + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); return tx; } @@ -742,17 +967,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) return tx; } -static void ops_complete_postxor(void *stripe_head_ref) +static void ops_complete_reconstruct(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int disks = sh->disks, i, pd_idx = sh->pd_idx; + int disks = sh->disks; + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->written || i == pd_idx) + + if (dev->written || i == pd_idx || i == qd_idx) set_bit(R5_UPTODATE, &dev->flags); } @@ -770,12 +999,12 @@ static void ops_complete_postxor(void *stripe_head_ref) } static void -ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) { - /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page *xor_srcs[disks]; - + struct page **xor_srcs = percpu->scribble; + struct async_submit_ctl submit; int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; int prexor = 0; @@ -809,18 +1038,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST * for the synchronous xor case */ - flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | + flags = ASYNC_TX_ACK | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); atomic_inc(&sh->count); - if (unlikely(count == 1)) { - flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - flags, tx, ops_complete_postxor, sh); - } else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - flags, tx, ops_complete_postxor, sh); + init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, + to_addr_conv(sh, percpu)); + if (unlikely(count == 1)) + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + else + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); +} + +static void +ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + struct async_submit_ctl submit; + struct page **blocks = percpu->scribble; + int count; + + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + + count = set_syndrome_sources(blocks, sh); + + atomic_inc(&sh->count); + + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, + sh, to_addr_conv(sh, percpu)); + async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); } static void ops_complete_check(void *stripe_head_ref) @@ -835,63 +1082,115 @@ static void ops_complete_check(void *stripe_head_ref) release_stripe(sh); } -static void ops_run_check(struct stripe_head *sh) +static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) { - /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page *xor_srcs[disks]; + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + struct page *xor_dest; + struct page **xor_srcs = percpu->scribble; struct dma_async_tx_descriptor *tx; - - int count = 0, pd_idx = sh->pd_idx, i; - struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + struct async_submit_ctl submit; + int count; + int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + count = 0; + xor_dest = sh->dev[pd_idx].page; + xor_srcs[count++] = xor_dest; for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (i != pd_idx) - xor_srcs[count++] = dev->page; + if (i == pd_idx || i == qd_idx) + continue; + xor_srcs[count++] = sh->dev[i].page; } - tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); + init_async_submit(&submit, 0, NULL, NULL, NULL, + to_addr_conv(sh, percpu)); + tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + &sh->ops.zero_sum_result, &submit); + + atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); + tx = async_trigger_callback(&submit); +} + +static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) +{ + struct page **srcs = percpu->scribble; + struct async_submit_ctl submit; + int count; + + pr_debug("%s: stripe %llu checkp: %d\n", __func__, + (unsigned long long)sh->sector, checkp); + + count = set_syndrome_sources(srcs, sh); + if (!checkp) + srcs[count] = NULL; atomic_inc(&sh->count); - tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, - ops_complete_check, sh); + init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, + sh, to_addr_conv(sh, percpu)); + async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, + &sh->ops.zero_sum_result, percpu->spare_page, &submit); } -static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) +static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; + raid5_conf_t *conf = sh->raid_conf; + int level = conf->level; + struct raid5_percpu *percpu; + unsigned long cpu; + cpu = get_cpu(); + percpu = per_cpu_ptr(conf->percpu, cpu); if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; } if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { - tx = ops_run_compute5(sh); - /* terminate the chain if postxor is not set to be run */ - if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) + if (level < 6) + tx = ops_run_compute5(sh, percpu); + else { + if (sh->ops.target2 < 0 || sh->ops.target < 0) + tx = ops_run_compute6_1(sh, percpu); + else + tx = ops_run_compute6_2(sh, percpu); + } + /* terminate the chain if reconstruct is not set to be run */ + if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) async_tx_ack(tx); } if (test_bit(STRIPE_OP_PREXOR, &ops_request)) - tx = ops_run_prexor(sh, tx); + tx = ops_run_prexor(sh, percpu, tx); if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { tx = ops_run_biodrain(sh, tx); overlap_clear++; } - if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) - ops_run_postxor(sh, tx); + if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { + if (level < 6) + ops_run_reconstruct5(sh, percpu, tx); + else + ops_run_reconstruct6(sh, percpu, tx); + } - if (test_bit(STRIPE_OP_CHECK, &ops_request)) - ops_run_check(sh); + if (test_bit(STRIPE_OP_CHECK, &ops_request)) { + if (sh->check_state == check_state_run) + ops_run_check_p(sh, percpu); + else if (sh->check_state == check_state_run_q) + ops_run_check_pq(sh, percpu, 0); + else if (sh->check_state == check_state_run_pq) + ops_run_check_pq(sh, percpu, 1); + else + BUG(); + } if (overlap_clear) for (i = disks; i--; ) { @@ -899,6 +1198,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) if (test_and_clear_bit(R5_Overlap, &dev->flags)) wake_up(&sh->raid_conf->wait_for_overlap); } + put_cpu(); } static int grow_one_stripe(raid5_conf_t *conf) @@ -948,6 +1248,28 @@ static int grow_stripes(raid5_conf_t *conf, int num) return 0; } +/** + * scribble_len - return the required size of the scribble region + * @num - total number of disks in the array + * + * The size must be enough to contain: + * 1/ a struct page pointer for each device in the array +2 + * 2/ room to convert each entry in (1) to its corresponding dma + * (dma_map_page()) or page (page_address()) address. + * + * Note: the +2 is for the destination buffers of the ddf/raid6 case where we + * calculate over all devices (not just the data blocks), using zeros in place + * of the P and Q blocks. + */ +static size_t scribble_len(int num) +{ + size_t len; + + len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); + + return len; +} + static int resize_stripes(raid5_conf_t *conf, int newsize) { /* Make all the stripes able to hold 'newsize' devices. @@ -976,6 +1298,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; + unsigned long cpu; int err; struct kmem_cache *sc; int i; @@ -1041,7 +1364,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) /* Step 3. * At this point, we are holding all the stripes so the array * is completely stalled, so now is a good time to resize - * conf->disks. + * conf->disks and the scribble region */ ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); if (ndisks) { @@ -1052,10 +1375,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) } else err = -ENOMEM; + get_online_cpus(); + conf->scribble_len = scribble_len(newsize); + for_each_present_cpu(cpu) { + struct raid5_percpu *percpu; + void *scribble; + + percpu = per_cpu_ptr(conf->percpu, cpu); + scribble = kmalloc(conf->scribble_len, GFP_NOIO); + + if (scribble) { + kfree(percpu->scribble); + percpu->scribble = scribble; + } else { + err = -ENOMEM; + break; + } + } + put_online_cpus(); + /* Step 4, return new stripes to service */ while(!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); list_del_init(&nsh->lru); + for (i=conf->raid_disks; i < newsize; i++) if (nsh->dev[i].page == NULL) { struct page *p = alloc_page(GFP_NOIO); @@ -1594,258 +1937,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) } - -/* - * Copy data between a page in the stripe cache, and one or more bion - * The page could align with the middle of the bio, or there could be - * several bion, each with several bio_vecs, which cover part of the page - * Multiple bion are linked together on bi_next. There may be extras - * at the end of this list. We ignore them. - */ -static void copy_data(int frombio, struct bio *bio, - struct page *page, - sector_t sector) -{ - char *pa = page_address(page); - struct bio_vec *bvl; - int i; - int page_offset; - - if (bio->bi_sector >= sector) - page_offset = (signed)(bio->bi_sector - sector) * 512; - else - page_offset = (signed)(sector - bio->bi_sector) * -512; - bio_for_each_segment(bvl, bio, i) { - int len = bio_iovec_idx(bio,i)->bv_len; - int clen; - int b_offset = 0; - - if (page_offset < 0) { - b_offset = -page_offset; - page_offset += b_offset; - len -= b_offset; - } - - if (len > 0 && page_offset + len > STRIPE_SIZE) - clen = STRIPE_SIZE - page_offset; - else clen = len; - - if (clen > 0) { - char *ba = __bio_kmap_atomic(bio, i, KM_USER0); - if (frombio) - memcpy(pa+page_offset, ba+b_offset, clen); - else - memcpy(ba+b_offset, pa+page_offset, clen); - __bio_kunmap_atomic(ba, KM_USER0); - } - if (clen < len) /* hit end of page */ - break; - page_offset += len; - } -} - -#define check_xor() do { \ - if (count == MAX_XOR_BLOCKS) { \ - xor_blocks(count, STRIPE_SIZE, dest, ptr);\ - count = 0; \ - } \ - } while(0) - -static void compute_parity6(struct stripe_head *sh, int method) -{ - raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; - int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); - struct bio *chosen; - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[syndrome_disks+2]; - - pd_idx = sh->pd_idx; - qd_idx = sh->qd_idx; - d0_idx = raid6_d0(sh); - - pr_debug("compute_parity, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); - - switch(method) { - case READ_MODIFY_WRITE: - BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ - case RECONSTRUCT_WRITE: - for (i= disks; i-- ;) - if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - } - break; - case CHECK_PARITY: - BUG(); /* Not implemented yet */ - } - - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } - - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); - } - - /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ - - for (i = 0; i < disks; i++) - ptrs[i] = (void *)raid6_empty_zero_page; - - count = 0; - i = d0_idx; - do { - int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); - - ptrs[slot] = page_address(sh->dev[i].page); - if (slot < syndrome_disks && - !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { - printk(KERN_ERR "block %d/%d not uptodate " - "on parity calc\n", i, count); - BUG(); - } - - i = raid6_next_disk(i, disks); - } while (i != d0_idx); - BUG_ON(count != syndrome_disks); - - raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); - - switch(method) { - case RECONSTRUCT_WRITE: - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); - set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); - set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); - break; - case UPDATE_PARITY: - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); - break; - } -} - - -/* Compute one missing block */ -static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) -{ - int i, count, disks = sh->disks; - void *ptr[MAX_XOR_BLOCKS], *dest, *p; - int qd_idx = sh->qd_idx; - - pr_debug("compute_block_1, stripe %llu, idx %d\n", - (unsigned long long)sh->sector, dd_idx); - - if ( dd_idx == qd_idx ) { - /* We're actually computing the Q drive */ - compute_parity6(sh, UPDATE_PARITY); - } else { - dest = page_address(sh->dev[dd_idx].page); - if (!nozero) memset(dest, 0, STRIPE_SIZE); - count = 0; - for (i = disks ; i--; ) { - if (i == dd_idx || i == qd_idx) - continue; - p = page_address(sh->dev[i].page); - if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) - ptr[count++] = p; - else - printk("compute_block() %d, stripe %llu, %d" - " not present\n", dd_idx, - (unsigned long long)sh->sector, i); - - check_xor(); - } - if (count) - xor_blocks(count, STRIPE_SIZE, dest, ptr); - if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); - else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); - } -} - -/* Compute two missing blocks */ -static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) -{ - int i, count, disks = sh->disks; - int syndrome_disks = sh->ddf_layout ? disks : disks-2; - int d0_idx = raid6_d0(sh); - int faila = -1, failb = -1; - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[syndrome_disks+2]; - - for (i = 0; i < disks ; i++) - ptrs[i] = (void *)raid6_empty_zero_page; - count = 0; - i = d0_idx; - do { - int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); - - ptrs[slot] = page_address(sh->dev[i].page); - - if (i == dd_idx1) - faila = slot; - if (i == dd_idx2) - failb = slot; - i = raid6_next_disk(i, disks); - } while (i != d0_idx); - BUG_ON(count != syndrome_disks); - - BUG_ON(faila == failb); - if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } - - pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", - (unsigned long long)sh->sector, dd_idx1, dd_idx2, - faila, failb); - - if (failb == syndrome_disks+1) { - /* Q disk is one of the missing disks */ - if (faila == syndrome_disks) { - /* Missing P+Q, just recompute */ - compute_parity6(sh, UPDATE_PARITY); - return; - } else { - /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? - dd_idx2 : dd_idx1), - 0); - compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ - return; - } - } - - /* We're missing D+P or D+D; */ - if (failb == syndrome_disks) { - /* We're missing D+P. */ - raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); - } else { - /* We're missing D+D. */ - raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, - ptrs); - } - - /* Both the above update both missing blocks */ - set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); - set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); -} - static void -schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, +schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; + raid5_conf_t *conf = sh->raid_conf; + int level = conf->level; if (rcw) { /* if we are not expanding this is a proper write request, and @@ -1858,7 +1956,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, } else sh->reconstruct_state = reconstruct_state_run; - set_bit(STRIPE_OP_POSTXOR, &s->ops_request); + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1871,17 +1969,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } - if (s->locked + 1 == disks) + if (s->locked + conf->max_degraded == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) - atomic_inc(&sh->raid_conf->pending_full_writes); + atomic_inc(&conf->pending_full_writes); } else { + BUG_ON(level == 6); BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); sh->reconstruct_state = reconstruct_state_prexor_drain_run; set_bit(STRIPE_OP_PREXOR, &s->ops_request); set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - set_bit(STRIPE_OP_POSTXOR, &s->ops_request); + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1899,13 +1998,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, } } - /* keep the parity disk locked while asynchronous operations + /* keep the parity disk(s) locked while asynchronous operations * are in flight */ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); s->locked++; + if (level == 6) { + int qd_idx = sh->qd_idx; + struct r5dev *dev = &sh->dev[qd_idx]; + + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + s->locked++; + } + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, s->locked, s->ops_request); @@ -1986,13 +2094,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in static void end_reshape(raid5_conf_t *conf); -static int page_is_zero(struct page *p) -{ - char *a = page_address(p); - return ((*(u32*)a) == 0 && - memcmp(a, a+4, STRIPE_SIZE-4)==0); -} - static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, struct stripe_head *sh) { @@ -2132,9 +2233,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; + sh->ops.target2 = -1; s->req_compute = 1; /* Careful: from this point on 'uptodate' is in the eye - * of raid5_run_ops which services 'compute' operations + * of raid_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will * be R5_UPTODATE by the time it is needed for a * subsequent operation. @@ -2173,61 +2275,104 @@ static void handle_stripe_fill5(struct stripe_head *sh, set_bit(STRIPE_HANDLE, &sh->state); } -static void handle_stripe_fill6(struct stripe_head *sh, - struct stripe_head_state *s, struct r6_state *r6s, - int disks) +/* fetch_block6 - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill6 to continue + */ +static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, + struct r6_state *r6s, int disk_idx, int disks) { - int i; - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || (dev->towrite && - !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->failed >= 1 && - (sh->dev[r6s->failed_num[0]].toread || - s->to_write)) || - (s->failed >= 2 && - (sh->dev[r6s->failed_num[1]].toread || - s->to_write)))) { - /* we would like to get this block, possibly - * by computing it, but we might not be able to + struct r5dev *dev = &sh->dev[disk_idx]; + struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], + &sh->dev[r6s->failed_num[1]] }; + + if (!test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed >= 1 && + (fdev[0]->toread || s->to_write)) || + (s->failed >= 2 && + (fdev[1]->toread || s->to_write)))) { + /* we would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync + */ + BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); + BUG_ON(test_bit(R5_Wantread, &dev->flags)); + if ((s->uptodate == disks - 1) && + (s->failed && (disk_idx == r6s->failed_num[0] || + disk_idx == r6s->failed_num[1]))) { + /* have disk failed, and we're requested to fetch it; + * do compute it */ - if ((s->uptodate == disks - 1) && - (s->failed && (i == r6s->failed_num[0] || - i == r6s->failed_num[1]))) { - pr_debug("Computing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - compute_block_1(sh, i, 0); - s->uptodate++; - } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { - /* Computing 2-failure is *very* expensive; only - * do it if failed >= 2 - */ - int other; - for (other = disks; other--; ) { - if (other == i) - continue; - if (!test_bit(R5_UPTODATE, - &sh->dev[other].flags)) - break; - } - BUG_ON(other < 0); - pr_debug("Computing stripe %llu blocks %d,%d\n", - (unsigned long long)sh->sector, - i, other); - compute_block_2(sh, i, other); - s->uptodate += 2; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - pr_debug("Reading block %d (sync=%d)\n", - i, s->syncing); + pr_debug("Computing stripe %llu block %d\n", + (unsigned long long)sh->sector, disk_idx); + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, &dev->flags); + sh->ops.target = disk_idx; + sh->ops.target2 = -1; /* no 2nd target */ + s->req_compute = 1; + s->uptodate++; + return 1; + } else if (s->uptodate == disks-2 && s->failed >= 2) { + /* Computing 2-failure is *very* expensive; only + * do it if failed >= 2 + */ + int other; + for (other = disks; other--; ) { + if (other == disk_idx) + continue; + if (!test_bit(R5_UPTODATE, + &sh->dev[other].flags)) + break; } + BUG_ON(other < 0); + pr_debug("Computing stripe %llu blocks %d,%d\n", + (unsigned long long)sh->sector, + disk_idx, other); + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); + set_bit(R5_Wantcompute, &sh->dev[other].flags); + sh->ops.target = disk_idx; + sh->ops.target2 = other; + s->uptodate += 2; + s->req_compute = 1; + return 1; + } else if (test_bit(R5_Insync, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", + disk_idx, s->syncing); } } + + return 0; +} + +/** + * handle_stripe_fill6 - read or compute data to satisfy pending requests. + */ +static void handle_stripe_fill6(struct stripe_head *sh, + struct stripe_head_state *s, struct r6_state *r6s, + int disks) +{ + int i; + + /* look for blocks to read/compute, skip this if a compute + * is already in flight, or if the stripe contents are in the + * midst of changing due to a write + */ + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !sh->reconstruct_state) + for (i = disks; i--; ) + if (fetch_block6(sh, s, r6s, i, disks)) + break; set_bit(STRIPE_HANDLE, &sh->state); } @@ -2361,114 +2506,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, */ /* since handle_stripe can be called at any time we need to handle the * case where a compute block operation has been submitted and then a - * subsequent call wants to start a write request. raid5_run_ops only - * handles the case where compute block and postxor are requested + * subsequent call wants to start a write request. raid_run_ops only + * handles the case where compute block and reconstruct are requested * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && (s->locked == 0 && (rcw == 0 || rmw == 0) && !test_bit(STRIPE_BIT_DELAY, &sh->state))) - schedule_reconstruction5(sh, s, rcw == 0, 0); + schedule_reconstruction(sh, s, rcw == 0, 0); } static void handle_stripe_dirtying6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { - int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; + int rcw = 0, pd_idx = sh->pd_idx, i; int qd_idx = sh->qd_idx; + + set_bit(STRIPE_HANDLE, &sh->state); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) - && i != pd_idx && i != qd_idx - && (!test_bit(R5_LOCKED, &dev->flags) - ) && - !test_bit(R5_UPTODATE, &dev->flags)) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; - else { - pr_debug("raid6: must_compute: " - "disk %d flags=%#lx\n", i, dev->flags); - must_compute++; + /* check if we haven't enough data */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != pd_idx && i != qd_idx && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + rcw++; + if (!test_bit(R5_Insync, &dev->flags)) + continue; /* it's a failed drive */ + + if ( + test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + pr_debug("Read_old stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + } else { + pr_debug("Request delayed stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); } } } - pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", - (unsigned long long)sh->sector, rcw, must_compute); - set_bit(STRIPE_HANDLE, &sh->state); - - if (rcw > 0) - /* want reconstruct write, but need to get some data */ - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (!test_bit(R5_OVERWRITE, &dev->flags) - && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) - && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - test_bit(R5_Insync, &dev->flags)) { - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - } else { - pr_debug("Request delayed stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - } - } /* now if nothing is locked, and if we have enough data, we can start a * write request */ - if (s->locked == 0 && rcw == 0 && + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + s->locked == 0 && rcw == 0 && !test_bit(STRIPE_BIT_DELAY, &sh->state)) { - if (must_compute > 0) { - /* We have failed blocks and need to compute them */ - switch (s->failed) { - case 0: - BUG(); - case 1: - compute_block_1(sh, r6s->failed_num[0], 0); - break; - case 2: - compute_block_2(sh, r6s->failed_num[0], - r6s->failed_num[1]); - break; - default: /* This request should have been failed? */ - BUG(); - } - } - - pr_debug("Computing parity for stripe %llu\n", - (unsigned long long)sh->sector); - compute_parity6(sh, RECONSTRUCT_WRITE); - /* now every locked buffer is ready to be written */ - for (i = disks; i--; ) - if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { - pr_debug("Writing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - s->locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - } - if (s->locked == disks) - if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) - atomic_inc(&conf->pending_full_writes); - /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ - set_bit(STRIPE_INSYNC, &sh->state); - - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } + schedule_reconstruction(sh, s, 1, 0); } } @@ -2527,7 +2619,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, * we are done. Otherwise update the mismatch count and repair * parity if !MD_RECOVERY_CHECK */ - if (sh->ops.zero_sum_result == 0) + if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) /* parity is correct (on disc, * not in buffer any more) */ @@ -2544,6 +2636,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_Wantcompute, &sh->dev[sh->pd_idx].flags); sh->ops.target = sh->pd_idx; + sh->ops.target2 = -1; s->uptodate++; } } @@ -2560,67 +2653,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, - struct stripe_head_state *s, - struct r6_state *r6s, struct page *tmp_page, - int disks) + struct stripe_head_state *s, + struct r6_state *r6s, int disks) { - int update_p = 0, update_q = 0; - struct r5dev *dev; int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; + struct r5dev *dev; set_bit(STRIPE_HANDLE, &sh->state); BUG_ON(s->failed > 2); - BUG_ON(s->uptodate < disks); + /* Want to check and possibly repair P and Q. * However there could be one 'failed' device, in which * case we can only check one of them, possibly using the * other to generate missing data */ - /* If !tmp_page, we cannot do the calculations, - * but as we have set STRIPE_HANDLE, we will soon be called - * by stripe_handle with a tmp_page - just wait until then. - */ - if (tmp_page) { + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are < 2 failures */ if (s->failed == r6s->q_failed) { - /* The only possible failed device holds 'Q', so it + /* The only possible failed device holds Q, so it * makes sense to check P (If anything else were failed, * we would have used P to recreate it). */ - compute_block_1(sh, pd_idx, 1); - if (!page_is_zero(sh->dev[pd_idx].page)) { - compute_block_1(sh, pd_idx, 0); - update_p = 1; - } + sh->check_state = check_state_run; } if (!r6s->q_failed && s->failed < 2) { - /* q is not failed, and we didn't use it to generate + /* Q is not failed, and we didn't use it to generate * anything, so it makes sense to check it */ - memcpy(page_address(tmp_page), - page_address(sh->dev[qd_idx].page), - STRIPE_SIZE); - compute_parity6(sh, UPDATE_PARITY); - if (memcmp(page_address(tmp_page), - page_address(sh->dev[qd_idx].page), - STRIPE_SIZE) != 0) { - clear_bit(STRIPE_INSYNC, &sh->state); - update_q = 1; - } + if (sh->check_state == check_state_run) + sh->check_state = check_state_run_pq; + else + sh->check_state = check_state_run_q; } - if (update_p || update_q) { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - update_p = update_q = 0; + + /* discard potentially stale zero_sum_result */ + sh->ops.zero_sum_result = 0; + + if (sh->check_state == check_state_run) { + /* async_xor_zero_sum destroys the contents of P */ + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + s->uptodate--; + } + if (sh->check_state >= check_state_run && + sh->check_state <= check_state_run_pq) { + /* async_syndrome_zero_sum preserves P and Q, so + * no need to mark them !uptodate here + */ + set_bit(STRIPE_OP_CHECK, &s->ops_request); + break; } + /* we have 2-disk failure */ + BUG_ON(s->failed != 2); + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; + /* now write out any block on a failed drive, - * or P or Q if they need it + * or P or Q if they were recomputed */ - + BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ if (s->failed == 2) { dev = &sh->dev[r6s->failed_num[1]]; s->locked++; @@ -2633,14 +2733,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } - - if (update_p) { + if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { dev = &sh->dev[pd_idx]; s->locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } - if (update_q) { + if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { dev = &sh->dev[qd_idx]; s->locked++; set_bit(R5_LOCKED, &dev->flags); @@ -2649,6 +2748,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + case check_state_run_q: + case check_state_run_pq: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) { + /* both parities are correct */ + if (!s->failed) + set_bit(STRIPE_INSYNC, &sh->state); + else { + /* in contrast to the raid5 case we can validate + * parity, but still have a failure to write + * back + */ + sh->check_state = check_state_compute_result; + /* Returning at this point means that we may go + * off and bring p and/or q uptodate again so + * we make sure to check zero_sum_result again + * to verify if p or q need writeback + */ + } + } else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + int *target = &sh->ops.target; + + sh->ops.target = -1; + sh->ops.target2 = -1; + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { + set_bit(R5_Wantcompute, + &sh->dev[pd_idx].flags); + *target = pd_idx; + target = &sh->ops.target2; + s->uptodate++; + } + if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { + set_bit(R5_Wantcompute, + &sh->dev[qd_idx].flags); + *target = qd_idx; + s->uptodate++; + } + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); } } @@ -2666,6 +2829,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, if (i != sh->pd_idx && i != sh->qd_idx) { int dd_idx, j; struct stripe_head *sh2; + struct async_submit_ctl submit; sector_t bn = compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, @@ -2685,9 +2849,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, } /* place all the copies on one channel */ + init_async_submit(&submit, 0, tx, NULL, NULL, NULL); tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, STRIPE_SIZE, - ASYNC_TX_DEP_ACK, tx, NULL, NULL); + sh->dev[i].page, 0, 0, STRIPE_SIZE, + &submit); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); @@ -2973,7 +3138,7 @@ static bool handle_stripe5(struct stripe_head *sh) /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; stripe_set_idx(sh->sector, conf, 0, sh); - schedule_reconstruction5(sh, &s, 1, 1); + schedule_reconstruction(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); @@ -2993,7 +3158,7 @@ static bool handle_stripe5(struct stripe_head *sh) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); if (s.ops_request) - raid5_run_ops(sh, s.ops_request); + raid_run_ops(sh, s.ops_request); ops_run_io(sh, &s); @@ -3002,7 +3167,7 @@ static bool handle_stripe5(struct stripe_head *sh) return blocked_rdev == NULL; } -static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) +static bool handle_stripe6(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; @@ -3014,9 +3179,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) mdk_rdev_t *blocked_rdev = NULL; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " - "pd_idx=%d, qd_idx=%d\n", + "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), pd_idx, qd_idx); + atomic_read(&sh->count), pd_idx, qd_idx, + sh->check_state, sh->reconstruct_state); memset(&s, 0, sizeof(s)); spin_lock(&sh->lock); @@ -3036,35 +3202,24 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) pr_debug("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read */ - if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { - struct bio *rbi, *rbi2; - pr_debug("Return read for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - rbi = dev->toread; - dev->toread = NULL; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { - copy_data(0, rbi, dev->page, dev->sector); - rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); - if (!raid5_dec_bi_phys_segments(rbi)) { - rbi->bi_next = return_bi; - return_bi = rbi; - } - spin_unlock_irq(&conf->device_lock); - rbi = rbi2; - } - } + /* maybe we can reply to a read + * + * new wantfill requests are only permitted while + * ops_complete_biofill is guaranteed to be inactive + */ + if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) + set_bit(R5_Wantfill, &dev->flags); /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) + BUG_ON(++s.compute > 2); - - if (dev->toread) + if (test_bit(R5_Wantfill, &dev->flags)) { + s.to_fill++; + } else if (dev->toread) s.to_read++; if (dev->towrite) { s.to_write++; @@ -3105,6 +3260,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) blocked_rdev = NULL; } + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); + } + pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, @@ -3145,19 +3305,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || - (s.syncing && (s.uptodate < disks)) || s.expanding) + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) handle_stripe_fill6(sh, &s, &r6s, disks); - /* now to consider writing and what else, if anything should be read */ - if (s.to_write) + /* Now we check to see if any write operations have recently + * completed + */ + if (sh->reconstruct_state == reconstruct_state_drain_result) { + int qd_idx = sh->qd_idx; + + sh->reconstruct_state = reconstruct_state_idle; + /* All the 'written' buffers and the parity blocks are ready to + * be written back to disk + */ + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); + for (i = disks; i--; ) { + dev = &sh->dev[i]; + if (test_bit(R5_LOCKED, &dev->flags) && + (i == sh->pd_idx || i == qd_idx || + dev->written)) { + pr_debug("Writing block %d\n", i); + BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); + set_bit(R5_Wantwrite, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags) || + ((i == sh->pd_idx || i == qd_idx) && + s.failed == 0)) + set_bit(STRIPE_INSYNC, &sh->state); + } + } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + } + + /* Now to consider new write requests and what else, if anything + * should be read. We do not handle new writes when: + * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. + * 2/ A 'check' operation is in flight, as it may clobber the parity + * block. + */ + if (s.to_write && !sh->reconstruct_state && !sh->check_state) handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough - * data is available + * data is available. The parity check is held off while parity + * dependent operations are in flight. */ - if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) - handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); + if (sh->check_state || + (s.syncing && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + !test_bit(STRIPE_INSYNC, &sh->state))) + handle_parity_checks6(conf, sh, &s, &r6s, disks); if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS,1); @@ -3178,15 +3381,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); + s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); set_bit(R5_LOCKED, &dev->flags); + s.locked++; } } } - if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + sh->reconstruct_state = reconstruct_state_idle; + clear_bit(STRIPE_EXPANDING, &sh->state); + for (i = conf->raid_disks; i--; ) { + set_bit(R5_Wantwrite, &sh->dev[i].flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; + } + } + + if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !sh->reconstruct_state) { struct stripe_head *sh2 = get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { @@ -3207,14 +3424,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; stripe_set_idx(sh->sector, conf, 0, sh); - compute_parity6(sh, RECONSTRUCT_WRITE); - for (i = conf->raid_disks ; i-- ; ) { - set_bit(R5_LOCKED, &sh->dev[i].flags); - s.locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - } - clear_bit(STRIPE_EXPANDING, &sh->state); - } else if (s.expanded) { + schedule_reconstruction(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -3232,6 +3443,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); + if (s.ops_request) + raid_run_ops(sh, s.ops_request); + ops_run_io(sh, &s); return_io(return_bi); @@ -3240,16 +3454,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } /* returns true if the stripe was handled */ -static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) +static bool handle_stripe(struct stripe_head *sh) { if (sh->raid_conf->level == 6) - return handle_stripe6(sh, tmp_page); + return handle_stripe6(sh); else return handle_stripe5(sh); } - - static void raid5_activate_delayed(raid5_conf_t *conf) { if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { @@ -4046,7 +4258,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski spin_unlock(&sh->lock); /* wait for any blocked device to be handled */ - while(unlikely(!handle_stripe(sh, NULL))) + while (unlikely(!handle_stripe(sh))) ; release_stripe(sh); @@ -4103,7 +4315,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) return handled; } - handle_stripe(sh, NULL); + handle_stripe(sh); release_stripe(sh); handled++; } @@ -4117,6 +4329,36 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) return handled; } +#ifdef CONFIG_MULTICORE_RAID456 +static void __process_stripe(void *param, async_cookie_t cookie) +{ + struct stripe_head *sh = param; + + handle_stripe(sh); + release_stripe(sh); +} + +static void process_stripe(struct stripe_head *sh, struct list_head *domain) +{ + async_schedule_domain(__process_stripe, sh, domain); +} + +static void synchronize_stripe_processing(struct list_head *domain) +{ + async_synchronize_full_domain(domain); +} +#else +static void process_stripe(struct stripe_head *sh, struct list_head *domain) +{ + handle_stripe(sh); + release_stripe(sh); + cond_resched(); +} + +static void synchronize_stripe_processing(struct list_head *domain) +{ +} +#endif /* @@ -4131,6 +4373,7 @@ static void raid5d(mddev_t *mddev) struct stripe_head *sh; raid5_conf_t *conf = mddev->private; int handled; + LIST_HEAD(raid_domain); pr_debug("+++ raid5d active\n"); @@ -4167,8 +4410,7 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); handled++; - handle_stripe(sh, conf->spare_page); - release_stripe(sh); + process_stripe(sh, &raid_domain); spin_lock_irq(&conf->device_lock); } @@ -4176,6 +4418,7 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); + synchronize_stripe_processing(&raid_domain); async_tx_issue_pending_all(); unplug_slaves(mddev); @@ -4308,6 +4551,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) return sectors * (raid_disks - conf->max_degraded); } +static void raid5_free_percpu(raid5_conf_t *conf) +{ + struct raid5_percpu *percpu; + unsigned long cpu; + + if (!conf->percpu) + return; + + get_online_cpus(); + for_each_possible_cpu(cpu) { + percpu = per_cpu_ptr(conf->percpu, cpu); + safe_put_page(percpu->spare_page); + kfree(percpu->scribble); + } +#ifdef CONFIG_HOTPLUG_CPU + unregister_cpu_notifier(&conf->cpu_notify); +#endif + put_online_cpus(); + + free_percpu(conf->percpu); +} + +static void free_conf(raid5_conf_t *conf) +{ + shrink_stripes(conf); + raid5_free_percpu(conf); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); + long cpu = (long)hcpu; + struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (conf->level == 6 && !percpu->spare_page) + percpu->spare_page = alloc_page(GFP_KERNEL); + if (!percpu->scribble) + percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); + + if (!percpu->scribble || + (conf->level == 6 && !percpu->spare_page)) { + safe_put_page(percpu->spare_page); + kfree(percpu->scribble); + pr_err("%s: failed memory allocation for cpu%ld\n", + __func__, cpu); + return NOTIFY_BAD; + } + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + safe_put_page(percpu->spare_page); + kfree(percpu->scribble); + percpu->spare_page = NULL; + percpu->scribble = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} +#endif + +static int raid5_alloc_percpu(raid5_conf_t *conf) +{ + unsigned long cpu; + struct page *spare_page; + struct raid5_percpu *allcpus; + void *scribble; + int err; + + allcpus = alloc_percpu(struct raid5_percpu); + if (!allcpus) + return -ENOMEM; + conf->percpu = allcpus; + + get_online_cpus(); + err = 0; + for_each_present_cpu(cpu) { + if (conf->level == 6) { + spare_page = alloc_page(GFP_KERNEL); + if (!spare_page) { + err = -ENOMEM; + break; + } + per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; + } + scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); + if (!scribble) { + err = -ENOMEM; + break; + } + per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; + } +#ifdef CONFIG_HOTPLUG_CPU + conf->cpu_notify.notifier_call = raid456_cpu_notify; + conf->cpu_notify.priority = 0; + if (err == 0) + err = register_cpu_notifier(&conf->cpu_notify); +#endif + put_online_cpus(); + + return err; +} + static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; @@ -4349,6 +4704,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) goto abort; conf->raid_disks = mddev->raid_disks; + conf->scribble_len = scribble_len(conf->raid_disks); if (mddev->reshape_position == MaxSector) conf->previous_raid_disks = mddev->raid_disks; else @@ -4364,11 +4720,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; - if (mddev->new_level == 6) { - conf->spare_page = alloc_page(GFP_KERNEL); - if (!conf->spare_page) - goto abort; - } + conf->level = mddev->new_level; + if (raid5_alloc_percpu(conf) != 0) + goto abort; + spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); @@ -4439,11 +4794,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) abort: if (conf) { - shrink_stripes(conf); - safe_put_page(conf->spare_page); - kfree(conf->disks); - kfree(conf->stripe_hashtbl); - kfree(conf); + free_conf(conf); return ERR_PTR(-EIO); } else return ERR_PTR(-ENOMEM); @@ -4613,12 +4964,8 @@ abort: md_unregister_thread(mddev->thread); mddev->thread = NULL; if (conf) { - shrink_stripes(conf); print_raid5_conf(conf); - safe_put_page(conf->spare_page); - kfree(conf->disks); - kfree(conf->stripe_hashtbl); - kfree(conf); + free_conf(conf); } mddev->private = NULL; printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); @@ -4633,13 +4980,10 @@ static int stop(mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; - shrink_stripes(conf); - kfree(conf->stripe_hashtbl); mddev->queue->backing_dev_info.congested_fn = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); - kfree(conf->disks); - kfree(conf); + free_conf(conf); mddev->private = NULL; return 0; } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 9459689c4ea0..2390e0e83daf 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -2,6 +2,7 @@ #define _RAID5_H #include <linux/raid/xor.h> +#include <linux/dmaengine.h> /* * @@ -175,7 +176,9 @@ */ enum check_states { check_state_idle = 0, - check_state_run, /* parity check */ + check_state_run, /* xor parity check */ + check_state_run_q, /* q-parity check */ + check_state_run_pq, /* pq dual parity check */ check_state_check_result, check_state_compute_run, /* parity repair */ check_state_compute_result, @@ -215,8 +218,8 @@ struct stripe_head { * @target - STRIPE_OP_COMPUTE_BLK target */ struct stripe_operations { - int target; - u32 zero_sum_result; + int target, target2; + enum sum_check_flags zero_sum_result; } ops; struct r5dev { struct bio req; @@ -298,7 +301,7 @@ struct r6_state { #define STRIPE_OP_COMPUTE_BLK 1 #define STRIPE_OP_PREXOR 2 #define STRIPE_OP_BIODRAIN 3 -#define STRIPE_OP_POSTXOR 4 +#define STRIPE_OP_RECONSTRUCT 4 #define STRIPE_OP_CHECK 5 /* @@ -385,8 +388,21 @@ struct raid5_private_data { * (fresh device added). * Cleared when a sync completes. */ - - struct page *spare_page; /* Used when checking P/Q in raid6 */ + /* per cpu variables */ + struct raid5_percpu { + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address + * conversions + */ + } *percpu; + size_t scribble_len; /* size of scribble region must be + * associated with conf to handle + * cpu hotplug while reshaping + */ +#ifdef CONFIG_HOTPLUG_CPU + struct notifier_block cpu_notify; +#endif /* * Free stripes pool |