9 files changed, 2414 insertions, 138 deletions
diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c
index 200926699778..bb5b90e8e768 100644
--- a/drivers/bus/arm-cci.c
+++ b/drivers/bus/arm-cci.c
@@ -18,11 +18,21 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 
 #include <asm/cacheflush.h>
+#include <asm/irq_regs.h>
+#include <asm/pmu.h>
 #include <asm/smp_plat.h>
 
+#define DRIVER_NAME		"CCI-400"
+#define DRIVER_NAME_PMU		DRIVER_NAME " PMU"
+#define PMU_NAME		"CCI_400"
+
 #define CCI_PORT_CTRL		0x0
 #define CCI_CTRL_STATUS		0xc
 
@@ -54,6 +64,568 @@ static unsigned int nb_cci_ports;
 static void __iomem *cci_ctrl_base;
 static unsigned long cci_ctrl_phys;
 
+#ifdef CONFIG_HW_PERF_EVENTS
+
+#define CCI_PMCR		0x0100
+#define CCI_PID2		0x0fe8
+
+#define CCI_PMCR_CEN		0x00000001
+#define CCI_PMCR_NCNT_MASK	0x0000f800
+#define CCI_PMCR_NCNT_SHIFT	11
+
+#define CCI_PID2_REV_MASK	0xf0
+#define CCI_PID2_REV_SHIFT	4
+
+/* Port ids */
+#define CCI_PORT_S0	0
+#define CCI_PORT_S1	1
+#define CCI_PORT_S2	2
+#define CCI_PORT_S3	3
+#define CCI_PORT_S4	4
+#define CCI_PORT_M0	5
+#define CCI_PORT_M1	6
+#define CCI_PORT_M2	7
+
+#define CCI_REV_R0		0
+#define CCI_REV_R1		1
+#define CCI_REV_R0_P4		4
+#define CCI_REV_R1_P2		6
+
+#define CCI_PMU_EVT_SEL		0x000
+#define CCI_PMU_CNTR		0x004
+#define CCI_PMU_CNTR_CTRL	0x008
+#define CCI_PMU_OVRFLW		0x00c
+
+#define CCI_PMU_OVRFLW_FLAG	1
+
+#define CCI_PMU_CNTR_BASE(idx)	((idx) * SZ_4K)
+
+/*
+ * Instead of an event id to monitor CCI cycles, a dedicated counter is
+ * provided. Use 0xff to represent CCI cycles and hope that no future revisions
+ * make use of this event in hardware.
+ */
+enum cci400_perf_events {
+	CCI_PMU_CYCLES = 0xff
+};
+
+#define CCI_PMU_EVENT_MASK		0xff
+#define CCI_PMU_EVENT_SOURCE(event)	((event >> 5) & 0x7)
+#define CCI_PMU_EVENT_CODE(event)	(event & 0x1f)
+
+#define CCI_PMU_MAX_HW_EVENTS 5   /* CCI PMU has 4 counters + 1 cycle counter */
+
+#define CCI_PMU_CYCLE_CNTR_IDX		0
+#define CCI_PMU_CNTR0_IDX		1
+#define CCI_PMU_CNTR_LAST(cci_pmu)	(CCI_PMU_CYCLE_CNTR_IDX + cci_pmu->num_events - 1)
+
+/*
+ * CCI PMU event id is an 8-bit value made of two parts - bits 7:5 for one of 8
+ * ports and bits 4:0 are event codes. There are different event codes
+ * associated with each port type.
+ *
+ * Additionally, the range of events associated with the port types changed
+ * between Rev0 and Rev1.
+ *
+ * The constants below define the range of valid codes for each port type for
+ * the different revisions and are used to validate the event to be monitored.
+ */
+
+#define CCI_REV_R0_SLAVE_PORT_MIN_EV	0x00
+#define CCI_REV_R0_SLAVE_PORT_MAX_EV	0x13
+#define CCI_REV_R0_MASTER_PORT_MIN_EV	0x14
+#define CCI_REV_R0_MASTER_PORT_MAX_EV	0x1a
+
+#define CCI_REV_R1_SLAVE_PORT_MIN_EV	0x00
+#define CCI_REV_R1_SLAVE_PORT_MAX_EV	0x14
+#define CCI_REV_R1_MASTER_PORT_MIN_EV	0x00
+#define CCI_REV_R1_MASTER_PORT_MAX_EV	0x11
+
+struct pmu_port_event_ranges {
+	u8 slave_min;
+	u8 slave_max;
+	u8 master_min;
+	u8 master_max;
+};
+
+static struct pmu_port_event_ranges port_event_range[] = {
+	[CCI_REV_R0] = {
+		.slave_min = CCI_REV_R0_SLAVE_PORT_MIN_EV,
+		.slave_max = CCI_REV_R0_SLAVE_PORT_MAX_EV,
+		.master_min = CCI_REV_R0_MASTER_PORT_MIN_EV,
+		.master_max = CCI_REV_R0_MASTER_PORT_MAX_EV,
+	},
+	[CCI_REV_R1] = {
+		.slave_min = CCI_REV_R1_SLAVE_PORT_MIN_EV,
+		.slave_max = CCI_REV_R1_SLAVE_PORT_MAX_EV,
+		.master_min = CCI_REV_R1_MASTER_PORT_MIN_EV,
+		.master_max = CCI_REV_R1_MASTER_PORT_MAX_EV,
+	},
+};
+
+struct cci_pmu_drv_data {
+	void __iomem *base;
+	struct arm_pmu *cci_pmu;
+	int nr_irqs;
+	int irqs[CCI_PMU_MAX_HW_EVENTS];
+	unsigned long active_irqs;
+	struct perf_event *events[CCI_PMU_MAX_HW_EVENTS];
+	unsigned long used_mask[BITS_TO_LONGS(CCI_PMU_MAX_HW_EVENTS)];
+	struct pmu_port_event_ranges *port_ranges;
+	struct pmu_hw_events hw_events;
+};
+static struct cci_pmu_drv_data *pmu;
+
+static bool is_duplicate_irq(int irq, int *irqs, int nr_irqs)
+{
+	int i;
+
+	for (i = 0; i < nr_irqs; i++)
+		if (irq == irqs[i])
+			return true;
+
+	return false;
+}
+
+static int probe_cci_revision(void)
+{
+	int rev;
+	rev = readl_relaxed(cci_ctrl_base + CCI_PID2) & CCI_PID2_REV_MASK;
+	rev >>= CCI_PID2_REV_SHIFT;
+
+	if (rev <= CCI_REV_R0_P4)
+		return CCI_REV_R0;
+	else if (rev <= CCI_REV_R1_P2)
+		return CCI_REV_R1;
+
+	return -ENOENT;
+}
+
+static struct pmu_port_event_ranges *port_range_by_rev(void)
+{
+	int rev = probe_cci_revision();
+
+	if (rev < 0)
+		return NULL;
+
+	return &port_event_range[rev];
+}
+
+static int pmu_is_valid_slave_event(u8 ev_code)
+{
+	return pmu->port_ranges->slave_min <= ev_code &&
+		ev_code <= pmu->port_ranges->slave_max;
+}
+
+static int pmu_is_valid_master_event(u8 ev_code)
+{
+	return pmu->port_ranges->master_min <= ev_code &&
+		ev_code <= pmu->port_ranges->master_max;
+}
+
+static int pmu_validate_hw_event(u8 hw_event)
+{
+	u8 ev_source = CCI_PMU_EVENT_SOURCE(hw_event);
+	u8 ev_code = CCI_PMU_EVENT_CODE(hw_event);
+
+	switch (ev_source) {
+	case CCI_PORT_S0:
+	case CCI_PORT_S1:
+	case CCI_PORT_S2:
+	case CCI_PORT_S3:
+	case CCI_PORT_S4:
+		/* Slave Interface */
+		if (pmu_is_valid_slave_event(ev_code))
+			return hw_event;
+		break;
+	case CCI_PORT_M0:
+	case CCI_PORT_M1:
+	case CCI_PORT_M2:
+		/* Master Interface */
+		if (pmu_is_valid_master_event(ev_code))
+			return hw_event;
+		break;
+	}
+
+	return -ENOENT;
+}
+
+static int pmu_is_valid_counter(struct arm_pmu *cci_pmu, int idx)
+{
+	return CCI_PMU_CYCLE_CNTR_IDX <= idx &&
+		idx <= CCI_PMU_CNTR_LAST(cci_pmu);
+}
+
+static u32 pmu_read_register(int idx, unsigned int offset)
+{
+	return readl_relaxed(pmu->base + CCI_PMU_CNTR_BASE(idx) + offset);
+}
+
+static void pmu_write_register(u32 value, int idx, unsigned int offset)
+{
+	return writel_relaxed(value, pmu->base + CCI_PMU_CNTR_BASE(idx) + offset);
+}
+
+static void pmu_disable_counter(int idx)
+{
+	pmu_write_register(0, idx, CCI_PMU_CNTR_CTRL);
+}
+
+static void pmu_enable_counter(int idx)
+{
+	pmu_write_register(1, idx, CCI_PMU_CNTR_CTRL);
+}
+
+static void pmu_set_event(int idx, unsigned long event)
+{
+	event &= CCI_PMU_EVENT_MASK;
+	pmu_write_register(event, idx, CCI_PMU_EVT_SEL);
+}
+
+static u32 pmu_get_max_counters(void)
+{
+	u32 n_cnts = (readl_relaxed(cci_ctrl_base + CCI_PMCR) &
+		      CCI_PMCR_NCNT_MASK) >> CCI_PMCR_NCNT_SHIFT;
+
+	/* add 1 for cycle counter */
+	return n_cnts + 1;
+}
+
+static struct pmu_hw_events *pmu_get_hw_events(void)
+{
+	return &pmu->hw_events;
+}
+
+static int pmu_get_event_idx(struct pmu_hw_events *hw, struct perf_event *event)
+{
+	struct arm_pmu *cci_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hw_event = &event->hw;
+	unsigned long cci_event = hw_event->config_base & CCI_PMU_EVENT_MASK;
+	int idx;
+
+	if (cci_event == CCI_PMU_CYCLES) {
+		if (test_and_set_bit(CCI_PMU_CYCLE_CNTR_IDX, hw->used_mask))
+			return -EAGAIN;
+
+		return CCI_PMU_CYCLE_CNTR_IDX;
+	}
+
+	for (idx = CCI_PMU_CNTR0_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); ++idx)
+		if (!test_and_set_bit(idx, hw->used_mask))
+			return idx;
+
+	/* No counters available */
+	return -EAGAIN;
+}
+
+static int pmu_map_event(struct perf_event *event)
+{
+	int mapping;
+	u8 config = event->attr.config & CCI_PMU_EVENT_MASK;
+
+	if (event->attr.type < PERF_TYPE_MAX)
+		return -ENOENT;
+
+	if (config == CCI_PMU_CYCLES)
+		mapping = config;
+	else
+		mapping = pmu_validate_hw_event(config);
+
+	return mapping;
+}
+
+static int pmu_request_irq(struct arm_pmu *cci_pmu, irq_handler_t handler)
+{
+	int i;
+	struct platform_device *pmu_device = cci_pmu->plat_device;
+
+	if (unlikely(!pmu_device))
+		return -ENODEV;
+
+	if (pmu->nr_irqs < 1) {
+		dev_err(&pmu_device->dev, "no irqs for CCI PMUs defined\n");
+		return -ENODEV;
+	}
+
+	/*
+	 * Register all available CCI PMU interrupts. In the interrupt handler
+	 * we iterate over the counters checking for interrupt source (the
+	 * overflowing counter) and clear it.
+	 *
+	 * This should allow handling of non-unique interrupt for the counters.
+	 */
+	for (i = 0; i < pmu->nr_irqs; i++) {
+		int err = request_irq(pmu->irqs[i], handler, IRQF_SHARED,
+				"arm-cci-pmu", cci_pmu);
+		if (err) {
+			dev_err(&pmu_device->dev, "unable to request IRQ%d for ARM CCI PMU counters\n",
+				pmu->irqs[i]);
+			return err;
+		}
+
+		set_bit(i, &pmu->active_irqs);
+	}
+
+	return 0;
+}
+
+static irqreturn_t pmu_handle_irq(int irq_num, void *dev)
+{
+	unsigned long flags;
+	struct arm_pmu *cci_pmu = (struct arm_pmu *)dev;
+	struct pmu_hw_events *events = cci_pmu->get_hw_events();
+	struct perf_sample_data data;
+	struct pt_regs *regs;
+	int idx, handled = IRQ_NONE;
+
+	raw_spin_lock_irqsave(&events->pmu_lock, flags);
+	regs = get_irq_regs();
+	/*
+	 * Iterate over counters and update the corresponding perf events.
+	 * This should work regardless of whether we have per-counter overflow
+	 * interrupt or a combined overflow interrupt.
+	 */
+	for (idx = CCI_PMU_CYCLE_CNTR_IDX; idx <= CCI_PMU_CNTR_LAST(cci_pmu); idx++) {
+		struct perf_event *event = events->events[idx];
+		struct hw_perf_event *hw_counter;
+
+		if (!event)
+			continue;
+
+		hw_counter = &event->hw;
+
+		/* Did this counter overflow? */
+		if (!pmu_read_register(idx, CCI_PMU_OVRFLW) & CCI_PMU_OVRFLW_FLAG)
+			continue;
+
+		pmu_write_register(CCI_PMU_OVRFLW_FLAG, idx, CCI_PMU_OVRFLW);
+
+		handled = IRQ_HANDLED;
+
+		armpmu_event_update(event);
+		perf_sample_data_init(&data, 0, hw_counter->last_period);
+		if (!armpmu_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			cci_pmu->disable(event);
+	}
+	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
+
+	return IRQ_RETVAL(handled);
+}
+
+static void pmu_free_irq(struct arm_pmu *cci_pmu)
+{
+	int i;
+
+	for (i = 0; i < pmu->nr_irqs; i++) {
+		if (!test_and_clear_bit(i, &pmu->active_irqs))
+			continue;
+
+		free_irq(pmu->irqs[i], cci_pmu);
+	}
+}
+
+static void pmu_enable_event(struct perf_event *event)
+{
+	unsigned long flags;
+	struct arm_pmu *cci_pmu = to_arm_pmu(event->pmu);
+	struct pmu_hw_events *events = cci_pmu->get_hw_events();
+	struct hw_perf_event *hw_counter = &event->hw;
+	int idx = hw_counter->idx;
+
+	if (unlikely(!pmu_is_valid_counter(cci_pmu, idx))) {
+		dev_err(&cci_pmu->plat_device->dev, "Invalid CCI PMU counter %d\n", idx);
+		return;
+	}
+
+	raw_spin_lock_irqsave(&events->pmu_lock, flags);
+
+	/* Configure the event to count, unless you are counting cycles */
+	if (idx != CCI_PMU_CYCLE_CNTR_IDX)
+		pmu_set_event(idx, hw_counter->config_base);
+
+	pmu_enable_counter(idx);
+
+	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
+}
+
+static void pmu_disable_event(struct perf_event *event)
+{
+	struct arm_pmu *cci_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hw_counter = &event->hw;
+	int idx = hw_counter->idx;
+
+	if (unlikely(!pmu_is_valid_counter(cci_pmu, idx))) {
+		dev_err(&cci_pmu->plat_device->dev, "Invalid CCI PMU counter %d\n", idx);
+		return;
+	}
+
+	pmu_disable_counter(idx);
+}
+
+static void pmu_start(struct arm_pmu *cci_pmu)
+{
+	u32 val;
+	unsigned long flags;
+	struct pmu_hw_events *events = cci_pmu->get_hw_events();
+
+	raw_spin_lock_irqsave(&events->pmu_lock, flags);
+
+	/* Enable all the PMU counters. */
+	val = readl_relaxed(cci_ctrl_base + CCI_PMCR) | CCI_PMCR_CEN;
+	writel(val, cci_ctrl_base + CCI_PMCR);
+
+	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
+}
+
+static void pmu_stop(struct arm_pmu *cci_pmu)
+{
+	u32 val;
+	unsigned long flags;
+	struct pmu_hw_events *events = cci_pmu->get_hw_events();
+
+	raw_spin_lock_irqsave(&events->pmu_lock, flags);
+
+	/* Disable all the PMU counters. */
+	val = readl_relaxed(cci_ctrl_base + CCI_PMCR) & ~CCI_PMCR_CEN;
+	writel(val, cci_ctrl_base + CCI_PMCR);
+
+	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
+}
+
+static u32 pmu_read_counter(struct perf_event *event)
+{
+	struct arm_pmu *cci_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hw_counter = &event->hw;
+	int idx = hw_counter->idx;
+	u32 value;
+
+	if (unlikely(!pmu_is_valid_counter(cci_pmu, idx))) {
+		dev_err(&cci_pmu->plat_device->dev, "Invalid CCI PMU counter %d\n", idx);
+		return 0;
+	}
+	value = pmu_read_register(idx, CCI_PMU_CNTR);
+
+	return value;
+}
+
+static void pmu_write_counter(struct perf_event *event, u32 value)
+{
+	struct arm_pmu *cci_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hw_counter = &event->hw;
+	int idx = hw_counter->idx;
+
+	if (unlikely(!pmu_is_valid_counter(cci_pmu, idx)))
+		dev_err(&cci_pmu->plat_device->dev, "Invalid CCI PMU counter %d\n", idx);
+	else
+		pmu_write_register(value, idx, CCI_PMU_CNTR);
+}
+
+static int cci_pmu_init(struct arm_pmu *cci_pmu, struct platform_device *pdev)
+{
+	*cci_pmu = (struct arm_pmu){
+		.name             = PMU_NAME,
+		.max_period       = (1LLU << 32) - 1,
+		.get_hw_events    = pmu_get_hw_events,
+		.get_event_idx    = pmu_get_event_idx,
+		.map_event        = pmu_map_event,
+		.request_irq      = pmu_request_irq,
+		.handle_irq       = pmu_handle_irq,
+		.free_irq         = pmu_free_irq,
+		.enable           = pmu_enable_event,
+		.disable          = pmu_disable_event,
+		.start            = pmu_start,
+		.stop             = pmu_stop,
+		.read_counter     = pmu_read_counter,
+		.write_counter    = pmu_write_counter,
+	};
+
+	cci_pmu->plat_device = pdev;
+	cci_pmu->num_events = pmu_get_max_counters();
+
+	return armpmu_register(cci_pmu, -1);
+}
+
+static const struct of_device_id arm_cci_pmu_matches[] = {
+	{
+		.compatible = "arm,cci-400-pmu",
+	},
+	{},
+};
+
+static int cci_pmu_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	int i, ret, irq;
+
+	pmu = devm_kzalloc(&pdev->dev, sizeof(*pmu), GFP_KERNEL);
+	if (!pmu)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	pmu->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(pmu->base))
+		return -ENOMEM;
+
+	/*
+	 * CCI PMU has 5 overflow signals - one per counter; but some may be tied
+	 * together to a common interrupt.
+	 */
+	pmu->nr_irqs = 0;
+	for (i = 0; i < CCI_PMU_MAX_HW_EVENTS; i++) {
+		irq = platform_get_irq(pdev, i);
+		if (irq < 0)
+			break;
+
+		if (is_duplicate_irq(irq, pmu->irqs, pmu->nr_irqs))
+			continue;
+
+		pmu->irqs[pmu->nr_irqs++] = irq;
+	}
+
+	/*
+	 * Ensure that the device tree has as many interrupts as the number
+	 * of counters.
+	 */
+	if (i < CCI_PMU_MAX_HW_EVENTS) {
+		dev_warn(&pdev->dev, "In-correct number of interrupts: %d, should be %d\n",
+			i, CCI_PMU_MAX_HW_EVENTS);
+		return -EINVAL;
+	}
+
+	pmu->port_ranges = port_range_by_rev();
+	if (!pmu->port_ranges) {
+		dev_warn(&pdev->dev, "CCI PMU version not supported\n");
+		return -EINVAL;
+	}
+
+	pmu->cci_pmu = devm_kzalloc(&pdev->dev, sizeof(*(pmu->cci_pmu)), GFP_KERNEL);
+	if (!pmu->cci_pmu)
+		return -ENOMEM;
+
+	pmu->hw_events.events = pmu->events;
+	pmu->hw_events.used_mask = pmu->used_mask;
+	raw_spin_lock_init(&pmu->hw_events.pmu_lock);
+
+	ret = cci_pmu_init(pmu->cci_pmu, pdev);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int cci_platform_probe(struct platform_device *pdev)
+{
+	if (!cci_probed())
+		return -ENODEV;
+
+	return of_platform_populate(pdev->dev.of_node, NULL, NULL, &pdev->dev);
+}
+
+#endif /* CONFIG_HW_PERF_EVENTS */
+
 struct cpu_port {
 	u64 mpidr;
 	u32 port;
@@ -120,7 +692,7 @@ int cci_ace_get_port(struct device_node *dn)
 }
 EXPORT_SYMBOL_GPL(cci_ace_get_port);
 
-static void __init cci_ace_init_ports(void)
+static void cci_ace_init_ports(void)
 {
 	int port, cpu;
 	struct device_node *cpun;
@@ -386,7 +958,7 @@ static const struct of_device_id arm_cci_ctrl_if_matches[] = {
 	{},
 };
 
-static int __init cci_probe(void)
+static int cci_probe(void)
 {
 	struct cci_nb_ports const *cci_config;
 	int ret, i, nb_ace = 0, nb_ace_lite = 0;
@@ -490,7 +1062,7 @@ memalloc_err:
 static int cci_init_status = -EAGAIN;
 static DEFINE_MUTEX(cci_probing);
 
-static int __init cci_init(void)
+static int cci_init(void)
 {
 	if (cci_init_status != -EAGAIN)
 		return cci_init_status;
@@ -502,18 +1074,55 @@ static int __init cci_init(void)
 	return cci_init_status;
 }
 
+#ifdef CONFIG_HW_PERF_EVENTS
+static struct platform_driver cci_pmu_driver = {
+	.driver = {
+		   .name = DRIVER_NAME_PMU,
+		   .of_match_table = arm_cci_pmu_matches,
+		  },
+	.probe = cci_pmu_probe,
+};
+
+static struct platform_driver cci_platform_driver = {
+	.driver = {
+		   .name = DRIVER_NAME,
+		   .of_match_table = arm_cci_matches,
+		  },
+	.probe = cci_platform_probe,
+};
+
+static int __init cci_platform_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&cci_pmu_driver);
+	if (ret)
+		return ret;
+
+	return platform_driver_register(&cci_platform_driver);
+}
+
+#else
+
+static int __init cci_platform_init(void)
+{
+	return 0;
+}
+
+#endif
 /*
  * To sort out early init calls ordering a helper function is provided to
  * check if the CCI driver has beed initialized. Function check if the driver
  * has been initialized, if not it calls the init function that probes
  * the driver and updates the return value.
  */
-bool __init cci_probed(void)
+bool cci_probed(void)
 {
 	return cci_init() == 0;
 }
 EXPORT_SYMBOL_GPL(cci_probed);
 
 early_initcall(cci_init);
+core_initcall(cci_platform_init);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("ARM CCI support");
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 56fe803adcb1..c61a6eccf169 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -154,6 +154,18 @@ config TEGRA20_APB_DMA
 	  This DMA controller transfers data from memory to peripheral fifo
 	  or vice versa. It does not support memory to memory data transfer.
 
+config S3C24XX_DMAC
+	tristate "Samsung S3C24XX DMA support"
+	depends on ARCH_S3C24XX && !S3C24XX_DMA
+	select DMA_ENGINE
+	select DMA_VIRTUAL_CHANNELS
+	help
+	  Support for the Samsung S3C24XX DMA controller driver. The
+	  DMA controller is having multiple DMA channels which can be
+	  configured for different peripherals like audio, UART, SPI.
+	  The DMA controller can transfer data from memory to peripheral,
+	  periphal to memory, periphal to periphal and memory to memory.
+
 source "drivers/dma/sh/Kconfig"
 
 config COH901318
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index db89035b3626..0ce2da97e429 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_SIRF_DMA) += sirf-dma.o
 obj-$(CONFIG_TI_EDMA) += edma.o
 obj-$(CONFIG_STE_DMA40) += ste_dma40.o ste_dma40_ll.o
 obj-$(CONFIG_TEGRA20_APB_DMA) += tegra20-apb-dma.o
+obj-$(CONFIG_S3C24XX_DMAC) += s3c24xx-dma.o
 obj-$(CONFIG_PL330_DMA) += pl330.o
 obj-$(CONFIG_PCH_DMA) += pch_dma.o
 obj-$(CONFIG_AMBA_PL08X) += amba-pl08x.o
diff --git a/drivers/dma/s3c24xx-dma.c b/drivers/dma/s3c24xx-dma.c
new file mode 100644
index 000000000000..4cb127978636
--- /dev/null
+++ b/drivers/dma/s3c24xx-dma.c
@@ -0,0 +1,1350 @@
+/*
+ * S3C24XX DMA handling
+ *
+ * Copyright (c) 2013 Heiko Stuebner <heiko@sntech.de>
+ *
+ * based on amba-pl08x.c
+ *
+ * Copyright (c) 2006 ARM Ltd.
+ * Copyright (c) 2010 ST-Ericsson SA
+ *
+ * Author: Peter Pearse <peter.pearse@arm.com>
+ * Author: Linus Walleij <linus.walleij@stericsson.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * The DMA controllers in S3C24XX SoCs have a varying number of DMA signals
+ * that can be routed to any of the 4 to 8 hardware-channels.
+ *
+ * Therefore on these DMA controllers the number of channels
+ * and the number of incoming DMA signals are two totally different things.
+ * It is usually not possible to theoretically handle all physical signals,
+ * so a multiplexing scheme with possible denial of use is necessary.
+ *
+ * Open items:
+ * - bursts
+ */
+
+#include <linux/platform_device.h>
+#include <linux/types.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/clk.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/platform_data/dma-s3c24xx.h>
+
+#include "dmaengine.h"
+#include "virt-dma.h"
+
+#define MAX_DMA_CHANNELS	8
+
+#define S3C24XX_DISRC			0x00
+#define S3C24XX_DISRCC			0x04
+#define S3C24XX_DISRCC_INC_INCREMENT	0
+#define S3C24XX_DISRCC_INC_FIXED	BIT(0)
+#define S3C24XX_DISRCC_LOC_AHB		0
+#define S3C24XX_DISRCC_LOC_APB		BIT(1)
+
+#define S3C24XX_DIDST			0x08
+#define S3C24XX_DIDSTC			0x0c
+#define S3C24XX_DIDSTC_INC_INCREMENT	0
+#define S3C24XX_DIDSTC_INC_FIXED	BIT(0)
+#define S3C24XX_DIDSTC_LOC_AHB		0
+#define S3C24XX_DIDSTC_LOC_APB		BIT(1)
+#define S3C24XX_DIDSTC_INT_TC0		0
+#define S3C24XX_DIDSTC_INT_RELOAD	BIT(2)
+
+#define S3C24XX_DCON			0x10
+
+#define S3C24XX_DCON_TC_MASK		0xfffff
+#define S3C24XX_DCON_DSZ_BYTE		(0 << 20)
+#define S3C24XX_DCON_DSZ_HALFWORD	(1 << 20)
+#define S3C24XX_DCON_DSZ_WORD		(2 << 20)
+#define S3C24XX_DCON_DSZ_MASK		(3 << 20)
+#define S3C24XX_DCON_DSZ_SHIFT		20
+#define S3C24XX_DCON_AUTORELOAD		0
+#define S3C24XX_DCON_NORELOAD		BIT(22)
+#define S3C24XX_DCON_HWTRIG		BIT(23)
+#define S3C24XX_DCON_HWSRC_SHIFT	24
+#define S3C24XX_DCON_SERV_SINGLE	0
+#define S3C24XX_DCON_SERV_WHOLE		BIT(27)
+#define S3C24XX_DCON_TSZ_UNIT		0
+#define S3C24XX_DCON_TSZ_BURST4		BIT(28)
+#define S3C24XX_DCON_INT		BIT(29)
+#define S3C24XX_DCON_SYNC_PCLK		0
+#define S3C24XX_DCON_SYNC_HCLK		BIT(30)
+#define S3C24XX_DCON_DEMAND		0
+#define S3C24XX_DCON_HANDSHAKE		BIT(31)
+
+#define S3C24XX_DSTAT			0x14
+#define S3C24XX_DSTAT_STAT_BUSY		BIT(20)
+#define S3C24XX_DSTAT_CURRTC_MASK	0xfffff
+
+#define S3C24XX_DMASKTRIG		0x20
+#define S3C24XX_DMASKTRIG_SWTRIG	BIT(0)
+#define S3C24XX_DMASKTRIG_ON		BIT(1)
+#define S3C24XX_DMASKTRIG_STOP		BIT(2)
+
+#define S3C24XX_DMAREQSEL		0x24
+#define S3C24XX_DMAREQSEL_HW		BIT(0)
+
+/*
+ * S3C2410, S3C2440 and S3C2442 SoCs cannot select any physical channel
+ * for a DMA source. Instead only specific channels are valid.
+ * All of these SoCs have 4 physical channels and the number of request
+ * source bits is 3. Additionally we also need 1 bit to mark the channel
+ * as valid.
+ * Therefore we separate the chansel element of the channel data into 4
+ * parts of 4 bits each, to hold the information if the channel is valid
+ * and the hw request source to use.
+ *
+ * Example:
+ * SDI is valid on channels 0, 2 and 3 - with varying hw request sources.
+ * For it the chansel field would look like
+ *
+ * ((BIT(3) | 1) << 3 * 4) | // channel 3, with request source 1
+ * ((BIT(3) | 2) << 2 * 4) | // channel 2, with request source 2
+ * ((BIT(3) | 2) << 0 * 4)   // channel 0, with request source 2
+ */
+#define S3C24XX_CHANSEL_WIDTH		4
+#define S3C24XX_CHANSEL_VALID		BIT(3)
+#define S3C24XX_CHANSEL_REQ_MASK	7
+
+/*
+ * struct soc_data - vendor-specific config parameters for individual SoCs
+ * @stride: spacing between the registers of each channel
+ * @has_reqsel: does the controller use the newer requestselection mechanism
+ * @has_clocks: are controllable dma-clocks present
+ */
+struct soc_data {
+	int stride;
+	bool has_reqsel;
+	bool has_clocks;
+};
+
+/*
+ * enum s3c24xx_dma_chan_state - holds the virtual channel states
+ * @S3C24XX_DMA_CHAN_IDLE: the channel is idle
+ * @S3C24XX_DMA_CHAN_RUNNING: the channel has allocated a physical transport
+ * channel and is running a transfer on it
+ * @S3C24XX_DMA_CHAN_WAITING: the channel is waiting for a physical transport
+ * channel to become available (only pertains to memcpy channels)
+ */
+enum s3c24xx_dma_chan_state {
+	S3C24XX_DMA_CHAN_IDLE,
+	S3C24XX_DMA_CHAN_RUNNING,
+	S3C24XX_DMA_CHAN_WAITING,
+};
+
+/*
+ * struct s3c24xx_sg - structure containing data per sg
+ * @src_addr: src address of sg
+ * @dst_addr: dst address of sg
+ * @len: transfer len in bytes
+ * @node: node for txd's dsg_list
+ */
+struct s3c24xx_sg {
+	dma_addr_t src_addr;
+	dma_addr_t dst_addr;
+	size_t len;
+	struct list_head node;
+};
+
+/*
+ * struct s3c24xx_txd - wrapper for struct dma_async_tx_descriptor
+ * @vd: virtual DMA descriptor
+ * @dsg_list: list of children sg's
+ * @at: sg currently being transfered
+ * @width: transfer width
+ * @disrcc: value for source control register
+ * @didstc: value for destination control register
+ * @dcon: base value for dcon register
+ */
+struct s3c24xx_txd {
+	struct virt_dma_desc vd;
+	struct list_head dsg_list;
+	struct list_head *at;
+	u8 width;
+	u32 disrcc;
+	u32 didstc;
+	u32 dcon;
+};
+
+struct s3c24xx_dma_chan;
+
+/*
+ * struct s3c24xx_dma_phy - holder for the physical channels
+ * @id: physical index to this channel
+ * @valid: does the channel have all required elements
+ * @base: virtual memory base (remapped) for the this channel
+ * @irq: interrupt for this channel
+ * @clk: clock for this channel
+ * @lock: a lock to use when altering an instance of this struct
+ * @serving: virtual channel currently being served by this physicalchannel
+ * @host: a pointer to the host (internal use)
+ */
+struct s3c24xx_dma_phy {
+	unsigned int			id;
+	bool				valid;
+	void __iomem			*base;
+	unsigned int			irq;
+	struct clk			*clk;
+	spinlock_t			lock;
+	struct s3c24xx_dma_chan		*serving;
+	struct s3c24xx_dma_engine	*host;
+};
+
+/*
+ * struct s3c24xx_dma_chan - this structure wraps a DMA ENGINE channel
+ * @id: the id of the channel
+ * @name: name of the channel
+ * @vc: wrappped virtual channel
+ * @phy: the physical channel utilized by this channel, if there is one
+ * @runtime_addr: address for RX/TX according to the runtime config
+ * @at: active transaction on this channel
+ * @lock: a lock for this channel data
+ * @host: a pointer to the host (internal use)
+ * @state: whether the channel is idle, running etc
+ * @slave: whether this channel is a device (slave) or for memcpy
+ */
+struct s3c24xx_dma_chan {
+	int id;
+	const char *name;
+	struct virt_dma_chan vc;
+	struct s3c24xx_dma_phy *phy;
+	struct dma_slave_config cfg;
+	struct s3c24xx_txd *at;
+	struct s3c24xx_dma_engine *host;
+	enum s3c24xx_dma_chan_state state;
+	bool slave;
+};
+
+/*
+ * struct s3c24xx_dma_engine - the local state holder for the S3C24XX
+ * @pdev: the corresponding platform device
+ * @pdata: platform data passed in from the platform/machine
+ * @base: virtual memory base (remapped)
+ * @slave: slave engine for this instance
+ * @memcpy: memcpy engine for this instance
+ * @phy_chans: array of data for the physical channels
+ */
+struct s3c24xx_dma_engine {
+	struct platform_device			*pdev;
+	const struct s3c24xx_dma_platdata	*pdata;
+	struct soc_data				*sdata;
+	void __iomem				*base;
+	struct dma_device			slave;
+	struct dma_device			memcpy;
+	struct s3c24xx_dma_phy			*phy_chans;
+};
+
+/*
+ * Physical channel handling
+ */
+
+/*
+ * Check whether a certain channel is busy or not.
+ */
+static int s3c24xx_dma_phy_busy(struct s3c24xx_dma_phy *phy)
+{
+	unsigned int val = readl(phy->base + S3C24XX_DSTAT);
+	return val & S3C24XX_DSTAT_STAT_BUSY;
+}
+
+static bool s3c24xx_dma_phy_valid(struct s3c24xx_dma_chan *s3cchan,
+				  struct s3c24xx_dma_phy *phy)
+{
+	struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
+	const struct s3c24xx_dma_platdata *pdata = s3cdma->pdata;
+	struct s3c24xx_dma_channel *cdata = &pdata->channels[s3cchan->id];
+	int phyvalid;
+
+	/* every phy is valid for memcopy channels */
+	if (!s3cchan->slave)
+		return true;
+
+	/* On newer variants all phys can be used for all virtual channels */
+	if (s3cdma->sdata->has_reqsel)
+		return true;
+
+	phyvalid = (cdata->chansel >> (phy->id * S3C24XX_CHANSEL_WIDTH));
+	return (phyvalid & S3C24XX_CHANSEL_VALID) ? true : false;
+}
+
+/*
+ * Allocate a physical channel for a virtual channel
+ *
+ * Try to locate a physical channel to be used for this transfer. If all
+ * are taken return NULL and the requester will have to cope by using
+ * some fallback PIO mode or retrying later.
+ */
+static
+struct s3c24xx_dma_phy *s3c24xx_dma_get_phy(struct s3c24xx_dma_chan *s3cchan)
+{
+	struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
+	const struct s3c24xx_dma_platdata *pdata = s3cdma->pdata;
+	struct s3c24xx_dma_channel *cdata;
+	struct s3c24xx_dma_phy *phy = NULL;
+	unsigned long flags;
+	int i;
+	int ret;
+
+	if (s3cchan->slave)
+		cdata = &pdata->channels[s3cchan->id];
+
+	for (i = 0; i < s3cdma->pdata->num_phy_channels; i++) {
+		phy = &s3cdma->phy_chans[i];
+
+		if (!phy->valid)
+			continue;
+
+		if (!s3c24xx_dma_phy_valid(s3cchan, phy))
+			continue;
+
+		spin_lock_irqsave(&phy->lock, flags);
+
+		if (!phy->serving) {
+			phy->serving = s3cchan;
+			spin_unlock_irqrestore(&phy->lock, flags);
+			break;
+		}
+
+		spin_unlock_irqrestore(&phy->lock, flags);
+	}
+
+	/* No physical channel available, cope with it */
+	if (i == s3cdma->pdata->num_phy_channels) {
+		dev_warn(&s3cdma->pdev->dev, "no phy channel available\n");
+		return NULL;
+	}
+
+	/* start the phy clock */
+	if (s3cdma->sdata->has_clocks) {
+		ret = clk_enable(phy->clk);
+		if (ret) {
+			dev_err(&s3cdma->pdev->dev, "could not enable clock for channel %d, err %d\n",
+				phy->id, ret);
+			phy->serving = NULL;
+			return NULL;
+		}
+	}
+
+	return phy;
+}
+
+/*
+ * Mark the physical channel as free.
+ *
+ * This drops the link between the physical and virtual channel.
+ */
+static inline void s3c24xx_dma_put_phy(struct s3c24xx_dma_phy *phy)
+{
+	struct s3c24xx_dma_engine *s3cdma = phy->host;
+
+	if (s3cdma->sdata->has_clocks)
+		clk_disable(phy->clk);
+
+	phy->serving = NULL;
+}
+
+/*
+ * Stops the channel by writing the stop bit.
+ * This should not be used for an on-going transfer, but as a method of
+ * shutting down a channel (eg, when it's no longer used) or terminating a
+ * transfer.
+ */
+static void s3c24xx_dma_terminate_phy(struct s3c24xx_dma_phy *phy)
+{
+	writel(S3C24XX_DMASKTRIG_STOP, phy->base + S3C24XX_DMASKTRIG);
+}
+
+/*
+ * Virtual channel handling
+ */
+
+static inline
+struct s3c24xx_dma_chan *to_s3c24xx_dma_chan(struct dma_chan *chan)
+{
+	return container_of(chan, struct s3c24xx_dma_chan, vc.chan);
+}
+
+static u32 s3c24xx_dma_getbytes_chan(struct s3c24xx_dma_chan *s3cchan)
+{
+	struct s3c24xx_dma_phy *phy = s3cchan->phy;
+	struct s3c24xx_txd *txd = s3cchan->at;
+	u32 tc = readl(phy->base + S3C24XX_DSTAT) & S3C24XX_DSTAT_CURRTC_MASK;
+
+	return tc * txd->width;
+}
+
+static int s3c24xx_dma_set_runtime_config(struct s3c24xx_dma_chan *s3cchan,
+				  struct dma_slave_config *config)
+{
+	if (!s3cchan->slave)
+		return -EINVAL;
+
+	/* Reject definitely invalid configurations */
+	if (config->src_addr_width == DMA_SLAVE_BUSWIDTH_8_BYTES ||
+	    config->dst_addr_width == DMA_SLAVE_BUSWIDTH_8_BYTES)
+		return -EINVAL;
+
+	s3cchan->cfg = *config;
+
+	return 0;
+}
+
+/*
+ * Transfer handling
+ */
+
+static inline
+struct s3c24xx_txd *to_s3c24xx_txd(struct dma_async_tx_descriptor *tx)
+{
+	return container_of(tx, struct s3c24xx_txd, vd.tx);
+}
+
+static struct s3c24xx_txd *s3c24xx_dma_get_txd(void)
+{
+	struct s3c24xx_txd *txd = kzalloc(sizeof(*txd), GFP_NOWAIT);
+
+	if (txd) {
+		INIT_LIST_HEAD(&txd->dsg_list);
+		txd->dcon = S3C24XX_DCON_INT | S3C24XX_DCON_NORELOAD;
+	}
+
+	return txd;
+}
+
+static void s3c24xx_dma_free_txd(struct s3c24xx_txd *txd)
+{
+	struct s3c24xx_sg *dsg, *_dsg;
+
+	list_for_each_entry_safe(dsg, _dsg, &txd->dsg_list, node) {
+		list_del(&dsg->node);
+		kfree(dsg);
+	}
+
+	kfree(txd);
+}
+
+static void s3c24xx_dma_start_next_sg(struct s3c24xx_dma_chan *s3cchan,
+				       struct s3c24xx_txd *txd)
+{
+	struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
+	struct s3c24xx_dma_phy *phy = s3cchan->phy;
+	const struct s3c24xx_dma_platdata *pdata = s3cdma->pdata;
+	struct s3c24xx_sg *dsg = list_entry(txd->at, struct s3c24xx_sg, node);
+	u32 dcon = txd->dcon;
+	u32 val;
+
+	/* transfer-size and -count from len and width */
+	switch (txd->width) {
+	case 1:
+		dcon |= S3C24XX_DCON_DSZ_BYTE | dsg->len;
+		break;
+	case 2:
+		dcon |= S3C24XX_DCON_DSZ_HALFWORD | (dsg->len / 2);
+		break;
+	case 4:
+		dcon |= S3C24XX_DCON_DSZ_WORD | (dsg->len / 4);
+		break;
+	}
+
+	if (s3cchan->slave) {
+		struct s3c24xx_dma_channel *cdata =
+					&pdata->channels[s3cchan->id];
+
+		if (s3cdma->sdata->has_reqsel) {
+			writel_relaxed((cdata->chansel << 1) |
+							S3C24XX_DMAREQSEL_HW,
+					phy->base + S3C24XX_DMAREQSEL);
+		} else {
+			int csel = cdata->chansel >> (phy->id *
+							S3C24XX_CHANSEL_WIDTH);
+
+			csel &= S3C24XX_CHANSEL_REQ_MASK;
+			dcon |= csel << S3C24XX_DCON_HWSRC_SHIFT;
+			dcon |= S3C24XX_DCON_HWTRIG;
+		}
+	} else {
+		if (s3cdma->sdata->has_reqsel)
+			writel_relaxed(0, phy->base + S3C24XX_DMAREQSEL);
+	}
+
+	writel_relaxed(dsg->src_addr, phy->base + S3C24XX_DISRC);
+	writel_relaxed(txd->disrcc, phy->base + S3C24XX_DISRCC);
+	writel_relaxed(dsg->dst_addr, phy->base + S3C24XX_DIDST);
+	writel_relaxed(txd->didstc, phy->base + S3C24XX_DIDSTC);
+	writel_relaxed(dcon, phy->base + S3C24XX_DCON);
+
+	val = readl_relaxed(phy->base + S3C24XX_DMASKTRIG);
+	val &= ~S3C24XX_DMASKTRIG_STOP;
+	val |= S3C24XX_DMASKTRIG_ON;
+
+	/* trigger the dma operation for memcpy transfers */
+	if (!s3cchan->slave)
+		val |= S3C24XX_DMASKTRIG_SWTRIG;
+
+	writel(val, phy->base + S3C24XX_DMASKTRIG);
+}
+
+/*
+ * Set the initial DMA register values and start first sg.
+ */
+static void s3c24xx_dma_start_next_txd(struct s3c24xx_dma_chan *s3cchan)
+{
+	struct s3c24xx_dma_phy *phy = s3cchan->phy;
+	struct virt_dma_desc *vd = vchan_next_desc(&s3cchan->vc);
+	struct s3c24xx_txd *txd = to_s3c24xx_txd(&vd->tx);
+
+	list_del(&txd->vd.node);
+
+	s3cchan->at = txd;
+
+	/* Wait for channel inactive */
+	while (s3c24xx_dma_phy_busy(phy))
+		cpu_relax();
+
+	/* point to the first element of the sg list */
+	txd->at = txd->dsg_list.next;
+	s3c24xx_dma_start_next_sg(s3cchan, txd);
+}
+
+static void s3c24xx_dma_free_txd_list(struct s3c24xx_dma_engine *s3cdma,
+				struct s3c24xx_dma_chan *s3cchan)
+{
+	LIST_HEAD(head);
+
+	vchan_get_all_descriptors(&s3cchan->vc, &head);
+	vchan_dma_desc_free_list(&s3cchan->vc, &head);
+}
+
+/*
+ * Try to allocate a physical channel.  When successful, assign it to
+ * this virtual channel, and initiate the next descriptor.  The
+ * virtual channel lock must be held at this point.
+ */
+static void s3c24xx_dma_phy_alloc_and_start(struct s3c24xx_dma_chan *s3cchan)
+{
+	struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
+	struct s3c24xx_dma_phy *phy;
+
+	phy = s3c24xx_dma_get_phy(s3cchan);
+	if (!phy) {
+		dev_dbg(&s3cdma->pdev->dev, "no physical channel available for xfer on %s\n",
+			s3cchan->name);
+		s3cchan->state = S3C24XX_DMA_CHAN_WAITING;
+		return;
+	}
+
+	dev_dbg(&s3cdma->pdev->dev, "allocated physical channel %d for xfer on %s\n",
+		phy->id, s3cchan->name);
+
+	s3cchan->phy = phy;
+	s3cchan->state = S3C24XX_DMA_CHAN_RUNNING;
+
+	s3c24xx_dma_start_next_txd(s3cchan);
+}
+
+static void s3c24xx_dma_phy_reassign_start(struct s3c24xx_dma_phy *phy,
+	struct s3c24xx_dma_chan *s3cchan)
+{
+	struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
+
+	dev_dbg(&s3cdma->pdev->dev, "reassigned physical channel %d for xfer on %s\n",
+		phy->id, s3cchan->name);
+
+	/*
+	 * We do this without taking the lock; we're really only concerned
+	 * about whether this pointer is NULL or not, and we're guaranteed
+	 * that this will only be called when it _already_ is non-NULL.
+	 */
+	phy->serving = s3cchan;
+	s3cchan->phy = phy;
+	s3cchan->state = S3C24XX_DMA_CHAN_RUNNING;
+	s3c24xx_dma_start_next_txd(s3cchan);
+}
+
+/*
+ * Free a physical DMA channel, potentially reallocating it to another
+ * virtual channel if we have any pending.
+ */
+static void s3c24xx_dma_phy_free(struct s3c24xx_dma_chan *s3cchan)
+{
+	struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
+	struct s3c24xx_dma_chan *p, *next;
+
+retry:
+	next = NULL;
+
+	/* Find a waiting virtual channel for the next transfer. */
+	list_for_each_entry(p, &s3cdma->memcpy.channels, vc.chan.device_node)
+		if (p->state == S3C24XX_DMA_CHAN_WAITING) {
+			next = p;
+			break;
+		}
+
+	if (!next) {
+		list_for_each_entry(p, &s3cdma->slave.channels,
+				    vc.chan.device_node)
+			if (p->state == S3C24XX_DMA_CHAN_WAITING &&
+				      s3c24xx_dma_phy_valid(p, s3cchan->phy)) {
+				next = p;
+				break;
+			}
+	}
+
+	/* Ensure that the physical channel is stopped */
+	s3c24xx_dma_terminate_phy(s3cchan->phy);
+
+	if (next) {
+		bool success;
+
+		/*
+		 * Eww.  We know this isn't going to deadlock
+		 * but lockdep probably doesn't.
+		 */
+		spin_lock(&next->vc.lock);
+		/* Re-check the state now that we have the lock */
+		success = next->state == S3C24XX_DMA_CHAN_WAITING;
+		if (success)
+			s3c24xx_dma_phy_reassign_start(s3cchan->phy, next);
+		spin_unlock(&next->vc.lock);
+
+		/* If the state changed, try to find another channel */
+		if (!success)
+			goto retry;
+	} else {
+		/* No more jobs, so free up the physical channel */
+		s3c24xx_dma_put_phy(s3cchan->phy);
+	}
+
+	s3cchan->phy = NULL;
+	s3cchan->state = S3C24XX_DMA_CHAN_IDLE;
+}
+
+static void s3c24xx_dma_unmap_buffers(struct s3c24xx_txd *txd)
+{
+	struct device *dev = txd->vd.tx.chan->device->dev;
+	struct s3c24xx_sg *dsg;
+
+	if (!(txd->vd.tx.flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+		if (txd->vd.tx.flags & DMA_COMPL_SRC_UNMAP_SINGLE)
+			list_for_each_entry(dsg, &txd->dsg_list, node)
+				dma_unmap_single(dev, dsg->src_addr, dsg->len,
+						DMA_TO_DEVICE);
+		else {
+			list_for_each_entry(dsg, &txd->dsg_list, node)
+				dma_unmap_page(dev, dsg->src_addr, dsg->len,
+						DMA_TO_DEVICE);
+		}
+	}
+
+	if (!(txd->vd.tx.flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+		if (txd->vd.tx.flags & DMA_COMPL_DEST_UNMAP_SINGLE)
+			list_for_each_entry(dsg, &txd->dsg_list, node)
+				dma_unmap_single(dev, dsg->dst_addr, dsg->len,
+						DMA_FROM_DEVICE);
+		else
+			list_for_each_entry(dsg, &txd->dsg_list, node)
+				dma_unmap_page(dev, dsg->dst_addr, dsg->len,
+						DMA_FROM_DEVICE);
+	}
+}
+
+static void s3c24xx_dma_desc_free(struct virt_dma_desc *vd)
+{
+	struct s3c24xx_txd *txd = to_s3c24xx_txd(&vd->tx);
+	struct s3c24xx_dma_chan *s3cchan = to_s3c24xx_dma_chan(vd->tx.chan);
+
+	if (!s3cchan->slave)
+		s3c24xx_dma_unmap_buffers(txd);
+
+	s3c24xx_dma_free_txd(txd);
+}
+
+static irqreturn_t s3c24xx_dma_irq(int irq, void *data)
+{
+	struct s3c24xx_dma_phy *phy = data;
+	struct s3c24xx_dma_chan *s3cchan = phy->serving;
+	struct s3c24xx_txd *txd;
+
+	dev_dbg(&phy->host->pdev->dev, "interrupt on channel %d\n", phy->id);
+
+	/*
+	 * Interrupts happen to notify the completion of a transfer and the
+	 * channel should have moved into its stop state already on its own.
+	 * Therefore interrupts on channels not bound to a virtual channel
+	 * should never happen. Nevertheless send a terminate command to the
+	 * channel if the unlikely case happens.
+	 */
+	if (unlikely(!s3cchan)) {
+		dev_err(&phy->host->pdev->dev, "interrupt on unused channel %d\n",
+			phy->id);
+
+		s3c24xx_dma_terminate_phy(phy);
+
+		return IRQ_HANDLED;
+	}
+
+	spin_lock(&s3cchan->vc.lock);
+	txd = s3cchan->at;
+	if (txd) {
+		/* when more sg's are in this txd, start the next one */
+		if (!list_is_last(txd->at, &txd->dsg_list)) {
+			txd->at = txd->at->next;
+			s3c24xx_dma_start_next_sg(s3cchan, txd);
+		} else {
+			s3cchan->at = NULL;
+			vchan_cookie_complete(&txd->vd);
+
+			/*
+			 * And start the next descriptor (if any),
+			 * otherwise free this channel.
+			 */
+			if (vchan_next_desc(&s3cchan->vc))
+				s3c24xx_dma_start_next_txd(s3cchan);
+			else
+				s3c24xx_dma_phy_free(s3cchan);
+		}
+	}
+	spin_unlock(&s3cchan->vc.lock);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * The DMA ENGINE API
+ */
+
+static int s3c24xx_dma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
+			 unsigned long arg)
+{
+	struct s3c24xx_dma_chan *s3cchan = to_s3c24xx_dma_chan(chan);
+	struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&s3cchan->vc.lock, flags);
+
+	switch (cmd) {
+	case DMA_SLAVE_CONFIG:
+		ret = s3c24xx_dma_set_runtime_config(s3cchan,
+					      (struct dma_slave_config *)arg);
+		break;
+	case DMA_TERMINATE_ALL:
+		if (!s3cchan->phy && !s3cchan->at) {
+			dev_err(&s3cdma->pdev->dev, "trying to terminate already stopped channel %d\n",
+				s3cchan->id);
+			ret = -EINVAL;
+			break;
+		}
+
+		s3cchan->state = S3C24XX_DMA_CHAN_IDLE;
+
+		 /* Mark physical channel as free */
+		if (s3cchan->phy)
+			s3c24xx_dma_phy_free(s3cchan);
+
+		/* Dequeue current job */
+		if (s3cchan->at) {
+			s3c24xx_dma_desc_free(&s3cchan->at->vd);
+			s3cchan->at = NULL;
+		}
+
+		/* Dequeue jobs not yet fired as well */
+		s3c24xx_dma_free_txd_list(s3cdma, s3cchan);
+		break;
+	default:
+		/* Unknown command */
+		ret = -ENXIO;
+		break;
+	}
+
+	spin_unlock_irqrestore(&s3cchan->vc.lock, flags);
+
+	return ret;
+}
+
+static int s3c24xx_dma_alloc_chan_resources(struct dma_chan *chan)
+{
+	return 0;
+}
+
+static void s3c24xx_dma_free_chan_resources(struct dma_chan *chan)
+{
+	/* Ensure all queued descriptors are freed */
+	vchan_free_chan_resources(to_virt_chan(chan));
+}
+
+static enum dma_status s3c24xx_dma_tx_status(struct dma_chan *chan,
+		dma_cookie_t cookie, struct dma_tx_state *txstate)
+{
+	struct s3c24xx_dma_chan *s3cchan = to_s3c24xx_dma_chan(chan);
+	struct s3c24xx_txd *txd;
+	struct s3c24xx_sg *dsg;
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+	enum dma_status ret;
+	size_t bytes = 0;
+
+	spin_lock_irqsave(&s3cchan->vc.lock, flags);
+	ret = dma_cookie_status(chan, cookie, txstate);
+	if (ret == DMA_SUCCESS) {
+		spin_unlock_irqrestore(&s3cchan->vc.lock, flags);
+		return ret;
+	}
+
+	/*
+	 * There's no point calculating the residue if there's
+	 * no txstate to store the value.
+	 */
+	if (!txstate) {
+		spin_unlock_irqrestore(&s3cchan->vc.lock, flags);
+		return ret;
+	}
+
+	vd = vchan_find_desc(&s3cchan->vc, cookie);
+	if (vd) {
+		/* On the issued list, so hasn't been processed yet */
+		txd = to_s3c24xx_txd(&vd->tx);
+
+		list_for_each_entry(dsg, &txd->dsg_list, node)
+			bytes += dsg->len;
+	} else {
+		/*
+		 * Currently running, so sum over the pending sg's and
+		 * the currently active one.
+		 */
+		txd = s3cchan->at;
+
+		dsg = list_entry(txd->at, struct s3c24xx_sg, node);
+		list_for_each_entry_from(dsg, &txd->dsg_list, node)
+			bytes += dsg->len;
+
+		bytes += s3c24xx_dma_getbytes_chan(s3cchan);
+	}
+	spin_unlock_irqrestore(&s3cchan->vc.lock, flags);
+
+	/*
+	 * This cookie not complete yet
+	 * Get number of bytes left in the active transactions and queue
+	 */
+	dma_set_residue(txstate, bytes);
+
+	/* Whether waiting or running, we're in progress */
+	return ret;
+}
+
+/*
+ * Initialize a descriptor to be used by memcpy submit
+ */
+static struct dma_async_tx_descriptor *s3c24xx_dma_prep_memcpy(
+		struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
+		size_t len, unsigned long flags)
+{
+	struct s3c24xx_dma_chan *s3cchan = to_s3c24xx_dma_chan(chan);
+	struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
+	struct s3c24xx_txd *txd;
+	struct s3c24xx_sg *dsg;
+	int src_mod, dest_mod;
+
+	dev_dbg(&s3cdma->pdev->dev, "prepare memcpy of %d bytes from %s\n",
+			len, s3cchan->name);
+
+	if ((len & S3C24XX_DCON_TC_MASK) != len) {
+		dev_err(&s3cdma->pdev->dev, "memcpy size %d to large\n", len);
+		return NULL;
+	}
+
+	txd = s3c24xx_dma_get_txd();
+	if (!txd)
+		return NULL;
+
+	dsg = kzalloc(sizeof(*dsg), GFP_NOWAIT);
+	if (!dsg) {
+		s3c24xx_dma_free_txd(txd);
+		return NULL;
+	}
+	list_add_tail(&dsg->node, &txd->dsg_list);
+
+	dsg->src_addr = src;
+	dsg->dst_addr = dest;
+	dsg->len = len;
+
+	/*
+	 * Determine a suitable transfer width.
+	 * The DMA controller cannot fetch/store information which is not
+	 * naturally aligned on the bus, i.e., a 4 byte fetch must start at
+	 * an address divisible by 4 - more generally addr % width must be 0.
+	 */
+	src_mod = src % 4;
+	dest_mod = dest % 4;
+	switch (len % 4) {
+	case 0:
+		txd->width = (src_mod == 0 && dest_mod == 0) ? 4 : 1;
+		break;
+	case 2:
+		txd->width = ((src_mod == 2 || src_mod == 0) &&
+			      (dest_mod == 2 || dest_mod == 0)) ? 2 : 1;
+		break;
+	default:
+		txd->width = 1;
+		break;
+	}
+
+	txd->disrcc = S3C24XX_DISRCC_LOC_AHB | S3C24XX_DISRCC_INC_INCREMENT;
+	txd->didstc = S3C24XX_DIDSTC_LOC_AHB | S3C24XX_DIDSTC_INC_INCREMENT;
+	txd->dcon |= S3C24XX_DCON_DEMAND | S3C24XX_DCON_SYNC_HCLK |
+		     S3C24XX_DCON_SERV_WHOLE;
+
+	return vchan_tx_prep(&s3cchan->vc, &txd->vd, flags);
+}
+
+static struct dma_async_tx_descriptor *s3c24xx_dma_prep_slave_sg(
+		struct dma_chan *chan, struct scatterlist *sgl,
+		unsigned int sg_len, enum dma_transfer_direction direction,
+		unsigned long flags, void *context)
+{
+	struct s3c24xx_dma_chan *s3cchan = to_s3c24xx_dma_chan(chan);
+	struct s3c24xx_dma_engine *s3cdma = s3cchan->host;
+	const struct s3c24xx_dma_platdata *pdata = s3cdma->pdata;
+	struct s3c24xx_dma_channel *cdata = &pdata->channels[s3cchan->id];
+	struct s3c24xx_txd *txd;
+	struct s3c24xx_sg *dsg;
+	struct scatterlist *sg;
+	dma_addr_t slave_addr;
+	u32 hwcfg = 0;
+	int tmp;
+
+	dev_dbg(&s3cdma->pdev->dev, "prepare transaction of %d bytes from %s\n",
+			sg_dma_len(sgl), s3cchan->name);
+
+	txd = s3c24xx_dma_get_txd();
+	if (!txd)
+		return NULL;
+
+	if (cdata->handshake)
+		txd->dcon |= S3C24XX_DCON_HANDSHAKE;
+
+	switch (cdata->bus) {
+	case S3C24XX_DMA_APB:
+		txd->dcon |= S3C24XX_DCON_SYNC_PCLK;
+		hwcfg |= S3C24XX_DISRCC_LOC_APB;
+		break;
+	case S3C24XX_DMA_AHB:
+		txd->dcon |= S3C24XX_DCON_SYNC_HCLK;
+		hwcfg |= S3C24XX_DISRCC_LOC_AHB;
+		break;
+	}
+
+	/*
+	 * Always assume our peripheral desintation is a fixed
+	 * address in memory.
+	 */
+	hwcfg |= S3C24XX_DISRCC_INC_FIXED;
+
+	/*
+	 * Individual dma operations are requested by the slave,
+	 * so serve only single atomic operations (S3C24XX_DCON_SERV_SINGLE).
+	 */
+	txd->dcon |= S3C24XX_DCON_SERV_SINGLE;
+
+	if (direction == DMA_MEM_TO_DEV) {
+		txd->disrcc = S3C24XX_DISRCC_LOC_AHB |
+			      S3C24XX_DISRCC_INC_INCREMENT;
+		txd->didstc = hwcfg;
+		slave_addr = s3cchan->cfg.dst_addr;
+		txd->width = s3cchan->cfg.dst_addr_width;
+	} else if (direction == DMA_DEV_TO_MEM) {
+		txd->disrcc = hwcfg;
+		txd->didstc = S3C24XX_DIDSTC_LOC_AHB |
+			      S3C24XX_DIDSTC_INC_INCREMENT;
+		slave_addr = s3cchan->cfg.src_addr;
+		txd->width = s3cchan->cfg.src_addr_width;
+	} else {
+		s3c24xx_dma_free_txd(txd);
+		dev_err(&s3cdma->pdev->dev,
+			"direction %d unsupported\n", direction);
+		return NULL;
+	}
+
+	for_each_sg(sgl, sg, sg_len, tmp) {
+		dsg = kzalloc(sizeof(*dsg), GFP_NOWAIT);
+		if (!dsg) {
+			s3c24xx_dma_free_txd(txd);
+			return NULL;
+		}
+		list_add_tail(&dsg->node, &txd->dsg_list);
+
+		dsg->len = sg_dma_len(sg);
+		if (direction == DMA_MEM_TO_DEV) {
+			dsg->src_addr = sg_dma_address(sg);
+			dsg->dst_addr = slave_addr;
+		} else { /* DMA_DEV_TO_MEM */
+			dsg->src_addr = slave_addr;
+			dsg->dst_addr = sg_dma_address(sg);
+		}
+		break;
+	}
+
+	return vchan_tx_prep(&s3cchan->vc, &txd->vd, flags);
+}
+
+/*
+ * Slave transactions callback to the slave device to allow
+ * synchronization of slave DMA signals with the DMAC enable
+ */
+static void s3c24xx_dma_issue_pending(struct dma_chan *chan)
+{
+	struct s3c24xx_dma_chan *s3cchan = to_s3c24xx_dma_chan(chan);
+	unsigned long flags;
+
+	spin_lock_irqsave(&s3cchan->vc.lock, flags);
+	if (vchan_issue_pending(&s3cchan->vc)) {
+		if (!s3cchan->phy && s3cchan->state != S3C24XX_DMA_CHAN_WAITING)
+			s3c24xx_dma_phy_alloc_and_start(s3cchan);
+	}
+	spin_unlock_irqrestore(&s3cchan->vc.lock, flags);
+}
+
+/*
+ * Bringup and teardown
+ */
+
+/*
+ * Initialise the DMAC memcpy/slave channels.
+ * Make a local wrapper to hold required data
+ */
+static int s3c24xx_dma_init_virtual_channels(struct s3c24xx_dma_engine *s3cdma,
+		struct dma_device *dmadev, unsigned int channels, bool slave)
+{
+	struct s3c24xx_dma_chan *chan;
+	int i;
+
+	INIT_LIST_HEAD(&dmadev->channels);
+
+	/*
+	 * Register as many many memcpy as we have physical channels,
+	 * we won't always be able to use all but the code will have
+	 * to cope with that situation.
+	 */
+	for (i = 0; i < channels; i++) {
+		chan = devm_kzalloc(dmadev->dev, sizeof(*chan), GFP_KERNEL);
+		if (!chan) {
+			dev_err(dmadev->dev,
+				"%s no memory for channel\n", __func__);
+			return -ENOMEM;
+		}
+
+		chan->id = i;
+		chan->host = s3cdma;
+		chan->state = S3C24XX_DMA_CHAN_IDLE;
+
+		if (slave) {
+			chan->slave = true;
+			chan->name = kasprintf(GFP_KERNEL, "slave%d", i);
+			if (!chan->name)
+				return -ENOMEM;
+		} else {
+			chan->name = kasprintf(GFP_KERNEL, "memcpy%d", i);
+			if (!chan->name)
+				return -ENOMEM;
+		}
+		dev_dbg(dmadev->dev,
+			 "initialize virtual channel \"%s\"\n",
+			 chan->name);
+
+		chan->vc.desc_free = s3c24xx_dma_desc_free;
+		vchan_init(&chan->vc, dmadev);
+	}
+	dev_info(dmadev->dev, "initialized %d virtual %s channels\n",
+		 i, slave ? "slave" : "memcpy");
+	return i;
+}
+
+static void s3c24xx_dma_free_virtual_channels(struct dma_device *dmadev)
+{
+	struct s3c24xx_dma_chan *chan = NULL;
+	struct s3c24xx_dma_chan *next;
+
+	list_for_each_entry_safe(chan,
+				 next, &dmadev->channels, vc.chan.device_node)
+		list_del(&chan->vc.chan.device_node);
+}
+
+/* s3c2410, s3c2440 and s3c2442 have a 0x40 stride without separate clocks */
+static struct soc_data soc_s3c2410 = {
+	.stride = 0x40,
+	.has_reqsel = false,
+	.has_clocks = false,
+};
+
+/* s3c2412 and s3c2413 have a 0x40 stride and dmareqsel mechanism */
+static struct soc_data soc_s3c2412 = {
+	.stride = 0x40,
+	.has_reqsel = true,
+	.has_clocks = true,
+};
+
+/* s3c2443 and following have a 0x100 stride and dmareqsel mechanism */
+static struct soc_data soc_s3c2443 = {
+	.stride = 0x100,
+	.has_reqsel = true,
+	.has_clocks = true,
+};
+
+static struct platform_device_id s3c24xx_dma_driver_ids[] = {
+	{
+		.name		= "s3c2410-dma",
+		.driver_data	= (kernel_ulong_t)&soc_s3c2410,
+	}, {
+		.name		= "s3c2412-dma",
+		.driver_data	= (kernel_ulong_t)&soc_s3c2412,
+	}, {
+		.name		= "s3c2443-dma",
+		.driver_data	= (kernel_ulong_t)&soc_s3c2443,
+	},
+	{ },
+};
+
+static struct soc_data *s3c24xx_dma_get_soc_data(struct platform_device *pdev)
+{
+	return (struct soc_data *)
+			 platform_get_device_id(pdev)->driver_data;
+}
+
+static int s3c24xx_dma_probe(struct platform_device *pdev)
+{
+	const struct s3c24xx_dma_platdata *pdata = dev_get_platdata(&pdev->dev);
+	struct s3c24xx_dma_engine *s3cdma;
+	struct soc_data *sdata;
+	struct resource *res;
+	int ret;
+	int i;
+
+	if (!pdata) {
+		dev_err(&pdev->dev, "platform data missing\n");
+		return -ENODEV;
+	}
+
+	/* Basic sanity check */
+	if (pdata->num_phy_channels > MAX_DMA_CHANNELS) {
+		dev_err(&pdev->dev, "to many dma channels %d, max %d\n",
+			pdata->num_phy_channels, MAX_DMA_CHANNELS);
+		return -EINVAL;
+	}
+
+	sdata = s3c24xx_dma_get_soc_data(pdev);
+	if (!sdata)
+		return -EINVAL;
+
+	s3cdma = devm_kzalloc(&pdev->dev, sizeof(*s3cdma), GFP_KERNEL);
+	if (!s3cdma)
+		return -ENOMEM;
+
+	s3cdma->pdev = pdev;
+	s3cdma->pdata = pdata;
+	s3cdma->sdata = sdata;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	s3cdma->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(s3cdma->base))
+		return PTR_ERR(s3cdma->base);
+
+	s3cdma->phy_chans = devm_kzalloc(&pdev->dev,
+					      sizeof(struct s3c24xx_dma_phy) *
+							pdata->num_phy_channels,
+					      GFP_KERNEL);
+	if (!s3cdma->phy_chans)
+		return -ENOMEM;
+
+	/* aquire irqs and clocks for all physical channels */
+	for (i = 0; i < pdata->num_phy_channels; i++) {
+		struct s3c24xx_dma_phy *phy = &s3cdma->phy_chans[i];
+		char clk_name[6];
+
+		phy->id = i;
+		phy->base = s3cdma->base + (i * sdata->stride);
+		phy->host = s3cdma;
+
+		phy->irq = platform_get_irq(pdev, i);
+		if (phy->irq < 0) {
+			dev_err(&pdev->dev, "failed to get irq %d, err %d\n",
+				i, phy->irq);
+			continue;
+		}
+
+		ret = devm_request_irq(&pdev->dev, phy->irq, s3c24xx_dma_irq,
+				       0, pdev->name, phy);
+		if (ret) {
+			dev_err(&pdev->dev, "Unable to request irq for channel %d, error %d\n",
+				i, ret);
+			continue;
+		}
+
+		if (sdata->has_clocks) {
+			sprintf(clk_name, "dma.%d", i);
+			phy->clk = devm_clk_get(&pdev->dev, clk_name);
+			if (IS_ERR(phy->clk) && sdata->has_clocks) {
+				dev_err(&pdev->dev, "unable to aquire clock for channel %d, error %lu",
+					i, PTR_ERR(phy->clk));
+				continue;
+			}
+
+			ret = clk_prepare(phy->clk);
+			if (ret) {
+				dev_err(&pdev->dev, "clock for phy %d failed, error %d\n",
+					i, ret);
+				continue;
+			}
+		}
+
+		spin_lock_init(&phy->lock);
+		phy->valid = true;
+
+		dev_dbg(&pdev->dev, "physical channel %d is %s\n",
+			i, s3c24xx_dma_phy_busy(phy) ? "BUSY" : "FREE");
+	}
+
+	/* Initialize memcpy engine */
+	dma_cap_set(DMA_MEMCPY, s3cdma->memcpy.cap_mask);
+	dma_cap_set(DMA_PRIVATE, s3cdma->memcpy.cap_mask);
+	s3cdma->memcpy.dev = &pdev->dev;
+	s3cdma->memcpy.device_alloc_chan_resources =
+					s3c24xx_dma_alloc_chan_resources;
+	s3cdma->memcpy.device_free_chan_resources =
+					s3c24xx_dma_free_chan_resources;
+	s3cdma->memcpy.device_prep_dma_memcpy = s3c24xx_dma_prep_memcpy;
+	s3cdma->memcpy.device_tx_status = s3c24xx_dma_tx_status;
+	s3cdma->memcpy.device_issue_pending = s3c24xx_dma_issue_pending;
+	s3cdma->memcpy.device_control = s3c24xx_dma_control;
+
+	/* Initialize slave engine for SoC internal dedicated peripherals */
+	dma_cap_set(DMA_SLAVE, s3cdma->slave.cap_mask);
+	dma_cap_set(DMA_PRIVATE, s3cdma->slave.cap_mask);
+	s3cdma->slave.dev = &pdev->dev;
+	s3cdma->slave.device_alloc_chan_resources =
+					s3c24xx_dma_alloc_chan_resources;
+	s3cdma->slave.device_free_chan_resources =
+					s3c24xx_dma_free_chan_resources;
+	s3cdma->slave.device_tx_status = s3c24xx_dma_tx_status;
+	s3cdma->slave.device_issue_pending = s3c24xx_dma_issue_pending;
+	s3cdma->slave.device_prep_slave_sg = s3c24xx_dma_prep_slave_sg;
+	s3cdma->slave.device_control = s3c24xx_dma_control;
+
+	/* Register as many memcpy channels as there are physical channels */
+	ret = s3c24xx_dma_init_virtual_channels(s3cdma, &s3cdma->memcpy,
+						pdata->num_phy_channels, false);
+	if (ret <= 0) {
+		dev_warn(&pdev->dev,
+			 "%s failed to enumerate memcpy channels - %d\n",
+			 __func__, ret);
+		goto err_memcpy;
+	}
+
+	/* Register slave channels */
+	ret = s3c24xx_dma_init_virtual_channels(s3cdma, &s3cdma->slave,
+				pdata->num_channels, true);
+	if (ret <= 0) {
+		dev_warn(&pdev->dev,
+			"%s failed to enumerate slave channels - %d\n",
+				__func__, ret);
+		goto err_slave;
+	}
+
+	ret = dma_async_device_register(&s3cdma->memcpy);
+	if (ret) {
+		dev_warn(&pdev->dev,
+			"%s failed to register memcpy as an async device - %d\n",
+			__func__, ret);
+		goto err_memcpy_reg;
+	}
+
+	ret = dma_async_device_register(&s3cdma->slave);
+	if (ret) {
+		dev_warn(&pdev->dev,
+			"%s failed to register slave as an async device - %d\n",
+			__func__, ret);
+		goto err_slave_reg;
+	}
+
+	platform_set_drvdata(pdev, s3cdma);
+	dev_info(&pdev->dev, "Loaded dma driver with %d physical channels\n",
+		 pdata->num_phy_channels);
+
+	return 0;
+
+err_slave_reg:
+	dma_async_device_unregister(&s3cdma->memcpy);
+err_memcpy_reg:
+	s3c24xx_dma_free_virtual_channels(&s3cdma->slave);
+err_slave:
+	s3c24xx_dma_free_virtual_channels(&s3cdma->memcpy);
+err_memcpy:
+	if (sdata->has_clocks)
+		for (i = 0; i < pdata->num_phy_channels; i++) {
+			struct s3c24xx_dma_phy *phy = &s3cdma->phy_chans[i];
+			if (phy->valid)
+				clk_unprepare(phy->clk);
+		}
+
+	return ret;
+}
+
+static int s3c24xx_dma_remove(struct platform_device *pdev)
+{
+	const struct s3c24xx_dma_platdata *pdata = dev_get_platdata(&pdev->dev);
+	struct s3c24xx_dma_engine *s3cdma = platform_get_drvdata(pdev);
+	struct soc_data *sdata = s3c24xx_dma_get_soc_data(pdev);
+	int i;
+
+	dma_async_device_unregister(&s3cdma->slave);
+	dma_async_device_unregister(&s3cdma->memcpy);
+
+	s3c24xx_dma_free_virtual_channels(&s3cdma->slave);
+	s3c24xx_dma_free_virtual_channels(&s3cdma->memcpy);
+
+	if (sdata->has_clocks)
+		for (i = 0; i < pdata->num_phy_channels; i++) {
+			struct s3c24xx_dma_phy *phy = &s3cdma->phy_chans[i];
+			if (phy->valid)
+				clk_unprepare(phy->clk);
+		}
+
+	return 0;
+}
+
+static struct platform_driver s3c24xx_dma_driver = {
+	.driver		= {
+		.name	= "s3c24xx-dma",
+		.owner	= THIS_MODULE,
+	},
+	.id_table	= s3c24xx_dma_driver_ids,
+	.probe		= s3c24xx_dma_probe,
+	.remove		= s3c24xx_dma_remove,
+};
+
+module_platform_driver(s3c24xx_dma_driver);
+
+bool s3c24xx_dma_filter(struct dma_chan *chan, void *param)
+{
+	struct s3c24xx_dma_chan *s3cchan;
+
+	if (chan->device->dev->driver != &s3c24xx_dma_driver.driver)
+		return false;
+
+	s3cchan = to_s3c24xx_dma_chan(chan);
+
+	return s3cchan->id == (int)param;
+}
+EXPORT_SYMBOL(s3c24xx_dma_filter);
+
+MODULE_DESCRIPTION("S3C24XX DMA Driver");
+MODULE_AUTHOR("Heiko Stuebner");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c
index 17df6db5dca7..8847adf392b7 100644
--- a/drivers/gpio/gpio-davinci.c
+++ b/drivers/gpio/gpio-davinci.c
@@ -15,8 +15,9 @@
 #include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/io.h>
-
-#include <asm/mach/irq.h>
+#include <linux/irq.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/gpio-davinci.h>
 
 struct davinci_gpio_regs {
 	u32	dir;
@@ -31,13 +32,14 @@ struct davinci_gpio_regs {
 	u32	intstat;
 };
 
+#define BINTEN	0x8 /* GPIO Interrupt Per-Bank Enable Register */
+
 #define chip2controller(chip)	\
 	container_of(chip, struct davinci_gpio_controller, chip)
 
-static struct davinci_gpio_controller chips[DIV_ROUND_UP(DAVINCI_N_GPIO, 32)];
 static void __iomem *gpio_base;
 
-static struct davinci_gpio_regs __iomem __init *gpio2regs(unsigned gpio)
+static struct davinci_gpio_regs __iomem *gpio2regs(unsigned gpio)
 {
 	void __iomem *ptr;
 
@@ -65,7 +67,7 @@ static inline struct davinci_gpio_regs __iomem *irq2regs(int irq)
 	return g;
 }
 
-static int __init davinci_gpio_irq_setup(void);
+static int davinci_gpio_irq_setup(struct platform_device *pdev);
 
 /*--------------------------------------------------------------------------*/
 
@@ -131,33 +133,53 @@ davinci_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
 	__raw_writel((1 << offset), value ? &g->set_data : &g->clr_data);
 }
 
-static int __init davinci_gpio_setup(void)
+static int davinci_gpio_probe(struct platform_device *pdev)
 {
 	int i, base;
 	unsigned ngpio;
-	struct davinci_soc_info *soc_info = &davinci_soc_info;
-	struct davinci_gpio_regs *regs;
-
-	if (soc_info->gpio_type != GPIO_TYPE_DAVINCI)
-		return 0;
+	struct davinci_gpio_controller *chips;
+	struct davinci_gpio_platform_data *pdata;
+	struct davinci_gpio_regs __iomem *regs;
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+
+	pdata = dev->platform_data;
+	if (!pdata) {
+		dev_err(dev, "No platform data found\n");
+		return -EINVAL;
+	}
 
 	/*
 	 * The gpio banks conceptually expose a segmented bitmap,
 	 * and "ngpio" is one more than the largest zero-based
 	 * bit index that's valid.
 	 */
-	ngpio = soc_info->gpio_num;
+	ngpio = pdata->ngpio;
 	if (ngpio == 0) {
-		pr_err("GPIO setup:  how many GPIOs?\n");
+		dev_err(dev, "How many GPIOs?\n");
 		return -EINVAL;
 	}
 
 	if (WARN_ON(DAVINCI_N_GPIO < ngpio))
 		ngpio = DAVINCI_N_GPIO;
 
-	gpio_base = ioremap(soc_info->gpio_base, SZ_4K);
-	if (WARN_ON(!gpio_base))
+	chips = devm_kzalloc(dev,
+			     ngpio * sizeof(struct davinci_gpio_controller),
+			     GFP_KERNEL);
+	if (!chips) {
+		dev_err(dev, "Memory allocation failed\n");
 		return -ENOMEM;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(dev, "Invalid memory resource\n");
+		return -EBUSY;
+	}
+
+	gpio_base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(gpio_base))
+		return PTR_ERR(gpio_base);
 
 	for (i = 0, base = 0; base < ngpio; i++, base += 32) {
 		chips[i].chip.label = "DaVinci";
@@ -183,13 +205,10 @@ static int __init davinci_gpio_setup(void)
 		gpiochip_add(&chips[i].chip);
 	}
 
-	soc_info->gpio_ctlrs = chips;
-	soc_info->gpio_ctlrs_num = DIV_ROUND_UP(ngpio, 32);
-
-	davinci_gpio_irq_setup();
+	platform_set_drvdata(pdev, chips);
+	davinci_gpio_irq_setup(pdev);
 	return 0;
 }
-pure_initcall(davinci_gpio_setup);
 
 /*--------------------------------------------------------------------------*/
 /*
@@ -302,13 +321,14 @@ static int gpio_to_irq_banked(struct gpio_chip *chip, unsigned offset)
 
 static int gpio_to_irq_unbanked(struct gpio_chip *chip, unsigned offset)
 {
-	struct davinci_soc_info *soc_info = &davinci_soc_info;
+	struct davinci_gpio_controller *d = chip2controller(chip);
 
-	/* NOTE:  we assume for now that only irqs in the first gpio_chip
+	/*
+	 * NOTE:  we assume for now that only irqs in the first gpio_chip
 	 * can provide direct-mapped IRQs to AINTC (up to 32 GPIOs).
 	 */
-	if (offset < soc_info->gpio_unbanked)
-		return soc_info->gpio_irq + offset;
+	if (offset < d->irq_base)
+		return d->gpio_irq + offset;
 	else
 		return -ENODEV;
 }
@@ -317,12 +337,11 @@ static int gpio_irq_type_unbanked(struct irq_data *data, unsigned trigger)
 {
 	struct davinci_gpio_controller *d;
 	struct davinci_gpio_regs __iomem *g;
-	struct davinci_soc_info *soc_info = &davinci_soc_info;
 	u32 mask;
 
 	d = (struct davinci_gpio_controller *)data->handler_data;
 	g = (struct davinci_gpio_regs __iomem *)d->regs;
-	mask = __gpio_mask(data->irq - soc_info->gpio_irq);
+	mask = __gpio_mask(data->irq - d->gpio_irq);
 
 	if (trigger & ~(IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING))
 		return -EINVAL;
@@ -343,24 +362,33 @@ static int gpio_irq_type_unbanked(struct irq_data *data, unsigned trigger)
  * (dm6446) can be set appropriately for GPIOV33 pins.
  */
 
-static int __init davinci_gpio_irq_setup(void)
+static int davinci_gpio_irq_setup(struct platform_device *pdev)
 {
 	unsigned	gpio, irq, bank;
 	struct clk	*clk;
 	u32		binten = 0;
 	unsigned	ngpio, bank_irq;
-	struct davinci_soc_info *soc_info = &davinci_soc_info;
-	struct davinci_gpio_regs	__iomem *g;
+	struct device *dev = &pdev->dev;
+	struct resource	*res;
+	struct davinci_gpio_controller *chips = platform_get_drvdata(pdev);
+	struct davinci_gpio_platform_data *pdata = dev->platform_data;
+	struct davinci_gpio_regs __iomem *g;
 
-	ngpio = soc_info->gpio_num;
+	ngpio = pdata->ngpio;
+	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!res) {
+		dev_err(dev, "Invalid IRQ resource\n");
+		return -EBUSY;
+	}
 
-	bank_irq = soc_info->gpio_irq;
-	if (bank_irq == 0) {
-		printk(KERN_ERR "Don't know first GPIO bank IRQ.\n");
-		return -EINVAL;
+	bank_irq = res->start;
+
+	if (!bank_irq) {
+		dev_err(dev, "Invalid IRQ resource\n");
+		return -ENODEV;
 	}
 
-	clk = clk_get(NULL, "gpio");
+	clk = devm_clk_get(dev, "gpio");
 	if (IS_ERR(clk)) {
 		printk(KERN_ERR "Error %ld getting gpio clock?\n",
 		       PTR_ERR(clk));
@@ -368,16 +396,17 @@ static int __init davinci_gpio_irq_setup(void)
 	}
 	clk_prepare_enable(clk);
 
-	/* Arrange gpio_to_irq() support, handling either direct IRQs or
+	/*
+	 * Arrange gpio_to_irq() support, handling either direct IRQs or
 	 * banked IRQs.  Having GPIOs in the first GPIO bank use direct
 	 * IRQs, while the others use banked IRQs, would need some setup
 	 * tweaks to recognize hardware which can do that.
 	 */
 	for (gpio = 0, bank = 0; gpio < ngpio; bank++, gpio += 32) {
 		chips[bank].chip.to_irq = gpio_to_irq_banked;
-		chips[bank].irq_base = soc_info->gpio_unbanked
+		chips[bank].irq_base = pdata->gpio_unbanked
 			? -EINVAL
-			: (soc_info->intc_irq_num + gpio);
+			: (pdata->intc_irq_num + gpio);
 	}
 
 	/*
@@ -385,7 +414,7 @@ static int __init davinci_gpio_irq_setup(void)
 	 * controller only handling trigger modes.  We currently assume no
 	 * IRQ mux conflicts; gpio_irq_type_unbanked() is only for GPIOs.
 	 */
-	if (soc_info->gpio_unbanked) {
+	if (pdata->gpio_unbanked) {
 		static struct irq_chip_type gpio_unbanked;
 
 		/* pass "bank 0" GPIO IRQs to AINTC */
@@ -405,7 +434,7 @@ static int __init davinci_gpio_irq_setup(void)
 		__raw_writel(~0, &g->set_rising);
 
 		/* set the direct IRQs up to use that irqchip */
-		for (gpio = 0; gpio < soc_info->gpio_unbanked; gpio++, irq++) {
+		for (gpio = 0; gpio < pdata->gpio_unbanked; gpio++, irq++) {
 			irq_set_chip(irq, &gpio_unbanked.chip);
 			irq_set_handler_data(irq, &chips[gpio / 32]);
 			irq_set_status_flags(irq, IRQ_TYPE_EDGE_BOTH);
@@ -450,12 +479,31 @@ static int __init davinci_gpio_irq_setup(void)
 	}
 
 done:
-	/* BINTEN -- per-bank interrupt enable. genirq would also let these
+	/*
+	 * BINTEN -- per-bank interrupt enable. genirq would also let these
 	 * bits be set/cleared dynamically.
 	 */
-	__raw_writel(binten, gpio_base + 0x08);
+	__raw_writel(binten, gpio_base + BINTEN);
 
 	printk(KERN_INFO "DaVinci: %d gpio irqs\n", irq - gpio_to_irq(0));
 
 	return 0;
 }
+
+static struct platform_driver davinci_gpio_driver = {
+	.probe		= davinci_gpio_probe,
+	.driver		= {
+		.name	= "davinci_gpio",
+		.owner	= THIS_MODULE,
+	},
+};
+
+/**
+ * GPIO driver registration needs to be done before machine_init functions
+ * access GPIO. Hence davinci_gpio_drv_reg() is a postcore_initcall.
+ */
+static int __init davinci_gpio_drv_reg(void)
+{
+	return platform_driver_register(&davinci_gpio_driver);
+}
+postcore_initcall(davinci_gpio_drv_reg);
diff --git a/drivers/gpio/gpio-tnetv107x.c b/drivers/gpio/gpio-tnetv107x.c
index 3fa3e2867e19..58445bb69106 100644
--- a/drivers/gpio/gpio-tnetv107x.c
+++ b/drivers/gpio/gpio-tnetv107x.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/gpio.h>
+#include <linux/platform_data/gpio-davinci.h>
 
 #include <mach/common.h>
 #include <mach/tnetv107x.h>
diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c
index bb328a366122..433cc8568dec 100644
--- a/drivers/irqchip/irq-armada-370-xp.c
+++ b/drivers/irqchip/irq-armada-370-xp.c
@@ -21,7 +21,10 @@
 #include <linux/io.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
+#include <linux/of_pci.h>
 #include <linux/irqdomain.h>
+#include <linux/slab.h>
+#include <linux/msi.h>
 #include <asm/mach/arch.h>
 #include <asm/exception.h>
 #include <asm/smp_plat.h>
@@ -51,12 +54,22 @@
 #define IPI_DOORBELL_START                      (0)
 #define IPI_DOORBELL_END                        (8)
 #define IPI_DOORBELL_MASK                       0xFF
+#define PCI_MSI_DOORBELL_START                  (16)
+#define PCI_MSI_DOORBELL_NR                     (16)
+#define PCI_MSI_DOORBELL_END                    (32)
+#define PCI_MSI_DOORBELL_MASK                   0xFFFF0000
 
 static DEFINE_RAW_SPINLOCK(irq_controller_lock);
 
 static void __iomem *per_cpu_int_base;
 static void __iomem *main_int_base;
 static struct irq_domain *armada_370_xp_mpic_domain;
+#ifdef CONFIG_PCI_MSI
+static struct irq_domain *armada_370_xp_msi_domain;
+static DECLARE_BITMAP(msi_used, PCI_MSI_DOORBELL_NR);
+static DEFINE_MUTEX(msi_used_lock);
+static phys_addr_t msi_doorbell_addr;
+#endif
 
 /*
  * In SMP mode:
@@ -87,6 +100,144 @@ static void armada_370_xp_irq_unmask(struct irq_data *d)
 				ARMADA_370_XP_INT_CLEAR_MASK_OFFS);
 }
 
+#ifdef CONFIG_PCI_MSI
+
+static int armada_370_xp_alloc_msi(void)
+{
+	int hwirq;
+
+	mutex_lock(&msi_used_lock);
+	hwirq = find_first_zero_bit(&msi_used, PCI_MSI_DOORBELL_NR);
+	if (hwirq >= PCI_MSI_DOORBELL_NR)
+		hwirq = -ENOSPC;
+	else
+		set_bit(hwirq, msi_used);
+	mutex_unlock(&msi_used_lock);
+
+	return hwirq;
+}
+
+static void armada_370_xp_free_msi(int hwirq)
+{
+	mutex_lock(&msi_used_lock);
+	if (!test_bit(hwirq, msi_used))
+		pr_err("trying to free unused MSI#%d\n", hwirq);
+	else
+		clear_bit(hwirq, msi_used);
+	mutex_unlock(&msi_used_lock);
+}
+
+static int armada_370_xp_setup_msi_irq(struct msi_chip *chip,
+				       struct pci_dev *pdev,
+				       struct msi_desc *desc)
+{
+	struct msi_msg msg;
+	irq_hw_number_t hwirq;
+	int virq;
+
+	hwirq = armada_370_xp_alloc_msi();
+	if (hwirq < 0)
+		return hwirq;
+
+	virq = irq_create_mapping(armada_370_xp_msi_domain, hwirq);
+	if (!virq) {
+		armada_370_xp_free_msi(hwirq);
+		return -EINVAL;
+	}
+
+	irq_set_msi_desc(virq, desc);
+
+	msg.address_lo = msi_doorbell_addr;
+	msg.address_hi = 0;
+	msg.data = 0xf00 | (hwirq + 16);
+
+	write_msi_msg(virq, &msg);
+	return 0;
+}
+
+static void armada_370_xp_teardown_msi_irq(struct msi_chip *chip,
+					   unsigned int irq)
+{
+	struct irq_data *d = irq_get_irq_data(irq);
+	irq_dispose_mapping(irq);
+	armada_370_xp_free_msi(d->hwirq);
+}
+
+static struct irq_chip armada_370_xp_msi_irq_chip = {
+	.name = "armada_370_xp_msi_irq",
+	.irq_enable = unmask_msi_irq,
+	.irq_disable = mask_msi_irq,
+	.irq_mask = mask_msi_irq,
+	.irq_unmask = unmask_msi_irq,
+};
+
+static int armada_370_xp_msi_map(struct irq_domain *domain, unsigned int virq,
+				 irq_hw_number_t hw)
+{
+	irq_set_chip_and_handler(virq, &armada_370_xp_msi_irq_chip,
+				 handle_simple_irq);
+	set_irq_flags(virq, IRQF_VALID);
+
+	return 0;
+}
+
+static const struct irq_domain_ops armada_370_xp_msi_irq_ops = {
+	.map = armada_370_xp_msi_map,
+};
+
+static int armada_370_xp_msi_init(struct device_node *node,
+				  phys_addr_t main_int_phys_base)
+{
+	struct msi_chip *msi_chip;
+	u32 reg;
+	int ret;
+
+	msi_doorbell_addr = main_int_phys_base +
+		ARMADA_370_XP_SW_TRIG_INT_OFFS;
+
+	msi_chip = kzalloc(sizeof(*msi_chip), GFP_KERNEL);
+	if (!msi_chip)
+		return -ENOMEM;
+
+	msi_chip->setup_irq = armada_370_xp_setup_msi_irq;
+	msi_chip->teardown_irq = armada_370_xp_teardown_msi_irq;
+	msi_chip->of_node = node;
+
+	armada_370_xp_msi_domain =
+		irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR,
+				      &armada_370_xp_msi_irq_ops,
+				      NULL);
+	if (!armada_370_xp_msi_domain) {
+		kfree(msi_chip);
+		return -ENOMEM;
+	}
+
+	ret = of_pci_msi_chip_add(msi_chip);
+	if (ret < 0) {
+		irq_domain_remove(armada_370_xp_msi_domain);
+		kfree(msi_chip);
+		return ret;
+	}
+
+	reg = readl(per_cpu_int_base + ARMADA_370_XP_IN_DRBEL_MSK_OFFS)
+		| PCI_MSI_DOORBELL_MASK;
+
+	writel(reg, per_cpu_int_base +
+	       ARMADA_370_XP_IN_DRBEL_MSK_OFFS);
+
+	/* Unmask IPI interrupt */
+	writel(1, per_cpu_int_base + ARMADA_370_XP_INT_CLEAR_MASK_OFFS);
+
+	return 0;
+}
+#else
+static inline int armada_370_xp_msi_init(struct device_node *node,
+					 phys_addr_t main_int_phys_base)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SMP
 static int armada_xp_set_affinity(struct irq_data *d,
 				  const struct cpumask *mask_val, bool force)
@@ -214,12 +365,39 @@ armada_370_xp_handle_irq(struct pt_regs *regs)
 		if (irqnr > 1022)
 			break;
 
-		if (irqnr > 0) {
+		if (irqnr > 1) {
 			irqnr =	irq_find_mapping(armada_370_xp_mpic_domain,
 					irqnr);
 			handle_IRQ(irqnr, regs);
 			continue;
 		}
+
+#ifdef CONFIG_PCI_MSI
+		/* MSI handling */
+		if (irqnr == 1) {
+			u32 msimask, msinr;
+
+			msimask = readl_relaxed(per_cpu_int_base +
+						ARMADA_370_XP_IN_DRBEL_CAUSE_OFFS)
+				& PCI_MSI_DOORBELL_MASK;
+
+			writel(~PCI_MSI_DOORBELL_MASK, per_cpu_int_base +
+			       ARMADA_370_XP_IN_DRBEL_CAUSE_OFFS);
+
+			for (msinr = PCI_MSI_DOORBELL_START;
+			     msinr < PCI_MSI_DOORBELL_END; msinr++) {
+				int irq;
+
+				if (!(msimask & BIT(msinr)))
+					continue;
+
+				irq = irq_find_mapping(armada_370_xp_msi_domain,
+						       msinr - 16);
+				handle_IRQ(irq, regs);
+			}
+		}
+#endif
+
 #ifdef CONFIG_SMP
 		/* IPI Handling */
 		if (irqnr == 0) {
@@ -248,12 +426,25 @@ armada_370_xp_handle_irq(struct pt_regs *regs)
 static int __init armada_370_xp_mpic_of_init(struct device_node *node,
 					     struct device_node *parent)
 {
+	struct resource main_int_res, per_cpu_int_res;
 	u32 control;
 
-	main_int_base = of_iomap(node, 0);
-	per_cpu_int_base = of_iomap(node, 1);
+	BUG_ON(of_address_to_resource(node, 0, &main_int_res));
+	BUG_ON(of_address_to_resource(node, 1, &per_cpu_int_res));
+
+	BUG_ON(!request_mem_region(main_int_res.start,
+				   resource_size(&main_int_res),
+				   node->full_name));
+	BUG_ON(!request_mem_region(per_cpu_int_res.start,
+				   resource_size(&per_cpu_int_res),
+				   node->full_name));
 
+	main_int_base = ioremap(main_int_res.start,
+				resource_size(&main_int_res));
 	BUG_ON(!main_int_base);
+
+	per_cpu_int_base = ioremap(per_cpu_int_res.start,
+				   resource_size(&per_cpu_int_res));
 	BUG_ON(!per_cpu_int_base);
 
 	control = readl(main_int_base + ARMADA_370_XP_INT_CONTROL);
@@ -262,8 +453,7 @@ static int __init armada_370_xp_mpic_of_init(struct device_node *node,
 		irq_domain_add_linear(node, (control >> 2) & 0x3ff,
 				&armada_370_xp_mpic_irq_ops, NULL);
 
-	if (!armada_370_xp_mpic_domain)
-		panic("Unable to add Armada_370_Xp MPIC irq domain (DT)\n");
+	BUG_ON(!armada_370_xp_mpic_domain);
 
 	irq_set_default_host(armada_370_xp_mpic_domain);
 
@@ -280,6 +470,8 @@ static int __init armada_370_xp_mpic_of_init(struct device_node *node,
 
 #endif
 
+	armada_370_xp_msi_init(node, main_int_res.start);
+
 	set_handle_irq(armada_370_xp_handle_irq);
 
 	return 0;
diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
index 3d9504811126..43186feb4294 100644
--- a/drivers/pci/host/Kconfig
+++ b/drivers/pci/host/Kconfig
@@ -3,7 +3,7 @@ menu "PCI host controller drivers"
 
 config PCI_MVEBU
 	bool "Marvell EBU PCIe controller"
-	depends on ARCH_MVEBU || ARCH_KIRKWOOD
+	depends on ARCH_MVEBU || ARCH_DOVE || ARCH_KIRKWOOD
 	depends on OF
 
 config PCIE_DW
diff --git a/drivers/pci/host/pci-mvebu.c b/drivers/pci/host/pci-mvebu.c
index 729d5a101d62..80b2250ea19a 100644
--- a/drivers/pci/host/pci-mvebu.c
+++ b/drivers/pci/host/pci-mvebu.c
@@ -9,13 +9,17 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/gpio.h>
 #include <linux/module.h>
 #include <linux/mbus.h>
+#include <linux/msi.h>
 #include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <linux/of_address.h>
-#include <linux/of_pci.h>
 #include <linux/of_irq.h>
+#include <linux/of_gpio.h>
+#include <linux/of_pci.h>
 #include <linux/of_platform.h>
 
 /*
@@ -103,6 +107,7 @@ struct mvebu_pcie_port;
 struct mvebu_pcie {
 	struct platform_device *pdev;
 	struct mvebu_pcie_port *ports;
+	struct msi_chip *msi;
 	struct resource io;
 	struct resource realio;
 	struct resource mem;
@@ -115,7 +120,6 @@ struct mvebu_pcie_port {
 	char *name;
 	void __iomem *base;
 	spinlock_t conf_lock;
-	int haslink;
 	u32 port;
 	u32 lane;
 	int devfn;
@@ -124,6 +128,9 @@ struct mvebu_pcie_port {
 	unsigned int io_target;
 	unsigned int io_attr;
 	struct clk *clk;
+	int reset_gpio;
+	int reset_active_low;
+	char *reset_name;
 	struct mvebu_sw_pci_bridge bridge;
 	struct device_node *dn;
 	struct mvebu_pcie *pcie;
@@ -133,29 +140,39 @@ struct mvebu_pcie_port {
 	size_t iowin_size;
 };
 
+static inline void mvebu_writel(struct mvebu_pcie_port *port, u32 val, u32 reg)
+{
+	writel(val, port->base + reg);
+}
+
+static inline u32 mvebu_readl(struct mvebu_pcie_port *port, u32 reg)
+{
+	return readl(port->base + reg);
+}
+
 static bool mvebu_pcie_link_up(struct mvebu_pcie_port *port)
 {
-	return !(readl(port->base + PCIE_STAT_OFF) & PCIE_STAT_LINK_DOWN);
+	return !(mvebu_readl(port, PCIE_STAT_OFF) & PCIE_STAT_LINK_DOWN);
 }
 
 static void mvebu_pcie_set_local_bus_nr(struct mvebu_pcie_port *port, int nr)
 {
 	u32 stat;
 
-	stat = readl(port->base + PCIE_STAT_OFF);
+	stat = mvebu_readl(port, PCIE_STAT_OFF);
 	stat &= ~PCIE_STAT_BUS;
 	stat |= nr << 8;
-	writel(stat, port->base + PCIE_STAT_OFF);
+	mvebu_writel(port, stat, PCIE_STAT_OFF);
 }
 
 static void mvebu_pcie_set_local_dev_nr(struct mvebu_pcie_port *port, int nr)
 {
 	u32 stat;
 
-	stat = readl(port->base + PCIE_STAT_OFF);
+	stat = mvebu_readl(port, PCIE_STAT_OFF);
 	stat &= ~PCIE_STAT_DEV;
 	stat |= nr << 16;
-	writel(stat, port->base + PCIE_STAT_OFF);
+	mvebu_writel(port, stat, PCIE_STAT_OFF);
 }
 
 /*
@@ -163,7 +180,7 @@ static void mvebu_pcie_set_local_dev_nr(struct mvebu_pcie_port *port, int nr)
  * BAR[0,2] -> disabled, BAR[1] -> covers all DRAM banks
  * WIN[0-3] -> DRAM bank[0-3]
  */
-static void __init mvebu_pcie_setup_wins(struct mvebu_pcie_port *port)
+static void mvebu_pcie_setup_wins(struct mvebu_pcie_port *port)
 {
 	const struct mbus_dram_target_info *dram;
 	u32 size;
@@ -173,33 +190,34 @@ static void __init mvebu_pcie_setup_wins(struct mvebu_pcie_port *port)
 
 	/* First, disable and clear BARs and windows. */
 	for (i = 1; i < 3; i++) {
-		writel(0, port->base + PCIE_BAR_CTRL_OFF(i));
-		writel(0, port->base + PCIE_BAR_LO_OFF(i));
-		writel(0, port->base + PCIE_BAR_HI_OFF(i));
+		mvebu_writel(port, 0, PCIE_BAR_CTRL_OFF(i));
+		mvebu_writel(port, 0, PCIE_BAR_LO_OFF(i));
+		mvebu_writel(port, 0, PCIE_BAR_HI_OFF(i));
 	}
 
 	for (i = 0; i < 5; i++) {
-		writel(0, port->base + PCIE_WIN04_CTRL_OFF(i));
-		writel(0, port->base + PCIE_WIN04_BASE_OFF(i));
-		writel(0, port->base + PCIE_WIN04_REMAP_OFF(i));
+		mvebu_writel(port, 0, PCIE_WIN04_CTRL_OFF(i));
+		mvebu_writel(port, 0, PCIE_WIN04_BASE_OFF(i));
+		mvebu_writel(port, 0, PCIE_WIN04_REMAP_OFF(i));
 	}
 
-	writel(0, port->base + PCIE_WIN5_CTRL_OFF);
-	writel(0, port->base + PCIE_WIN5_BASE_OFF);
-	writel(0, port->base + PCIE_WIN5_REMAP_OFF);
+	mvebu_writel(port, 0, PCIE_WIN5_CTRL_OFF);
+	mvebu_writel(port, 0, PCIE_WIN5_BASE_OFF);
+	mvebu_writel(port, 0, PCIE_WIN5_REMAP_OFF);
 
 	/* Setup windows for DDR banks.  Count total DDR size on the fly. */
 	size = 0;
 	for (i = 0; i < dram->num_cs; i++) {
 		const struct mbus_dram_window *cs = dram->cs + i;
 
-		writel(cs->base & 0xffff0000,
-		       port->base + PCIE_WIN04_BASE_OFF(i));
-		writel(0, port->base + PCIE_WIN04_REMAP_OFF(i));
-		writel(((cs->size - 1) & 0xffff0000) |
-			(cs->mbus_attr << 8) |
-			(dram->mbus_dram_target_id << 4) | 1,
-		       port->base + PCIE_WIN04_CTRL_OFF(i));
+		mvebu_writel(port, cs->base & 0xffff0000,
+			     PCIE_WIN04_BASE_OFF(i));
+		mvebu_writel(port, 0, PCIE_WIN04_REMAP_OFF(i));
+		mvebu_writel(port,
+			     ((cs->size - 1) & 0xffff0000) |
+			     (cs->mbus_attr << 8) |
+			     (dram->mbus_dram_target_id << 4) | 1,
+			     PCIE_WIN04_CTRL_OFF(i));
 
 		size += cs->size;
 	}
@@ -209,41 +227,40 @@ static void __init mvebu_pcie_setup_wins(struct mvebu_pcie_port *port)
 		size = 1 << fls(size);
 
 	/* Setup BAR[1] to all DRAM banks. */
-	writel(dram->cs[0].base, port->base + PCIE_BAR_LO_OFF(1));
-	writel(0, port->base + PCIE_BAR_HI_OFF(1));
-	writel(((size - 1) & 0xffff0000) | 1,
-	       port->base + PCIE_BAR_CTRL_OFF(1));
+	mvebu_writel(port, dram->cs[0].base, PCIE_BAR_LO_OFF(1));
+	mvebu_writel(port, 0, PCIE_BAR_HI_OFF(1));
+	mvebu_writel(port, ((size - 1) & 0xffff0000) | 1,
+		     PCIE_BAR_CTRL_OFF(1));
 }
 
-static void __init mvebu_pcie_setup_hw(struct mvebu_pcie_port *port)
+static void mvebu_pcie_setup_hw(struct mvebu_pcie_port *port)
 {
-	u16 cmd;
-	u32 mask;
+	u32 cmd, mask;
 
 	/* Point PCIe unit MBUS decode windows to DRAM space. */
 	mvebu_pcie_setup_wins(port);
 
 	/* Master + slave enable. */
-	cmd = readw(port->base + PCIE_CMD_OFF);
+	cmd = mvebu_readl(port, PCIE_CMD_OFF);
 	cmd |= PCI_COMMAND_IO;
 	cmd |= PCI_COMMAND_MEMORY;
 	cmd |= PCI_COMMAND_MASTER;
-	writew(cmd, port->base + PCIE_CMD_OFF);
+	mvebu_writel(port, cmd, PCIE_CMD_OFF);
 
 	/* Enable interrupt lines A-D. */
-	mask = readl(port->base + PCIE_MASK_OFF);
+	mask = mvebu_readl(port, PCIE_MASK_OFF);
 	mask |= PCIE_MASK_ENABLE_INTS;
-	writel(mask, port->base + PCIE_MASK_OFF);
+	mvebu_writel(port, mask, PCIE_MASK_OFF);
 }
 
 static int mvebu_pcie_hw_rd_conf(struct mvebu_pcie_port *port,
 				 struct pci_bus *bus,
 				 u32 devfn, int where, int size, u32 *val)
 {
-	writel(PCIE_CONF_ADDR(bus->number, devfn, where),
-	       port->base + PCIE_CONF_ADDR_OFF);
+	mvebu_writel(port, PCIE_CONF_ADDR(bus->number, devfn, where),
+		     PCIE_CONF_ADDR_OFF);
 
-	*val = readl(port->base + PCIE_CONF_DATA_OFF);
+	*val = mvebu_readl(port, PCIE_CONF_DATA_OFF);
 
 	if (size == 1)
 		*val = (*val >> (8 * (where & 3))) & 0xff;
@@ -257,21 +274,24 @@ static int mvebu_pcie_hw_wr_conf(struct mvebu_pcie_port *port,
 				 struct pci_bus *bus,
 				 u32 devfn, int where, int size, u32 val)
 {
-	int ret = PCIBIOS_SUCCESSFUL;
+	u32 _val, shift = 8 * (where & 3);
 
-	writel(PCIE_CONF_ADDR(bus->number, devfn, where),
-	       port->base + PCIE_CONF_ADDR_OFF);
+	mvebu_writel(port, PCIE_CONF_ADDR(bus->number, devfn, where),
+		     PCIE_CONF_ADDR_OFF);
+	_val = mvebu_readl(port, PCIE_CONF_DATA_OFF);
 
 	if (size == 4)
-		writel(val, port->base + PCIE_CONF_DATA_OFF);
+		_val = val;
 	else if (size == 2)
-		writew(val, port->base + PCIE_CONF_DATA_OFF + (where & 3));
+		_val = (_val & ~(0xffff << shift)) | ((val & 0xffff) << shift);
 	else if (size == 1)
-		writeb(val, port->base + PCIE_CONF_DATA_OFF + (where & 3));
+		_val = (_val & ~(0xff << shift)) | ((val & 0xff) << shift);
 	else
-		ret = PCIBIOS_BAD_REGISTER_NUMBER;
+		return PCIBIOS_BAD_REGISTER_NUMBER;
 
-	return ret;
+	mvebu_writel(port, _val, PCIE_CONF_DATA_OFF);
+
+	return PCIBIOS_SUCCESSFUL;
 }
 
 static void mvebu_pcie_handle_iobase_change(struct mvebu_pcie_port *port)
@@ -552,7 +572,7 @@ static int mvebu_pcie_wr_conf(struct pci_bus *bus, u32 devfn,
 	if (bus->number == 0)
 		return mvebu_sw_pci_bridge_write(port, where, size, val);
 
-	if (!port->haslink)
+	if (!mvebu_pcie_link_up(port))
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	/*
@@ -594,7 +614,7 @@ static int mvebu_pcie_rd_conf(struct pci_bus *bus, u32 devfn, int where,
 	if (bus->number == 0)
 		return mvebu_sw_pci_bridge_read(port, where, size, val);
 
-	if (!port->haslink) {
+	if (!mvebu_pcie_link_up(port)) {
 		*val = 0xffffffff;
 		return PCIBIOS_DEVICE_NOT_FOUND;
 	}
@@ -626,7 +646,7 @@ static struct pci_ops mvebu_pcie_ops = {
 	.write = mvebu_pcie_wr_conf,
 };
 
-static int __init mvebu_pcie_setup(int nr, struct pci_sys_data *sys)
+static int mvebu_pcie_setup(int nr, struct pci_sys_data *sys)
 {
 	struct mvebu_pcie *pcie = sys_to_pcie(sys);
 	int i;
@@ -645,7 +665,7 @@ static int __init mvebu_pcie_setup(int nr, struct pci_sys_data *sys)
 	return 1;
 }
 
-static int __init mvebu_pcie_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
+static int mvebu_pcie_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 {
 	struct of_irq oirq;
 	int ret;
@@ -673,11 +693,17 @@ static struct pci_bus *mvebu_pcie_scan_bus(int nr, struct pci_sys_data *sys)
 	return bus;
 }
 
-resource_size_t mvebu_pcie_align_resource(struct pci_dev *dev,
-					  const struct resource *res,
-					  resource_size_t start,
-					  resource_size_t size,
-					  resource_size_t align)
+static void mvebu_pcie_add_bus(struct pci_bus *bus)
+{
+	struct mvebu_pcie *pcie = sys_to_pcie(bus->sysdata);
+	bus->msi = pcie->msi;
+}
+
+static resource_size_t mvebu_pcie_align_resource(struct pci_dev *dev,
+						const struct resource *res,
+						resource_size_t start,
+						resource_size_t size,
+						resource_size_t align)
 {
 	if (dev->bus->number != 0)
 		return start;
@@ -696,7 +722,7 @@ resource_size_t mvebu_pcie_align_resource(struct pci_dev *dev,
 		return start;
 }
 
-static void __init mvebu_pcie_enable(struct mvebu_pcie *pcie)
+static void mvebu_pcie_enable(struct mvebu_pcie *pcie)
 {
 	struct hw_pci hw;
 
@@ -709,6 +735,7 @@ static void __init mvebu_pcie_enable(struct mvebu_pcie *pcie)
 	hw.map_irq        = mvebu_pcie_map_irq;
 	hw.ops            = &mvebu_pcie_ops;
 	hw.align_resource = mvebu_pcie_align_resource;
+	hw.add_bus        = mvebu_pcie_add_bus;
 
 	pci_common_init(&hw);
 }
@@ -718,10 +745,8 @@ static void __init mvebu_pcie_enable(struct mvebu_pcie *pcie)
  * <...> property for one that matches the given port/lane. Once
  * found, maps it.
  */
-static void __iomem * __init
-mvebu_pcie_map_registers(struct platform_device *pdev,
-			 struct device_node *np,
-			 struct mvebu_pcie_port *port)
+static void __iomem *mvebu_pcie_map_registers(struct platform_device *pdev,
+		      struct device_node *np, struct mvebu_pcie_port *port)
 {
 	struct resource regs;
 	int ret = 0;
@@ -777,7 +802,22 @@ static int mvebu_get_tgt_attr(struct device_node *np, int devfn,
 	return -ENOENT;
 }
 
-static int __init mvebu_pcie_probe(struct platform_device *pdev)
+static void mvebu_pcie_msi_enable(struct mvebu_pcie *pcie)
+{
+	struct device_node *msi_node;
+
+	msi_node = of_parse_phandle(pcie->pdev->dev.of_node,
+				    "msi-parent", 0);
+	if (!msi_node)
+		return;
+
+	pcie->msi = of_pci_find_msi_chip_by_node(msi_node);
+
+	if (pcie->msi)
+		pcie->msi->dev = &pcie->pdev->dev;
+}
+
+static int mvebu_pcie_probe(struct platform_device *pdev)
 {
 	struct mvebu_pcie *pcie;
 	struct device_node *np = pdev->dev.of_node;
@@ -790,6 +830,7 @@ static int __init mvebu_pcie_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	pcie->pdev = pdev;
+	platform_set_drvdata(pdev, pcie);
 
 	/* Get the PCIe memory and I/O aperture */
 	mvebu_mbus_get_pcie_mem_aperture(&pcie->mem);
@@ -818,13 +859,14 @@ static int __init mvebu_pcie_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	i = 0;
 	for_each_child_of_node(pdev->dev.of_node, child) {
 		if (!of_device_is_available(child))
 			continue;
-		pcie->nports++;
+		i++;
 	}
 
-	pcie->ports = devm_kzalloc(&pdev->dev, pcie->nports *
+	pcie->ports = devm_kzalloc(&pdev->dev, i *
 				   sizeof(struct mvebu_pcie_port),
 				   GFP_KERNEL);
 	if (!pcie->ports)
@@ -833,6 +875,7 @@ static int __init mvebu_pcie_probe(struct platform_device *pdev)
 	i = 0;
 	for_each_child_of_node(pdev->dev.of_node, child) {
 		struct mvebu_pcie_port *port = &pcie->ports[i];
+		enum of_gpio_flags flags;
 
 		if (!of_device_is_available(child))
 			continue;
@@ -873,45 +916,68 @@ static int __init mvebu_pcie_probe(struct platform_device *pdev)
 			continue;
 		}
 
+		port->reset_gpio = of_get_named_gpio_flags(child,
+						   "reset-gpios", 0, &flags);
+		if (gpio_is_valid(port->reset_gpio)) {
+			u32 reset_udelay = 20000;
+
+			port->reset_active_low = flags & OF_GPIO_ACTIVE_LOW;
+			port->reset_name = kasprintf(GFP_KERNEL,
+				     "pcie%d.%d-reset", port->port, port->lane);
+			of_property_read_u32(child, "reset-delay-us",
+					     &reset_udelay);
+
+			ret = devm_gpio_request_one(&pdev->dev,
+			    port->reset_gpio, GPIOF_DIR_OUT, port->reset_name);
+			if (ret) {
+				if (ret == -EPROBE_DEFER)
+					return ret;
+				continue;
+			}
+
+			gpio_set_value(port->reset_gpio,
+				       (port->reset_active_low) ? 1 : 0);
+			msleep(reset_udelay/1000);
+		}
+
+		port->clk = of_clk_get_by_name(child, NULL);
+		if (IS_ERR(port->clk)) {
+			dev_err(&pdev->dev, "PCIe%d.%d: cannot get clock\n",
+			       port->port, port->lane);
+			continue;
+		}
+
+		ret = clk_prepare_enable(port->clk);
+		if (ret)
+			continue;
+
 		port->base = mvebu_pcie_map_registers(pdev, child, port);
 		if (IS_ERR(port->base)) {
 			dev_err(&pdev->dev, "PCIe%d.%d: cannot map registers\n",
 				port->port, port->lane);
 			port->base = NULL;
+			clk_disable_unprepare(port->clk);
 			continue;
 		}
 
 		mvebu_pcie_set_local_dev_nr(port, 1);
 
-		if (mvebu_pcie_link_up(port)) {
-			port->haslink = 1;
-			dev_info(&pdev->dev, "PCIe%d.%d: link up\n",
-				 port->port, port->lane);
-		} else {
-			port->haslink = 0;
-			dev_info(&pdev->dev, "PCIe%d.%d: link down\n",
-				 port->port, port->lane);
-		}
-
 		port->clk = of_clk_get_by_name(child, NULL);
 		if (IS_ERR(port->clk)) {
 			dev_err(&pdev->dev, "PCIe%d.%d: cannot get clock\n",
 			       port->port, port->lane);
 			iounmap(port->base);
-			port->haslink = 0;
 			continue;
 		}
 
 		port->dn = child;
-
-		clk_prepare_enable(port->clk);
 		spin_lock_init(&port->conf_lock);
-
 		mvebu_sw_pci_bridge_init(port);
-
 		i++;
 	}
 
+	pcie->nports = i;
+	mvebu_pcie_msi_enable(pcie);
 	mvebu_pcie_enable(pcie);
 
 	return 0;
@@ -920,6 +986,7 @@ static int __init mvebu_pcie_probe(struct platform_device *pdev)
 static const struct of_device_id mvebu_pcie_of_match_table[] = {
 	{ .compatible = "marvell,armada-xp-pcie", },
 	{ .compatible = "marvell,armada-370-pcie", },
+	{ .compatible = "marvell,dove-pcie", },
 	{ .compatible = "marvell,kirkwood-pcie", },
 	{},
 };
@@ -931,16 +998,12 @@ static struct platform_driver mvebu_pcie_driver = {
 		.name = "mvebu-pcie",
 		.of_match_table =
 		   of_match_ptr(mvebu_pcie_of_match_table),
+		/* driver unloading/unbinding currently not supported */
+		.suppress_bind_attrs = true,
 	},
+	.probe = mvebu_pcie_probe,
 };
-
-static int __init mvebu_pcie_init(void)
-{
-	return platform_driver_probe(&mvebu_pcie_driver,
-				     mvebu_pcie_probe);
-}
-
-subsys_initcall(mvebu_pcie_init);
+module_platform_driver(mvebu_pcie_driver);
 
 MODULE_AUTHOR("Thomas Petazzoni <thomas.petazzoni@free-electrons.com>");
 MODULE_DESCRIPTION("Marvell EBU PCIe driver");