20 files changed, 1095 insertions, 256 deletions
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index df7537f12469..55347662e5ed 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -21,7 +21,7 @@ config CPU_ARM7TDMI
 
 # ARM720T
 config CPU_ARM720T
-	bool "Support ARM720T processor" if (ARCH_MULTI_V4T && ARCH_INTEGRATOR)
+	bool
 	select CPU_32v4T
 	select CPU_ABRT_LV4T
 	select CPU_CACHE_V4
@@ -39,7 +39,7 @@ config CPU_ARM720T
 
 # ARM740T
 config CPU_ARM740T
-	bool "Support ARM740T processor" if (ARCH_MULTI_V4T && ARCH_INTEGRATOR)
+	bool
 	depends on !MMU
 	select CPU_32v4T
 	select CPU_ABRT_LV4T
@@ -71,7 +71,7 @@ config CPU_ARM9TDMI
 
 # ARM920T
 config CPU_ARM920T
-	bool "Support ARM920T processor" if (ARCH_MULTI_V4T && ARCH_INTEGRATOR)
+	bool
 	select CPU_32v4T
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_V4WT
@@ -89,7 +89,7 @@ config CPU_ARM920T
 
 # ARM922T
 config CPU_ARM922T
-	bool "Support ARM922T processor" if (ARCH_MULTI_V4T && ARCH_INTEGRATOR)
+	bool
 	select CPU_32v4T
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_V4WT
@@ -108,7 +108,7 @@ config CPU_ARM922T
 
 # ARM925T
 config CPU_ARM925T
- 	bool "Support ARM925T processor" if ARCH_OMAP1
+	bool
 	select CPU_32v4T
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_V4WT
@@ -127,7 +127,7 @@ config CPU_ARM925T
 
 # ARM926T
 config CPU_ARM926T
-	bool "Support ARM926T processor" if (!ARCH_MULTIPLATFORM || ARCH_MULTI_V5) && (ARCH_INTEGRATOR || MACH_REALVIEW_EB)
+	bool
 	select CPU_32v5
 	select CPU_ABRT_EV5TJ
 	select CPU_CACHE_VIVT
@@ -163,7 +163,7 @@ config CPU_FA526
 
 # ARM940T
 config CPU_ARM940T
-	bool "Support ARM940T processor" if (ARCH_MULTI_V4T && ARCH_INTEGRATOR)
+	bool
 	depends on !MMU
 	select CPU_32v4T
 	select CPU_ABRT_NOMMU
@@ -181,7 +181,7 @@ config CPU_ARM940T
 
 # ARM946E-S
 config CPU_ARM946E
-	bool "Support ARM946E-S processor" if (ARCH_MULTI_V5 && ARCH_INTEGRATOR)
+	bool
 	depends on !MMU
 	select CPU_32v5
 	select CPU_ABRT_NOMMU
@@ -198,7 +198,7 @@ config CPU_ARM946E
 
 # ARM1020 - needs validating
 config CPU_ARM1020
-	bool "Support ARM1020T (rev 0) processor" if (ARCH_MULTI_V5 && ARCH_INTEGRATOR)
+	bool
 	select CPU_32v5
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_V4WT
@@ -216,7 +216,7 @@ config CPU_ARM1020
 
 # ARM1020E - needs validating
 config CPU_ARM1020E
-	bool "Support ARM1020E processor" if (ARCH_MULTI_V5 && ARCH_INTEGRATOR)
+	bool
 	depends on n
 	select CPU_32v5
 	select CPU_ABRT_EV4T
@@ -229,7 +229,7 @@ config CPU_ARM1020E
 
 # ARM1022E
 config CPU_ARM1022
-	bool "Support ARM1022E processor" if (ARCH_MULTI_V5 && ARCH_INTEGRATOR)
+	bool
 	select CPU_32v5
 	select CPU_ABRT_EV4T
 	select CPU_CACHE_VIVT
@@ -247,7 +247,7 @@ config CPU_ARM1022
 
 # ARM1026EJ-S
 config CPU_ARM1026
-	bool "Support ARM1026EJ-S processor" if (ARCH_MULTI_V5 && ARCH_INTEGRATOR)
+	bool
 	select CPU_32v5
 	select CPU_ABRT_EV5T # But need Jazelle, but EV5TJ ignores bit 10
 	select CPU_CACHE_VIVT
@@ -358,7 +358,7 @@ config CPU_PJ4B
 
 # ARMv6
 config CPU_V6
-	bool "Support ARM V6 processor" if (!ARCH_MULTIPLATFORM || ARCH_MULTI_V6) && (ARCH_INTEGRATOR || MACH_REALVIEW_EB || MACH_REALVIEW_PBX)
+	bool
 	select CPU_32v6
 	select CPU_ABRT_EV6
 	select CPU_CACHE_V6
@@ -371,7 +371,7 @@ config CPU_V6
 
 # ARMv6k
 config CPU_V6K
-	bool "Support ARM V6K processor" if (!ARCH_MULTIPLATFORM || ARCH_MULTI_V6) && (ARCH_INTEGRATOR || MACH_REALVIEW_EB || MACH_REALVIEW_PBX)
+	bool
 	select CPU_32v6
 	select CPU_32v6K
 	select CPU_ABRT_EV6
@@ -385,7 +385,7 @@ config CPU_V6K
 
 # ARMv7
 config CPU_V7
-	bool "Support ARM V7 processor" if (!ARCH_MULTIPLATFORM || ARCH_MULTI_V7) && (ARCH_INTEGRATOR || MACH_REALVIEW_EB || MACH_REALVIEW_PBX)
+	bool
 	select CPU_32v6K
 	select CPU_32v7
 	select CPU_ABRT_EV7
@@ -419,28 +419,24 @@ config CPU_THUMBONLY
 config CPU_32v3
 	bool
 	select CPU_USE_DOMAINS if MMU
-	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
-	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
-	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 
 config CPU_32v5
 	bool
 	select CPU_USE_DOMAINS if MMU
-	select NEEDS_SYSCALL_FOR_CMPXCHG if SMP
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 
@@ -805,14 +801,6 @@ config TLS_REG_EMUL
 	  a few prototypes like that in existence) and therefore access to
 	  that required register must be emulated.
 
-config NEEDS_SYSCALL_FOR_CMPXCHG
-	bool
-	select NEED_KUSER_HELPERS
-	help
-	  SMP on a pre-ARMv6 processor?  Well OK then.
-	  Forget about fast user space cmpxchg support.
-	  It is just not possible.
-
 config NEED_KUSER_HELPERS
 	bool
 
@@ -986,6 +974,16 @@ config CACHE_TAUROS2
 	  This option enables the Tauros2 L2 cache controller (as
 	  found on PJ1/PJ4).
 
+config CACHE_UNIPHIER
+	bool "Enable the UniPhier outer cache controller"
+	depends on ARCH_UNIPHIER
+	default y
+	select OUTER_CACHE
+	select OUTER_CACHE_SYNC
+	help
+	  This option enables the UniPhier outer cache (system cache)
+	  controller.
+
 config CACHE_XSC3L2
 	bool "Enable the L2 cache on XScale3"
 	depends on CPU_XSC3
@@ -1007,8 +1005,6 @@ config ARM_L1_CACHE_SHIFT
 
 config ARM_DMA_MEM_BUFFERABLE
 	bool "Use non-cacheable memory for DMA" if (CPU_V6 || CPU_V6K) && !CPU_V7
-	depends on !(MACH_REALVIEW_PB1176 || REALVIEW_EB_ARM11MP || \
-		     MACH_REALVIEW_PB11MP)
 	default y if CPU_V6 || CPU_V6K || CPU_V7
 	help
 	  Historically, the kernel has used strongly ordered mappings to
@@ -1041,24 +1037,26 @@ config ARCH_SUPPORTS_BIG_ENDIAN
 	  This option specifies the architecture can support big endian
 	  operation.
 
-config ARM_KERNMEM_PERMS
-	bool "Restrict kernel memory permissions"
-	depends on MMU
-	help
-	  If this is set, kernel memory other than kernel text (and rodata)
-	  will be made non-executable. The tradeoff is that each region is
-	  padded to section-size (1MiB) boundaries (because their permissions
-	  are different and splitting the 1M pages into 4K ones causes TLB
-	  performance problems), wasting memory.
-
 config DEBUG_RODATA
 	bool "Make kernel text and rodata read-only"
-	depends on ARM_KERNMEM_PERMS
+	depends on MMU && !XIP_KERNEL
+	default y if CPU_V7
+	help
+	  If this is set, kernel text and rodata memory will be made
+	  read-only, and non-text kernel memory will be made non-executable.
+	  The tradeoff is that each region is padded to section-size (1MiB)
+	  boundaries (because their permissions are different and splitting
+	  the 1M pages into 4K ones causes TLB performance problems), which
+	  can waste memory.
+
+config DEBUG_ALIGN_RODATA
+	bool "Make rodata strictly non-executable"
+	depends on DEBUG_RODATA
 	default y
 	help
-	  If this is set, kernel text and rodata will be made read-only. This
-	  is to help catch accidental or malicious attempts to change the
-	  kernel's executable code. Additionally splits rodata from kernel
-	  text so it can be made explicitly non-executable. This creates
-	  another section-size padded region, so it can waste more memory
-	  space while gaining the read-only protections.
+	  If this is set, rodata will be made explicitly non-executable. This
+	  provides protection on the rare chance that attackers might find and
+	  use ROP gadgets that exist in the rodata section. This adds an
+	  additional section-aligned split of rodata from kernel text so it
+	  can be made explicitly non-executable. This padding may waste memory
+	  space to gain the additional protection.
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 57c8df500e8c..7f76d96ce546 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -103,3 +103,4 @@ obj-$(CONFIG_CACHE_FEROCEON_L2)	+= cache-feroceon-l2.o
 obj-$(CONFIG_CACHE_L2X0)	+= cache-l2x0.o l2c-l2x0-resume.o
 obj-$(CONFIG_CACHE_XSC3L2)	+= cache-xsc3l2.o
 obj-$(CONFIG_CACHE_TAUROS2)	+= cache-tauros2.o
+obj-$(CONFIG_CACHE_UNIPHIER)	+= cache-uniphier.o
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 00b7f7de28a1..7d5f4c736a16 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -803,7 +803,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 			}
 		}
 	} else {
-		fault = probe_kernel_address(instrptr, instr);
+		fault = probe_kernel_address((void *)instrptr, instr);
 		instr = __mem_to_opcode_arm(instr);
 	}
 
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index 493692d838c6..9f9d54271aad 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -790,7 +790,7 @@ static const struct l2c_init_data l2c310_init_fns __initconst = {
 };
 
 static int __init __l2c_init(const struct l2c_init_data *data,
-			     u32 aux_val, u32 aux_mask, u32 cache_id)
+			     u32 aux_val, u32 aux_mask, u32 cache_id, bool nosync)
 {
 	struct outer_cache_fns fns;
 	unsigned way_size_bits, ways;
@@ -866,6 +866,10 @@ static int __init __l2c_init(const struct l2c_init_data *data,
 	fns.configure = outer_cache.configure;
 	if (data->fixup)
 		data->fixup(l2x0_base, cache_id, &fns);
+	if (nosync) {
+		pr_info("L2C: disabling outer sync\n");
+		fns.sync = NULL;
+	}
 
 	/*
 	 * Check if l2x0 controller is already enabled.  If we are booting
@@ -925,7 +929,7 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
 	if (data->save)
 		data->save(l2x0_base);
 
-	__l2c_init(data, aux_val, aux_mask, cache_id);
+	__l2c_init(data, aux_val, aux_mask, cache_id, false);
 }
 
 #ifdef CONFIG_OF
@@ -1060,6 +1064,18 @@ static void __init l2x0_of_parse(const struct device_node *np,
 		val |= (dirty - 1) << L2X0_AUX_CTRL_DIRTY_LATENCY_SHIFT;
 	}
 
+	if (of_property_read_bool(np, "arm,parity-enable")) {
+		mask &= ~L2C_AUX_CTRL_PARITY_ENABLE;
+		val |= L2C_AUX_CTRL_PARITY_ENABLE;
+	} else if (of_property_read_bool(np, "arm,parity-disable")) {
+		mask &= ~L2C_AUX_CTRL_PARITY_ENABLE;
+	}
+
+	if (of_property_read_bool(np, "arm,shared-override")) {
+		mask &= ~L2C_AUX_CTRL_SHARED_OVERRIDE;
+		val |= L2C_AUX_CTRL_SHARED_OVERRIDE;
+	}
+
 	ret = l2x0_cache_size_of_parse(np, aux_val, aux_mask, &assoc, SZ_256K);
 	if (ret)
 		return;
@@ -1176,6 +1192,14 @@ static void __init l2c310_of_parse(const struct device_node *np,
 		*aux_mask &= ~L2C_AUX_CTRL_SHARED_OVERRIDE;
 	}
 
+	if (of_property_read_bool(np, "arm,parity-enable")) {
+		*aux_val |= L2C_AUX_CTRL_PARITY_ENABLE;
+		*aux_mask &= ~L2C_AUX_CTRL_PARITY_ENABLE;
+	} else if (of_property_read_bool(np, "arm,parity-disable")) {
+		*aux_val &= ~L2C_AUX_CTRL_PARITY_ENABLE;
+		*aux_mask &= ~L2C_AUX_CTRL_PARITY_ENABLE;
+	}
+
 	prefetch = l2x0_saved_regs.prefetch_ctrl;
 
 	ret = of_property_read_u32(np, "arm,double-linefill", &val);
@@ -1704,6 +1728,7 @@ int __init l2x0_of_init(u32 aux_val, u32 aux_mask)
 	struct resource res;
 	u32 cache_id, old_aux;
 	u32 cache_level = 2;
+	bool nosync = false;
 
 	np = of_find_matching_node(NULL, l2x0_ids);
 	if (!np)
@@ -1742,6 +1767,8 @@ int __init l2x0_of_init(u32 aux_val, u32 aux_mask)
 	if (cache_level != 2)
 		pr_err("L2C: device tree specifies invalid cache level\n");
 
+	nosync = of_property_read_bool(np, "arm,outer-sync-disable");
+
 	/* Read back current (default) hardware configuration */
 	if (data->save)
 		data->save(l2x0_base);
@@ -1756,6 +1783,6 @@ int __init l2x0_of_init(u32 aux_val, u32 aux_mask)
 	else
 		cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID);
 
-	return __l2c_init(data, aux_val, aux_mask, cache_id);
+	return __l2c_init(data, aux_val, aux_mask, cache_id, nosync);
 }
 #endif
diff --git a/arch/arm/mm/cache-uniphier.c b/arch/arm/mm/cache-uniphier.c
new file mode 100644
index 000000000000..a6fa7b73fbe0
--- /dev/null
+++ b/arch/arm/mm/cache-uniphier.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (C) 2015 Masahiro Yamada <yamada.masahiro@socionext.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt)		"uniphier: " fmt
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/log2.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+#include <asm/hardware/cache-uniphier.h>
+#include <asm/outercache.h>
+
+/* control registers */
+#define UNIPHIER_SSCC		0x0	/* Control Register */
+#define    UNIPHIER_SSCC_BST			BIT(20)	/* UCWG burst read */
+#define    UNIPHIER_SSCC_ACT			BIT(19)	/* Inst-Data separate */
+#define    UNIPHIER_SSCC_WTG			BIT(18)	/* WT gathering on */
+#define    UNIPHIER_SSCC_PRD			BIT(17)	/* enable pre-fetch */
+#define    UNIPHIER_SSCC_ON			BIT(0)	/* enable cache */
+#define UNIPHIER_SSCLPDAWCR	0x30	/* Unified/Data Active Way Control */
+#define UNIPHIER_SSCLPIAWCR	0x34	/* Instruction Active Way Control */
+
+/* revision registers */
+#define UNIPHIER_SSCID		0x0	/* ID Register */
+
+/* operation registers */
+#define UNIPHIER_SSCOPE		0x244	/* Cache Operation Primitive Entry */
+#define    UNIPHIER_SSCOPE_CM_INV		0x0	/* invalidate */
+#define    UNIPHIER_SSCOPE_CM_CLEAN		0x1	/* clean */
+#define    UNIPHIER_SSCOPE_CM_FLUSH		0x2	/* flush */
+#define    UNIPHIER_SSCOPE_CM_SYNC		0x8	/* sync (drain bufs) */
+#define    UNIPHIER_SSCOPE_CM_FLUSH_PREFETCH	0x9	/* flush p-fetch buf */
+#define UNIPHIER_SSCOQM		0x248	/* Cache Operation Queue Mode */
+#define    UNIPHIER_SSCOQM_TID_MASK		(0x3 << 21)
+#define    UNIPHIER_SSCOQM_TID_LRU_DATA		(0x0 << 21)
+#define    UNIPHIER_SSCOQM_TID_LRU_INST		(0x1 << 21)
+#define    UNIPHIER_SSCOQM_TID_WAY		(0x2 << 21)
+#define    UNIPHIER_SSCOQM_S_MASK		(0x3 << 17)
+#define    UNIPHIER_SSCOQM_S_RANGE		(0x0 << 17)
+#define    UNIPHIER_SSCOQM_S_ALL		(0x1 << 17)
+#define    UNIPHIER_SSCOQM_S_WAY		(0x2 << 17)
+#define    UNIPHIER_SSCOQM_CE			BIT(15)	/* notify completion */
+#define    UNIPHIER_SSCOQM_CM_INV		0x0	/* invalidate */
+#define    UNIPHIER_SSCOQM_CM_CLEAN		0x1	/* clean */
+#define    UNIPHIER_SSCOQM_CM_FLUSH		0x2	/* flush */
+#define    UNIPHIER_SSCOQM_CM_PREFETCH		0x3	/* prefetch to cache */
+#define    UNIPHIER_SSCOQM_CM_PREFETCH_BUF	0x4	/* prefetch to pf-buf */
+#define    UNIPHIER_SSCOQM_CM_TOUCH		0x5	/* touch */
+#define    UNIPHIER_SSCOQM_CM_TOUCH_ZERO	0x6	/* touch to zero */
+#define    UNIPHIER_SSCOQM_CM_TOUCH_DIRTY	0x7	/* touch with dirty */
+#define UNIPHIER_SSCOQAD	0x24c	/* Cache Operation Queue Address */
+#define UNIPHIER_SSCOQSZ	0x250	/* Cache Operation Queue Size */
+#define UNIPHIER_SSCOQMASK	0x254	/* Cache Operation Queue Address Mask */
+#define UNIPHIER_SSCOQWN	0x258	/* Cache Operation Queue Way Number */
+#define UNIPHIER_SSCOPPQSEF	0x25c	/* Cache Operation Queue Set Complete*/
+#define    UNIPHIER_SSCOPPQSEF_FE		BIT(1)
+#define    UNIPHIER_SSCOPPQSEF_OE		BIT(0)
+#define UNIPHIER_SSCOLPQS	0x260	/* Cache Operation Queue Status */
+#define    UNIPHIER_SSCOLPQS_EF			BIT(2)
+#define    UNIPHIER_SSCOLPQS_EST		BIT(1)
+#define    UNIPHIER_SSCOLPQS_QST		BIT(0)
+
+/* Is the touch/pre-fetch destination specified by ways? */
+#define UNIPHIER_SSCOQM_TID_IS_WAY(op) \
+		((op & UNIPHIER_SSCOQM_TID_MASK) == UNIPHIER_SSCOQM_TID_WAY)
+/* Is the operation region specified by address range? */
+#define UNIPHIER_SSCOQM_S_IS_RANGE(op) \
+		((op & UNIPHIER_SSCOQM_S_MASK) == UNIPHIER_SSCOQM_S_RANGE)
+
+/**
+ * uniphier_cache_data - UniPhier outer cache specific data
+ *
+ * @ctrl_base: virtual base address of control registers
+ * @rev_base: virtual base address of revision registers
+ * @op_base: virtual base address of operation registers
+ * @way_present_mask: each bit specifies if the way is present
+ * @way_locked_mask: each bit specifies if the way is locked
+ * @nsets: number of associativity sets
+ * @line_size: line size in bytes
+ * @range_op_max_size: max size that can be handled by a single range operation
+ * @list: list node to include this level in the whole cache hierarchy
+ */
+struct uniphier_cache_data {
+	void __iomem *ctrl_base;
+	void __iomem *rev_base;
+	void __iomem *op_base;
+	u32 way_present_mask;
+	u32 way_locked_mask;
+	u32 nsets;
+	u32 line_size;
+	u32 range_op_max_size;
+	struct list_head list;
+};
+
+/*
+ * List of the whole outer cache hierarchy.  This list is only modified during
+ * the early boot stage, so no mutex is taken for the access to the list.
+ */
+static LIST_HEAD(uniphier_cache_list);
+
+/**
+ * __uniphier_cache_sync - perform a sync point for a particular cache level
+ *
+ * @data: cache controller specific data
+ */
+static void __uniphier_cache_sync(struct uniphier_cache_data *data)
+{
+	/* This sequence need not be atomic.  Do not disable IRQ. */
+	writel_relaxed(UNIPHIER_SSCOPE_CM_SYNC,
+		       data->op_base + UNIPHIER_SSCOPE);
+	/* need a read back to confirm */
+	readl_relaxed(data->op_base + UNIPHIER_SSCOPE);
+}
+
+/**
+ * __uniphier_cache_maint_common - run a queue operation for a particular level
+ *
+ * @data: cache controller specific data
+ * @start: start address of range operation (don't care for "all" operation)
+ * @size: data size of range operation (don't care for "all" operation)
+ * @operation: flags to specify the desired cache operation
+ */
+static void __uniphier_cache_maint_common(struct uniphier_cache_data *data,
+					  unsigned long start,
+					  unsigned long size,
+					  u32 operation)
+{
+	unsigned long flags;
+
+	/*
+	 * No spin lock is necessary here because:
+	 *
+	 * [1] This outer cache controller is able to accept maintenance
+	 * operations from multiple CPUs at a time in an SMP system; if a
+	 * maintenance operation is under way and another operation is issued,
+	 * the new one is stored in the queue.  The controller performs one
+	 * operation after another.  If the queue is full, the status register,
+	 * UNIPHIER_SSCOPPQSEF, indicates that the queue registration has
+	 * failed.  The status registers, UNIPHIER_{SSCOPPQSEF, SSCOLPQS}, have
+	 * different instances for each CPU, i.e. each CPU can track the status
+	 * of the maintenance operations triggered by itself.
+	 *
+	 * [2] The cache command registers, UNIPHIER_{SSCOQM, SSCOQAD, SSCOQSZ,
+	 * SSCOQWN}, are shared between multiple CPUs, but the hardware still
+	 * guarantees the registration sequence is atomic; the write access to
+	 * them are arbitrated by the hardware.  The first accessor to the
+	 * register, UNIPHIER_SSCOQM, holds the access right and it is released
+	 * by reading the status register, UNIPHIER_SSCOPPQSEF.  While one CPU
+	 * is holding the access right, other CPUs fail to register operations.
+	 * One CPU should not hold the access right for a long time, so local
+	 * IRQs should be disabled while the following sequence.
+	 */
+	local_irq_save(flags);
+
+	/* clear the complete notification flag */
+	writel_relaxed(UNIPHIER_SSCOLPQS_EF, data->op_base + UNIPHIER_SSCOLPQS);
+
+	do {
+		/* set cache operation */
+		writel_relaxed(UNIPHIER_SSCOQM_CE | operation,
+			       data->op_base + UNIPHIER_SSCOQM);
+
+		/* set address range if needed */
+		if (likely(UNIPHIER_SSCOQM_S_IS_RANGE(operation))) {
+			writel_relaxed(start, data->op_base + UNIPHIER_SSCOQAD);
+			writel_relaxed(size, data->op_base + UNIPHIER_SSCOQSZ);
+		}
+
+		/* set target ways if needed */
+		if (unlikely(UNIPHIER_SSCOQM_TID_IS_WAY(operation)))
+			writel_relaxed(data->way_locked_mask,
+				       data->op_base + UNIPHIER_SSCOQWN);
+	} while (unlikely(readl_relaxed(data->op_base + UNIPHIER_SSCOPPQSEF) &
+			  (UNIPHIER_SSCOPPQSEF_FE | UNIPHIER_SSCOPPQSEF_OE)));
+
+	/* wait until the operation is completed */
+	while (likely(readl_relaxed(data->op_base + UNIPHIER_SSCOLPQS) !=
+		      UNIPHIER_SSCOLPQS_EF))
+		cpu_relax();
+
+	local_irq_restore(flags);
+}
+
+static void __uniphier_cache_maint_all(struct uniphier_cache_data *data,
+				       u32 operation)
+{
+	__uniphier_cache_maint_common(data, 0, 0,
+				      UNIPHIER_SSCOQM_S_ALL | operation);
+
+	__uniphier_cache_sync(data);
+}
+
+static void __uniphier_cache_maint_range(struct uniphier_cache_data *data,
+					 unsigned long start, unsigned long end,
+					 u32 operation)
+{
+	unsigned long size;
+
+	/*
+	 * If the start address is not aligned,
+	 * perform a cache operation for the first cache-line
+	 */
+	start = start & ~(data->line_size - 1);
+
+	size = end - start;
+
+	if (unlikely(size >= (unsigned long)(-data->line_size))) {
+		/* this means cache operation for all range */
+		__uniphier_cache_maint_all(data, operation);
+		return;
+	}
+
+	/*
+	 * If the end address is not aligned,
+	 * perform a cache operation for the last cache-line
+	 */
+	size = ALIGN(size, data->line_size);
+
+	while (size) {
+		unsigned long chunk_size = min_t(unsigned long, size,
+						 data->range_op_max_size);
+
+		__uniphier_cache_maint_common(data, start, chunk_size,
+					UNIPHIER_SSCOQM_S_RANGE | operation);
+
+		start += chunk_size;
+		size -= chunk_size;
+	}
+
+	__uniphier_cache_sync(data);
+}
+
+static void __uniphier_cache_enable(struct uniphier_cache_data *data, bool on)
+{
+	u32 val = 0;
+
+	if (on)
+		val = UNIPHIER_SSCC_WTG | UNIPHIER_SSCC_PRD | UNIPHIER_SSCC_ON;
+
+	writel_relaxed(val, data->ctrl_base + UNIPHIER_SSCC);
+}
+
+static void __init __uniphier_cache_set_locked_ways(
+					struct uniphier_cache_data *data,
+					u32 way_mask)
+{
+	data->way_locked_mask = way_mask & data->way_present_mask;
+
+	writel_relaxed(~data->way_locked_mask & data->way_present_mask,
+		       data->ctrl_base + UNIPHIER_SSCLPDAWCR);
+}
+
+static void uniphier_cache_maint_range(unsigned long start, unsigned long end,
+				       u32 operation)
+{
+	struct uniphier_cache_data *data;
+
+	list_for_each_entry(data, &uniphier_cache_list, list)
+		__uniphier_cache_maint_range(data, start, end, operation);
+}
+
+static void uniphier_cache_maint_all(u32 operation)
+{
+	struct uniphier_cache_data *data;
+
+	list_for_each_entry(data, &uniphier_cache_list, list)
+		__uniphier_cache_maint_all(data, operation);
+}
+
+static void uniphier_cache_inv_range(unsigned long start, unsigned long end)
+{
+	uniphier_cache_maint_range(start, end, UNIPHIER_SSCOQM_CM_INV);
+}
+
+static void uniphier_cache_clean_range(unsigned long start, unsigned long end)
+{
+	uniphier_cache_maint_range(start, end, UNIPHIER_SSCOQM_CM_CLEAN);
+}
+
+static void uniphier_cache_flush_range(unsigned long start, unsigned long end)
+{
+	uniphier_cache_maint_range(start, end, UNIPHIER_SSCOQM_CM_FLUSH);
+}
+
+static void __init uniphier_cache_inv_all(void)
+{
+	uniphier_cache_maint_all(UNIPHIER_SSCOQM_CM_INV);
+}
+
+static void uniphier_cache_flush_all(void)
+{
+	uniphier_cache_maint_all(UNIPHIER_SSCOQM_CM_FLUSH);
+}
+
+static void uniphier_cache_disable(void)
+{
+	struct uniphier_cache_data *data;
+
+	list_for_each_entry_reverse(data, &uniphier_cache_list, list)
+		__uniphier_cache_enable(data, false);
+
+	uniphier_cache_flush_all();
+}
+
+static void __init uniphier_cache_enable(void)
+{
+	struct uniphier_cache_data *data;
+
+	uniphier_cache_inv_all();
+
+	list_for_each_entry(data, &uniphier_cache_list, list) {
+		__uniphier_cache_enable(data, true);
+		__uniphier_cache_set_locked_ways(data, 0);
+	}
+}
+
+static void uniphier_cache_sync(void)
+{
+	struct uniphier_cache_data *data;
+
+	list_for_each_entry(data, &uniphier_cache_list, list)
+		__uniphier_cache_sync(data);
+}
+
+int __init uniphier_cache_l2_is_enabled(void)
+{
+	struct uniphier_cache_data *data;
+
+	data = list_first_entry_or_null(&uniphier_cache_list,
+					struct uniphier_cache_data, list);
+	if (!data)
+		return 0;
+
+	return !!(readl_relaxed(data->ctrl_base + UNIPHIER_SSCC) &
+		  UNIPHIER_SSCC_ON);
+}
+
+void __init uniphier_cache_l2_touch_range(unsigned long start,
+					  unsigned long end)
+{
+	struct uniphier_cache_data *data;
+
+	data = list_first_entry_or_null(&uniphier_cache_list,
+					struct uniphier_cache_data, list);
+	if (data)
+		__uniphier_cache_maint_range(data, start, end,
+					     UNIPHIER_SSCOQM_TID_WAY |
+					     UNIPHIER_SSCOQM_CM_TOUCH);
+}
+
+void __init uniphier_cache_l2_set_locked_ways(u32 way_mask)
+{
+	struct uniphier_cache_data *data;
+
+	data = list_first_entry_or_null(&uniphier_cache_list,
+					struct uniphier_cache_data, list);
+	if (data)
+		__uniphier_cache_set_locked_ways(data, way_mask);
+}
+
+static const struct of_device_id uniphier_cache_match[] __initconst = {
+	{
+		.compatible = "socionext,uniphier-system-cache",
+	},
+	{ /* sentinel */ }
+};
+
+static int __init __uniphier_cache_init(struct device_node *np,
+					unsigned int *cache_level)
+{
+	struct uniphier_cache_data *data;
+	u32 level, cache_size;
+	struct device_node *next_np;
+	int ret = 0;
+
+	if (!of_match_node(uniphier_cache_match, np)) {
+		pr_err("L%d: not compatible with uniphier cache\n",
+		       *cache_level);
+		return -EINVAL;
+	}
+
+	if (of_property_read_u32(np, "cache-level", &level)) {
+		pr_err("L%d: cache-level is not specified\n", *cache_level);
+		return -EINVAL;
+	}
+
+	if (level != *cache_level) {
+		pr_err("L%d: cache-level is unexpected value %d\n",
+		       *cache_level, level);
+		return -EINVAL;
+	}
+
+	if (!of_property_read_bool(np, "cache-unified")) {
+		pr_err("L%d: cache-unified is not specified\n", *cache_level);
+		return -EINVAL;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	if (of_property_read_u32(np, "cache-line-size", &data->line_size) ||
+	    !is_power_of_2(data->line_size)) {
+		pr_err("L%d: cache-line-size is unspecified or invalid\n",
+		       *cache_level);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (of_property_read_u32(np, "cache-sets", &data->nsets) ||
+	    !is_power_of_2(data->nsets)) {
+		pr_err("L%d: cache-sets is unspecified or invalid\n",
+		       *cache_level);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (of_property_read_u32(np, "cache-size", &cache_size) ||
+	    cache_size == 0 || cache_size % (data->nsets * data->line_size)) {
+		pr_err("L%d: cache-size is unspecified or invalid\n",
+		       *cache_level);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	data->way_present_mask =
+		((u32)1 << cache_size / data->nsets / data->line_size) - 1;
+
+	data->ctrl_base = of_iomap(np, 0);
+	if (!data->ctrl_base) {
+		pr_err("L%d: failed to map control register\n", *cache_level);
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	data->rev_base = of_iomap(np, 1);
+	if (!data->rev_base) {
+		pr_err("L%d: failed to map revision register\n", *cache_level);
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	data->op_base = of_iomap(np, 2);
+	if (!data->op_base) {
+		pr_err("L%d: failed to map operation register\n", *cache_level);
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	if (*cache_level == 2) {
+		u32 revision = readl(data->rev_base + UNIPHIER_SSCID);
+		/*
+		 * The size of range operation is limited to (1 << 22) or less
+		 * for PH-sLD8 or older SoCs.
+		 */
+		if (revision <= 0x16)
+			data->range_op_max_size = (u32)1 << 22;
+	}
+
+	data->range_op_max_size -= data->line_size;
+
+	INIT_LIST_HEAD(&data->list);
+	list_add_tail(&data->list, &uniphier_cache_list); /* no mutex */
+
+	/*
+	 * OK, this level has been successfully initialized.  Look for the next
+	 * level cache.  Do not roll back even if the initialization of the
+	 * next level cache fails because we want to continue with available
+	 * cache levels.
+	 */
+	next_np = of_find_next_cache_node(np);
+	if (next_np) {
+		(*cache_level)++;
+		ret = __uniphier_cache_init(next_np, cache_level);
+	}
+	of_node_put(next_np);
+
+	return ret;
+err:
+	iounmap(data->op_base);
+	iounmap(data->rev_base);
+	iounmap(data->ctrl_base);
+	kfree(data);
+
+	return ret;
+}
+
+int __init uniphier_cache_init(void)
+{
+	struct device_node *np = NULL;
+	unsigned int cache_level;
+	int ret = 0;
+
+	/* look for level 2 cache */
+	while ((np = of_find_matching_node(np, uniphier_cache_match)))
+		if (!of_property_read_u32(np, "cache-level", &cache_level) &&
+		    cache_level == 2)
+			break;
+
+	if (!np)
+		return -ENODEV;
+
+	ret = __uniphier_cache_init(np, &cache_level);
+	of_node_put(np);
+
+	if (ret) {
+		/*
+		 * Error out iif L2 initialization fails.  Continue with any
+		 * error on L3 or outer because they are optional.
+		 */
+		if (cache_level == 2) {
+			pr_err("failed to initialize L2 cache\n");
+			return ret;
+		}
+
+		cache_level--;
+		ret = 0;
+	}
+
+	outer_cache.inv_range = uniphier_cache_inv_range;
+	outer_cache.clean_range = uniphier_cache_clean_range;
+	outer_cache.flush_range = uniphier_cache_flush_range;
+	outer_cache.flush_all = uniphier_cache_flush_all;
+	outer_cache.disable = uniphier_cache_disable;
+	outer_cache.sync = uniphier_cache_sync;
+
+	uniphier_cache_enable();
+
+	pr_info("enabled outer cache (cache level: %d)\n", cache_level);
+
+	return ret;
+}
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index 845769e41332..c8c8b9ed02e0 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -165,13 +165,28 @@ static void flush_context(unsigned int cpu)
 		__flush_icache_all();
 }
 
-static int is_reserved_asid(u64 asid)
+static bool check_update_reserved_asid(u64 asid, u64 newasid)
 {
 	int cpu;
-	for_each_possible_cpu(cpu)
-		if (per_cpu(reserved_asids, cpu) == asid)
-			return 1;
-	return 0;
+	bool hit = false;
+
+	/*
+	 * Iterate over the set of reserved ASIDs looking for a match.
+	 * If we find one, then we can update our mm to use newasid
+	 * (i.e. the same ASID in the current generation) but we can't
+	 * exit the loop early, since we need to ensure that all copies
+	 * of the old ASID are updated to reflect the mm. Failure to do
+	 * so could result in us missing the reserved ASID in a future
+	 * generation.
+	 */
+	for_each_possible_cpu(cpu) {
+		if (per_cpu(reserved_asids, cpu) == asid) {
+			hit = true;
+			per_cpu(reserved_asids, cpu) = newasid;
+		}
+	}
+
+	return hit;
 }
 
 static u64 new_context(struct mm_struct *mm, unsigned int cpu)
@@ -181,12 +196,14 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 	u64 generation = atomic64_read(&asid_generation);
 
 	if (asid != 0) {
+		u64 newasid = generation | (asid & ~ASID_MASK);
+
 		/*
 		 * If our current ASID was active during a rollover, we
 		 * can continue to use it and this was just a false alarm.
 		 */
-		if (is_reserved_asid(asid))
-			return generation | (asid & ~ASID_MASK);
+		if (check_update_reserved_asid(asid, newasid))
+			return newasid;
 
 		/*
 		 * We had a valid ASID in a previous life, so try to re-use
@@ -194,7 +211,7 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 		 */
 		asid &= ~ASID_MASK;
 		if (!__test_and_set_bit(asid, asid_map))
-			goto bump_gen;
+			return newasid;
 	}
 
 	/*
@@ -216,11 +233,8 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 
 	__set_bit(asid, asid_map);
 	cur_idx = asid;
-
-bump_gen:
-	asid |= generation;
 	cpumask_clear(mm_cpumask(mm));
-	return asid;
+	return asid | generation;
 }
 
 void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk)
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 1a7815e5421b..deac58d5f1f7 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -42,6 +42,55 @@
 #include "dma.h"
 #include "mm.h"
 
+struct arm_dma_alloc_args {
+	struct device *dev;
+	size_t size;
+	gfp_t gfp;
+	pgprot_t prot;
+	const void *caller;
+	bool want_vaddr;
+};
+
+struct arm_dma_free_args {
+	struct device *dev;
+	size_t size;
+	void *cpu_addr;
+	struct page *page;
+	bool want_vaddr;
+};
+
+struct arm_dma_allocator {
+	void *(*alloc)(struct arm_dma_alloc_args *args,
+		       struct page **ret_page);
+	void (*free)(struct arm_dma_free_args *args);
+};
+
+struct arm_dma_buffer {
+	struct list_head list;
+	void *virt;
+	struct arm_dma_allocator *allocator;
+};
+
+static LIST_HEAD(arm_dma_bufs);
+static DEFINE_SPINLOCK(arm_dma_bufs_lock);
+
+static struct arm_dma_buffer *arm_dma_buffer_find(void *virt)
+{
+	struct arm_dma_buffer *buf, *found = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&arm_dma_bufs_lock, flags);
+	list_for_each_entry(buf, &arm_dma_bufs, list) {
+		if (buf->virt == virt) {
+			list_del(&buf->list);
+			found = buf;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&arm_dma_bufs_lock, flags);
+	return found;
+}
+
 /*
  * The DMA API is built upon the notion of "buffer ownership".  A buffer
  * is either exclusively owned by the CPU (and therefore may be accessed
@@ -592,7 +641,7 @@ static inline pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot)
 #define __alloc_remap_buffer(dev, size, gfp, prot, ret, c, wv)	NULL
 #define __alloc_from_pool(size, ret_page)			NULL
 #define __alloc_from_contiguous(dev, size, prot, ret, c, wv)	NULL
-#define __free_from_pool(cpu_addr, size)			0
+#define __free_from_pool(cpu_addr, size)			do { } while (0)
 #define __free_from_contiguous(dev, page, cpu_addr, size, wv)	do { } while (0)
 #define __dma_free_remap(cpu_addr, size)			do { } while (0)
 
@@ -610,7 +659,78 @@ static void *__alloc_simple_buffer(struct device *dev, size_t size, gfp_t gfp,
 	return page_address(page);
 }
 
+static void *simple_allocator_alloc(struct arm_dma_alloc_args *args,
+				    struct page **ret_page)
+{
+	return __alloc_simple_buffer(args->dev, args->size, args->gfp,
+				     ret_page);
+}
+
+static void simple_allocator_free(struct arm_dma_free_args *args)
+{
+	__dma_free_buffer(args->page, args->size);
+}
 
+static struct arm_dma_allocator simple_allocator = {
+	.alloc = simple_allocator_alloc,
+	.free = simple_allocator_free,
+};
+
+static void *cma_allocator_alloc(struct arm_dma_alloc_args *args,
+				 struct page **ret_page)
+{
+	return __alloc_from_contiguous(args->dev, args->size, args->prot,
+				       ret_page, args->caller,
+				       args->want_vaddr);
+}
+
+static void cma_allocator_free(struct arm_dma_free_args *args)
+{
+	__free_from_contiguous(args->dev, args->page, args->cpu_addr,
+			       args->size, args->want_vaddr);
+}
+
+static struct arm_dma_allocator cma_allocator = {
+	.alloc = cma_allocator_alloc,
+	.free = cma_allocator_free,
+};
+
+static void *pool_allocator_alloc(struct arm_dma_alloc_args *args,
+				  struct page **ret_page)
+{
+	return __alloc_from_pool(args->size, ret_page);
+}
+
+static void pool_allocator_free(struct arm_dma_free_args *args)
+{
+	__free_from_pool(args->cpu_addr, args->size);
+}
+
+static struct arm_dma_allocator pool_allocator = {
+	.alloc = pool_allocator_alloc,
+	.free = pool_allocator_free,
+};
+
+static void *remap_allocator_alloc(struct arm_dma_alloc_args *args,
+				   struct page **ret_page)
+{
+	return __alloc_remap_buffer(args->dev, args->size, args->gfp,
+				    args->prot, ret_page, args->caller,
+				    args->want_vaddr);
+}
+
+static void remap_allocator_free(struct arm_dma_free_args *args)
+{
+	if (args->want_vaddr)
+		__dma_free_remap(args->cpu_addr, args->size);
+
+	__dma_free_buffer(args->page, args->size);
+}
+
+static struct arm_dma_allocator remap_allocator = {
+	.alloc = remap_allocator_alloc,
+	.free = remap_allocator_free,
+};
 
 static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 			 gfp_t gfp, pgprot_t prot, bool is_coherent,
@@ -619,7 +739,16 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 	u64 mask = get_coherent_dma_mask(dev);
 	struct page *page = NULL;
 	void *addr;
-	bool want_vaddr;
+	bool allowblock, cma;
+	struct arm_dma_buffer *buf;
+	struct arm_dma_alloc_args args = {
+		.dev = dev,
+		.size = PAGE_ALIGN(size),
+		.gfp = gfp,
+		.prot = prot,
+		.caller = caller,
+		.want_vaddr = !dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs),
+	};
 
 #ifdef CONFIG_DMA_API_DEBUG
 	u64 limit = (mask + 1) & ~mask;
@@ -633,6 +762,10 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 	if (!mask)
 		return NULL;
 
+	buf = kzalloc(sizeof(*buf), gfp);
+	if (!buf)
+		return NULL;
+
 	if (mask < 0xffffffffULL)
 		gfp |= GFP_DMA;
 
@@ -644,28 +777,37 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 	 * platform; see CONFIG_HUGETLBFS.
 	 */
 	gfp &= ~(__GFP_COMP);
+	args.gfp = gfp;
 
 	*handle = DMA_ERROR_CODE;
-	size = PAGE_ALIGN(size);
-	want_vaddr = !dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs);
-
-	if (nommu())
-		addr = __alloc_simple_buffer(dev, size, gfp, &page);
-	else if (dev_get_cma_area(dev) && (gfp & __GFP_WAIT))
-		addr = __alloc_from_contiguous(dev, size, prot, &page,
-					       caller, want_vaddr);
-	else if (is_coherent)
-		addr = __alloc_simple_buffer(dev, size, gfp, &page);
-	else if (!(gfp & __GFP_WAIT))
-		addr = __alloc_from_pool(size, &page);
+	allowblock = gfpflags_allow_blocking(gfp);
+	cma = allowblock ? dev_get_cma_area(dev) : false;
+
+	if (cma)
+		buf->allocator = &cma_allocator;
+	else if (nommu() || is_coherent)
+		buf->allocator = &simple_allocator;
+	else if (allowblock)
+		buf->allocator = &remap_allocator;
 	else
-		addr = __alloc_remap_buffer(dev, size, gfp, prot, &page,
-					    caller, want_vaddr);
+		buf->allocator = &pool_allocator;
+
+	addr = buf->allocator->alloc(&args, &page);
+
+	if (page) {
+		unsigned long flags;
 
-	if (page)
 		*handle = pfn_to_dma(dev, page_to_pfn(page));
+		buf->virt = args.want_vaddr ? addr : page;
 
-	return want_vaddr ? addr : page;
+		spin_lock_irqsave(&arm_dma_bufs_lock, flags);
+		list_add(&buf->list, &arm_dma_bufs);
+		spin_unlock_irqrestore(&arm_dma_bufs_lock, flags);
+	} else {
+		kfree(buf);
+	}
+
+	return args.want_vaddr ? addr : page;
 }
 
 /*
@@ -741,25 +883,21 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
 			   bool is_coherent)
 {
 	struct page *page = pfn_to_page(dma_to_pfn(dev, handle));
-	bool want_vaddr = !dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs);
-
-	size = PAGE_ALIGN(size);
-
-	if (nommu()) {
-		__dma_free_buffer(page, size);
-	} else if (!is_coherent && __free_from_pool(cpu_addr, size)) {
+	struct arm_dma_buffer *buf;
+	struct arm_dma_free_args args = {
+		.dev = dev,
+		.size = PAGE_ALIGN(size),
+		.cpu_addr = cpu_addr,
+		.page = page,
+		.want_vaddr = !dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs),
+	};
+
+	buf = arm_dma_buffer_find(cpu_addr);
+	if (WARN(!buf, "Freeing invalid buffer %p\n", cpu_addr))
 		return;
-	} else if (!dev_get_cma_area(dev)) {
-		if (want_vaddr && !is_coherent)
-			__dma_free_remap(cpu_addr, size);
-		__dma_free_buffer(page, size);
-	} else {
-		/*
-		 * Non-atomic allocations cannot be freed with IRQs disabled
-		 */
-		WARN_ON(irqs_disabled());
-		__free_from_contiguous(dev, page, cpu_addr, size, want_vaddr);
-	}
+
+	buf->allocator->free(&args);
+	kfree(buf);
 }
 
 void arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
@@ -1122,6 +1260,9 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping,
 	spin_unlock_irqrestore(&mapping->lock, flags);
 }
 
+/* We'll try 2M, 1M, 64K, and finally 4K; array must end with 0! */
+static const int iommu_order_array[] = { 9, 8, 4, 0 };
+
 static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 					  gfp_t gfp, struct dma_attrs *attrs)
 {
@@ -1129,6 +1270,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 	int count = size >> PAGE_SHIFT;
 	int array_size = count * sizeof(struct page *);
 	int i = 0;
+	int order_idx = 0;
 
 	if (array_size <= PAGE_SIZE)
 		pages = kzalloc(array_size, GFP_KERNEL);
@@ -1154,6 +1296,10 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 		return pages;
 	}
 
+	/* Go straight to 4K chunks if caller says it's OK. */
+	if (dma_get_attr(DMA_ATTR_ALLOC_SINGLE_PAGES, attrs))
+		order_idx = ARRAY_SIZE(iommu_order_array) - 1;
+
 	/*
 	 * IOMMU can map any pages, so himem can also be used here
 	 */
@@ -1162,22 +1308,24 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 	while (count) {
 		int j, order;
 
-		for (order = __fls(count); order > 0; --order) {
-			/*
-			 * We do not want OOM killer to be invoked as long
-			 * as we can fall back to single pages, so we force
-			 * __GFP_NORETRY for orders higher than zero.
-			 */
-			pages[i] = alloc_pages(gfp | __GFP_NORETRY, order);
-			if (pages[i])
-				break;
+		order = iommu_order_array[order_idx];
+
+		/* Drop down when we get small */
+		if (__fls(count) < order) {
+			order_idx++;
+			continue;
 		}
 
-		if (!pages[i]) {
-			/*
-			 * Fall back to single page allocation.
-			 * Might invoke OOM killer as last resort.
-			 */
+		if (order) {
+			/* See if it's easy to allocate a high-order chunk */
+			pages[i] = alloc_pages(gfp | __GFP_NORETRY, order);
+
+			/* Go down a notch at first sign of pressure */
+			if (!pages[i]) {
+				order_idx++;
+				continue;
+			}
+		} else {
 			pages[i] = alloc_pages(gfp, 0);
 			if (!pages[i])
 				goto error;
@@ -1200,10 +1348,7 @@ error:
 	while (i--)
 		if (pages[i])
 			__free_pages(pages[i], 0);
-	if (array_size <= PAGE_SIZE)
-		kfree(pages);
-	else
-		vfree(pages);
+	kvfree(pages);
 	return NULL;
 }
 
@@ -1211,7 +1356,6 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
 			       size_t size, struct dma_attrs *attrs)
 {
 	int count = size >> PAGE_SHIFT;
-	int array_size = count * sizeof(struct page *);
 	int i;
 
 	if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) {
@@ -1222,10 +1366,7 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
 				__free_pages(pages[i], 0);
 	}
 
-	if (array_size <= PAGE_SIZE)
-		kfree(pages);
-	else
-		vfree(pages);
+	kvfree(pages);
 	return 0;
 }
 
@@ -1363,7 +1504,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
 	*handle = DMA_ERROR_CODE;
 	size = PAGE_ALIGN(size);
 
-	if (!(gfp & __GFP_WAIT))
+	if (!gfpflags_allow_blocking(gfp))
 		return __iommu_alloc_atomic(dev, size, handle);
 
 	/*
@@ -1407,12 +1548,19 @@ static int arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 	unsigned long uaddr = vma->vm_start;
 	unsigned long usize = vma->vm_end - vma->vm_start;
 	struct page **pages = __iommu_get_pages(cpu_addr, attrs);
+	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned long off = vma->vm_pgoff;
 
 	vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot);
 
 	if (!pages)
 		return -ENXIO;
 
+	if (off >= nr_pages || (usize >> PAGE_SHIFT) > nr_pages - off)
+		return -ENXIO;
+
+	pages += off;
+
 	do {
 		int ret = vm_insert_page(vma, uaddr, *pages++);
 		if (ret) {
@@ -1514,7 +1662,7 @@ static int __map_sg_chunk(struct device *dev, struct scatterlist *sg,
 		return -ENOMEM;
 
 	for (count = 0, s = sg; count < (size >> PAGE_SHIFT); s = sg_next(s)) {
-		phys_addr_t phys = sg_phys(s) & PAGE_MASK;
+		phys_addr_t phys = page_to_phys(sg_page(s));
 		unsigned int len = PAGE_ALIGN(s->offset + s->length);
 
 		if (!is_coherent &&
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 0d629b8f973f..daafcf121ce0 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -593,6 +593,28 @@ do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs)
 	arm_notify_die("", regs, &info, ifsr, 0);
 }
 
+/*
+ * Abort handler to be used only during first unmasking of asynchronous aborts
+ * on the boot CPU. This makes sure that the machine will not die if the
+ * firmware/bootloader left an imprecise abort pending for us to trip over.
+ */
+static int __init early_abort_handler(unsigned long addr, unsigned int fsr,
+				      struct pt_regs *regs)
+{
+	pr_warn("Hit pending asynchronous external abort (FSR=0x%08x) during "
+		"first unmask, this is most likely caused by a "
+		"firmware/bootloader bug.\n", fsr);
+
+	return 0;
+}
+
+void __init early_abt_enable(void)
+{
+	fsr_info[22].fn = early_abort_handler;
+	local_abt_enable();
+	fsr_info[22].fn = do_bad;
+}
+
 #ifndef CONFIG_ARM_LPAE
 static int __init exceptions_init(void)
 {
diff --git a/arch/arm/mm/fault.h b/arch/arm/mm/fault.h
index cf08bdfbe0d6..05ec5e0df32d 100644
--- a/arch/arm/mm/fault.h
+++ b/arch/arm/mm/fault.h
@@ -24,5 +24,6 @@ static inline int fsr_fs(unsigned int fsr)
 
 void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs);
 unsigned long search_exception_table(unsigned long addr);
+void early_abt_enable(void);
 
 #endif	/* __ARCH_ARM_FAULT_H */
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 1ec8e7590fc6..d0ba3551d49a 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -330,7 +330,7 @@ void flush_dcache_page(struct page *page)
 	mapping = page_mapping(page);
 
 	if (!cache_ops_need_broadcast() &&
-	    mapping && !page_mapped(page))
+	    mapping && !page_mapcount(page))
 		clear_bit(PG_dcache_clean, &page->flags);
 	else {
 		__flush_dcache_page(mapping, page);
@@ -415,18 +415,3 @@ void __flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned l
 	 */
 	__cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
 }
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-			  pmd_t *pmdp)
-{
-	pmd_t pmd = pmd_mksplitting(*pmdp);
-	VM_BUG_ON(address & ~PMD_MASK);
-	set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
-	/* dummy IPI to serialise against fast_gup */
-	kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 9df5f09585ca..d02f8187b1cc 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -147,13 +147,3 @@ void *kmap_atomic_pfn(unsigned long pfn)
 
 	return (void *)vaddr;
 }
-
-struct page *kmap_atomic_to_page(const void *ptr)
-{
-	unsigned long vaddr = (unsigned long)ptr;
-
-	if (vaddr < FIXADDR_START)
-		return virt_to_page(ptr);
-
-	return pte_page(get_fixmap_pte(vaddr));
-}
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index e7a81cebbb2e..bd274a05b8ff 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -15,7 +15,7 @@
  * page tables.
  */
 pgd_t *idmap_pgd;
-phys_addr_t (*arch_virt_to_idmap) (unsigned long x);
+unsigned long (*arch_virt_to_idmap)(unsigned long x);
 
 #ifdef CONFIG_ARM_LPAE
 static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
@@ -86,7 +86,7 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start,
 
 	prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
 
-	if (cpu_architecture() <= CPU_ARCH_ARMv5TEJ && !cpu_is_xscale())
+	if (cpu_architecture() <= CPU_ARCH_ARMv5TEJ && !cpu_is_xscale_family())
 		prot |= PMD_BIT4;
 
 	pgd += pgd_index(addr);
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 8a63b4cdc0f2..370581aeb871 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -22,6 +22,7 @@
 #include <linux/memblock.h>
 #include <linux/dma-contiguous.h>
 #include <linux/sizes.h>
+#include <linux/stop_machine.h>
 
 #include <asm/cp15.h>
 #include <asm/mach-types.h>
@@ -191,7 +192,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max_low,
 #ifdef CONFIG_HAVE_ARCH_PFN_VALID
 int pfn_valid(unsigned long pfn)
 {
-	return memblock_is_memory(__pfn_to_phys(pfn));
+	return memblock_is_map_memory(__pfn_to_phys(pfn));
 }
 EXPORT_SYMBOL(pfn_valid);
 #endif
@@ -432,6 +433,9 @@ static void __init free_highpages(void)
 		if (end <= max_low)
 			continue;
 
+		if (memblock_is_nomap(mem))
+			continue;
+
 		/* Truncate partial highmem entries */
 		if (start < max_low)
 			start = max_low;
@@ -568,8 +572,9 @@ void __init mem_init(void)
 	}
 }
 
-#ifdef CONFIG_ARM_KERNMEM_PERMS
+#ifdef CONFIG_DEBUG_RODATA
 struct section_perm {
+	const char *name;
 	unsigned long start;
 	unsigned long end;
 	pmdval_t mask;
@@ -577,9 +582,13 @@ struct section_perm {
 	pmdval_t clear;
 };
 
+/* First section-aligned location at or after __start_rodata. */
+extern char __start_rodata_section_aligned[];
+
 static struct section_perm nx_perms[] = {
 	/* Make pages tables, etc before _stext RW (set NX). */
 	{
+		.name	= "pre-text NX",
 		.start	= PAGE_OFFSET,
 		.end	= (unsigned long)_stext,
 		.mask	= ~PMD_SECT_XN,
@@ -587,26 +596,26 @@ static struct section_perm nx_perms[] = {
 	},
 	/* Make init RW (set NX). */
 	{
+		.name	= "init NX",
 		.start	= (unsigned long)__init_begin,
 		.end	= (unsigned long)_sdata,
 		.mask	= ~PMD_SECT_XN,
 		.prot	= PMD_SECT_XN,
 	},
-#ifdef CONFIG_DEBUG_RODATA
 	/* Make rodata NX (set RO in ro_perms below). */
 	{
-		.start  = (unsigned long)__start_rodata,
+		.name	= "rodata NX",
+		.start  = (unsigned long)__start_rodata_section_aligned,
 		.end    = (unsigned long)__init_begin,
 		.mask   = ~PMD_SECT_XN,
 		.prot   = PMD_SECT_XN,
 	},
-#endif
 };
 
-#ifdef CONFIG_DEBUG_RODATA
 static struct section_perm ro_perms[] = {
 	/* Make kernel code and rodata RX (set RO). */
 	{
+		.name	= "text/rodata RO",
 		.start  = (unsigned long)_stext,
 		.end    = (unsigned long)__init_begin,
 #ifdef CONFIG_ARM_LPAE
@@ -619,7 +628,6 @@ static struct section_perm ro_perms[] = {
 #endif
 	},
 };
-#endif
 
 /*
  * Updates section permissions only for the current mm (sections are
@@ -627,12 +635,10 @@ static struct section_perm ro_perms[] = {
  * safe to be called with preemption disabled, as under stop_machine().
  */
 static inline void section_update(unsigned long addr, pmdval_t mask,
-				  pmdval_t prot)
+				  pmdval_t prot, struct mm_struct *mm)
 {
-	struct mm_struct *mm;
 	pmd_t *pmd;
 
-	mm = current->active_mm;
 	pmd = pmd_offset(pud_offset(pgd_offset(mm, addr), addr), addr);
 
 #ifdef CONFIG_ARM_LPAE
@@ -656,55 +662,86 @@ static inline bool arch_has_strict_perms(void)
 	return !!(get_cr() & CR_XP);
 }
 
-#define set_section_perms(perms, field)	{				\
-	size_t i;							\
-	unsigned long addr;						\
-									\
-	if (!arch_has_strict_perms())					\
-		return;							\
-									\
-	for (i = 0; i < ARRAY_SIZE(perms); i++) {			\
-		if (!IS_ALIGNED(perms[i].start, SECTION_SIZE) ||	\
-		    !IS_ALIGNED(perms[i].end, SECTION_SIZE)) {		\
-			pr_err("BUG: section %lx-%lx not aligned to %lx\n", \
-				perms[i].start, perms[i].end,		\
-				SECTION_SIZE);				\
-			continue;					\
-		}							\
-									\
-		for (addr = perms[i].start;				\
-		     addr < perms[i].end;				\
-		     addr += SECTION_SIZE)				\
-			section_update(addr, perms[i].mask,		\
-				       perms[i].field);			\
-	}								\
+void set_section_perms(struct section_perm *perms, int n, bool set,
+			struct mm_struct *mm)
+{
+	size_t i;
+	unsigned long addr;
+
+	if (!arch_has_strict_perms())
+		return;
+
+	for (i = 0; i < n; i++) {
+		if (!IS_ALIGNED(perms[i].start, SECTION_SIZE) ||
+		    !IS_ALIGNED(perms[i].end, SECTION_SIZE)) {
+			pr_err("BUG: %s section %lx-%lx not aligned to %lx\n",
+				perms[i].name, perms[i].start, perms[i].end,
+				SECTION_SIZE);
+			continue;
+		}
+
+		for (addr = perms[i].start;
+		     addr < perms[i].end;
+		     addr += SECTION_SIZE)
+			section_update(addr, perms[i].mask,
+				set ? perms[i].prot : perms[i].clear, mm);
+	}
+
+}
+
+static void update_sections_early(struct section_perm perms[], int n)
+{
+	struct task_struct *t, *s;
+
+	read_lock(&tasklist_lock);
+	for_each_process(t) {
+		if (t->flags & PF_KTHREAD)
+			continue;
+		for_each_thread(t, s)
+			set_section_perms(perms, n, true, s->mm);
+	}
+	read_unlock(&tasklist_lock);
+	set_section_perms(perms, n, true, current->active_mm);
+	set_section_perms(perms, n, true, &init_mm);
 }
 
-static inline void fix_kernmem_perms(void)
+int __fix_kernmem_perms(void *unused)
 {
-	set_section_perms(nx_perms, prot);
+	update_sections_early(nx_perms, ARRAY_SIZE(nx_perms));
+	return 0;
+}
+
+void fix_kernmem_perms(void)
+{
+	stop_machine(__fix_kernmem_perms, NULL, NULL);
+}
+
+int __mark_rodata_ro(void *unused)
+{
+	update_sections_early(ro_perms, ARRAY_SIZE(ro_perms));
+	return 0;
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void)
 {
-	set_section_perms(ro_perms, prot);
+	stop_machine(__mark_rodata_ro, NULL, NULL);
 }
 
 void set_kernel_text_rw(void)
 {
-	set_section_perms(ro_perms, clear);
+	set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), false,
+				current->active_mm);
 }
 
 void set_kernel_text_ro(void)
 {
-	set_section_perms(ro_perms, prot);
+	set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), true,
+				current->active_mm);
 }
-#endif /* CONFIG_DEBUG_RODATA */
 
 #else
 static inline void fix_kernmem_perms(void) { }
-#endif /* CONFIG_ARM_KERNMEM_PERMS */
+#endif /* CONFIG_DEBUG_RODATA */
 
 void free_tcmmem(void)
 {
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 0c81056c1dd7..66a978d05958 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -30,6 +30,7 @@
 #include <asm/cp15.h>
 #include <asm/cputype.h>
 #include <asm/cacheflush.h>
+#include <asm/early_ioremap.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
@@ -469,3 +470,11 @@ int pci_ioremap_io(unsigned int offset, phys_addr_t phys_addr)
 }
 EXPORT_SYMBOL_GPL(pci_ioremap_io);
 #endif
+
+/*
+ * Must be called after early_fixmap_init
+ */
+void __init early_ioremap_init(void)
+{
+	early_ioremap_setup();
+}
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 407dc786583a..4b4058db0781 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -173,8 +173,7 @@ unsigned long arch_mmap_rnd(void)
 {
 	unsigned long rnd;
 
-	/* 8 bits of randomness in 20 address space bits */
-	rnd = (unsigned long)get_random_int() % (1 << 8);
+	rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
 
 	return rnd << PAGE_SHIFT;
 }
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 7cd15143a507..e4b681aafd6d 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -38,6 +38,7 @@
 #include <asm/mach/pci.h>
 #include <asm/fixmap.h>
 
+#include "fault.h"
 #include "mm.h"
 #include "tcm.h"
 
@@ -389,7 +390,7 @@ void __init early_fixmap_init(void)
 	 * The early fixmap range spans multiple pmds, for which
 	 * we are not prepared:
 	 */
-	BUILD_BUG_ON((__fix_to_virt(__end_of_permanent_fixed_addresses) >> PMD_SHIFT)
+	BUILD_BUG_ON((__fix_to_virt(__end_of_early_ioremap_region) >> PMD_SHIFT)
 		     != FIXADDR_TOP >> PMD_SHIFT);
 
 	pmd = fixmap_pmd(FIXADDR_TOP);
@@ -476,7 +477,7 @@ static void __init build_mem_type_table(void)
 	 * "update-able on write" bit on ARM610).  However, Xscale and
 	 * Xscale3 require this bit to be cleared.
 	 */
-	if (cpu_is_xscale() || cpu_is_xsc3()) {
+	if (cpu_is_xscale_family()) {
 		for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
 			mem_types[i].prot_sect &= ~PMD_BIT4;
 			mem_types[i].prot_l1 &= ~PMD_BIT4;
@@ -571,7 +572,7 @@ static void __init build_mem_type_table(void)
 	 * in the Short-descriptor translation table format descriptors.
 	 */
 	if (cpu_arch == CPU_ARCH_ARMv7 &&
-		(read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) {
+		(read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) {
 		user_pmd_table |= PMD_PXNTABLE;
 	}
 #endif
@@ -723,30 +724,49 @@ static void __init *early_alloc(unsigned long sz)
 	return early_alloc_aligned(sz, sz);
 }
 
-static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, unsigned long prot)
+static void *__init late_alloc(unsigned long sz)
+{
+	void *ptr = (void *)__get_free_pages(PGALLOC_GFP, get_order(sz));
+
+	BUG_ON(!ptr);
+	return ptr;
+}
+
+static pte_t * __init pte_alloc(pmd_t *pmd, unsigned long addr,
+				unsigned long prot,
+				void *(*alloc)(unsigned long sz))
 {
 	if (pmd_none(*pmd)) {
-		pte_t *pte = early_alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
+		pte_t *pte = alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
 		__pmd_populate(pmd, __pa(pte), prot);
 	}
 	BUG_ON(pmd_bad(*pmd));
 	return pte_offset_kernel(pmd, addr);
 }
 
+static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr,
+				      unsigned long prot)
+{
+	return pte_alloc(pmd, addr, prot, early_alloc);
+}
+
 static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
 				  unsigned long end, unsigned long pfn,
-				  const struct mem_type *type)
+				  const struct mem_type *type,
+				  void *(*alloc)(unsigned long sz),
+				  bool ng)
 {
-	pte_t *pte = early_pte_alloc(pmd, addr, type->prot_l1);
+	pte_t *pte = pte_alloc(pmd, addr, type->prot_l1, alloc);
 	do {
-		set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0);
+		set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)),
+			    ng ? PTE_EXT_NG : 0);
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 }
 
 static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
 			unsigned long end, phys_addr_t phys,
-			const struct mem_type *type)
+			const struct mem_type *type, bool ng)
 {
 	pmd_t *p = pmd;
 
@@ -764,7 +784,7 @@ static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
 		pmd++;
 #endif
 	do {
-		*pmd = __pmd(phys | type->prot_sect);
+		*pmd = __pmd(phys | type->prot_sect | (ng ? PMD_SECT_nG : 0));
 		phys += SECTION_SIZE;
 	} while (pmd++, addr += SECTION_SIZE, addr != end);
 
@@ -773,7 +793,8 @@ static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
 
 static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
 				      unsigned long end, phys_addr_t phys,
-				      const struct mem_type *type)
+				      const struct mem_type *type,
+				      void *(*alloc)(unsigned long sz), bool ng)
 {
 	pmd_t *pmd = pmd_offset(pud, addr);
 	unsigned long next;
@@ -791,10 +812,10 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
 		 */
 		if (type->prot_sect &&
 				((addr | next | phys) & ~SECTION_MASK) == 0) {
-			__map_init_section(pmd, addr, next, phys, type);
+			__map_init_section(pmd, addr, next, phys, type, ng);
 		} else {
 			alloc_init_pte(pmd, addr, next,
-						__phys_to_pfn(phys), type);
+				       __phys_to_pfn(phys), type, alloc, ng);
 		}
 
 		phys += next - addr;
@@ -804,21 +825,24 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
 
 static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
 				  unsigned long end, phys_addr_t phys,
-				  const struct mem_type *type)
+				  const struct mem_type *type,
+				  void *(*alloc)(unsigned long sz), bool ng)
 {
 	pud_t *pud = pud_offset(pgd, addr);
 	unsigned long next;
 
 	do {
 		next = pud_addr_end(addr, end);
-		alloc_init_pmd(pud, addr, next, phys, type);
+		alloc_init_pmd(pud, addr, next, phys, type, alloc, ng);
 		phys += next - addr;
 	} while (pud++, addr = next, addr != end);
 }
 
 #ifndef CONFIG_ARM_LPAE
-static void __init create_36bit_mapping(struct map_desc *md,
-					const struct mem_type *type)
+static void __init create_36bit_mapping(struct mm_struct *mm,
+					struct map_desc *md,
+					const struct mem_type *type,
+					bool ng)
 {
 	unsigned long addr, length, end;
 	phys_addr_t phys;
@@ -858,7 +882,7 @@ static void __init create_36bit_mapping(struct map_desc *md,
 	 */
 	phys |= (((md->pfn >> (32 - PAGE_SHIFT)) & 0xF) << 20);
 
-	pgd = pgd_offset_k(addr);
+	pgd = pgd_offset(mm, addr);
 	end = addr + length;
 	do {
 		pud_t *pud = pud_offset(pgd, addr);
@@ -866,7 +890,8 @@ static void __init create_36bit_mapping(struct map_desc *md,
 		int i;
 
 		for (i = 0; i < 16; i++)
-			*pmd++ = __pmd(phys | type->prot_sect | PMD_SECT_SUPER);
+			*pmd++ = __pmd(phys | type->prot_sect | PMD_SECT_SUPER |
+				       (ng ? PMD_SECT_nG : 0));
 
 		addr += SUPERSECTION_SIZE;
 		phys += SUPERSECTION_SIZE;
@@ -875,33 +900,15 @@ static void __init create_36bit_mapping(struct map_desc *md,
 }
 #endif	/* !CONFIG_ARM_LPAE */
 
-/*
- * Create the page directory entries and any necessary
- * page tables for the mapping specified by `md'.  We
- * are able to cope here with varying sizes and address
- * offsets, and we take full advantage of sections and
- * supersections.
- */
-static void __init create_mapping(struct map_desc *md)
+static void __init __create_mapping(struct mm_struct *mm, struct map_desc *md,
+				    void *(*alloc)(unsigned long sz),
+				    bool ng)
 {
 	unsigned long addr, length, end;
 	phys_addr_t phys;
 	const struct mem_type *type;
 	pgd_t *pgd;
 
-	if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) {
-		pr_warn("BUG: not creating mapping for 0x%08llx at 0x%08lx in user region\n",
-			(long long)__pfn_to_phys((u64)md->pfn), md->virtual);
-		return;
-	}
-
-	if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
-	    md->virtual >= PAGE_OFFSET && md->virtual < FIXADDR_START &&
-	    (md->virtual < VMALLOC_START || md->virtual >= VMALLOC_END)) {
-		pr_warn("BUG: mapping for 0x%08llx at 0x%08lx out of vmalloc space\n",
-			(long long)__pfn_to_phys((u64)md->pfn), md->virtual);
-	}
-
 	type = &mem_types[md->type];
 
 #ifndef CONFIG_ARM_LPAE
@@ -909,7 +916,7 @@ static void __init create_mapping(struct map_desc *md)
 	 * Catch 36-bit addresses
 	 */
 	if (md->pfn >= 0x100000) {
-		create_36bit_mapping(md, type);
+		create_36bit_mapping(mm, md, type, ng);
 		return;
 	}
 #endif
@@ -924,12 +931,12 @@ static void __init create_mapping(struct map_desc *md)
 		return;
 	}
 
-	pgd = pgd_offset_k(addr);
+	pgd = pgd_offset(mm, addr);
 	end = addr + length;
 	do {
 		unsigned long next = pgd_addr_end(addr, end);
 
-		alloc_init_pud(pgd, addr, next, phys, type);
+		alloc_init_pud(pgd, addr, next, phys, type, alloc, ng);
 
 		phys += next - addr;
 		addr = next;
@@ -937,6 +944,43 @@ static void __init create_mapping(struct map_desc *md)
 }
 
 /*
+ * Create the page directory entries and any necessary
+ * page tables for the mapping specified by `md'.  We
+ * are able to cope here with varying sizes and address
+ * offsets, and we take full advantage of sections and
+ * supersections.
+ */
+static void __init create_mapping(struct map_desc *md)
+{
+	if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) {
+		pr_warn("BUG: not creating mapping for 0x%08llx at 0x%08lx in user region\n",
+			(long long)__pfn_to_phys((u64)md->pfn), md->virtual);
+		return;
+	}
+
+	if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
+	    md->virtual >= PAGE_OFFSET && md->virtual < FIXADDR_START &&
+	    (md->virtual < VMALLOC_START || md->virtual >= VMALLOC_END)) {
+		pr_warn("BUG: mapping for 0x%08llx at 0x%08lx out of vmalloc space\n",
+			(long long)__pfn_to_phys((u64)md->pfn), md->virtual);
+	}
+
+	__create_mapping(&init_mm, md, early_alloc, false);
+}
+
+void __init create_mapping_late(struct mm_struct *mm, struct map_desc *md,
+				bool ng)
+{
+#ifdef CONFIG_ARM_LPAE
+	pud_t *pud = pud_alloc(mm, pgd_offset(mm, md->virtual), md->virtual);
+	if (WARN_ON(!pud))
+		return;
+	pmd_alloc(mm, pud, 0);
+#endif
+	__create_mapping(mm, md, late_alloc, ng);
+}
+
+/*
  * Create the architecture specific mappings
  */
 void __init iotable_init(struct map_desc *io_desc, int nr)
@@ -1209,7 +1253,7 @@ static inline void prepare_page_table(void)
 
 #ifdef CONFIG_XIP_KERNEL
 	/* The XIP kernel is mapped in the module area -- skip over it */
-	addr = ((unsigned long)_etext + PMD_SIZE - 1) & PMD_MASK;
+	addr = ((unsigned long)_exiprom + PMD_SIZE - 1) & PMD_MASK;
 #endif
 	for ( ; addr < PAGE_OFFSET; addr += PMD_SIZE)
 		pmd_clear(pmd_off_k(addr));
@@ -1291,7 +1335,7 @@ static void __init devicemaps_init(const struct machine_desc *mdesc)
 #ifdef CONFIG_XIP_KERNEL
 	map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK);
 	map.virtual = MODULES_VADDR;
-	map.length = ((unsigned long)_etext - map.virtual + ~SECTION_MASK) & SECTION_MASK;
+	map.length = ((unsigned long)_exiprom - map.virtual + ~SECTION_MASK) & SECTION_MASK;
 	map.type = MT_ROM;
 	create_mapping(&map);
 #endif
@@ -1363,6 +1407,9 @@ static void __init devicemaps_init(const struct machine_desc *mdesc)
 	 */
 	local_flush_tlb_all();
 	flush_cache_all();
+
+	/* Enable asynchronous aborts */
+	early_abt_enable();
 }
 
 static void __init kmap_init(void)
@@ -1379,7 +1426,11 @@ static void __init kmap_init(void)
 static void __init map_lowmem(void)
 {
 	struct memblock_region *reg;
+#ifdef CONFIG_XIP_KERNEL
+	phys_addr_t kernel_x_start = round_down(__pa(_sdata), SECTION_SIZE);
+#else
 	phys_addr_t kernel_x_start = round_down(__pa(_stext), SECTION_SIZE);
+#endif
 	phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
 
 	/* Map all the lowmem memory banks. */
@@ -1388,6 +1439,9 @@ static void __init map_lowmem(void)
 		phys_addr_t end = start + reg->size;
 		struct map_desc map;
 
+		if (memblock_is_nomap(reg))
+			continue;
+
 		if (end > arm_lowmem_limit)
 			end = arm_lowmem_limit;
 		if (start >= end)
diff --git a/arch/arm/mm/pageattr.c b/arch/arm/mm/pageattr.c
index cf30daff8932..d19b1ad29b07 100644
--- a/arch/arm/mm/pageattr.c
+++ b/arch/arm/mm/pageattr.c
@@ -49,6 +49,9 @@ static int change_memory_common(unsigned long addr, int numpages,
 		WARN_ON_ONCE(1);
 	}
 
+	if (!numpages)
+		return 0;
+
 	if (start < MODULES_VADDR || start >= MODULES_END)
 		return -EINVAL;
 
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index d65edf717bf7..6f07d2ef4ff2 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -342,11 +342,13 @@ ENTRY(cpu_mohawk_switch_mm)
  */
 	.align	5
 ENTRY(cpu_mohawk_set_pte_ext)
+#ifdef CONFIG_MMU
 	armv3_set_pte_ext
 	mov	r0, r0
 	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
 	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	ret	lr
+#endif
 
 .globl	cpu_mohawk_suspend_size
 .equ	cpu_mohawk_suspend_size, 4 * 6
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index de2b246fed38..0f8963a7e7d9 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -95,7 +95,7 @@ ENDPROC(cpu_v7_dcache_clean_area)
 .equ	cpu_v7_suspend_size, 4 * 9
 #ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_v7_do_suspend)
-	stmfd	sp!, {r4 - r10, lr}
+	stmfd	sp!, {r4 - r11, lr}
 	mrc	p15, 0, r4, c13, c0, 0	@ FCSE/PID
 	mrc	p15, 0, r5, c13, c0, 3	@ User r/o thread ID
 	stmia	r0!, {r4 - r5}
@@ -112,7 +112,7 @@ ENTRY(cpu_v7_do_suspend)
 	mrc	p15, 0, r9, c1, c0, 1	@ Auxiliary control register
 	mrc	p15, 0, r10, c1, c0, 2	@ Co-processor access control
 	stmia	r0, {r5 - r11}
-	ldmfd	sp!, {r4 - r10, pc}
+	ldmfd	sp!, {r4 - r11, pc}
 ENDPROC(cpu_v7_do_suspend)
 
 ENTRY(cpu_v7_do_resume)
@@ -274,10 +274,12 @@ __v7_ca15mp_setup:
 __v7_b15mp_setup:
 __v7_ca17mp_setup:
 	mov	r10, #0
-1:	adr	r12, __v7_setup_stack		@ the local stack
-	stmia	r12, {r0-r5, lr}		@ v7_invalidate_l1 touches r0-r6
+1:	adr	r0, __v7_setup_stack_ptr
+	ldr	r12, [r0]
+	add	r12, r12, r0			@ the local stack
+	stmia	r12, {r1-r6, lr}		@ v7_invalidate_l1 touches r0-r6
 	bl      v7_invalidate_l1
-	ldmia	r12, {r0-r5, lr}
+	ldmia	r12, {r1-r6, lr}
 #ifdef CONFIG_SMP
 	ALT_SMP(mrc	p15, 0, r0, c1, c0, 1)
 	ALT_UP(mov	r0, #(1 << 6))		@ fake it for UP
@@ -415,10 +417,12 @@ __v7_pj4b_setup:
 #endif /* CONFIG_CPU_PJ4B */
 
 __v7_setup:
-	adr	r12, __v7_setup_stack		@ the local stack
-	stmia	r12, {r0-r5, lr}		@ v7_invalidate_l1 touches r0-r6
+	adr	r0, __v7_setup_stack_ptr
+	ldr	r12, [r0]
+	add	r12, r12, r0			@ the local stack
+	stmia	r12, {r1-r6, lr}		@ v7_invalidate_l1 touches r0-r6
 	bl      v7_invalidate_l1
-	ldmia	r12, {r0-r5, lr}
+	ldmia	r12, {r1-r6, lr}
 
 __v7_setup_cont:
 	and	r0, r9, #0xff000000		@ ARM?
@@ -480,11 +484,16 @@ __errata_finish:
 	orr	r0, r0, r6			@ set them
  THUMB(	orr	r0, r0, #1 << 30	)	@ Thumb exceptions
 	ret	lr				@ return to head.S:__ret
+
+	.align	2
+__v7_setup_stack_ptr:
+	.word	PHYS_RELATIVE(__v7_setup_stack, .)
 ENDPROC(__v7_setup)
 
+	.bss
 	.align	2
 __v7_setup_stack:
-	.space	4 * 7				@ 12 registers
+	.space	4 * 7				@ 7 registers
 
 	__INITDATA
 
diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S
index 67d9209077c6..7229d8d0be1a 100644
--- a/arch/arm/mm/proc-v7m.S
+++ b/arch/arm/mm/proc-v7m.S
@@ -12,6 +12,7 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/memory.h>
 #include <asm/v7m.h>
 #include "proc-macros.S"
 
@@ -97,19 +98,19 @@ __v7m_setup:
 	mov	r5, #0x00800000
 	str	r5, [r0, V7M_SCB_SHPR3]	@ set PendSV priority
 
-	@ SVC to run the kernel in this mode
+	@ SVC to switch to handler mode. Notice that this requires sp to
+	@ point to writeable memory because the processor saves
+	@ some registers to the stack.
 	badr	r1, 1f
 	ldr	r5, [r12, #11 * 4]	@ read the SVC vector entry
 	str	r1, [r12, #11 * 4]	@ write the temporary SVC vector entry
 	mov	r6, lr			@ save LR
-	mov	r7, sp			@ save SP
-	ldr	sp, =__v7m_setup_stack_top
+	ldr	sp, =init_thread_union + THREAD_START_SP
 	cpsie	i
 	svc	#0
 1:	cpsid	i
 	str	r5, [r12, #11 * 4]	@ restore the original SVC vector entry
 	mov	lr, r6			@ restore LR
-	mov	sp, r7			@ restore SP
 
 	@ Special-purpose control register
 	mov	r1, #1
@@ -123,11 +124,6 @@ __v7m_setup:
 	ret	lr
 ENDPROC(__v7m_setup)
 
-	.align 2
-__v7m_setup_stack:
-	.space	4 * 8				@ 8 registers
-__v7m_setup_stack_top:
-
 	define_processor_functions v7m, dabort=nommu_early_abort, pabort=legacy_pabort, nommu=1
 
 	.section ".rodata"