From 0b1674638a5c69cbace63278625c199100955490 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 14 Sep 2020 11:23:39 +0300
Subject: ARM: assembler: introduce adr_l, ldr_l and str_l macros

Like arm64, ARM supports position independent code sequences that
produce symbol references with a greater reach than the ordinary
adr/ldr instructions. Since on ARM, the adrl pseudo-instruction is
only supported in ARM mode (and not at all when using Clang), having
a adr_l macro like we do on arm64 is useful, and increases symmetry
as well.

Currently, we use open coded instruction sequences involving literals
and arithmetic operations. Instead, we can use movw/movt pairs on v7
CPUs, circumventing the D-cache entirely.

E.g., on v7+ CPUs, we can emit a PC-relative reference as follows:

       movw         <reg>, #:lower16:<sym> - (1f + 8)
       movt         <reg>, #:upper16:<sym> - (1f + 8)
  1:   add          <reg>, <reg>, pc

For older CPUs, we can emit the literal into a subsection, allowing it
to be emitted out of line while retaining the ability to perform
arithmetic on label offsets.

E.g., on pre-v7 CPUs, we can emit a PC-relative reference as follows:

       ldr          <reg>, 2f
  1:   add          <reg>, <reg>, pc
       .subsection  1
  2:   .long        <sym> - (1b + 8)
       .previous

This is allowed by the assembler because, unlike ordinary sections,
subsections are combined into a single section in the object file, and
so the label references are not true cross-section references that are
visible as relocations. (Subsections have been available in binutils
since 2004 at least, so they should not cause any issues with older
toolchains.)

So use the above to implement the macros mov_l, adr_l, ldr_l and str_l,
all of which will use movw/movt pairs on v7 and later CPUs, and use
PC-relative literals otherwise.

Reviewed-by: Nicolas Pitre <nico@fluxnic.net>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/include/asm/assembler.h | 84 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

(limited to 'arch/arm/include')
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index feac2c8b86f2..72627c5fb3b2 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -494,4 +494,88 @@ THUMB(	orr	\reg , \reg , #PSR_T_BIT	)
 #define _ASM_NOKPROBE(entry)
 #endif
 
+	.macro		__adldst_l, op, reg, sym, tmp, c
+	.if		__LINUX_ARM_ARCH__ < 7
+	ldr\c		\tmp, .La\@
+	.subsection	1
+	.align		2
+.La\@:	.long		\sym - .Lpc\@
+	.previous
+	.else
+	.ifnb		\c
+ THUMB(	ittt		\c			)
+	.endif
+	movw\c		\tmp, #:lower16:\sym - .Lpc\@
+	movt\c		\tmp, #:upper16:\sym - .Lpc\@
+	.endif
+
+#ifndef CONFIG_THUMB2_KERNEL
+	.set		.Lpc\@, . + 8			// PC bias
+	.ifc		\op, add
+	add\c		\reg, \tmp, pc
+	.else
+	\op\c		\reg, [pc, \tmp]
+	.endif
+#else
+.Lb\@:	add\c		\tmp, \tmp, pc
+	/*
+	 * In Thumb-2 builds, the PC bias depends on whether we are currently
+	 * emitting into a .arm or a .thumb section. The size of the add opcode
+	 * above will be 2 bytes when emitting in Thumb mode and 4 bytes when
+	 * emitting in ARM mode, so let's use this to account for the bias.
+	 */
+	.set		.Lpc\@, . + (. - .Lb\@)
+
+	.ifnc		\op, add
+	\op\c		\reg, [\tmp]
+	.endif
+#endif
+	.endm
+
+	/*
+	 * mov_l - move a constant value or [relocated] address into a register
+	 */
+	.macro		mov_l, dst:req, imm:req
+	.if		__LINUX_ARM_ARCH__ < 7
+	ldr		\dst, =\imm
+	.else
+	movw		\dst, #:lower16:\imm
+	movt		\dst, #:upper16:\imm
+	.endif
+	.endm
+
+	/*
+	 * adr_l - adr pseudo-op with unlimited range
+	 *
+	 * @dst: destination register
+	 * @sym: name of the symbol
+	 * @cond: conditional opcode suffix
+	 */
+	.macro		adr_l, dst:req, sym:req, cond
+	__adldst_l	add, \dst, \sym, \dst, \cond
+	.endm
+
+	/*
+	 * ldr_l - ldr <literal> pseudo-op with unlimited range
+	 *
+	 * @dst: destination register
+	 * @sym: name of the symbol
+	 * @cond: conditional opcode suffix
+	 */
+	.macro		ldr_l, dst:req, sym:req, cond
+	__adldst_l	ldr, \dst, \sym, \dst, \cond
+	.endm
+
+	/*
+	 * str_l - str <literal> pseudo-op with unlimited range
+	 *
+	 * @src: source register
+	 * @sym: name of the symbol
+	 * @tmp: mandatory scratch register
+	 * @cond: conditional opcode suffix
+	 */
+	.macro		str_l, src:req, sym:req, tmp:req, cond
+	__adldst_l	str, \src, \sym, \tmp, \cond
+	.endm
+
 #endif /* __ASM_ASSEMBLER_H__ */
-- 
cgit v1.2.3


From 22f2d23098f7d34fc5142531cffb241d14611684 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 14 Sep 2020 11:24:37 +0300
Subject: ARM: module: add support for place relative relocations

When using the new adr_l/ldr_l/str_l macros to refer to external symbols
from modules, the linker may emit place relative ELF relocations that
need to be fixed up by the module loader. So add support for these.

Reviewed-by: Nicolas Pitre <nico@fluxnic.net>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/include/asm/elf.h |  5 +++++
 arch/arm/kernel/module.c   | 20 ++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'arch/arm/include')

diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index b078d992414b..0ac62a54b73c 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -51,6 +51,7 @@ typedef struct user_fp elf_fpregset_t;
 #define R_ARM_NONE		0
 #define R_ARM_PC24		1
 #define R_ARM_ABS32		2
+#define R_ARM_REL32		3
 #define R_ARM_CALL		28
 #define R_ARM_JUMP24		29
 #define R_ARM_TARGET1		38
@@ -58,11 +59,15 @@ typedef struct user_fp elf_fpregset_t;
 #define R_ARM_PREL31		42
 #define R_ARM_MOVW_ABS_NC	43
 #define R_ARM_MOVT_ABS		44
+#define R_ARM_MOVW_PREL_NC	45
+#define R_ARM_MOVT_PREL		46
 
 #define R_ARM_THM_CALL		10
 #define R_ARM_THM_JUMP24	30
 #define R_ARM_THM_MOVW_ABS_NC	47
 #define R_ARM_THM_MOVT_ABS	48
+#define R_ARM_THM_MOVW_PREL_NC	49
+#define R_ARM_THM_MOVT_PREL	50
 
 /*
  * These are used to set parameters in the core dumps.
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e15444b25ca0..beac45e89ba6 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -185,14 +185,24 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 			*(u32 *)loc |= offset & 0x7fffffff;
 			break;
 
+		case R_ARM_REL32:
+			*(u32 *)loc += sym->st_value - loc;
+			break;
+
 		case R_ARM_MOVW_ABS_NC:
 		case R_ARM_MOVT_ABS:
+		case R_ARM_MOVW_PREL_NC:
+		case R_ARM_MOVT_PREL:
 			offset = tmp = __mem_to_opcode_arm(*(u32 *)loc);
 			offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff);
 			offset = (offset ^ 0x8000) - 0x8000;
 
 			offset += sym->st_value;
-			if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS)
+			if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_PREL ||
+			    ELF32_R_TYPE(rel->r_info) == R_ARM_MOVW_PREL_NC)
+				offset -= loc;
+			if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS ||
+			    ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_PREL)
 				offset >>= 16;
 
 			tmp &= 0xfff0f000;
@@ -283,6 +293,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 
 		case R_ARM_THM_MOVW_ABS_NC:
 		case R_ARM_THM_MOVT_ABS:
+		case R_ARM_THM_MOVW_PREL_NC:
+		case R_ARM_THM_MOVT_PREL:
 			upper = __mem_to_opcode_thumb16(*(u16 *)loc);
 			lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
 
@@ -302,7 +314,11 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 			offset = (offset ^ 0x8000) - 0x8000;
 			offset += sym->st_value;
 
-			if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS)
+			if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_PREL ||
+			    ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVW_PREL_NC)
+				offset -= loc;
+			if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS ||
+			    ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_PREL)
 				offset >>= 16;
 
 			upper = (u16)((upper & 0xfbf0) |
-- 
cgit v1.2.3


From 0869f3b9da38889faef2ccafcf675c713d4a3aa8 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 18 Sep 2020 10:00:24 +0300
Subject: ARM: p2v: drop redundant 'type' argument from __pv_stub

We always pass the same value for 'type' so pull it into the __pv_stub
macro itself.

Acked-by: Nicolas Pitre <nico@fluxnic.net>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/include/asm/memory.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/arm/include')

diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 99035b5891ef..eb3c8e6e960a 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -183,14 +183,14 @@ extern const void *__pv_table_begin, *__pv_table_end;
 #define PHYS_OFFSET	((phys_addr_t)__pv_phys_pfn_offset << PAGE_SHIFT)
 #define PHYS_PFN_OFFSET	(__pv_phys_pfn_offset)
 
-#define __pv_stub(from,to,instr,type)			\
+#define __pv_stub(from,to,instr)			\
 	__asm__("@ __pv_stub\n"				\
 	"1:	" instr "	%0, %1, %2\n"		\
 	"	.pushsection .pv_table,\"a\"\n"		\
 	"	.long	1b\n"				\
 	"	.popsection\n"				\
 	: "=r" (to)					\
-	: "r" (from), "I" (type))
+	: "r" (from), "I" (__PV_BITS_31_24))
 
 #define __pv_stub_mov_hi(t)				\
 	__asm__ volatile("@ __pv_stub_mov\n"		\
@@ -217,7 +217,7 @@ static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 	phys_addr_t t;
 
 	if (sizeof(phys_addr_t) == 4) {
-		__pv_stub(x, t, "add", __PV_BITS_31_24);
+		__pv_stub(x, t, "add");
 	} else {
 		__pv_stub_mov_hi(t);
 		__pv_add_carry_stub(x, t);
@@ -235,7 +235,7 @@ static inline unsigned long __phys_to_virt(phys_addr_t x)
 	 * assembler expression receives 32 bit argument
 	 * in place where 'r' 32 bit operand is expected.
 	 */
-	__pv_stub((unsigned long) x, t, "sub", __PV_BITS_31_24);
+	__pv_stub((unsigned long) x, t, "sub");
 	return t;
 }
 
-- 
cgit v1.2.3


From 2730e8eaa4f2baccc03296e0c5ee109c0673fe5f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Sun, 20 Sep 2020 18:23:35 +0200
Subject: ARM: p2v: use relative references in patch site arrays

Free up a register in the p2v patching code by switching to relative
references, which don't require keeping the phys-to-virt displacement
live in a register.

Acked-by: Nicolas Pitre <nico@fluxnic.net>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/include/asm/memory.h |  6 +++---
 arch/arm/kernel/phys2virt.S   | 18 +++++++-----------
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'arch/arm/include')

diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index eb3c8e6e960a..4121662dea5a 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -187,7 +187,7 @@ extern const void *__pv_table_begin, *__pv_table_end;
 	__asm__("@ __pv_stub\n"				\
 	"1:	" instr "	%0, %1, %2\n"		\
 	"	.pushsection .pv_table,\"a\"\n"		\
-	"	.long	1b\n"				\
+	"	.long	1b - .\n"			\
 	"	.popsection\n"				\
 	: "=r" (to)					\
 	: "r" (from), "I" (__PV_BITS_31_24))
@@ -196,7 +196,7 @@ extern const void *__pv_table_begin, *__pv_table_end;
 	__asm__ volatile("@ __pv_stub_mov\n"		\
 	"1:	mov	%R0, %1\n"			\
 	"	.pushsection .pv_table,\"a\"\n"		\
-	"	.long	1b\n"				\
+	"	.long	1b - .\n"			\
 	"	.popsection\n"				\
 	: "=r" (t)					\
 	: "I" (__PV_BITS_7_0))
@@ -206,7 +206,7 @@ extern const void *__pv_table_begin, *__pv_table_end;
 	"1:	adds	%Q0, %1, %2\n"			\
 	"	adc	%R0, %R0, #0\n"			\
 	"	.pushsection .pv_table,\"a\"\n"		\
-	"	.long	1b\n"				\
+	"	.long	1b - .\n"			\
 	"	.popsection\n"				\
 	: "+r" (y)					\
 	: "r" (x), "I" (__PV_BITS_31_24)		\
diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S
index 5031e5a2e78b..8e4be15e1559 100644
--- a/arch/arm/kernel/phys2virt.S
+++ b/arch/arm/kernel/phys2virt.S
@@ -58,9 +58,7 @@ ENDPROC(__fixup_pv_table)
 
 	.text
 __fixup_a_pv_table:
-	adr	r0, 3f
-	ldr	r6, [r0]
-	add	r6, r6, r3
+	adr_l	r6, __pv_offset
 	ldr	r0, [r6, #HIGH_OFFSET]	@ pv_offset high word
 	ldr	r6, [r6, #LOW_OFFSET]	@ pv_offset low word
 	mov	r6, r6, lsr #24
@@ -78,7 +76,8 @@ __fixup_a_pv_table:
 	orr	r6, r6, r7, lsl #12
 	orr	r6, #0x4000
 	b	.Lnext
-.Lloop:	add	r7, r3
+.Lloop:	add	r7, r4
+	adds	r4, #4
 	ldrh	ip, [r7, #2]
 ARM_BE8(rev16	ip, ip)
 	tst	ip, #0x4000
@@ -108,28 +107,25 @@ ARM_BE8(rev16	ip, ip)
 
 	moveq	r0, #0x400000		@ set bit 22, mov to mvn instruction
 	b	.Lnext
-.Lloop:	ldr	ip, [r7, r3]
+.Lloop:	ldr	ip, [r7, r4]
 	bic	ip, ip, #PV_IMM8_MASK
 	tst	ip, #PV_ROT_MASK		@ check the rotation field
 	orrne	ip, ip, r6 ARM_BE8(, lsl #24)	@ mask in offset bits 31-24
 	biceq	ip, ip, #PV_BIT22		@ clear bit 22
 	orreq	ip, ip, r0 ARM_BE8(, ror #8)	@ mask in offset bits 7-0 (or bit 22)
-	str	ip, [r7, r3]
+	str	ip, [r7, r4]
+	add	r4, r4, #4
 #endif
 
 .Lnext:
 	cmp	r4, r5
-	ldrcc	r7, [r4], #4		@ use branch for delay slot
+	ldrcc	r7, [r4]		@ use branch for delay slot
 	bcc	.Lloop
 	ret	lr
 ENDPROC(__fixup_a_pv_table)
 
-	.align
-3:	.long __pv_offset
-
 ENTRY(fixup_pv_table)
 	stmfd	sp!, {r4 - r7, lr}
-	mov	r3, #0			@ no offset
 	mov	r4, r0			@ r0 = table start
 	add	r5, r0, r1		@ r1 = table size
 	bl	__fixup_a_pv_table
-- 
cgit v1.2.3


From e8e00f5afb087912fb3edb225ee373aa6499bb79 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Sun, 20 Sep 2020 23:02:25 +0200
Subject: ARM: p2v: switch to MOVW for Thumb2 and ARM/LPAE

In preparation for reducing the phys-to-virt minimum relative alignment
from 16 MiB to 2 MiB, switch to patchable sequences involving MOVW
instructions that can more easily be manipulated to carry a 12-bit
immediate. Note that the non-LPAE ARM sequence is not updated: MOVW
may not be supported on non-LPAE platforms, and the sequence itself
can be updated more easily to apply the 12 bits of displacement.

For Thumb2, which has many more versions of opcodes, switch to a sequence
that can be patched by the same patching code for both versions. Note
that the Thumb2 opcodes for MOVW and MVN are unambiguous, and have no
rotation bits in their immediate fields, so there is no need to use
placeholder constants in the asm blocks.

While at it, drop the 'volatile' qualifiers from the asm blocks: the
code does not have any side effects that are invisible to the compiler,
so it is free to omit these sequences if the outputs are not used.

Suggested-by: Russell King <linux@armlinux.org.uk>
Acked-by: Nicolas Pitre <nico@fluxnic.net>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/include/asm/memory.h |  44 +++++++++----
 arch/arm/kernel/phys2virt.S   | 147 +++++++++++++++++++++++++++++++++---------
 2 files changed, 148 insertions(+), 43 deletions(-)

(limited to 'arch/arm/include')

diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 4121662dea5a..ccf55cef6ab9 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -183,6 +183,7 @@ extern const void *__pv_table_begin, *__pv_table_end;
 #define PHYS_OFFSET	((phys_addr_t)__pv_phys_pfn_offset << PAGE_SHIFT)
 #define PHYS_PFN_OFFSET	(__pv_phys_pfn_offset)
 
+#ifndef CONFIG_THUMB2_KERNEL
 #define __pv_stub(from,to,instr)			\
 	__asm__("@ __pv_stub\n"				\
 	"1:	" instr "	%0, %1, %2\n"		\
@@ -192,25 +193,45 @@ extern const void *__pv_table_begin, *__pv_table_end;
 	: "=r" (to)					\
 	: "r" (from), "I" (__PV_BITS_31_24))
 
-#define __pv_stub_mov_hi(t)				\
-	__asm__ volatile("@ __pv_stub_mov\n"		\
-	"1:	mov	%R0, %1\n"			\
+#define __pv_add_carry_stub(x, y)			\
+	__asm__("@ __pv_add_carry_stub\n"		\
+	"0:	movw	%R0, #0\n"			\
+	"	adds	%Q0, %1, %R0, lsl #24\n"	\
+	"1:	mov	%R0, %2\n"			\
+	"	adc	%R0, %R0, #0\n"			\
 	"	.pushsection .pv_table,\"a\"\n"		\
-	"	.long	1b - .\n"			\
+	"	.long	0b - ., 1b - .\n"		\
 	"	.popsection\n"				\
-	: "=r" (t)					\
-	: "I" (__PV_BITS_7_0))
+	: "=&r" (y)					\
+	: "r" (x), "I" (__PV_BITS_7_0)			\
+	: "cc")
+
+#else
+#define __pv_stub(from,to,instr)			\
+	__asm__("@ __pv_stub\n"				\
+	"0:	movw	%0, #0\n"			\
+	"	lsl	%0, #24\n"			\
+	"	" instr " %0, %1, %0\n"			\
+	"	.pushsection .pv_table,\"a\"\n"		\
+	"	.long	0b - .\n"			\
+	"	.popsection\n"				\
+	: "=&r" (to)					\
+	: "r" (from))
 
 #define __pv_add_carry_stub(x, y)			\
-	__asm__ volatile("@ __pv_add_carry_stub\n"	\
-	"1:	adds	%Q0, %1, %2\n"			\
+	__asm__("@ __pv_add_carry_stub\n"		\
+	"0:	movw	%R0, #0\n"			\
+	"	lsls	%R0, #24\n"			\
+	"	adds	%Q0, %1, %R0\n"			\
+	"1:	mvn	%R0, #0\n"			\
 	"	adc	%R0, %R0, #0\n"			\
 	"	.pushsection .pv_table,\"a\"\n"		\
-	"	.long	1b - .\n"			\
+	"	.long	0b - ., 1b - .\n"		\
 	"	.popsection\n"				\
-	: "+r" (y)					\
-	: "r" (x), "I" (__PV_BITS_31_24)		\
+	: "=&r" (y)					\
+	: "r" (x)					\
 	: "cc")
+#endif
 
 static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 {
@@ -219,7 +240,6 @@ static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
 	if (sizeof(phys_addr_t) == 4) {
 		__pv_stub(x, t, "add");
 	} else {
-		__pv_stub_mov_hi(t);
 		__pv_add_carry_stub(x, t);
 	}
 	return t;
diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S
index be8fb0d89877..a4e364689663 100644
--- a/arch/arm/kernel/phys2virt.S
+++ b/arch/arm/kernel/phys2virt.S
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
  *  Copyright (C) 1994-2002 Russell King
- *  Copyright (c) 2003 ARM Limited
+ *  Copyright (c) 2003, 2020 ARM Limited
  *  All Rights Reserved
  */
 
@@ -58,55 +58,140 @@ __fixup_a_pv_table:
 	mov	r6, r6, lsr #24
 	cmn	r0, #1
 #ifdef CONFIG_THUMB2_KERNEL
+	@
+	@ The Thumb-2 versions of the patchable sequences are
+	@
+	@ phys-to-virt:			movw	<reg>, #offset<31:24>
+	@				lsl	<reg>, #24
+	@				sub	<VA>, <PA>, <reg>
+	@
+	@ virt-to-phys (non-LPAE):	movw	<reg>, #offset<31:24>
+	@				lsl	<reg>, #24
+	@				add	<PA>, <VA>, <reg>
+	@
+	@ virt-to-phys (LPAE):		movw	<reg>, #offset<31:24>
+	@				lsl	<reg>, #24
+	@				adds	<PAlo>, <VA>, <reg>
+	@				mov	<PAhi>, #offset<39:32>
+	@				adc	<PAhi>, <PAhi>, #0
+	@
+	@ In the non-LPAE case, all patchable instructions are MOVW
+	@ instructions, where we need to patch in the offset into the
+	@ second halfword of the opcode (the 16-bit immediate is encoded
+	@ as imm4:i:imm3:imm8)
+	@
+	@       15       11 10  9           4 3    0  15  14  12 11 8 7    0
+	@      +-----------+---+-------------+------++---+------+----+------+
+	@ MOVW | 1 1 1 1 0 | i | 1 0 0 1 0 0 | imm4 || 0 | imm3 | Rd | imm8 |
+	@      +-----------+---+-------------+------++---+------+----+------+
+	@
+	@ In the LPAE case, we also need to patch in the high word of the
+	@ offset into the immediate field of the MOV instruction, or patch it
+	@ to a MVN instruction if the offset is negative. In this case, we
+	@ need to inspect the first halfword of the opcode, to check whether
+	@ it is MOVW or MOV/MVN, and to perform the MOV to MVN patching if
+	@ needed. The encoding of the immediate is rather complex for values
+	@ of i:imm3 != 0b0000, but fortunately, we never need more than 8 lower
+	@ order bits, which can be patched into imm8 directly (and i:imm3
+	@ cleared)
+	@
+	@      15       11 10  9        5         0  15  14  12 11 8 7    0
+	@     +-----------+---+---------------------++---+------+----+------+
+	@ MOV | 1 1 1 1 0 | i | 0 0 0 1 0 0 1 1 1 1 || 0 | imm3 | Rd | imm8 |
+	@ MVN | 1 1 1 1 0 | i | 0 0 0 1 1 0 1 1 1 1 || 0 | imm3 | Rd | imm8 |
+	@     +-----------+---+---------------------++---+------+----+------+
+	@
 	moveq	r0, #0x200000		@ set bit 21, mov to mvn instruction
-	lsls	r6, #24
-	beq	.Lnext
-	clz	r7, r6
-	lsr	r6, #24
-	lsl	r6, r7
-	bic	r6, #0x0080
-	lsrs	r7, #1
-	orrcs	r6, #0x0080
-	orr	r6, r6, r7, lsl #12
-	orr	r6, #0x4000
 	b	.Lnext
 .Lloop:	add	r7, r4
-	adds	r4, #4
-	ldrh	ip, [r7, #2]
-ARM_BE8(rev16	ip, ip)
-	tst	ip, #0x4000
-	and	ip, #0x8f00
-	orrne	ip, r6			@ mask in offset bits 31-24
-	orreq	ip, r0			@ mask in offset bits 7-0
-ARM_BE8(rev16	ip, ip)
-	strh	ip, [r7, #2]
-	bne	.Lnext
+	adds	r4, #4			@ clears Z flag
+#ifdef CONFIG_ARM_LPAE
 	ldrh	ip, [r7]
 ARM_BE8(rev16	ip, ip)
-	bic	ip, #0x20
-	orr	ip, ip, r0, lsr #16
+	tst	ip, #0x200		@ MOVW has bit 9 set, MVN has it clear
+	bne	0f			@ skip to MOVW handling (Z flag is clear)
+	bic	ip, #0x20		@ clear bit 5 (MVN -> MOV)
+	orr	ip, ip, r0, lsr #16	@ MOV -> MVN if offset < 0
 ARM_BE8(rev16	ip, ip)
 	strh	ip, [r7]
+	@ Z flag is set
+0:
+#endif
+	ldrh	ip, [r7, #2]
+ARM_BE8(rev16	ip, ip)
+	and	ip, #0xf00		@ clear everything except Rd field
+	orreq	ip, r0			@ Z flag set -> MOV/MVN -> patch in high bits
+	orrne	ip, r6			@ Z flag clear -> MOVW -> patch in low bits
+ARM_BE8(rev16	ip, ip)
+	strh	ip, [r7, #2]
 #else
 #ifdef CONFIG_CPU_ENDIAN_BE8
 @ in BE8, we load data in BE, but instructions still in LE
-#define PV_BIT22	0x00004000
+#define PV_BIT24	0x00000001
 #define PV_IMM8_MASK	0xff000000
-#define PV_ROT_MASK	0x000f0000
 #else
-#define PV_BIT22	0x00400000
+#define PV_BIT24	0x01000000
 #define PV_IMM8_MASK	0x000000ff
-#define PV_ROT_MASK	0xf00
 #endif
 
+	@
+	@ The ARM versions of the patchable sequences are
+	@
+	@ phys-to-virt:			sub	<VA>, <PA>, #offset<31:24>, lsl #24
+	@
+	@ virt-to-phys (non-LPAE):	add	<PA>, <VA>, #offset<31:24>, lsl #24
+	@
+	@ virt-to-phys (LPAE):		movw	<reg>, #offset<31:24>
+	@				adds	<PAlo>, <VA>, <reg>, lsl #24
+	@				mov	<PAhi>, #offset<39:32>
+	@				adc	<PAhi>, <PAhi>, #0
+	@
+	@ In the non-LPAE case, all patchable instructions are ADD or SUB
+	@ instructions, where we need to patch in the offset into the
+	@ immediate field of the opcode, which is emitted with the correct
+	@ rotation value. (The effective value of the immediate is imm12<7:0>
+	@ rotated right by [2 * imm12<11:8>] bits)
+	@
+	@      31   28 27      23 22  20 19  16 15  12 11    0
+	@      +------+-----------------+------+------+-------+
+	@  ADD | cond | 0 0 1 0 1 0 0 0 |  Rn  |  Rd  | imm12 |
+	@  SUB | cond | 0 0 1 0 0 1 0 0 |  Rn  |  Rd  | imm12 |
+	@  MOV | cond | 0 0 1 1 1 0 1 0 |  Rn  |  Rd  | imm12 |
+	@  MVN | cond | 0 0 1 1 1 1 1 0 |  Rn  |  Rd  | imm12 |
+	@      +------+-----------------+------+------+-------+
+	@
+	@ In the LPAE case, we use a MOVW instruction to carry the low offset
+	@ word, and patch in the high word of the offset into the immediate
+	@ field of the subsequent MOV instruction, or patch it to a MVN
+	@ instruction if the offset is negative. We can distinguish MOVW
+	@ instructions based on bits 23:22 of the opcode, and ADD/SUB can be
+	@ distinguished from MOV/MVN (all using the encodings above) using
+	@ bit 24.
+	@
+	@      31   28 27      23 22  20 19  16 15  12 11    0
+	@      +------+-----------------+------+------+-------+
+	@ MOVW | cond | 0 0 1 1 0 0 0 0 | imm4 |  Rd  | imm12 |
+	@      +------+-----------------+------+------+-------+
+	@
 	moveq	r0, #0x400000		@ set bit 22, mov to mvn instruction
 	b	.Lnext
 .Lloop:	ldr	ip, [r7, r4]
+#ifdef CONFIG_ARM_LPAE
+	tst	ip, #PV_BIT24		@ ADD/SUB have bit 24 clear
+	beq	1f
+ARM_BE8(rev	ip, ip)
+	tst	ip, #0xc00000		@ MOVW has bits 23:22 clear
+	bic	ip, ip, #0x400000	@ clear bit 22
+	bfc	ip, #0, #12		@ clear imm12 field of MOV[W] instruction
+	orreq	ip, ip, r6		@ MOVW -> mask in offset bits 31-24
+	orrne	ip, ip, r0		@ MOV  -> mask in offset bits 7-0 (or bit 22)
+ARM_BE8(rev	ip, ip)
+	b	2f
+1:
+#endif
 	bic	ip, ip, #PV_IMM8_MASK
-	tst	ip, #PV_ROT_MASK		@ check the rotation field
-	orrne	ip, ip, r6 ARM_BE8(, lsl #24)	@ mask in offset bits 31-24
-	biceq	ip, ip, #PV_BIT22		@ clear bit 22
-	orreq	ip, ip, r0 ARM_BE8(, ror #8)	@ mask in offset bits 7-0 (or bit 22)
+	orr	ip, ip, r6 ARM_BE8(, lsl #24)	@ mask in offset bits 31-24
+2:
 	str	ip, [r7, r4]
 	add	r4, r4, #4
 #endif
-- 
cgit v1.2.3


From 9443076e4330a14ae2c6114307668b98a8293b77 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 18 Sep 2020 11:55:42 +0300
Subject: ARM: p2v: reduce p2v alignment requirement to 2 MiB

The ARM kernel's linear map starts at PAGE_OFFSET, which maps to a
physical address (PHYS_OFFSET) that is platform specific, and is
discovered at boot. Since we don't want to slow down translations
between physical and virtual addresses by keeping the offset in a
variable in memory, we implement this by patching the code performing
the translation, and putting the offset between PAGE_OFFSET and the
start of physical RAM directly into the instruction opcodes.

As we only patch up to 8 bits of offset, yielding 4 GiB >> 8 == 16 MiB
of granularity, we have to round up PHYS_OFFSET to the next multiple if
the start of physical RAM is not a multiple of 16 MiB. This wastes some
physical RAM, since the memory that was skipped will now live below
PAGE_OFFSET, making it inaccessible to the kernel.

We can improve this by changing the patchable sequences and the patching
logic to carry more bits of offset: 11 bits gives us 4 GiB >> 11 == 2 MiB
of granularity, and so we will never waste more than that amount by
rounding up the physical start of DRAM to the next multiple of 2 MiB.
(Note that 2 MiB granularity guarantees that the linear mapping can be
created efficiently, whereas less than 2 MiB may result in the linear
mapping needing another level of page tables)

This helps Zhen Lei's scenario, where the start of DRAM is known to be
occupied. It also helps EFI boot, which relies on the firmware's page
allocator to allocate space for the decompressed kernel as low as
possible. And if the KASLR patches ever land for 32-bit, it will give
us 3 more bits of randomization of the placement of the kernel inside
the linear region.

For the ARM code path, it simply comes down to using two add/sub
instructions instead of one for the carryless version, and patching
each of them with the correct immediate depending on the rotation
field. For the LPAE calculation, which has to deal with a carry, it
patches the MOVW instruction with up to 12 bits of offset (but we only
need 11 bits anyway)

For the Thumb2 code path, patching more than 11 bits of displacement
would be somewhat cumbersome, but the 11 bits we need fit nicely into
the second word of the u16[2] opcode, so we simply update the immediate
assignment and the left shift to create an addend of the right magnitude.

Suggested-by: Zhen Lei <thunder.leizhen@huawei.com>
Acked-by: Nicolas Pitre <nico@fluxnic.net>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/Kconfig              |  2 +-
 arch/arm/include/asm/memory.h | 13 ++++++++-----
 arch/arm/kernel/phys2virt.S   | 40 ++++++++++++++++++++++++++--------------
 3 files changed, 35 insertions(+), 20 deletions(-)

(limited to 'arch/arm/include')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index fe2f17eb2b50..d8d62b8e72e4 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -243,7 +243,7 @@ config ARM_PATCH_PHYS_VIRT
 	  kernel in system memory.
 
 	  This can only be used with non-XIP MMU kernels where the base
-	  of physical memory is at a 16MB boundary.
+	  of physical memory is at a 2 MiB boundary.
 
 	  Only disable this option if you know that you do not require
 	  this feature (eg, building a kernel for a single machine) and
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index ccf55cef6ab9..2611be35f26b 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -173,6 +173,7 @@ extern unsigned long vectors_base;
  * so that all we need to do is modify the 8-bit constant field.
  */
 #define __PV_BITS_31_24	0x81000000
+#define __PV_BITS_23_16	0x810000
 #define __PV_BITS_7_0	0x81
 
 extern unsigned long __pv_phys_pfn_offset;
@@ -187,16 +188,18 @@ extern const void *__pv_table_begin, *__pv_table_end;
 #define __pv_stub(from,to,instr)			\
 	__asm__("@ __pv_stub\n"				\
 	"1:	" instr "	%0, %1, %2\n"		\
+	"2:	" instr "	%0, %0, %3\n"		\
 	"	.pushsection .pv_table,\"a\"\n"		\
-	"	.long	1b - .\n"			\
+	"	.long	1b - ., 2b - .\n"		\
 	"	.popsection\n"				\
 	: "=r" (to)					\
-	: "r" (from), "I" (__PV_BITS_31_24))
+	: "r" (from), "I" (__PV_BITS_31_24),		\
+	  "I"(__PV_BITS_23_16))
 
 #define __pv_add_carry_stub(x, y)			\
 	__asm__("@ __pv_add_carry_stub\n"		\
 	"0:	movw	%R0, #0\n"			\
-	"	adds	%Q0, %1, %R0, lsl #24\n"	\
+	"	adds	%Q0, %1, %R0, lsl #20\n"	\
 	"1:	mov	%R0, %2\n"			\
 	"	adc	%R0, %R0, #0\n"			\
 	"	.pushsection .pv_table,\"a\"\n"		\
@@ -210,7 +213,7 @@ extern const void *__pv_table_begin, *__pv_table_end;
 #define __pv_stub(from,to,instr)			\
 	__asm__("@ __pv_stub\n"				\
 	"0:	movw	%0, #0\n"			\
-	"	lsl	%0, #24\n"			\
+	"	lsl	%0, #21\n"			\
 	"	" instr " %0, %1, %0\n"			\
 	"	.pushsection .pv_table,\"a\"\n"		\
 	"	.long	0b - .\n"			\
@@ -221,7 +224,7 @@ extern const void *__pv_table_begin, *__pv_table_end;
 #define __pv_add_carry_stub(x, y)			\
 	__asm__("@ __pv_add_carry_stub\n"		\
 	"0:	movw	%R0, #0\n"			\
-	"	lsls	%R0, #24\n"			\
+	"	lsls	%R0, #21\n"			\
 	"	adds	%Q0, %1, %R0\n"			\
 	"1:	mvn	%R0, #0\n"			\
 	"	adc	%R0, %R0, #0\n"			\
diff --git a/arch/arm/kernel/phys2virt.S b/arch/arm/kernel/phys2virt.S
index a4e364689663..fb53db78fe78 100644
--- a/arch/arm/kernel/phys2virt.S
+++ b/arch/arm/kernel/phys2virt.S
@@ -21,7 +21,7 @@
 /*
  * __fixup_pv_table - patch the stub instructions with the delta between
  *                    PHYS_OFFSET and PAGE_OFFSET, which is assumed to be
- *                    16MiB aligned.
+ *                    2 MiB aligned.
  *
  * Called from head.S, which expects the following registers to be preserved:
  *   r1 = machine no, r2 = atags or dtb,
@@ -38,8 +38,8 @@ ENTRY(__fixup_pv_table)
 	strcc	ip, [r0, #HIGH_OFFSET]	@ save to __pv_offset high bits
 	str	r3, [r0, #LOW_OFFSET]	@ save to __pv_offset low bits
 
-	mov	r0, r3, lsr #24		@ constant for add/sub instructions
-	teq	r3, r0, lsl #24 	@ must be 16MiB aligned
+	mov	r0, r3, lsr #21		@ constant for add/sub instructions
+	teq	r3, r0, lsl #21 	@ must be 2 MiB aligned
 	bne	0f
 
 	adr_l	r4, __pv_table_begin
@@ -55,22 +55,21 @@ __fixup_a_pv_table:
 	adr_l	r6, __pv_offset
 	ldr	r0, [r6, #HIGH_OFFSET]	@ pv_offset high word
 	ldr	r6, [r6, #LOW_OFFSET]	@ pv_offset low word
-	mov	r6, r6, lsr #24
 	cmn	r0, #1
 #ifdef CONFIG_THUMB2_KERNEL
 	@
 	@ The Thumb-2 versions of the patchable sequences are
 	@
-	@ phys-to-virt:			movw	<reg>, #offset<31:24>
-	@				lsl	<reg>, #24
+	@ phys-to-virt:			movw	<reg>, #offset<31:21>
+	@				lsl	<reg>, #21
 	@				sub	<VA>, <PA>, <reg>
 	@
-	@ virt-to-phys (non-LPAE):	movw	<reg>, #offset<31:24>
-	@				lsl	<reg>, #24
+	@ virt-to-phys (non-LPAE):	movw	<reg>, #offset<31:21>
+	@				lsl	<reg>, #21
 	@				add	<PA>, <VA>, <reg>
 	@
-	@ virt-to-phys (LPAE):		movw	<reg>, #offset<31:24>
-	@				lsl	<reg>, #24
+	@ virt-to-phys (LPAE):		movw	<reg>, #offset<31:21>
+	@				lsl	<reg>, #21
 	@				adds	<PAlo>, <VA>, <reg>
 	@				mov	<PAhi>, #offset<39:32>
 	@				adc	<PAhi>, <PAhi>, #0
@@ -102,6 +101,9 @@ __fixup_a_pv_table:
 	@     +-----------+---+---------------------++---+------+----+------+
 	@
 	moveq	r0, #0x200000		@ set bit 21, mov to mvn instruction
+	lsrs	r3, r6, #29		@ isolate top 3 bits of displacement
+	ubfx	r6, r6, #21, #8		@ put bits 28:21 into the MOVW imm8 field
+	bfi	r6, r3, #12, #3		@ put bits 31:29 into the MOVW imm3 field
 	b	.Lnext
 .Lloop:	add	r7, r4
 	adds	r4, #4			@ clears Z flag
@@ -129,20 +131,24 @@ ARM_BE8(rev16	ip, ip)
 @ in BE8, we load data in BE, but instructions still in LE
 #define PV_BIT24	0x00000001
 #define PV_IMM8_MASK	0xff000000
+#define PV_IMMR_MSB	0x00080000
 #else
 #define PV_BIT24	0x01000000
 #define PV_IMM8_MASK	0x000000ff
+#define PV_IMMR_MSB	0x00000800
 #endif
 
 	@
 	@ The ARM versions of the patchable sequences are
 	@
 	@ phys-to-virt:			sub	<VA>, <PA>, #offset<31:24>, lsl #24
+	@				sub	<VA>, <PA>, #offset<23:16>, lsl #16
 	@
 	@ virt-to-phys (non-LPAE):	add	<PA>, <VA>, #offset<31:24>, lsl #24
+	@				add	<PA>, <VA>, #offset<23:16>, lsl #16
 	@
-	@ virt-to-phys (LPAE):		movw	<reg>, #offset<31:24>
-	@				adds	<PAlo>, <VA>, <reg>, lsl #24
+	@ virt-to-phys (LPAE):		movw	<reg>, #offset<31:20>
+	@				adds	<PAlo>, <VA>, <reg>, lsl #20
 	@				mov	<PAhi>, #offset<39:32>
 	@				adc	<PAhi>, <PAhi>, #0
 	@
@@ -174,6 +180,9 @@ ARM_BE8(rev16	ip, ip)
 	@      +------+-----------------+------+------+-------+
 	@
 	moveq	r0, #0x400000		@ set bit 22, mov to mvn instruction
+	mov	r3, r6, lsr #16		@ put offset bits 31-16 into r3
+	mov	r6, r6, lsr #24		@ put offset bits 31-24 into r6
+	and	r3, r3, #0xf0		@ only keep offset bits 23-20 in r3
 	b	.Lnext
 .Lloop:	ldr	ip, [r7, r4]
 #ifdef CONFIG_ARM_LPAE
@@ -183,14 +192,17 @@ ARM_BE8(rev	ip, ip)
 	tst	ip, #0xc00000		@ MOVW has bits 23:22 clear
 	bic	ip, ip, #0x400000	@ clear bit 22
 	bfc	ip, #0, #12		@ clear imm12 field of MOV[W] instruction
-	orreq	ip, ip, r6		@ MOVW -> mask in offset bits 31-24
+	orreq	ip, ip, r6, lsl #4	@ MOVW -> mask in offset bits 31-24
+	orreq	ip, ip, r3, lsr #4	@ MOVW -> mask in offset bits 23-20
 	orrne	ip, ip, r0		@ MOV  -> mask in offset bits 7-0 (or bit 22)
 ARM_BE8(rev	ip, ip)
 	b	2f
 1:
 #endif
+	tst	ip, #PV_IMMR_MSB		@ rotation value >= 16 ?
 	bic	ip, ip, #PV_IMM8_MASK
-	orr	ip, ip, r6 ARM_BE8(, lsl #24)	@ mask in offset bits 31-24
+	orreq	ip, ip, r6 ARM_BE8(, lsl #24)	@ mask in offset bits 31-24
+	orrne	ip, ip, r3 ARM_BE8(, lsl #24)	@ mask in offset bits 23-20
 2:
 	str	ip, [r7, r4]
 	add	r4, r4, #4
-- 
cgit v1.2.3


From 450abd38fe6c6313ce9bdd9dce81c1dd604f6fb0 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 14 Sep 2020 11:48:20 +0300
Subject: ARM: kernel: use relative references for UP/SMP alternatives

Currently, the .alt.smp.init section contains the virtual addresses
of the patch sites. Since patching may occur both before and after
switching into virtual mode, this requires some manual handling of
the address when applying the UP alternative.

Let's simplify this by using relative offsets in the table entries:
this allows us to simply add each entry's address to its contents,
regardless of whether we are running in virtual mode or not.

Reviewed-by: Nicolas Pitre <nico@fluxnic.net>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/include/asm/assembler.h |  4 ++--
 arch/arm/include/asm/processor.h |  2 +-
 arch/arm/kernel/head.S           | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/arm/include')

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 72627c5fb3b2..6ed30421f697 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -259,7 +259,7 @@
  */
 #define ALT_UP(instr...)					\
 	.pushsection ".alt.smp.init", "a"			;\
-	.long	9998b						;\
+	.long	9998b - .					;\
 9997:	instr							;\
 	.if . - 9997b == 2					;\
 		nop						;\
@@ -270,7 +270,7 @@
 	.popsection
 #define ALT_UP_B(label)					\
 	.pushsection ".alt.smp.init", "a"			;\
-	.long	9998b						;\
+	.long	9998b - .					;\
 	W(b)	. + (label - 9998b)					;\
 	.popsection
 #else
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index b9241051e5cb..9e6b97286307 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -96,7 +96,7 @@ unsigned long get_wchan(struct task_struct *p);
 #define __ALT_SMP_ASM(smp, up)						\
 	"9998:	" smp "\n"						\
 	"	.pushsection \".alt.smp.init\", \"a\"\n"		\
-	"	.long	9998b\n"					\
+	"	.long	9998b - .\n"					\
 	"	" up "\n"						\
 	"	.popsection\n"
 #else
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index 478506b2d51f..cdc79fcee43e 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -546,14 +546,15 @@ smp_on_up:
 __do_fixup_smp_on_up:
 	cmp	r4, r5
 	reths	lr
-	ldmia	r4!, {r0, r6}
- ARM(	str	r6, [r0, r3]	)
- THUMB(	add	r0, r0, r3	)
+	ldmia	r4, {r0, r6}
+ ARM(	str	r6, [r0, r4]	)
+ THUMB(	add	r0, r0, r4	)
+	add	r4, r4, #8
 #ifdef __ARMEB__
  THUMB(	mov	r6, r6, ror #16	)	@ Convert word order for big-endian.
 #endif
  THUMB(	strh	r6, [r0], #2	)	@ For Thumb-2, store as two halfwords
- THUMB(	mov	r6, r6, lsr #16	)	@ to be robust against misaligned r3.
+ THUMB(	mov	r6, r6, lsr #16	)	@ to be robust against misaligned r0.
  THUMB(	strh	r6, [r0]	)
 	b	__do_fixup_smp_on_up
 ENDPROC(__do_fixup_smp_on_up)
@@ -562,7 +563,6 @@ ENTRY(fixup_smp)
 	stmfd	sp!, {r4 - r6, lr}
 	mov	r4, r0
 	add	r5, r0, r1
-	mov	r3, #0
 	bl	__do_fixup_smp_on_up
 	ldmfd	sp!, {r4 - r6, pc}
 ENDPROC(fixup_smp)
-- 
cgit v1.2.3